Index: head/sys/alpha/osf1/osf1_misc.c
===================================================================
--- head/sys/alpha/osf1/osf1_misc.c	(revision 89305)
+++ head/sys/alpha/osf1/osf1_misc.c	(revision 89306)
@@ -1,1822 +1,1821 @@
 /*	$NetBSD: osf1_misc.c,v 1.14 1998/05/20 16:34:29 chs Exp $	*/
 
 /*
  * Copyright (c) 1994, 1995 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Author: Chris G. Demetriou
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  * Additional Copyright (c) 1999 by Andrew Gallatin
  * $FreeBSD$
  */
 
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/dkstat.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h>		/* Must come after sys/malloc.h */
 #include <sys/mman.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/selinfo.h>
 #include <sys/pipe.h>		/* Must come after sys/selinfo.h */
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/utsname.h>
 #include <sys/vnode.h>
 
 #include <alpha/osf1/exec_ecoff.h>
 #include <alpha/osf1/osf1_signal.h>
 #include <alpha/osf1/osf1_proto.h>
 #include <alpha/osf1/osf1_syscall.h>
 #include <alpha/osf1/osf1_util.h>
 #include <alpha/osf1/osf1.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 
 #include <machine/cpu.h>
 #include <machine/cpuconf.h>
 #include <machine/fpu.h>
 #include <machine/rpb.h>
 
 static void cvtstat2osf1 __P((struct stat *, struct osf1_stat *));
 static int  osf2bsd_pathconf __P((int *));
 
 static const char osf1_emul_path[] = "/compat/osf1";
 /*
  * [ taken from the linux emulator ]
  * Search an alternate path before passing pathname arguments on
  * to system calls. Useful for keeping a separate 'emulation tree'.
  *
  * If cflag is set, we check if an attempt can be made to create
  * the named file, i.e. we check if the directory it should
  * be in exists.
  */
 int
 osf1_emul_find(td, sgp, prefix, path, pbuf, cflag)
 	struct thread	*td;
 	caddr_t		*sgp;          /* Pointer to stackgap memory */
 	const char	*prefix;
 	char		*path;
 	char		**pbuf;
 	int		cflag;
 {
         int			error;
         size_t			len, sz;
         char			*buf, *cp, *ptr;
 	struct ucred		*ucred;
         struct nameidata	nd;
         struct nameidata	ndroot;
         struct vattr		vat;
         struct vattr		vatroot;
 
 	buf = (char *) malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	*pbuf = path;
 
 	for (ptr = buf; (*ptr = *prefix) != '\0'; ptr++, prefix++)
 		continue;
 
 	sz = MAXPATHLEN - (ptr - buf);
 
 	/*
 	 * If sgp is not given then the path is already in kernel space
 	 */
 	if (sgp == NULL)
 		error = copystr(path, ptr, sz, &len);
 	else
 		error = copyinstr(path, ptr, sz, &len);
 
 	if (error) {
 		free(buf, M_TEMP);
 		return error;
 	}
 
 	if (*ptr != '/') {
 		free(buf, M_TEMP);
 		return EINVAL;
 	}
 
 	/*
 	 *  We know that there is a / somewhere in this pathname.
 	 *  Search backwards for it, to find the file's parent dir
 	 *  to see if it exists in the alternate tree. If it does,
 	 *  and we want to create a file (cflag is set). We don't
 	 *  need to worry about the root comparison in this case.
 	 */
 
 	if (cflag) {
 		for (cp = &ptr[len] - 1; *cp != '/'; cp--)
 			;
 		*cp = '\0';
 
 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
 
 		if ((error = namei(&nd)) != 0) {
 			free(buf, M_TEMP);
 			return error;
 		}
 
 		*cp = '/';
 	} else {
 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
 
 		if ((error = namei(&nd)) != 0) {
 			free(buf, M_TEMP);
 			return error;
 		}
 
 		/*
 		 * We now compare the vnode of the osf1_root to the one
 		 * vnode asked. If they resolve to be the same, then we
 		 * ignore the match so that the real root gets used.
 		 * This avoids the problem of traversing "../.." to find the
 		 * root directory and never finding it, because "/" resolves
 		 * to the emulation root directory. This is expensive :-(
 		 */
 		NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, osf1_emul_path,
 		    td);
 
 		if ((error = namei(&ndroot)) != 0) {
 			/* Cannot happen! */
 			free(buf, M_TEMP);
 			vrele(nd.ni_vp);
 			return error;
 		}
 
 		ucred = td->td_proc->p_ucred;
 		if ((error = VOP_GETATTR(nd.ni_vp, &vat, ucred, td)) != 0) {
 			goto bad;
 		}
 
 		if ((error = VOP_GETATTR(ndroot.ni_vp, &vatroot, ucred,
 		    td)) != 0) {
 			goto bad;
 		}
 
 		if (vat.va_fsid == vatroot.va_fsid &&
 		    vat.va_fileid == vatroot.va_fileid) {
 			error = ENOENT;
 			goto bad;
 		}
 
 	}
 	if (sgp == NULL)
 		*pbuf = buf;
 	else {
 		sz = &ptr[len] - buf;
 		*pbuf = stackgap_alloc(sgp, sz + 1);
 		error = copyout(buf, *pbuf, sz);
 		free(buf, M_TEMP);
 	}
 
 	vrele(nd.ni_vp);
 	if (!cflag)
 		vrele(ndroot.ni_vp);
 
 	return error;
 
 bad:
 	vrele(ndroot.ni_vp);
 	vrele(nd.ni_vp);
 	free(buf, M_TEMP);
 	return error;
 }
 
 
 int
 osf1_open(td, uap)
 	struct thread *td;
 	struct osf1_open_args *uap;
 {
 	struct open_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 		syscallarg(int) mode;
 	} */ a;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, uap->path);
 
 	SCARG(&a, path) = SCARG(uap, path);
 	SCARG(&a, flags) = SCARG(uap, flags);		/* XXX translate */
 	SCARG(&a, mode) = SCARG(uap, mode);
 
 	return open(td, &a);
 }
 
 extern int totalphysmem;
 
 int
 osf1_getsysinfo(td, uap)
 	struct thread *td;
 	struct osf1_getsysinfo_args *uap;
 {
 	int error, retval;
 	int ncpus = 1; 	       /* XXX until SMP */
 	int ophysmem;
 	int unit;
 	long percpu;
 	long proctype;
 	struct osf1_cpu_info cpuinfo;
 
 	error = retval = 0;
 
 	switch(uap->op) {
 	case OSF_GET_MAX_UPROCS:
 		error = copyout(&maxprocperuid, uap->buffer,
 		    sizeof(maxprocperuid));
 		retval = 1;
 		break;
 	case OSF_GET_PHYSMEM:
 		ophysmem = totalphysmem * (PAGE_SIZE >> 10);	
 		error = copyout(&ophysmem, uap->buffer,
 		    sizeof(ophysmem));
 		retval = 1;
 		break;
 	case OSF_GET_MAX_CPU:
 	case OSF_GET_CPUS_IN_BOX:
 		error = copyout(&ncpus, uap->buffer,
 		    sizeof(ncpus));
 		retval = 1;
 		break;
 	case OSF_GET_IEEE_FP_CONTROL:
 		error = copyout(&td->td_pcb->pcb_fp_control,uap->buffer,
 		    sizeof(td->td_pcb->pcb_fp_control));
 		retval = 1;
 		break;
 	case OSF_GET_CPU_INFO:
 
 		if (uap->nbytes < sizeof(cpuinfo))
 			error = EINVAL;
 		else {
 			bzero(&cpuinfo, sizeof(cpuinfo));
 			unit = alpha_pal_whami();
 			cpuinfo.current_cpu = unit;
 			cpuinfo.cpus_in_box = ncpus;
 			cpuinfo.cpu_type = 
 			    LOCATE_PCS(hwrpb, unit)->pcs_proc_type;
 			cpuinfo.ncpus = ncpus;
 			cpuinfo.cpus_present = ncpus;
 			cpuinfo.cpus_running = ncpus;
 			cpuinfo.cpu_binding = 1;
 			cpuinfo.cpu_ex_binding = 0;
 			cpuinfo.mhz = hwrpb->rpb_cc_freq / 1000000;
 			error = copyout(&cpuinfo, uap->buffer,
 			    sizeof(cpuinfo));
 			retval = 1;
 		}
 		break;
 	case OSF_GET_PROC_TYPE:
 		if(uap->nbytes < sizeof(proctype))
 			error = EINVAL;
 		else {
 			unit = alpha_pal_whami();
 			proctype = LOCATE_PCS(hwrpb, unit)->pcs_proc_type;
 			error = copyout (&proctype, uap->buffer,
 			    sizeof(percpu));
 			retval = 1;
 		}
 	break;
 	case OSF_GET_HWRPB: {  /* note -- osf/1 doesn't have rpb_tbhint[8] */
 		unsigned long rpb_size;
 		rpb_size = (unsigned long)&hwrpb->rpb_tbhint -
 		    (unsigned long)hwrpb;
 		if(uap->nbytes < rpb_size){
 			uprintf("nbytes = %ld, sizeof(struct rpb) = %ld\n",
 			    uap->nbytes, rpb_size);
 			error = EINVAL;
 		}
 		else {
 			error = copyout(hwrpb, uap->buffer, rpb_size);
 			retval = 1;
 		}
 	}
 		break;
 	case OSF_GET_PLATFORM_NAME:
 		error = copyout(platform.model, uap->buffer, 
 		    strlen(platform.model));
 		retval = 1;
 		break;
 	default:
 		printf("osf1_getsysinfo called with unknown op=%ld\n", uap->op);
 		return EINVAL;
 	}
 	td->td_retval[0] = retval;
 	return(error);
 }
 
 
 int
 osf1_setsysinfo(td, uap)
 	struct thread *td;
 	struct osf1_setsysinfo_args *uap;
 {
 	int error;
 
 	error = 0;
 
 	switch(uap->op) {
 	case OSF_SET_IEEE_FP_CONTROL:
 	{
 		u_int64_t temp, *fp_control;
 
 		if ((error = copyin(uap->buffer, &temp, sizeof(temp))))
 			break;
 		fp_control = &td->td_pcb->pcb_fp_control;
 		*fp_control = temp & IEEE_TRAP_ENABLE_MASK;
 		break;
 	}
 	default:
 		uprintf("osf1_setsysinfo called with op=%ld\n", uap->op);
 		/*error = EINVAL;*/
 	}
 	return (error);
 }
 
 
 int
 osf1_getrlimit(td, uap)
 	struct thread *td;
 	struct osf1_getrlimit_args *uap;
 {
 	struct __getrlimit_args /* {
 		syscallarg(u_int) which;
 		syscallarg(struct rlimit *) rlp;
 	} */ a;
 
 	if (SCARG(uap, which) >= OSF1_RLIMIT_NLIMITS)
 		return (EINVAL);
 
 	if (SCARG(uap, which) <= OSF1_RLIMIT_LASTCOMMON)
 		SCARG(&a, which) = SCARG(uap, which);
 	else if (SCARG(uap, which) == OSF1_RLIMIT_NOFILE)
 		SCARG(&a, which) = RLIMIT_NOFILE;
 	else
 		return (0);
 	SCARG(&a, rlp) = (struct rlimit *)SCARG(uap, rlp);
 
 	return getrlimit(td, &a);
 }
 
 
 int
 osf1_setrlimit(td, uap)
 	struct thread *td;
 	struct osf1_setrlimit_args  *uap;
 {
 	struct __setrlimit_args /* {
 		syscallarg(u_int) which;
 		syscallarg(struct rlimit *) rlp;
 	} */ a;
 
 	if (SCARG(uap, which) >= OSF1_RLIMIT_NLIMITS)
 		return (EINVAL);
 
 	if (SCARG(uap, which) <= OSF1_RLIMIT_LASTCOMMON)
 		SCARG(&a, which) = SCARG(uap, which);
 	else if (SCARG(uap, which) == OSF1_RLIMIT_NOFILE)
 		SCARG(&a, which) = RLIMIT_NOFILE;
 	else
 		return (0);
 	SCARG(&a, rlp) = (struct rlimit *)SCARG(uap, rlp);
 
 	return setrlimit(td, &a);
 }
 
 
 /*
  *  As linux says, this is a total guess.
  */
 
 int
 osf1_set_program_attributes(td, uap)
 	struct thread *td;
 	struct osf1_set_program_attributes_args *uap;
 {
 	struct vmspace *vm = td->td_proc->p_vmspace;
 
 	vm->vm_taddr = (caddr_t)uap->text_start;
 	vm->vm_tsize = btoc(round_page(uap->text_len));
 	vm->vm_daddr = (caddr_t)uap->bss_start;
 	vm->vm_dsize = btoc(round_page(uap->bss_len));
 
 	return(KERN_SUCCESS);
 }
 
 
 int
 osf1_mmap(td, uap)
 	struct thread *td;
 	struct osf1_mmap_args *uap;
 {
 	struct mmap_args /* {
 		syscallarg(caddr_t) addr;
 		syscallarg(size_t) len;
 		syscallarg(int) prot;
 		syscallarg(int) flags;
 		syscallarg(int) fd;
 		syscallarg(long) pad;
 		syscallarg(off_t) pos;
 	} */ a;
 	int retval;
 	vm_map_t map;
 	vm_offset_t addr, len, newaddr;
 
 	GIANT_REQUIRED;
 
 	SCARG(&a, addr) = SCARG(uap, addr);
 	SCARG(&a, len) = SCARG(uap, len);
 	SCARG(&a, prot) = SCARG(uap, prot);
 	SCARG(&a, fd) = SCARG(uap, fd);
 	SCARG(&a, pad) = 0;
 	SCARG(&a, pos) = SCARG(uap, pos);
 
 	SCARG(&a, flags) = 0;
 
 	/*
 	 *  OSF/1's mmap, unlike FreeBSD's, does its best to map memory at the
 	 *  user's requested address, even if MAP_FIXED is not set.  Here we
 	 *  try to replicate this behaviour as much as we can because some
 	 *  applications (like /sbin/loader) depend on having things put as
 	 *  close to where they've requested as possible.
 	 */
 
 	if (SCARG(uap, addr) != NULL)
 		addr = round_page((vm_offset_t)SCARG(&a,addr));
 	else
 	/*
 	 *  Try to use the apparent OSF/1 default placement of 0x10000 for
 	 *  NULL addrs, this helps to prevent non-64 bit clean binaries from
 	 *  SEGV'ing.
 	 */
 		addr = round_page((vm_offset_t)0x10000UL);
 	len = (vm_offset_t)SCARG(&a, len);
 	map = &td->td_proc->p_vmspace->vm_map;
 	if (!vm_map_findspace(map, addr, len, &newaddr)) {
 		SCARG(&a,addr) = (caddr_t) newaddr;
 		SCARG(&a, flags) |= (MAP_FIXED);
 	}
 #ifdef DEBUG
 	else
 		uprintf("osf1_mmap:vm_map_findspace failed for: %p 0x%lx\n",
 		    (caddr_t)addr, len);
 #endif
 	if (SCARG(uap, flags) & OSF1_MAP_SHARED)
 		SCARG(&a, flags) |= MAP_SHARED;
 	if (SCARG(uap, flags) & OSF1_MAP_PRIVATE)
 		SCARG(&a, flags) |= MAP_PRIVATE;
 
 	switch (SCARG(uap, flags) & OSF1_MAP_TYPE) {
 	case OSF1_MAP_ANONYMOUS:
 		SCARG(&a, flags) |= MAP_ANON;
 		break;
 	case OSF1_MAP_FILE:
 		SCARG(&a, flags) |= MAP_FILE;
 		break;
 	default:
 		return (EINVAL);
 	}
 	if (SCARG(uap, flags) & OSF1_MAP_FIXED)
 		SCARG(&a, flags) |= MAP_FIXED;
 	if (SCARG(uap, flags) & OSF1_MAP_HASSEMAPHORE)
 		SCARG(&a, flags) |= MAP_HASSEMAPHORE;
 	if (SCARG(uap, flags) & OSF1_MAP_INHERIT)
 		return (EINVAL);
 	if (SCARG(uap, flags) & OSF1_MAP_UNALIGNED)
 		return (EINVAL);
 	/*
 	 *  Emulate an osf/1 bug:  Apparently, mmap'ed segments are always
 	 *  readable even if the user doesn't or in PROT_READ.  This causes
 	 *  some buggy programs to segv.
 	 */
 	SCARG(&a, prot) |= PROT_READ;
 
 
 	retval = mmap(td, &a);
 #ifdef DEBUG
 	uprintf(
 	    "\nosf1_mmap: addr=%p (%p), len = 0x%lx, prot=0x%x, fd=%d, pad=0, pos=0x%lx",
 	    SCARG(uap, addr), SCARG(&a, addr),SCARG(uap, len), SCARG(uap, prot),
 	    SCARG(uap, fd), SCARG(uap, pos));
 	printf(" flags = 0x%x\n",SCARG(uap, flags));
 #endif
 	return (retval);
 }
 
 int
 osf1_msync(td, uap)
 	struct thread *td;
 	struct osf1_msync_args *uap;
 {
 	struct msync_args a;
 
 	a.addr = SCARG(uap, addr);
 	a.len  = SCARG(uap, len);
 	a.flags = 0;
 	if(SCARG(uap, flags) & OSF1_MS_ASYNC)
 		SCARG(&a, flags) |= MS_ASYNC;
 	if(SCARG(uap, flags) & OSF1_MS_SYNC)
 		SCARG(&a, flags) |= MS_SYNC;
 	if(SCARG(uap, flags) & OSF1_MS_INVALIDATE)
 		SCARG(&a, flags) |= MS_INVALIDATE;
 	return(msync(td, &a));
 }
 
 struct osf1_stat {
 	int32_t		st_dev;
 	u_int32_t	st_ino;
 	u_int32_t	st_mode;
 	u_int16_t	st_nlink;
 	u_int32_t	st_uid;
 	u_int32_t	st_gid;
 	int32_t		st_rdev;
 	u_int64_t	st_size;
 	int32_t		st_atime_sec;
 	int32_t		st_spare1;
 	int32_t		st_mtime_sec;
 	int32_t		st_spare2;
 	int32_t		st_ctime_sec;
 	int32_t		st_spare3;
 	u_int32_t	st_blksize;
 	int32_t		st_blocks;
 	u_int32_t	st_flags;
 	u_int32_t	st_gen;
 };
 
 /*
  *  Get file status; this version follows links.
  */
 /* ARGSUSED */
 int
 osf1_stat(td, uap)
 	struct thread *td;
 	struct osf1_stat_args *uap;
 {
 	int error;
 	struct stat sb;
 	struct osf1_stat osb;
 	struct nameidata nd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, uap->path);
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)))
 		return (error);
 	error = vn_stat(nd.ni_vp, &sb, td);
 	vput(nd.ni_vp);
 	if (error)
 		return (error);
 	cvtstat2osf1(&sb, &osb);
 	error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
 	return (error);
 }
 
 
 /*
  *  Get file status; this version does not follow links.
  */
 /* ARGSUSED */
 int
 osf1_lstat(td, uap)
 	struct thread *td;
 	register struct osf1_lstat_args *uap;
 {
 	struct stat sb;
 	struct osf1_stat osb;
 	int error;
 	struct nameidata nd;
 	caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, uap->path);
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)))
 		return (error);
 	error = vn_stat(nd.ni_vp, &sb, td);
 	vput(nd.ni_vp);
 	if (error)
 		return (error);
 	cvtstat2osf1(&sb, &osb);
 	error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
 	return (error);
 }
 
 
 /*
  *  Return status information about a file descriptor.
  */
 int
 osf1_fstat(td, uap)
 	struct thread *td;
 	register struct osf1_fstat_args *uap;
 {
-	register struct filedesc *fdp;
 	register struct file *fp;
 	struct stat ub;
 	struct osf1_stat oub;
 	int error;
 
-	fdp = td->td_proc->p_fd;
-	if ((unsigned)SCARG(uap, fd) >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
+	fp = ffind_hold(td, uap->fd);
+	if (fp == NULL)
 		return (EBADF);
 
 	error = fo_stat(fp, &ub, td);
+	fdrop(fp, td);
 	cvtstat2osf1(&ub, &oub);
 	if (error == 0)
 		error = copyout((caddr_t)&oub, (caddr_t)SCARG(uap, sb),
 		    sizeof (oub));
 	return (error);
 }
 
 
 #if 1
 #define	bsd2osf_dev(dev)	(umajor(dev) << 20 | uminor(dev))
 #define	osf2bsd_dev(dev)	umakedev((umajor(dev) >> 20) & 0xfff, uminor(dev) & 0xfffff)
 #else
 #define	minor(x)		((int)((x)&0xffff00ff))
 #define	major(x)		((int)(((u_int)(x) >> 8)&0xff))
 #define	makedev(x,y)		((dev_t)(((x) << 8) | (y)))
 #define	bsd2osf_dev(dev)	(major(dev) << 20 | minor(dev))
 #define	osf2bsd_dev(dev)	makedev(((dev) >> 20) & 0xfff, (dev) & 0xfffff)
 #endif
 /*
  * Convert from a stat structure to an osf1 stat structure.
  */
 static void
 cvtstat2osf1(st, ost)
 	struct stat *st;
 	struct osf1_stat *ost;
 {
 
 	ost->st_dev = bsd2osf_dev(st->st_dev);
 	ost->st_ino = st->st_ino;
 	ost->st_mode = st->st_mode;
 	ost->st_nlink = st->st_nlink;
 	ost->st_uid = st->st_uid == -2 ? (u_int16_t) -2 : st->st_uid;
 	ost->st_gid = st->st_gid == -2 ? (u_int16_t) -2 : st->st_gid;
 	ost->st_rdev = bsd2osf_dev(st->st_rdev);
 	ost->st_size = st->st_size;
 	ost->st_atime_sec = st->st_atime;
 	ost->st_spare1 = 0;
 	ost->st_mtime_sec = st->st_mtime;
 	ost->st_spare2 = 0;
 	ost->st_ctime_sec = st->st_ctime;
 	ost->st_spare3 = 0;
 	ost->st_blksize = st->st_blksize;
 	ost->st_blocks = st->st_blocks;
 	ost->st_flags = st->st_flags;
 	ost->st_gen = st->st_gen;
 }
 
 
 int
 osf1_mknod(td, uap)
 	struct thread *td;
 	struct osf1_mknod_args *uap;
 {
 #if notanymore
 	struct mknod_args a;
 	caddr_t sg;
 
 	sg = stackgap_init();
         CHECKALTEXIST(td, &sg, uap->path);
 
 	SCARG(&a, path) = SCARG(uap, path);
 	SCARG(&a, mode) = SCARG(uap, mode);
 	SCARG(&a, dev) = osf2bsd_dev(SCARG(uap, dev));
 
 	return mknod(td, &a);
 #endif
 	printf("osf1_mknod no longer implemented\n");
 	return ENOSYS;
 }
 
 
 int
 osf1_access(td, uap)
 	struct thread *td;
 	struct osf1_access_args *uap;
 {
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, uap->path);
 
 	return access(td, (struct access_args *)uap);
 }
 
 
 struct osf1_flock   {
 	short	l_type;
 	short	l_whence;
 	off_t	l_start;
 	off_t	l_len;
 	pid_t	l_pid;
 	};
 
 int
 osf1_fcntl(td, uap)
 	struct thread *td;
 	struct osf1_fcntl_args *uap;
 {
 	int error;
 	long tmp;
 	caddr_t oarg, sg;
 	struct fcntl_args a;
 	struct osf1_flock osf_flock;
 	struct flock bsd_flock;
 	struct flock *nflock;
 
 	error = 0;
 
 	switch (SCARG(uap, cmd)) {
 
 	case F_SETFL:
 		SCARG(&a, fd) = SCARG(uap, fd);
 		SCARG(&a, cmd) = F_SETFL;
 		/* need to translate flags here */
 		tmp = 0;
 		if ((long)SCARG(uap, arg) & OSF1_FNONBLOCK)
 			tmp |= FNONBLOCK;
 		if ((long)SCARG(uap, arg) & OSF1_FAPPEND)
 			tmp |= FAPPEND;
 		if ((long)SCARG(uap, arg) & OSF1_FDEFER)
 			tmp |= FDEFER;
 		if ((long)SCARG(uap, arg) & OSF1_FASYNC)
 			tmp |= FASYNC;
 		if ((long)SCARG(uap, arg) & OSF1_FCREAT)
 			tmp |= O_CREAT;
 		if ((long)SCARG(uap, arg) & OSF1_FTRUNC)
 			tmp |= O_TRUNC;
 		if ((long)SCARG(uap, arg) & OSF1_FEXCL)
 			tmp |= O_EXCL;
 		if ((long)SCARG(uap, arg) & OSF1_FNDELAY)
 			tmp |= FNDELAY;
 		if ((long)SCARG(uap, arg) & OSF1_FSYNC)
 			tmp |= FFSYNC;
 		SCARG(&a, arg) = tmp;
 		error = fcntl(td, &a);
 		break;
 
 	case F_SETLK:
 	case F_SETLKW:
 	case F_GETLK:
 		/*
 		 *  The OSF/1 flock stucture has a different order than
 		 *  the BSD one, but all else is the same.  We must
 		 *  reorder the one we've gotten so that flock() groks it.
 		 */
 		if ((error = copyin(uap->arg, &osf_flock, sizeof(osf_flock))))
 			return error;
 		bsd_flock.l_type = osf_flock.l_type;
 		bsd_flock.l_whence = osf_flock.l_whence;
 		bsd_flock.l_start = osf_flock.l_start;
 		bsd_flock.l_len = osf_flock.l_len;
 		bsd_flock.l_pid = osf_flock.l_pid;
 		sg = stackgap_init();
 		nflock = stackgap_alloc(&sg, sizeof(struct flock));
 		if ((error = copyout(&bsd_flock, nflock, sizeof(bsd_flock))) != 0)
 			return error;
 		oarg = uap->arg;
 		uap->arg = nflock;
 		error = fcntl(td, (struct fcntl_args *) uap);
 /*		if (error) {
 			printf("fcntl called with cmd=%d, args=0x%lx\n returns %d\n",uap->cmd,(long)uap->arg,error);
 			printf("bsd_flock.l_type = 0x%x\n", bsd_flock.l_type);
 			printf("bsd_flock.l_whence = 0x%x\n", bsd_flock.l_whence);
 			printf("bsd_flock.l_start = 0x%lx\n", bsd_flock.l_start);
 			printf("bsd_flock.l_len = 0x%lx\n", bsd_flock.l_len);
 			printf("bsd_flock.l_pid = 0x%x\n", bsd_flock.l_pid);
 		}
 */
 		if ((uap->cmd == F_GETLK) && !error) {
 			osf_flock.l_type = F_UNLCK;
 			if ((error = copyout(&osf_flock, oarg,
 			    sizeof(osf_flock))))
 				return error;
 		}
 		break;
 	default:
 		error = fcntl(td, (struct fcntl_args *) uap);
 
 		if ((uap->cmd == OSF1_F_GETFL) && !error ) {
 			tmp = td->td_retval[0] & O_ACCMODE;
 			if (td->td_retval[0] & FNONBLOCK)
 				tmp |= OSF1_FNONBLOCK;
 			if (td->td_retval[0] & FAPPEND)
 				tmp |= OSF1_FAPPEND;
 			if (td->td_retval[0] & FDEFER)
 				tmp |= OSF1_FDEFER;
 			if (td->td_retval[0] & FASYNC)
 				tmp |= OSF1_FASYNC;
 			if (td->td_retval[0] & O_CREAT)
 				tmp |= OSF1_FCREAT;
 			if (td->td_retval[0] & O_TRUNC)
 				tmp |= OSF1_FTRUNC;
 			if (td->td_retval[0] & O_EXCL)
 				tmp |= OSF1_FEXCL;
 			if (td->td_retval[0] & FNDELAY)
 				tmp |= OSF1_FNDELAY;
 			if (td->td_retval[0] & FFSYNC)
 				tmp |= OSF1_FSYNC;
 			td->td_retval[0] = tmp;
 		}
 	}
 
 	return (error);
 }
 
 
 #if 0
 int
 osf1_fcntl(td, uap)
 	struct thread *td;
 	struct osf1_fcntl_args *uap;
 {
 	struct fcntl_args a;
 	long tmp;
 	int error;
 
 	SCARG(&a, fd) = SCARG(uap, fd);
 
 	switch (SCARG(uap, cmd)) {
 
 	case OSF1_F_DUPFD:
 		SCARG(&a, cmd) = F_DUPFD;
 		SCARG(&a, arg) = (long)SCARG(uap, arg);
 		break;
 
 	case OSF1_F_GETFD:
 		SCARG(&a, cmd) = F_GETFD;
 		SCARG(&a, arg) = (long)SCARG(uap, arg);
 		break;
 
 	case OSF1_F_SETFD:
 		SCARG(&a, cmd) = F_SETFD;
 		SCARG(&a, arg) = (long)SCARG(uap, arg);
 		break;
 
 	case OSF1_F_GETFL:
 		SCARG(&a, cmd) = F_GETFL;
 		SCARG(&a, arg) = (long)SCARG(uap, arg);		/* ignored */
 		break;
 
 	case OSF1_F_SETFL:
 		SCARG(&a, cmd) = F_SETFL;
 		tmp = 0;
 		if ((long)SCARG(uap, arg) & OSF1_FAPPEND)
 			tmp |= FAPPEND;
 		if ((long)SCARG(uap, arg) & OSF1_FNONBLOCK)
 			tmp |= FNONBLOCK;
 		if ((long)SCARG(uap, arg) & OSF1_FASYNC)
 			tmp |= FASYNC;
 		if ((long)SCARG(uap, arg) & OSF1_FSYNC)
 			tmp |= FFSYNC;
 		SCARG(&a, arg) = tmp;
 		break;
 
 	default:					/* XXX other cases */
 		return (EINVAL);
 	}
 
 	error = fcntl(td, &a);
 
 	if (error)
 		return error;
 
 	switch (SCARG(uap, cmd)) {
 	case OSF1_F_GETFL:
 		/* XXX */
 		break;
 	}
 
 	return error;
 }
 #endif
 
 int
 osf1_socket(td, uap)
 	struct thread *td;
 	struct osf1_socket_args *uap;
 {
 	struct socket_args a;
 
 	if (SCARG(uap, type) > AF_LINK)
 		return (EINVAL);	/* XXX After AF_LINK, divergence. */
 
 	SCARG(&a, domain) = SCARG(uap, domain);
 	SCARG(&a, type) = SCARG(uap, type);
 	SCARG(&a, protocol) = SCARG(uap, protocol);
 
 	return socket(td, &a);
 }
 
 
 int
 osf1_sendto(td, uap)
 	struct thread *td;
 	register struct osf1_sendto_args *uap;
 {
 	struct sendto_args a;
 
 	if (SCARG(uap, flags) & ~0x7f)		/* unsupported flags */
 		return (EINVAL);
 
 	SCARG(&a, s) = SCARG(uap, s);
 	SCARG(&a, buf) = SCARG(uap, buf);
 	SCARG(&a, len) = SCARG(uap, len);
 	SCARG(&a, flags) = SCARG(uap, flags);
 	SCARG(&a, to) = (caddr_t)SCARG(uap, to);
 	SCARG(&a, tolen) = SCARG(uap, tolen);
 
 	return sendto(td, &a);
 }
 
 
 int
 osf1_reboot(td, uap)
 	struct thread *td;
 	struct osf1_reboot_args *uap;
 {
 	struct reboot_args a;
 
 	if (SCARG(uap, opt) & ~OSF1_RB_ALLFLAGS &&
 	    SCARG(uap, opt) & (OSF1_RB_ALTBOOT|OSF1_RB_UNIPROC))
 		return (EINVAL);
 
 	SCARG(&a, opt) = 0;
 
 	if (SCARG(uap, opt) & OSF1_RB_ASKNAME)
 		SCARG(&a, opt) |= RB_ASKNAME;
 	if (SCARG(uap, opt) & OSF1_RB_SINGLE)
 		SCARG(&a, opt) |= RB_SINGLE;
 	if (SCARG(uap, opt) & OSF1_RB_NOSYNC)
 		SCARG(&a, opt) |= RB_NOSYNC;
 	if (SCARG(uap, opt) & OSF1_RB_HALT)
 		SCARG(&a, opt) |= RB_HALT;
 	if (SCARG(uap, opt) & OSF1_RB_INITNAME)
 		SCARG(&a, opt) |= RB_INITNAME;
 	if (SCARG(uap, opt) & OSF1_RB_DFLTROOT)
 		SCARG(&a, opt) |= RB_DFLTROOT;
 
 	return reboot(td, &a);
 }
 
 
 int
 osf1_lseek(td, uap)
 	struct thread *td;
 	struct osf1_lseek_args *uap;
 {
 	struct lseek_args a;
 
 	SCARG(&a, fd) = SCARG(uap, fd);
 	SCARG(&a, pad) = 0;
 	SCARG(&a, offset) = SCARG(uap, offset);
 	SCARG(&a, whence) = SCARG(uap, whence);
 
 	return lseek(td, &a);
 }
 
 
 /*
  *  OSF/1 defines _POSIX_SAVED_IDS, which means that our normal
  *  setuid() won't work.
  *
  *  Instead, by P1003.1b-1993, setuid() is supposed to work like:
  *	If the process has appropriate [super-user] priviledges, the
  *	    setuid() function sets the real user ID, effective user
  *	    ID, and the saved set-user-ID to uid.
  *	If the process does not have appropriate priviledges, but uid
  *	    is equal to the real user ID or the saved set-user-ID, the
  *	    setuid() function sets the effective user ID to uid; the
  *	    real user ID and saved set-user-ID remain unchanged by
  *	    this function call.
  */
 int
 osf1_setuid(td, uap)
 	struct thread *td;
 	struct osf1_setuid_args *uap;
 {
 	struct proc *p;
 	int error;
 	uid_t uid;
 	struct ucred *newcred, *oldcred;
 
 	p = td->td_proc;
 	uid = SCARG(uap, uid);
 	oldcred = p->p_ucred;
 
 	if ((error = suser_xxx(p->p_ucred, NULL, PRISON_ROOT)) != 0 &&
 	    uid != oldcred->cr_ruid && uid != oldcred->cr_svuid)
 		return (error);
 
 	newcred = crdup(oldcred);
 	if (error == 0) {
 		if (uid != oldcred->cr_ruid) {
 			change_ruid(newcred, uid);
 			setsugid(p);
 		}
 		if (oldcred->cr_svuid != uid) {
 			change_svuid(newcred, uid);
 			setsugid(p);
 		}
 	}
 	if (newcred->cr_uid != uid) {
 		change_euid(newcred, uid);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	crfree(oldcred);
 	return (0);
 }
 
 
 /*
  *  OSF/1 defines _POSIX_SAVED_IDS, which means that our normal
  *  setgid() won't work.
  *
  *  If you change "uid" to "gid" in the discussion, above, about
  *  setuid(), you'll get a correct description of setgid().
  */
 int
 osf1_setgid(td, uap)
 	struct thread *td;
 	struct osf1_setgid_args *uap;
 {
 	struct proc *p;
 	int error;
 	gid_t gid;
 	struct ucred *newcred, *oldcred;
 
 	p = td->td_proc;
 	gid = SCARG(uap, gid);
 	oldcred = p->p_ucred;
 
 	if (((error = suser_xxx(p->p_ucred, NULL, PRISON_ROOT)) != 0 ) &&
 	    gid != oldcred->cr_rgid && gid != oldcred->cr_svgid)
 		return (error);
 
 	newcred = crdup(oldcred);
 	if (error == 0) {
 		if (gid != oldcred->cr_rgid) {
 			change_rgid(newcred, gid);
 			setsugid(p);
 		}
 		if (oldcred->cr_svgid != gid) {
 			change_svgid(newcred, gid);
 			setsugid(p);
 		}
 	}
 	if (newcred->cr_groups[0] != gid) {
 		change_egid(newcred, gid);
 		setsugid(p);
 	}
 	p->p_ucred = newcred;
 	crfree(oldcred);
 	return (0);
 }
 
 
 /*
  *  The structures end up being the same... but we can't be sure that
  *  the other word of our iov_len is zero!
  */
 struct osf1_iovec {
 	char	*iov_base;
 	int	iov_len;
 };
 #define	STACKGAPLEN	400
 int
 osf1_readv(td, uap)
 	struct thread *td;
 	struct osf1_readv_args *uap;
 {
 	int error, osize, nsize, i;
 	caddr_t sg;
 	struct readv_args /* {
 		syscallarg(int) fd;
 		syscallarg(struct iovec *) iovp;
 		syscallarg(u_int) iovcnt;
 	} */ a;
 	struct osf1_iovec *oio;
 	struct iovec *nio;
 
 	sg = stackgap_init();
 
 	if (SCARG(uap, iovcnt) > (STACKGAPLEN / sizeof (struct iovec)))
 		return (EINVAL);
 
 	osize = SCARG(uap, iovcnt) * sizeof (struct osf1_iovec);
 	nsize = SCARG(uap, iovcnt) * sizeof (struct iovec);
 
 	oio = malloc(osize, M_TEMP, M_WAITOK);
 	nio = malloc(nsize, M_TEMP, M_WAITOK);
 
 	error = 0;
 	if ((error = copyin(SCARG(uap, iovp), oio, osize)))
 		goto punt;
 	for (i = 0; i < SCARG(uap, iovcnt); i++) {
 		nio[i].iov_base = oio[i].iov_base;
 		nio[i].iov_len = oio[i].iov_len;
 	}
 
 	SCARG(&a, fd) = SCARG(uap, fd);
 	SCARG(&a, iovp) = stackgap_alloc(&sg, nsize);
 	SCARG(&a, iovcnt) = SCARG(uap, iovcnt);
 
 	if ((error = copyout(nio, (caddr_t)SCARG(&a, iovp), nsize)))
 		goto punt;
 	error = readv(td, &a);
 
 punt:
 	free(oio, M_TEMP);
 	free(nio, M_TEMP);
 	return (error);
 }
 
 
 int
 osf1_writev(td, uap)
 	struct thread *td;
 	struct osf1_writev_args *uap;
 {
 	int error, i, nsize, osize;
 	caddr_t sg;
 	struct writev_args /* {
 		syscallarg(int) fd;
 		syscallarg(struct iovec *) iovp;
 		syscallarg(u_int) iovcnt;
 	} */ a;
 	struct osf1_iovec *oio;
 	struct iovec *nio;
 
 	sg = stackgap_init();
 
 	if (SCARG(uap, iovcnt) > (STACKGAPLEN / sizeof (struct iovec)))
 		return (EINVAL);
 
 	osize = SCARG(uap, iovcnt) * sizeof (struct osf1_iovec);
 	nsize = SCARG(uap, iovcnt) * sizeof (struct iovec);
 
 	oio = malloc(osize, M_TEMP, M_WAITOK);
 	nio = malloc(nsize, M_TEMP, M_WAITOK);
 
 	error = 0;
 	if ((error = copyin(SCARG(uap, iovp), oio, osize)))
 		goto punt;
 	for (i = 0; i < SCARG(uap, iovcnt); i++) {
 		nio[i].iov_base = oio[i].iov_base;
 		nio[i].iov_len = oio[i].iov_len;
 	}
 
 	SCARG(&a, fd) = SCARG(uap, fd);
 	SCARG(&a, iovp) = stackgap_alloc(&sg, nsize);
 	SCARG(&a, iovcnt) = SCARG(uap, iovcnt);
 
 	if ((error = copyout(nio, (caddr_t)SCARG(&a, iovp), nsize)))
 		goto punt;
 	error = writev(td, &a);
 
 punt:
 	free(oio, M_TEMP);
 	free(nio, M_TEMP);
 	return (error);
 }
 
 
 /*
  *  More of the stupid off_t padding!
  */
 int
 osf1_truncate(td, uap)
 	struct thread *td;
 	struct osf1_truncate_args *uap;
 {
 	caddr_t sg;
 	struct truncate_args a;
 
 	sg = stackgap_init();
         CHECKALTEXIST(td, &sg, uap->path);
 
 	SCARG(&a, path) = SCARG(uap, path);
 	SCARG(&a, pad) = 0;
 	SCARG(&a, length) = SCARG(uap, length);
 
 	return truncate(td, &a);
 }
 
 
 int
 osf1_ftruncate(td, uap)
 	struct thread *td;
 	struct osf1_ftruncate_args *uap;
 {
 	struct ftruncate_args a;
 
 	SCARG(&a, fd) = SCARG(uap, fd);
 	SCARG(&a, pad) = 0;
 	SCARG(&a, length) = SCARG(uap, length);
 
 	return ftruncate(td, &a);
 }
 
 
 static int
 osf2bsd_pathconf(name)
 	int *name;
 {
 
 	switch (*name) {
 	case _OSF1_PC_LINK_MAX:
 	case _OSF1_PC_MAX_CANON:
 	case _OSF1_PC_MAX_INPUT:
 	case _OSF1_PC_NAME_MAX:
 		*name -= 10;
 		break;
 
 	case _OSF1_PC_PATH_MAX:
 	case _OSF1_PC_PIPE_BUF:
 		*name -= 9;
 
 	case _OSF1_PC_NO_TRUNC:
 		*name = _PC_NO_TRUNC;
 		break;
 
 	case _OSF1_PC_CHOWN_RESTRICTED:
 		*name = _PC_CHOWN_RESTRICTED;
 		break;
 
 	case _OSF1_PC_VDISABLE:
 		*name = _PC_VDISABLE;
 		break;
 
 	default:
 		return (EINVAL);
 	}
 	return 0;
 }
 
 
 int
 osf1_pathconf(td, uap)
 	struct thread *td;
 	struct osf1_pathconf_args *uap;
 {
 
 	if (osf2bsd_pathconf(&uap->name))
 		return (EINVAL);
 	else
 		return (pathconf(td, (void *)uap));
 }
 
 
 int
 osf1_fpathconf(td, uap)
 	struct thread *td;
 	struct osf1_fpathconf_args *uap;
 {
 
 	if (osf2bsd_pathconf(&uap->name))
 		return (EINVAL);
 	else
 		return (fpathconf(td, (void *)uap));
 }
 
 
 int
 osf1_getrusage(td, uap)
 	struct thread *td;
 	struct osf1_getrusage_args *uap;
 {
 	struct proc *p;
 	struct rusage *rup;
 	struct osf1_rusage oru;
 
 	p = td->td_proc;
 	switch (uap->who) {
 	case RUSAGE_SELF:
 		rup = &p->p_stats->p_ru;
 		mtx_lock_spin(&sched_lock);
 		calcru(p, &rup->ru_utime, &rup->ru_stime, NULL);
 		mtx_unlock_spin(&sched_lock);
 		break;
 
 	case RUSAGE_CHILDREN:
 		rup = &p->p_stats->p_cru;
 		break;
 
 	default:
 		return (EINVAL);
 	}
 	TV_CP(rup->ru_utime, oru.ru_utime);
 	TV_CP(rup->ru_stime, oru.ru_stime);
 	bcopy(&(rup->ru_first), &(oru.ru_first),
 	    (&(oru.ru_last) - &(oru.ru_first)));
 
 	return (copyout((caddr_t)&oru, (caddr_t)uap->rusage,
 	    sizeof (struct osf1_rusage)));
 }
 
 
 int
 osf1_wait4(td, uap)
 	struct thread *td;
 	struct osf1_wait4_args *uap;
 {
 	int error;
 	caddr_t sg;
 	struct osf1_rusage *orusage, oru;
 	struct rusage *rusage = NULL, ru;
 
 	orusage = SCARG(uap, rusage);
 	if (orusage) {
 		sg = stackgap_init();
 		rusage = stackgap_alloc(&sg, sizeof(struct rusage));
 		SCARG(uap, rusage) = (struct osf1_rusage *)rusage;
 	}
 	if ((error = wait4(td, (struct wait_args *)uap)))
 		return error;
 	if (orusage && (error = copyin(rusage, &ru, sizeof(ru)) == 0)){
 		TV_CP(ru.ru_utime, oru.ru_utime);
 		TV_CP(ru.ru_stime, oru.ru_stime);
 		bcopy(&ru.ru_first, &oru.ru_first,
 		    (&(oru.ru_last) - &(oru.ru_first)));
 		copyout(&oru, orusage, sizeof (struct osf1_rusage));
 	}
 	return (0);
 }
 
 
 int
 osf1_madvise(td, uap)
 	struct thread *td;
 	struct osf1_madvise_args *uap;
 {
 
 	/* XXX */
 	return EINVAL;
 }
 
 
 int
 osf1_execve(td, uap)
 	struct thread *td;
 	struct osf1_execve_args *uap;
 {
 	caddr_t sg;
 	struct execve_args ap;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 
 	SCARG(&ap, fname) = SCARG(uap, path);
 	SCARG(&ap, argv) = SCARG(uap, argp);
 	SCARG(&ap, envv) = SCARG(uap, envp);
 
 	return execve(td, &ap);
 }
 
 
 int
 osf1_usleep_thread(td, uap)
 	struct thread *td;
 	struct osf1_usleep_thread_args *uap;
 {
 	int error, s, timo;
 	struct osf1_timeval time;
 	struct timeval difftv, endtv, sleeptv, tv;
 
 	if ((error = copyin(SCARG(uap, sleep), &time, sizeof time)))
 		return (error);
 
 	sleeptv.tv_sec =  (u_long)time.tv_sec;
 	sleeptv.tv_usec = (u_long)time.tv_usec;
 	timo = tvtohz(&sleeptv);
 
 	/*
 	 *  Some callers use usleep(0) as a sort of thread-yield so make
 	 *  sure that the timeout is non-zero.
 	 */
 
 	if (timo == 0)
 		timo = 1;
 	s = splclock();
 	microtime(&tv);
 	splx(s);
 
 	tsleep(td, PUSER|PCATCH, "OSF/1", timo);
 
 	if (SCARG(uap, slept) != NULL) {
 		s = splclock();
 		microtime(&endtv);
 		timersub(&time, &endtv, &difftv);
 		splx(s);
 		if (tv.tv_sec < 0 || tv.tv_usec < 0)
 			tv.tv_sec = tv.tv_usec = 0;
 		TV_CP(difftv, time)
 		error = copyout(&time, SCARG(uap, slept), sizeof time);
 	}
 	return (error);
 }
 
 
 int osf1_gettimeofday(td, uap)
 	struct thread *td;
 	register struct osf1_gettimeofday_args *uap;
 {
 	int error;
 	struct timeval atv;
 	struct osf1_timeval otv;
 
 	error = 0;
 
 	if (uap->tp) {
 		microtime(&atv);
 		otv.tv_sec = atv.tv_sec;
 		otv.tv_usec = atv.tv_usec;
 		if ((error = copyout((caddr_t)&otv, (caddr_t)uap->tp,
 		    sizeof (otv))))
 			return (error);
 	}
 	if (uap->tzp)
 		error = copyout((caddr_t)&tz, (caddr_t)uap->tzp, sizeof (tz));
 	return (error);
 }
 
 
 int osf1_select(td, uap)
 	struct thread *td;
 	register struct osf1_select_args *uap;
 {
 	if (uap->tv) {
 		int error;
 		caddr_t sg;
 		struct osf1_timeval otv;
 		struct timeval tv;
 
 		sg = stackgap_init();
 
 		if ((error=copyin((caddr_t)uap->tv,(caddr_t)&otv,sizeof(otv))))
 			return(error);
 		TV_CP(otv,tv);
 		uap->tv = stackgap_alloc(&sg, sizeof(struct timeval));
 		if ((error=copyout((caddr_t)&tv, (caddr_t)uap->tv,sizeof(tv))))
 			return(error);
 	}
 	return(select(td, (struct select_args *)uap));
 }
 
 
 int
 osf1_setitimer(td, uap)
 	struct thread *td;
 	struct osf1_setitimer_args *uap;
 {
 
 	int error;
 	caddr_t old_oitv, sg;
 	struct itimerval itv;
 	struct osf1_itimerval otv;
 
 	error = 0;
 	old_oitv = (caddr_t)uap->oitv;
 	sg = stackgap_init();
 
 	if ((error = copyin((caddr_t)uap->itv,(caddr_t)&otv,sizeof(otv)))) {
 		printf("%s(%d): error = %d\n", __FILE__, __LINE__, error);
 		return error;
 	}
 	TV_CP(otv.it_interval,itv.it_interval);
 	TV_CP(otv.it_value,itv.it_value);
 	uap->itv = stackgap_alloc(&sg, sizeof(struct itimerval));
 	if ((error = copyout((caddr_t)&itv,(caddr_t)uap->itv,sizeof(itv)))) {
 		printf("%s(%d): error = %d\n", __FILE__, __LINE__, error);
 		return error;
 	}
 	uap->oitv = stackgap_alloc(&sg, sizeof(struct itimerval));
 	if ((error = setitimer(td, (struct setitimer_args *)uap))) {
 		printf("%s(%d): error = %d\n", __FILE__, __LINE__, error);
 		return error;
 	}
 	if ((error = copyin((caddr_t)uap->oitv,(caddr_t)&itv,sizeof(itv)))) {
 		printf("%s(%d): error = %d\n", __FILE__, __LINE__, error);
 		return error;
 	}
 	TV_CP(itv.it_interval,otv.it_interval);
 	TV_CP(itv.it_value,otv.it_value);
 	if (old_oitv
 	    && (error = copyout((caddr_t)&otv, old_oitv, sizeof(otv)))) {
 		printf("%s(%d): error = %d\n", __FILE__, __LINE__, error);
 	}
 	return error;
 }
 
 
 int
 osf1_getitimer(td, uap)
 	struct thread *td;
 	struct osf1_getitimer_args *uap;
 {
 	int error;
 	caddr_t old_itv, sg;
 	struct itimerval itv;
 	struct osf1_itimerval otv;
 
 	error = 0;
 	old_itv = (caddr_t)uap->itv;
 	sg = stackgap_init();
 
 	uap->itv = stackgap_alloc(&sg, sizeof(struct itimerval));
 	if ((error = getitimer(td, (struct getitimer_args *)uap))) {
 		printf("%s(%d): error = %d\n", __FILE__, __LINE__, error);
 		return error;
 	}
 	if ((error = copyin((caddr_t)uap->itv,(caddr_t)&itv,sizeof(itv)))) {
 		printf("%s(%d): error = %d\n", __FILE__, __LINE__, error);
 		return error;
 	}
 	TV_CP(itv.it_interval,otv.it_interval);
 	TV_CP(itv.it_value,otv.it_value);
 	if ((error = copyout((caddr_t)&otv, old_itv, sizeof(otv)))) {
 		printf("%s(%d): error = %d\n", __FILE__, __LINE__, error);
 	}
 	return error;
 }
 
 
 int
 osf1_proplist_syscall(td, uap)
 	struct thread *td;
 	struct osf1_proplist_syscall_args *uap;
 {
 
 	return(EOPNOTSUPP);
 }
 
 
 int
 osf1_ntpgettime(td, uap)
 	struct thread *td;
 	struct  osf1_ntpgettime_args *uap;
 {
 
 	return(ENOSYS);
 }
 
 
 int
 osf1_ntpadjtime(td, uap)
 	struct thread *td;
 	struct  osf1_ntpadjtime_args *uap;
 {
 
 	return(ENOSYS);
 }
 
 
 int
 osf1_setpgrp(td, uap)
 	struct thread *td;
 	struct  osf1_setpgrp_args *uap;
 {
 
 	return(setpgid(td, (struct setpgid_args *)uap));
 }
 
 
 int
 osf1_uswitch(td, uap)
 	struct thread *td;
 	struct osf1_uswitch_args *uap;
 {
 	struct proc *p;
 	int rv;
 	vm_map_entry_t entry;
 	vm_offset_t zero;
 
 	GIANT_REQUIRED;
 	p = td->td_proc;
 	zero = 0;
 
 	if (uap->cmd == OSF1_USC_GET) {
 		if (vm_map_lookup_entry(&(p->p_vmspace->vm_map), 0, &entry))
 			td->td_retval[0] =  OSF1_USW_NULLP;
 		else
 			td->td_retval[0] =  0;
 		return(KERN_SUCCESS);
 	} else if (uap->cmd == OSF1_USC_SET)
 		if (uap->mask & OSF1_USW_NULLP) {
 			rv = vm_mmap(&(p->p_vmspace->vm_map), &zero, PAGE_SIZE,
 			    VM_PROT_READ, VM_PROT_ALL,
 			    MAP_PRIVATE | MAP_FIXED | MAP_ANON, NULL, 0);
 			if (!rv)
 				return(KERN_SUCCESS);
 			else {
 				printf(
 				    "osf1_uswitch:vm_mmap of zero page failed with status %d\n",
 				    rv);
 				return(rv);
 			}
 		}
 	return(EINVAL);
 }
 
 
 int
 osf1_classcntl(td, uap)
 	struct thread *td;
 	struct  osf1_classcntl_args *uap;
 {
 
 	return(EACCES); /* class scheduling not enabled */
 }
 
 
 struct osf1_tbl_loadavg
 {
 	union {
 		long   l[3];
 		double d[3];
 	} tl_avenrun;
 	int  tl_lscale;
 	long tl_mach_factor[3]; /* ???? */
 };
 
 struct osf1_tbl_sysinfo {
 	long si_user;
 	long si_nice;
 	long si_sys;
 	long si_idle;
 	long si_hz;
 	long si_phz;
 	long si_boottime;
 	long wait;
 };
 
 #define TBL_LOADAVG	3
 #define TBL_SYSINFO    12
 
 int
 osf1_table(td, uap)
 	struct thread *td;
 	struct  osf1_table_args /*{
 				long id;
 				long index;
 				void *addr;
 				long nel;
 				u_long lel;
 				}*/ *uap;
 {
 	int retval;
 	struct osf1_tbl_loadavg ld;
 	struct osf1_tbl_sysinfo si;
 
 	retval = 0;
 
 	switch(uap->id) {
 	case TBL_LOADAVG: /* xemacs wants this */
 		if ((uap->index != 0) || (uap->nel != 1))
 			retval = EINVAL;
 		bcopy(&averunnable, &ld, sizeof(averunnable));
 		ld.tl_lscale = (u_int)averunnable.fscale;
 		retval = copyout(&ld, uap->addr, sizeof(ld));
 		break;
 	case TBL_SYSINFO:
 		if ((uap->index != 0) || (uap->nel != 1))
 			retval = EINVAL;
 		bzero(&si, sizeof(si));
 #if 0
 		si.si_user = cp_time[CP_USER];
 		si.si_nice = cp_time[CP_NICE];
 		si.si_sys  = cp_time[CP_SYS];
 		si.si_idle = cp_time[CP_IDLE];
 		si.wait    = cp_time[CP_INTR];
 #endif
 		si.si_hz = hz;
 		si.si_phz = profhz;
 		si.si_boottime = boottime.tv_sec;
 		retval = copyout(&si, uap->addr, sizeof(si));
 		break;
 	default:
 		printf("osf1_table: %ld, %ld, %p, %ld %ld\n",
 		    uap->id, uap->index, uap->addr, uap->nel, uap->lel);
 		retval = EINVAL;
 	}
 	return retval;
 }
 
 
 int
 osf1_sysinfo(td, uap)
 	struct thread *td;
 	struct  osf1_sysinfo_args /*{
 				int cmd;
 				char *buf;
 				long count;
 				}*/ *uap;
 {
 	int name[2], retval;
 	size_t bytes, len;
 	char *string;
 
 	string = NULL;
 
 	switch(uap->cmd) {
 	case 1: /* OS */
 		string = "OSF1";
 		break;
 	case 2:	/* hostname, from ogethostname */
 		len = uap->count;
 		name[0] = CTL_KERN;
 		name[1] = KERN_HOSTNAME;
 		retval = userland_sysctl(td, name, 2, uap->buf, &len,
 					1, 0, 0, &bytes);
 		td->td_retval[0] =  bytes;
 		return(retval);
 		break;
 	case 3: /* release of osf1 */
 		string = "V4.0";
 		break;
 	case 4: /* minor version of osf1 */
 		string = "878";
 		break;
 	case 5: /* machine or arch */
 	case 6:
 		string = "alpha";
 		break;
 	case 7: /* serial number, real osf1 returns 0! */
 		string = "0";
 		break;
 	case 8: /* HW vendor */
 		string = "Digital";
 		break;
 	case 9: /* dunno, this is what du does.. */
 		return(ENOSYS);
 		break;
 	default:
 		return(EINVAL);
 	}
 	bytes = min(uap->count, strlen(string)+1);
 	copyout(string, uap->buf, bytes);
 	td->td_retval[0] =  bytes;
 	return(0);
 }
Index: head/sys/alpha/osf1/osf1_mount.c
===================================================================
--- head/sys/alpha/osf1/osf1_mount.c	(revision 89305)
+++ head/sys/alpha/osf1/osf1_mount.c	(revision 89306)
@@ -1,383 +1,385 @@
 /*	$NetBSD: osf1_mount.c,v 1.7 1998/05/20 16:34:29 chs Exp $	*/
 
 /*
  * Copyright (c) 1994, 1995 Carnegie-Mellon University.
  * All rights reserved.
  *
  * Author: Chris G. Demetriou
  *
  * Permission to use, copy, modify and distribute this software and
  * its documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
  *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
  *
  * Carnegie Mellon requests users of this software to return to
  *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
  */
 
 /*
  * Additional Copyright (c) 1999 by Andrew Gallatin
  * $FreeBSD$
  */
 
 #include "opt_nfs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/namei.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <nfs/xdr_subs.h>
 #include <nfs/rpcv2.h>
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfsclient/nfsmount.h>
 #include <nfsclient/nfsargs.h>
 
 #include <sys/sysent.h>
 #include <alpha/osf1/osf1_signal.h>
 #include <alpha/osf1/osf1_proto.h>
 #include <alpha/osf1/osf1_syscall.h>
 #include <alpha/osf1/osf1_util.h>
 #include <alpha/osf1/osf1.h>
 
 
 void bsd2osf_statfs __P((struct statfs *, struct osf1_statfs *));
 int osf1_mount_mfs __P((struct thread *, struct osf1_mount_args *,
 			struct mount_args *));
 int osf1_mount_nfs __P((struct thread *, struct osf1_mount_args *,
 			struct mount_args *));
 
 #ifdef notanymore
 static const	char *fsnames[OSF1_MOUNT_MAXTYPE+2] = INITMOUNTNAMES;
 #endif
 
 void
 bsd2osf_statfs(bsfs, osfs)
 	struct statfs *bsfs;
 	struct osf1_statfs *osfs;
 {
 
 #ifdef notanymore	
 bzero(osfs, sizeof (struct osf1_statfs));
 	if (!strncmp(fsnames[MOUNT_UFS], bsfs->f_fstypename, MFSNAMELEN))
 		osfs->f_type = OSF1_MOUNT_UFS;
 	else if (!strncmp(fsnames[MOUNT_NFS], bsfs->f_fstypename, MFSNAMELEN))
 		osfs->f_type = OSF1_MOUNT_NFS;
 	else if (!strncmp(fsnames[MOUNT_MFS], bsfs->f_fstypename, MFSNAMELEN))
 		osfs->f_type = OSF1_MOUNT_MFS;
 	else
 		/* uh oh...  XXX = PC, CDFS, PROCFS, etc. */
 		osfs->f_type = OSF1_MOUNT_ADDON;
 	osfs->f_flags = bsfs->f_flags;		/* XXX translate */
 	osfs->f_fsize = bsfs->f_bsize;
 	osfs->f_bsize = bsfs->f_iosize;
 	osfs->f_blocks = bsfs->f_blocks;
 	osfs->f_bfree = bsfs->f_bfree;
 	osfs->f_bavail = bsfs->f_bavail;
 	osfs->f_files = bsfs->f_files;
 	osfs->f_ffree = bsfs->f_ffree;
 	bcopy(&bsfs->f_fsid, &osfs->f_fsid,
 	    max(sizeof bsfs->f_fsid, sizeof osfs->f_fsid));
 	/* osfs->f_spare zeroed above */
 	bcopy(bsfs->f_mntonname, osfs->f_mntonname,
 	    max(sizeof bsfs->f_mntonname, sizeof osfs->f_mntonname));
 	bcopy(bsfs->f_mntfromname, osfs->f_mntfromname,
 	    max(sizeof bsfs->f_mntfromname, sizeof osfs->f_mntfromname));
 	/* XXX osfs->f_xxx should be filled in... */
 #endif
 }
 
 int
 osf1_statfs(td, uap)
 	struct thread *td;
 	struct osf1_statfs_args *uap;
 {
 	int error;
 	struct mount *mp;
 	struct statfs *sp;
 	struct osf1_statfs osfs;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)))
 		return (error);
 	mp = nd.ni_vp->v_mount;
 	sp = &mp->mnt_stat;
 	vrele(nd.ni_vp);
 	if ((error = VFS_STATFS(mp, sp, td)))
 		return (error);
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	bsd2osf_statfs(sp, &osfs);
 	return copyout(&osfs, SCARG(uap, buf), min(sizeof osfs,
 	    SCARG(uap, len)));
 }
 
 int
 osf1_fstatfs(td, uap)
 	struct thread *td;
 	struct osf1_fstatfs_args *uap;
 {
 	int error;
 	struct file *fp;
 	struct mount *mp;
 	struct statfs *sp;
 	struct osf1_statfs osfs;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)))
 		return (error);
 	mp = ((struct vnode *)fp->f_data)->v_mount;
 	sp = &mp->mnt_stat;
-	if ((error = VFS_STATFS(mp, sp, td)))
+	error = VFS_STATFS(mp, sp, td);
+	fdrop(fp, td);
+	if (error)
 		return (error);
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	bsd2osf_statfs(sp, &osfs);
 	return copyout(&osfs, SCARG(uap, buf), min(sizeof osfs,
 	    SCARG(uap, len)));
 }
 
 int
 osf1_getfsstat(td, uap)
 	struct thread *td;
 	register struct osf1_getfsstat_args *uap;
 {
 	long count, error, maxcount;
 	caddr_t osf_sfsp;
 	struct mount *mp, *nmp;
 	struct statfs *sp;
 	struct osf1_statfs osfs;
 
 	if (SCARG(uap, flags) & ~OSF1_GETFSSTAT_FLAGS)
 		return (EINVAL);
 
 	maxcount = SCARG(uap, bufsize) / sizeof(struct osf1_statfs);
 	osf_sfsp = (caddr_t)SCARG(uap, buf);
 	for (count = 0, mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		if (osf_sfsp && count < maxcount) {
 			sp = &mp->mnt_stat;
 			/*
 			 * If OSF1_MNT_NOWAIT is specified, do not refresh the
 			 * fsstat cache.  OSF1_MNT_WAIT overrides
 			 * OSF1_MNT_NOWAIT.
 			 */
 			if (((SCARG(uap, flags) & OSF1_MNT_NOWAIT) == 0 ||
 			    (SCARG(uap, flags) & OSF1_MNT_WAIT)) &&
 			    (error = VFS_STATFS(mp, sp, td)))
 				continue;
 			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 			bsd2osf_statfs(sp, &osfs);
 			if ((error = copyout(&osfs, osf_sfsp,
 			    sizeof (struct osf1_statfs))))
 				return (error);
 			osf_sfsp += sizeof (struct osf1_statfs);
 		}
 		count++;
 	}
 	if (osf_sfsp && count > maxcount)
 		td->td_retval[0] = maxcount;
 	else
 		td->td_retval[0] = count;
 
 	return (0);
 }
 
 int
 osf1_unmount(td, uap)
 	struct thread *td;
 	struct osf1_unmount_args *uap;
 {
 	struct unmount_args a;
 
 	SCARG(&a, path) = SCARG(uap, path);
 
 	if (SCARG(uap, flags) & ~OSF1_UNMOUNT_FLAGS)
 		return (EINVAL);
 	SCARG(&a, flags) = 0;
 	if ((SCARG(uap, flags) & OSF1_MNT_FORCE) &&
 	    (SCARG(uap, flags) & OSF1_MNT_NOFORCE) == 0)
 		SCARG(&a, flags) |= MNT_FORCE;
 
 	return unmount(td, &a);
 }
 
 int
 osf1_mount(td, uap)
 	struct thread *td;
 	struct osf1_mount_args *uap;
 {
 	int error;
 	struct mount_args a;
 
 	SCARG(&a, path) = SCARG(uap, path);
 
 	if (SCARG(uap, flags) & ~OSF1_MOUNT_FLAGS)
 		return (EINVAL);
 	SCARG(&a, flags) = SCARG(uap, flags);		/* XXX - xlate */
 
 	switch (SCARG(uap, type)) {
 	case OSF1_MOUNT_UFS:				/* XXX */
 		return (EINVAL);
 		break;
 
 	case OSF1_MOUNT_NFS:				/* XXX */
 		if ((error = osf1_mount_nfs(td, uap, &a)))
 			return error;
 		break;
 
 	case OSF1_MOUNT_MFS:				/* XXX */
 #ifdef notyet
 		if ((error = osf1_mount_mfs(td, uap, &a)))
 			return error;
 #endif
 		return EINVAL;
 		break;
 
 	case OSF1_MOUNT_CDFS:				/* XXX */
 		return (EINVAL);
 		break;
 
 	case OSF1_MOUNT_PROCFS:				/* XXX */
 		return (EINVAL);
 		break;
 
 	case OSF1_MOUNT_NONE:
 	case OSF1_MOUNT_PC:
 	case OSF1_MOUNT_S5FS:
 	case OSF1_MOUNT_DFS:
 	case OSF1_MOUNT_EFS:
 	case OSF1_MOUNT_MSFS:
 	case OSF1_MOUNT_FFM:
 	case OSF1_MOUNT_FDFS:
 	case OSF1_MOUNT_ADDON:
 	default:
 		return (EINVAL);
 	}
 
 	return mount(td, &a);
 }
 
 int
 osf1_mount_mfs(td, osf_argp, bsd_argp)
 	struct thread *td;
 	struct osf1_mount_args *osf_argp;
 	struct mount_args *bsd_argp;
 {
 #ifdef notyet
 	int error, len;
 	caddr_t sg;
 	static const char mfs_name[] = "mfs";
 	struct osf1_mfs_args osf_ma;
 	struct mfs_args bsd_ma;
 
 	sg = stackgap_init();
 
 	if ((error = copyin(SCARG(osf_argp, data), &osf_ma, sizeof osf_ma)))
 		return error;
 
 	bzero(&bsd_ma, sizeof bsd_ma);
 	bsd_ma.fspec = osf_ma.name;
 	/* XXX export args */
 	bsd_ma.base = osf_ma.base;
 	bsd_ma.size = osf_ma.size;
 
 	SCARG(bsd_argp, data) = stackgap_alloc(&sg, sizeof bsd_ma);
 	if ((error = copyout(&bsd_ma, SCARG(bsd_argp, data), sizeof bsd_ma)))
 		return error;
 
 	len = strlen(mfs_name) + 1;
 	SCARG(bsd_argp, type) = stackgap_alloc(&sg, len);
 	if ((error = copyout(mfs_name, (void *)SCARG(bsd_argp, type), len)))
 		return error;
 #endif
 	return 0;
 }
 
 int
 osf1_mount_nfs(td, osf_argp, bsd_argp)
 	struct thread *td;
 	struct osf1_mount_args *osf_argp;
 	struct mount_args *bsd_argp;
 {
 	int error, len;
 	caddr_t sg;
 	static const char nfs_name[] = "nfs";
 	struct osf1_nfs_args osf_na;
 	struct nfs_args bsd_na;
 
 	sg = stackgap_init();
 
 	if ((error = copyin(SCARG(osf_argp, data), &osf_na, sizeof osf_na)))
 		return error;
 
 	bzero(&bsd_na, sizeof bsd_na);
 	bsd_na.addr = (struct sockaddr *)osf_na.addr;
 	bsd_na.addrlen = sizeof (struct sockaddr_in);
 	bsd_na.sotype = SOCK_DGRAM;
 	bsd_na.proto = 0;
 	bsd_na.fh = osf_na.fh;
 
 	if (osf_na.flags & ~OSF1_NFSMNT_FLAGS)
 		return EINVAL;
 	if (osf_na.flags & OSF1_NFSMNT_SOFT)
 		bsd_na.flags |= NFSMNT_SOFT;
 	if (osf_na.flags & OSF1_NFSMNT_WSIZE) {
 		bsd_na.wsize = osf_na.wsize;
 		bsd_na.flags |= NFSMNT_WSIZE;
 	}
 	if (osf_na.flags & OSF1_NFSMNT_RSIZE) {
 		bsd_na.rsize = osf_na.rsize;
 		bsd_na.flags |= NFSMNT_RSIZE;
 	}
 	if (osf_na.flags & OSF1_NFSMNT_TIMEO) {
 		bsd_na.timeo = osf_na.timeo;
 		bsd_na.flags |= NFSMNT_TIMEO;
 	}
 	if (osf_na.flags & OSF1_NFSMNT_RETRANS) {
 		bsd_na.retrans = osf_na.retrans;
 		bsd_na.flags |= NFSMNT_RETRANS;
 	}
 	if (osf_na.flags & OSF1_NFSMNT_HOSTNAME)
 		bsd_na.hostname = osf_na.hostname;
 	if (osf_na.flags & OSF1_NFSMNT_INT)
 		bsd_na.flags |= NFSMNT_INT;
 	if (osf_na.flags & OSF1_NFSMNT_NOCONN)
 		bsd_na.flags |= NFSMNT_NOCONN;
 
 	SCARG(bsd_argp, data) = stackgap_alloc(&sg, sizeof bsd_na);
 	if ((error = copyout(&bsd_na, SCARG(bsd_argp, data), sizeof bsd_na)))
 		return error;
 
 	len = strlen(nfs_name) + 1;
 	SCARG(bsd_argp, type) = stackgap_alloc(&sg, len);
 	if ((error = copyout(nfs_name, (void *)SCARG(bsd_argp, type), len)))
 		return error;
 
 	return 0;
 }
Index: head/sys/compat/linux/linux_file.c
===================================================================
--- head/sys/compat/linux/linux_file.c	(revision 89305)
+++ head/sys/compat/linux/linux_file.c	(revision 89306)
@@ -1,1171 +1,1183 @@
 /*-
  * Copyright (c) 1994-1995 S�ren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer 
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software withough specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sysproto.h>
 #include <sys/tty.h>
 #include <sys/vnode.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/ufsmount.h>
 
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #include <compat/linux/linux_util.h>
 
 #ifndef __alpha__
 int
 linux_creat(struct thread *td, struct linux_creat_args *args)
 {
     struct open_args /* {
 	char *path;
 	int flags;
 	int mode;
     } */ bsd_open_args;
     caddr_t sg;
 
     sg = stackgap_init();
     CHECKALTCREAT(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(creat))
 		printf(ARGS(creat, "%s, %d"), args->path, args->mode);
 #endif
     bsd_open_args.path = args->path;
     bsd_open_args.mode = args->mode;
     bsd_open_args.flags = O_WRONLY | O_CREAT | O_TRUNC;
     return open(td, &bsd_open_args);
 }
 #endif /*!__alpha__*/
 
 int
 linux_open(struct thread *td, struct linux_open_args *args)
 {
     struct open_args /* {
 	char *path;
 	int flags;
 	int mode;
     } */ bsd_open_args;
     struct proc *p = td->td_proc;
     int error;
     caddr_t sg;
 
     sg = stackgap_init();
     
     if (args->flags & LINUX_O_CREAT)
 	CHECKALTCREAT(td, &sg, args->path);
     else
 	CHECKALTEXIST(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(open))
 		printf(ARGS(open, "%s, 0x%x, 0x%x"),
 		    args->path, args->flags, args->mode);
 #endif
     bsd_open_args.flags = 0;
     if (args->flags & LINUX_O_RDONLY)
 	bsd_open_args.flags |= O_RDONLY;
     if (args->flags & LINUX_O_WRONLY) 
 	bsd_open_args.flags |= O_WRONLY;
     if (args->flags & LINUX_O_RDWR)
 	bsd_open_args.flags |= O_RDWR;
     if (args->flags & LINUX_O_NDELAY)
 	bsd_open_args.flags |= O_NONBLOCK;
     if (args->flags & LINUX_O_APPEND)
 	bsd_open_args.flags |= O_APPEND;
     if (args->flags & LINUX_O_SYNC)
 	bsd_open_args.flags |= O_FSYNC;
     if (args->flags & LINUX_O_NONBLOCK)
 	bsd_open_args.flags |= O_NONBLOCK;
     if (args->flags & LINUX_FASYNC)
 	bsd_open_args.flags |= O_ASYNC;
     if (args->flags & LINUX_O_CREAT)
 	bsd_open_args.flags |= O_CREAT;
     if (args->flags & LINUX_O_TRUNC)
 	bsd_open_args.flags |= O_TRUNC;
     if (args->flags & LINUX_O_EXCL)
 	bsd_open_args.flags |= O_EXCL;
     if (args->flags & LINUX_O_NOCTTY)
 	bsd_open_args.flags |= O_NOCTTY;
     bsd_open_args.path = args->path;
     bsd_open_args.mode = args->mode;
 
     error = open(td, &bsd_open_args);
     PROC_LOCK(p);
     if (!error && !(bsd_open_args.flags & O_NOCTTY) && 
 	SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) {
-	struct filedesc *fdp = p->p_fd;
-	struct file *fp = fdp->fd_ofiles[td->td_retval[0]];
+	struct file *fp;
 
+	fp = ffind_hold(td, td->td_retval[0]);
 	PROC_UNLOCK(p);
 	if (fp->f_type == DTYPE_VNODE)
 	    fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td);
+	fdrop(fp, td);
     } else
 	PROC_UNLOCK(p);
 #ifdef DEBUG
 	if (ldebug(open))
 		printf(LMSG("open returns error %d"), error);
 #endif
     return error;
 }
 
 int
 linux_lseek(struct thread *td, struct linux_lseek_args *args)
 {
 
     struct lseek_args /* {
 	int fd;
 	int pad;
 	off_t offset;
 	int whence;
     } */ tmp_args;
     int error;
 
 #ifdef DEBUG
 	if (ldebug(lseek))
 		printf(ARGS(lseek, "%d, %ld, %d"),
 		    args->fdes, (long)args->off, args->whence);
 #endif
     tmp_args.fd = args->fdes;
     tmp_args.offset = (off_t)args->off;
     tmp_args.whence = args->whence;
     error = lseek(td, &tmp_args);
     return error;
 }
 
 #ifndef __alpha__
 int
 linux_llseek(struct thread *td, struct linux_llseek_args *args)
 {
 	struct lseek_args bsd_args;
 	int error;
 	off_t off;
 
 #ifdef DEBUG
 	if (ldebug(llseek))
 		printf(ARGS(llseek, "%d, %d:%d, %d"),
 		    args->fd, args->ohigh, args->olow, args->whence);
 #endif
 	off = (args->olow) | (((off_t) args->ohigh) << 32);
 
 	bsd_args.fd = args->fd;
 	bsd_args.offset = off;
 	bsd_args.whence = args->whence;
 
 	if ((error = lseek(td, &bsd_args)))
 		return error;
 
 	if ((error = copyout(td->td_retval, (caddr_t)args->res, sizeof (off_t))))
 		return error;
 
 	td->td_retval[0] = 0;
 	return 0;
 }
 #endif /*!__alpha__*/
 
 #ifndef __alpha__
 int
 linux_readdir(struct thread *td, struct linux_readdir_args *args)
 {
 	struct linux_getdents_args lda;
 
 	lda.fd = args->fd;
 	lda.dent = args->dent;
 	lda.count = 1;
 	return linux_getdents(td, &lda);
 }
 #endif /*!__alpha__*/
 
 /*
  * Note that linux_getdents(2) and linux_getdents64(2) have the same
  * arguments. They only differ in the definition of struct dirent they
  * operate on. We use this to common the code, with the exception of
  * accessing struct dirent. Note that linux_readdir(2) is implemented
  * by means of linux_getdents(2). In this case we never operate on
  * struct dirent64 and thus don't need to handle it...
  */
 
 struct l_dirent {
 	l_long		d_ino;
 	l_off_t		d_off;
 	l_ushort	d_reclen;
 	char		d_name[LINUX_NAME_MAX + 1];
 };
 
 struct l_dirent64 {
 	uint64_t	d_ino;
 	int64_t		d_off;
 	l_ushort	d_reclen;
 	u_char		d_type;
 	char		d_name[LINUX_NAME_MAX + 1];
 };
 
 #define LINUX_RECLEN(de,namlen) \
     ALIGN((((char *)&(de)->d_name - (char *)de) + (namlen) + 1))
 
 #define	LINUX_DIRBLKSIZ		512
 
 static int
 getdents_common(struct thread *td, struct linux_getdents64_args *args,
     int is64bit)
 {
 	register struct dirent *bdp;
 	struct vnode *vp;
 	caddr_t inp, buf;		/* BSD-format */
 	int len, reclen;		/* BSD-format */
 	caddr_t outp;			/* Linux-format */
 	int resid, linuxreclen=0;	/* Linux-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	struct vattr va;
 	off_t off;
 	struct l_dirent linux_dirent;
 	struct l_dirent64 linux_dirent64;
 	int buflen, error, eofflag, nbytes, justone;
 	u_long *cookies = NULL, *cookiep;
 	int ncookies;
 
 	if ((error = getvnode(td->td_proc->p_fd, args->fd, &fp)) != 0)
 		return (error);
 
-	if ((fp->f_flag & FREAD) == 0)
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
 		return (EBADF);
+	}
 
 	vp = (struct vnode *) fp->f_data;
-	if (vp->v_type != VDIR)
+	if (vp->v_type != VDIR) {
+		fdrop(fp, td);
 		return (EINVAL);
+	}
 
-	if ((error = VOP_GETATTR(vp, &va, td->td_proc->p_ucred, td)))
+	if ((error = VOP_GETATTR(vp, &va, td->td_proc->p_ucred, td))) {
+		fdrop(fp, td);
 		return (error);
+	}
 
 	nbytes = args->count;
 	if (nbytes == 1) {
 		/* readdir(2) case. Always struct dirent. */
-		if (is64bit)
+		if (is64bit) {
+			fdrop(fp, td);
 			return (EINVAL);
+		}
 		nbytes = sizeof(linux_dirent);
 		justone = 1;
 	} else
 		justone = 0;
 
 	off = fp->f_offset;
 
 	buflen = max(LINUX_DIRBLKSIZ, nbytes);
 	buflen = min(buflen, MAXBSIZE);
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 
 	if (cookies) {
 		free(cookies, M_TEMP);
 		cookies = NULL;
 	}
 
 	if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies,
 		 &cookies)))
 		goto out;
 
 	inp = buf;
 	outp = (caddr_t)args->dirent;
 	resid = nbytes;
 	if ((len = buflen - auio.uio_resid) <= 0)
 		goto eof;
 
 	cookiep = cookies;
 
 	if (cookies) {
 		/*
 		 * When using cookies, the vfs has the option of reading from
 		 * a different offset than that supplied (UFS truncates the
 		 * offset to a block boundary to make sure that it never reads
 		 * partway through a directory entry, even if the directory
 		 * has been compacted).
 		 */
 		while (len > 0 && ncookies > 0 && *cookiep <= off) {
 			bdp = (struct dirent *) inp;
 			len -= bdp->d_reclen;
 			inp += bdp->d_reclen;
 			cookiep++;
 			ncookies--;
 		}
 	}
 
 	while (len > 0) {
 		if (cookiep && ncookies == 0)
 			break;
 		bdp = (struct dirent *) inp;
 		reclen = bdp->d_reclen;
 		if (reclen & 3) {
 			error = EFAULT;
 			goto out;
 		}
 
 		if (bdp->d_fileno == 0) {
 			inp += reclen;
 			if (cookiep) {
 				off = *cookiep++;
 				ncookies--;
 			} else
 				off += reclen;
 
 			len -= reclen;
 			continue;
 		}
 
 		linuxreclen = (is64bit)
 		    ? LINUX_RECLEN(&linux_dirent64, bdp->d_namlen)
 		    : LINUX_RECLEN(&linux_dirent, bdp->d_namlen);
 
 		if (reclen > len || resid < linuxreclen) {
 			outp++;
 			break;
 		}
 
 		if (justone) {
 			/* readdir(2) case. */
 			linux_dirent.d_ino = (l_long)bdp->d_fileno;
 			linux_dirent.d_off = (l_off_t)linuxreclen;
 			linux_dirent.d_reclen = (l_ushort)bdp->d_namlen;
 			strcpy(linux_dirent.d_name, bdp->d_name);
 			error = copyout(&linux_dirent, outp, linuxreclen);
 		} else {
 			if (is64bit) {
 				linux_dirent64.d_ino = bdp->d_fileno;
 				linux_dirent64.d_off = (cookiep)
 				    ? (l_off_t)*cookiep
 				    : (l_off_t)(off + reclen);
 				linux_dirent64.d_reclen =
 				    (l_ushort)linuxreclen;
 				linux_dirent64.d_type = bdp->d_type;
 				strcpy(linux_dirent64.d_name, bdp->d_name);
 				error = copyout(&linux_dirent64, outp,
 				    linuxreclen);
 			} else {
 				linux_dirent.d_ino = bdp->d_fileno;
 				linux_dirent.d_off = (cookiep)
 				    ? (l_off_t)*cookiep
 				    : (l_off_t)(off + reclen);
 				linux_dirent.d_reclen = (l_ushort)linuxreclen;
 				strcpy(linux_dirent.d_name, bdp->d_name);
 				error = copyout(&linux_dirent, outp,
 				    linuxreclen);
 			}
 		}
 		if (error)
 			goto out;
 
 		inp += reclen;
 		if (cookiep) {
 			off = *cookiep++;
 			ncookies--;
 		} else
 			off += reclen;
 
 		outp += linuxreclen;
 		resid -= linuxreclen;
 		len -= reclen;
 		if (justone)
 			break;
 	}
 
 	if (outp == (caddr_t)args->dirent)
 		goto again;
 
 	fp->f_offset = off;
 	if (justone)
 		nbytes = resid + linuxreclen;
 
 eof:
 	td->td_retval[0] = nbytes - resid;
 
 out:
 	if (cookies)
 		free(cookies, M_TEMP);
 
 	VOP_UNLOCK(vp, 0, td);
+	fdrop(fp, td);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 int
 linux_getdents(struct thread *td, struct linux_getdents_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getdents))
 		printf(ARGS(getdents, "%d, *, %d"), args->fd, args->count);
 #endif
 
 	return (getdents_common(td, (struct linux_getdents64_args*)args, 0));
 }
 
 int
 linux_getdents64(struct thread *td, struct linux_getdents64_args *args)
 {
 
 #ifdef DEBUG
 	if (ldebug(getdents64))
 		printf(ARGS(getdents64, "%d, *, %d"), args->fd, args->count);
 #endif
 
 	return (getdents_common(td, args, 1));
 }
 
 /*
  * These exist mainly for hooks for doing /compat/linux translation.
  */
 
 int
 linux_access(struct thread *td, struct linux_access_args *args)
 {
 	struct access_args bsd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(access))
 		printf(ARGS(access, "%s, %d"), args->path, args->flags);
 #endif
 	bsd.path = args->path;
 	bsd.flags = args->flags;
 
 	return access(td, &bsd);
 }
 
 int
 linux_unlink(struct thread *td, struct linux_unlink_args *args)
 {
 	struct unlink_args bsd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(unlink))
 		printf(ARGS(unlink, "%s"), args->path);
 #endif
 	bsd.path = args->path;
 
 	return unlink(td, &bsd);
 }
 
 int
 linux_chdir(struct thread *td, struct linux_chdir_args *args)
 {
 	struct chdir_args bsd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(chdir))
 		printf(ARGS(chdir, "%s"), args->path);
 #endif
 	bsd.path = args->path;
 
 	return chdir(td, &bsd);
 }
 
 int
 linux_chmod(struct thread *td, struct linux_chmod_args *args)
 {
 	struct chmod_args bsd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(chmod))
 		printf(ARGS(chmod, "%s, %d"), args->path, args->mode);
 #endif
 	bsd.path = args->path;
 	bsd.mode = args->mode;
 
 	return chmod(td, &bsd);
 }
 
 int
 linux_mkdir(struct thread *td, struct linux_mkdir_args *args)
 {
 	struct mkdir_args bsd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTCREAT(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(mkdir))
 		printf(ARGS(mkdir, "%s, %d"), args->path, args->mode);
 #endif
 	bsd.path = args->path;
 	bsd.mode = args->mode;
 
 	return mkdir(td, &bsd);
 }
 
 int
 linux_rmdir(struct thread *td, struct linux_rmdir_args *args)
 {
 	struct rmdir_args bsd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(rmdir))
 		printf(ARGS(rmdir, "%s"), args->path);
 #endif
 	bsd.path = args->path;
 
 	return rmdir(td, &bsd);
 }
 
 int
 linux_rename(struct thread *td, struct linux_rename_args *args)
 {
 	struct rename_args bsd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->from);
 	CHECKALTCREAT(td, &sg, args->to);
 
 #ifdef DEBUG
 	if (ldebug(rename))
 		printf(ARGS(rename, "%s, %s"), args->from, args->to);
 #endif
 	bsd.from = args->from;
 	bsd.to = args->to;
 
 	return rename(td, &bsd);
 }
 
 int
 linux_symlink(struct thread *td, struct linux_symlink_args *args)
 {
 	struct symlink_args bsd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->path);
 	CHECKALTCREAT(td, &sg, args->to);
 
 #ifdef DEBUG
 	if (ldebug(symlink))
 		printf(ARGS(symlink, "%s, %s"), args->path, args->to);
 #endif
 	bsd.path = args->path;
 	bsd.link = args->to;
 
 	return symlink(td, &bsd);
 }
 
 int
 linux_readlink(struct thread *td, struct linux_readlink_args *args)
 {
 	struct readlink_args bsd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->name);
 
 #ifdef DEBUG
 	if (ldebug(readlink))
 		printf(ARGS(readlink, "%s, %p, %d"),
 		    args->name, (void *)args->buf, args->count);
 #endif
 	bsd.path = args->name;
 	bsd.buf = args->buf;
 	bsd.count = args->count;
 
 	return readlink(td, &bsd);
 }
 
 int
 linux_truncate(struct thread *td, struct linux_truncate_args *args)
 {
 	struct truncate_args bsd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(truncate))
 		printf(ARGS(truncate, "%s, %ld"), args->path,
 		    (long)args->length);
 #endif
 	bsd.path = args->path;
 	bsd.length = args->length;
 
 	return truncate(td, &bsd);
 }
 
 int
 linux_link(struct thread *td, struct linux_link_args *args)
 {
     struct link_args bsd;
     caddr_t sg;
 
     sg = stackgap_init();
     CHECKALTEXIST(td, &sg, args->path);
     CHECKALTCREAT(td, &sg, args->to);
 
 #ifdef DEBUG
 	if (ldebug(link))
 		printf(ARGS(link, "%s, %s"), args->path, args->to);
 #endif
 
     bsd.path = args->path;
     bsd.link = args->to;
 
     return link(td, &bsd);
 }
 
 #ifndef __alpha__
 int
 linux_fdatasync(td, uap)
 	struct thread *td;
 	struct linux_fdatasync_args *uap;
 {
 	struct fsync_args bsd;
 
 	bsd.fd = uap->fd;
 	return fsync(td, &bsd);
 }
 #endif /*!__alpha__*/
 
 int
 linux_pread(td, uap)
 	struct thread *td;
 	struct linux_pread_args *uap;
 {
 	struct pread_args bsd;
 
 	bsd.fd = uap->fd;
 	bsd.buf = uap->buf;
 	bsd.nbyte = uap->nbyte;
 	bsd.offset = uap->offset;
 	return pread(td, &bsd);
 }
 
 int
 linux_pwrite(td, uap)
 	struct thread *td;
 	struct linux_pwrite_args *uap;
 {
 	struct pwrite_args bsd;
 
 	bsd.fd = uap->fd;
 	bsd.buf = uap->buf;
 	bsd.nbyte = uap->nbyte;
 	bsd.offset = uap->offset;
 	return pwrite(td, &bsd);
 }
 
 int
 linux_mount(struct thread *td, struct linux_mount_args *args)
 {
 	struct ufs_args ufs;
         char fstypename[MFSNAMELEN];
         char mntonname[MNAMELEN], mntfromname[MNAMELEN];
 	int error;
 	int fsflags;
 	const char *fstype;
 	void *fsdata;
 
         error = copyinstr(args->filesystemtype, fstypename, MFSNAMELEN - 1,
 	    NULL);
 	if (error)
                 return (error);
         error = copyinstr(args->specialfile, mntfromname, MFSNAMELEN - 1, NULL);
 	if (error)
                 return (error);
         error = copyinstr(args->dir, mntonname, MFSNAMELEN - 1, NULL);
 	if (error)
                 return (error);
 
 #ifdef DEBUG
 	if (ldebug(mount))
 		printf(ARGS(mount, "%s, %s, %s"),
 		    fstypename, mntfromname, mntonname);
 #endif
 
 	if (strcmp(fstypename, "ext2") == 0) {
 		fstype = "ext2fs";
 		fsdata = &ufs;
 		ufs.fspec = mntfromname;
 #define DEFAULT_ROOTID		-2
 		ufs.export.ex_root = DEFAULT_ROOTID;
 		ufs.export.ex_flags =
 		    args->rwflag & LINUX_MS_RDONLY ? MNT_EXRDONLY : 0;
 	} else if (strcmp(fstypename, "proc") == 0) {
 		fstype = "linprocfs";
 		fsdata = NULL;
 	} else {
 		return (ENODEV);
 	}
 
 	fsflags = 0;
 
 	if ((args->rwflag & 0xffff0000) == 0xc0ed0000) {
 		/*
 		 * Linux SYNC flag is not included; the closest equivalent
 		 * FreeBSD has is !ASYNC, which is our default.
 		 */
 		if (args->rwflag & LINUX_MS_RDONLY)
 			fsflags |= MNT_RDONLY; 
 		if (args->rwflag & LINUX_MS_NOSUID)
 			fsflags |= MNT_NOSUID; 
 		if (args->rwflag & LINUX_MS_NODEV)
 			fsflags |= MNT_NODEV; 
 		if (args->rwflag & LINUX_MS_NOEXEC)
 			fsflags |= MNT_NOEXEC; 
 		if (args->rwflag & LINUX_MS_REMOUNT)
 			fsflags |= MNT_UPDATE; 
 	}
 
 	return (vfs_mount(td, fstype, mntonname, fsflags, fsdata));
 }
 
 int
 linux_oldumount(struct thread *td, struct linux_oldumount_args *args)
 {
 	struct linux_umount_args args2;
 
 	args2.path = args->path;
 	args2.flags = 0;
 	return (linux_umount(td, &args2));
 }
 
 int
 linux_umount(struct thread *td, struct linux_umount_args *args)
 {
 	struct unmount_args bsd;
 
 	bsd.path = args->path;
 	bsd.flags = args->flags;	/* XXX correct? */
 	return (unmount(td, &bsd));
 }
 
 /*
  * fcntl family of syscalls
  */
 
 struct l_flock {
 	l_short		l_type;
 	l_short		l_whence;
 	l_off_t		l_start;
 	l_off_t		l_len;
 	l_pid_t		l_pid;
 };
 
 static void
 linux_to_bsd_flock(struct l_flock *linux_flock, struct flock *bsd_flock)
 {
 	switch (linux_flock->l_type) {
 	case LINUX_F_RDLCK:
 		bsd_flock->l_type = F_RDLCK;
 		break;
 	case LINUX_F_WRLCK:
 		bsd_flock->l_type = F_WRLCK;
 		break;
 	case LINUX_F_UNLCK:
 		bsd_flock->l_type = F_UNLCK;
 		break;
 	default:
 		bsd_flock->l_type = -1;
 		break;
 	}
 	bsd_flock->l_whence = linux_flock->l_whence;
 	bsd_flock->l_start = (off_t)linux_flock->l_start;
 	bsd_flock->l_len = (off_t)linux_flock->l_len;
 	bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
 }
 
 static void
 bsd_to_linux_flock(struct flock *bsd_flock, struct l_flock *linux_flock)
 {
 	switch (bsd_flock->l_type) {
 	case F_RDLCK:
 		linux_flock->l_type = LINUX_F_RDLCK;
 		break;
 	case F_WRLCK:
 		linux_flock->l_type = LINUX_F_WRLCK;
 		break;
 	case F_UNLCK:
 		linux_flock->l_type = LINUX_F_UNLCK;
 		break;
 	}
 	linux_flock->l_whence = bsd_flock->l_whence;
 	linux_flock->l_start = (l_off_t)bsd_flock->l_start;
 	linux_flock->l_len = (l_off_t)bsd_flock->l_len;
 	linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid;
 }
 
 #if defined(__i386__)
 struct l_flock64 {
 	l_short		l_type;
 	l_short		l_whence;
 	l_loff_t	l_start;
 	l_loff_t	l_len;
 	l_pid_t		l_pid;
 };
 
 static void
 linux_to_bsd_flock64(struct l_flock64 *linux_flock, struct flock *bsd_flock)
 {
 	switch (linux_flock->l_type) {
 	case LINUX_F_RDLCK:
 		bsd_flock->l_type = F_RDLCK;
 		break;
 	case LINUX_F_WRLCK:
 		bsd_flock->l_type = F_WRLCK;
 		break;
 	case LINUX_F_UNLCK:
 		bsd_flock->l_type = F_UNLCK;
 		break;
 	default:
 		bsd_flock->l_type = -1;
 		break;
 	}
 	bsd_flock->l_whence = linux_flock->l_whence;
 	bsd_flock->l_start = (off_t)linux_flock->l_start;
 	bsd_flock->l_len = (off_t)linux_flock->l_len;
 	bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
 }
 
 static void
 bsd_to_linux_flock64(struct flock *bsd_flock, struct l_flock64 *linux_flock)
 {
 	switch (bsd_flock->l_type) {
 	case F_RDLCK:
 		linux_flock->l_type = LINUX_F_RDLCK;
 		break;
 	case F_WRLCK:
 		linux_flock->l_type = LINUX_F_WRLCK;
 		break;
 	case F_UNLCK:
 		linux_flock->l_type = LINUX_F_UNLCK;
 		break;
 	}
 	linux_flock->l_whence = bsd_flock->l_whence;
 	linux_flock->l_start = (l_loff_t)bsd_flock->l_start;
 	linux_flock->l_len = (l_loff_t)bsd_flock->l_len;
 	linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid;
 }
 #endif /* __i386__ */
 
 #if defined(__alpha__)
 #define	linux_fcntl64_args	linux_fcntl_args
 #endif
 
 static int
 fcntl_common(struct thread *td, struct linux_fcntl64_args *args)
 {
 	struct fcntl_args fcntl_args;
 	struct filedesc *fdp;
 	struct file *fp;
 	int error, result;
 
 	fcntl_args.fd = args->fd;
 
 	switch (args->cmd) {
 	case LINUX_F_DUPFD:
 		fcntl_args.cmd = F_DUPFD;
 		fcntl_args.arg = args->arg;
 		return (fcntl(td, &fcntl_args));
 
 	case LINUX_F_GETFD:
 		fcntl_args.cmd = F_GETFD;
 		return (fcntl(td, &fcntl_args));
 
 	case LINUX_F_SETFD:
 		fcntl_args.cmd = F_SETFD;
 		fcntl_args.arg = args->arg;
 		return (fcntl(td, &fcntl_args));
 
 	case LINUX_F_GETFL:
 		fcntl_args.cmd = F_GETFL;
 		error = fcntl(td, &fcntl_args);
 		result = td->td_retval[0];
 		td->td_retval[0] = 0;
 		if (result & O_RDONLY)
 			td->td_retval[0] |= LINUX_O_RDONLY;
 		if (result & O_WRONLY)
 			td->td_retval[0] |= LINUX_O_WRONLY;
 		if (result & O_RDWR)
 			td->td_retval[0] |= LINUX_O_RDWR;
 		if (result & O_NDELAY)
 			td->td_retval[0] |= LINUX_O_NONBLOCK;
 		if (result & O_APPEND)
 			td->td_retval[0] |= LINUX_O_APPEND;
 		if (result & O_FSYNC)
 			td->td_retval[0] |= LINUX_O_SYNC;
 		if (result & O_ASYNC)
 			td->td_retval[0] |= LINUX_FASYNC;
 		return (error);
 
 	case LINUX_F_SETFL:
 		fcntl_args.arg = 0;
 		if (args->arg & LINUX_O_NDELAY)
 			fcntl_args.arg |= O_NONBLOCK;
 		if (args->arg & LINUX_O_APPEND)
 			fcntl_args.arg |= O_APPEND;
 		if (args->arg & LINUX_O_SYNC)
 			fcntl_args.arg |= O_FSYNC;
 		if (args->arg & LINUX_FASYNC)
 			fcntl_args.arg |= O_ASYNC;
 		fcntl_args.cmd = F_SETFL;
 		return (fcntl(td, &fcntl_args));
 
 	case LINUX_F_GETOWN:
 		fcntl_args.cmd = F_GETOWN;
 		return (fcntl(td, &fcntl_args));
 
 	case LINUX_F_SETOWN:
 		/*
 		 * XXX some Linux applications depend on F_SETOWN having no
 		 * significant effect for pipes (SIGIO is not delivered for
 		 * pipes under Linux-2.2.35 at least).
 		 */
-		fdp = td->td_proc->p_fd;
-		if ((u_int)args->fd >= fdp->fd_nfiles ||
-		    (fp = fdp->fd_ofiles[args->fd]) == NULL)
-			return (EBADF);
-		if (fp->f_type == DTYPE_PIPE)
+		fp = ffind_hold(td, args->fd);
+		if (fp == NULL)
+			return EBADF;
+		if (fp->f_type == DTYPE_PIPE) {
+			fdrop(fp, td);
 			return (EINVAL);
+		}
+		fdrop(fp, td);
 
 		fcntl_args.cmd = F_SETOWN;
 		fcntl_args.arg = args->arg;
 		return (fcntl(td, &fcntl_args));
 	}
 
 	return (EINVAL);
 }
 
 int
 linux_fcntl(struct thread *td, struct linux_fcntl_args *args)
 {
 	struct linux_fcntl64_args args64;
 	struct fcntl_args fcntl_args;
 	struct l_flock linux_flock;
 	struct flock *bsd_flock;
 	int error;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	bsd_flock = (struct flock *)stackgap_alloc(&sg, sizeof(bsd_flock));
 
 #ifdef DEBUG
 	if (ldebug(fcntl))
 		printf(ARGS(fcntl, "%d, %08x, *"), args->fd, args->cmd);
 #endif
 
 	switch (args->cmd) {
 	case LINUX_F_GETLK:
 		error = copyin((caddr_t)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock(&linux_flock, bsd_flock);
 		fcntl_args.fd = args->fd;
 		fcntl_args.cmd = F_GETLK;
 		fcntl_args.arg = (long)bsd_flock;
 		error = fcntl(td, &fcntl_args);
 		if (error)
 			return (error);
 		bsd_to_linux_flock(bsd_flock, &linux_flock);
 		return (copyout(&linux_flock, (caddr_t)args->arg,
 		    sizeof(linux_flock)));
 
 	case LINUX_F_SETLK:
 		error = copyin((caddr_t)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock(&linux_flock, bsd_flock);
 		fcntl_args.fd = args->fd;
 		fcntl_args.cmd = F_SETLK;
 		fcntl_args.arg = (long)bsd_flock;
 		return (fcntl(td, &fcntl_args));
 
 	case LINUX_F_SETLKW:
 		error = copyin((caddr_t)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock(&linux_flock, bsd_flock);
 		fcntl_args.fd = args->fd;
 		fcntl_args.cmd = F_SETLKW;
 		fcntl_args.arg = (long)bsd_flock;
 		return (fcntl(td, &fcntl_args));
 	}
 
 	args64.fd = args->fd;
 	args64.cmd = args->cmd;
 	args64.arg = args->arg;
 	return (fcntl_common(td, &args64));
 }
 
 #if defined(__i386__)
 int
 linux_fcntl64(struct thread *td, struct linux_fcntl64_args *args)
 {
 	struct fcntl_args fcntl_args;
 	struct l_flock64 linux_flock;
 	struct flock *bsd_flock;
 	int error;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	bsd_flock = (struct flock *)stackgap_alloc(&sg, sizeof(bsd_flock));
 
 #ifdef DEBUG
 	if (ldebug(fcntl64))
 		printf(ARGS(fcntl64, "%d, %08x, *"), args->fd, args->cmd);
 #endif
 
 	switch (args->cmd) {
 	case LINUX_F_GETLK:
 		error = copyin((caddr_t)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock64(&linux_flock, bsd_flock);
 		fcntl_args.fd = args->fd;
 		fcntl_args.cmd = F_GETLK;
 		fcntl_args.arg = (long)bsd_flock;
 		error = fcntl(td, &fcntl_args);
 		if (error)
 			return (error);
 		bsd_to_linux_flock64(bsd_flock, &linux_flock);
 		return (copyout(&linux_flock, (caddr_t)args->arg,
 		    sizeof(linux_flock)));
 
 	case LINUX_F_SETLK:
 		error = copyin((caddr_t)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock64(&linux_flock, bsd_flock);
 		fcntl_args.fd = args->fd;
 		fcntl_args.cmd = F_SETLK;
 		fcntl_args.arg = (long)bsd_flock;
 		return (fcntl(td, &fcntl_args));
 
 	case LINUX_F_SETLKW:
 		error = copyin((caddr_t)args->arg, &linux_flock,
 		    sizeof(linux_flock));
 		if (error)
 			return (error);
 		linux_to_bsd_flock64(&linux_flock, bsd_flock);
 		fcntl_args.fd = args->fd;
 		fcntl_args.cmd = F_SETLKW;
 		fcntl_args.arg = (long)bsd_flock;
 		return (fcntl(td, &fcntl_args));
 	}
 
 	return (fcntl_common(td, args));
 }
 #endif /* __i386__ */
 
 int
 linux_chown(struct thread *td, struct linux_chown_args *args)
 {
 	struct chown_args bsd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(chown))
 		printf(ARGS(chown, "%s, %d, %d"), args->path, args->uid,
 		    args->gid);
 #endif
 
 	bsd.path = args->path;
 	bsd.uid = args->uid;
 	bsd.gid = args->gid;
 	return (chown(td, &bsd));
 }
 
 int
 linux_lchown(struct thread *td, struct linux_lchown_args *args)
 {
 	struct lchown_args bsd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(lchown))
 		printf(ARGS(lchown, "%s, %d, %d"), args->path, args->uid,
 		    args->gid);
 #endif
 
 	bsd.path = args->path;
 	bsd.uid = args->uid;
 	bsd.gid = args->gid;
 	return (lchown(td, &bsd));
 }
Index: head/sys/compat/linux/linux_ioctl.c
===================================================================
--- head/sys/compat/linux/linux_ioctl.c	(revision 89305)
+++ head/sys/compat/linux/linux_ioctl.c	(revision 89306)
@@ -1,2256 +1,2346 @@
 /*
  * Copyright (c) 1994-1995 S�ren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software withough specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/cdio.h>
 #include <sys/dvdio.h>
 #include <sys/consio.h>
 #include <sys/ctype.h>
 #include <sys/disklabel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/kbio.h>
 #include <sys/linker_set.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/soundcard.h>
 #include <sys/tty.h>
 #include <sys/uio.h>
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 
 #include <compat/linux/linux_ioctl.h>
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_util.h>
 
 static linux_ioctl_function_t linux_ioctl_cdrom;
 static linux_ioctl_function_t linux_ioctl_console;
 static linux_ioctl_function_t linux_ioctl_disk;
 static linux_ioctl_function_t linux_ioctl_socket;
 static linux_ioctl_function_t linux_ioctl_sound;
 static linux_ioctl_function_t linux_ioctl_termio;
 static linux_ioctl_function_t linux_ioctl_private;
 static linux_ioctl_function_t linux_ioctl_special;
 
 static struct linux_ioctl_handler cdrom_handler =
 { linux_ioctl_cdrom, LINUX_IOCTL_CDROM_MIN, LINUX_IOCTL_CDROM_MAX };
 static struct linux_ioctl_handler console_handler =
 { linux_ioctl_console, LINUX_IOCTL_CONSOLE_MIN, LINUX_IOCTL_CONSOLE_MAX };
 static struct linux_ioctl_handler disk_handler =
 { linux_ioctl_disk, LINUX_IOCTL_DISK_MIN, LINUX_IOCTL_DISK_MAX };
 static struct linux_ioctl_handler socket_handler =
 { linux_ioctl_socket, LINUX_IOCTL_SOCKET_MIN, LINUX_IOCTL_SOCKET_MAX };
 static struct linux_ioctl_handler sound_handler =
 { linux_ioctl_sound, LINUX_IOCTL_SOUND_MIN, LINUX_IOCTL_SOUND_MAX };
 static struct linux_ioctl_handler termio_handler =
 { linux_ioctl_termio, LINUX_IOCTL_TERMIO_MIN, LINUX_IOCTL_TERMIO_MAX };
 static struct linux_ioctl_handler private_handler =
 { linux_ioctl_private, LINUX_IOCTL_PRIVATE_MIN, LINUX_IOCTL_PRIVATE_MAX };
 
 DATA_SET(linux_ioctl_handler_set, cdrom_handler);
 DATA_SET(linux_ioctl_handler_set, console_handler);
 DATA_SET(linux_ioctl_handler_set, disk_handler);
 DATA_SET(linux_ioctl_handler_set, socket_handler);
 DATA_SET(linux_ioctl_handler_set, sound_handler);
 DATA_SET(linux_ioctl_handler_set, termio_handler);
 DATA_SET(linux_ioctl_handler_set, private_handler);
 
 struct handler_element 
 {
 	TAILQ_ENTRY(handler_element) list;
 	int	(*func)(struct thread *, struct linux_ioctl_args *);
 	int	low, high, span;
 };
 
 static TAILQ_HEAD(, handler_element) handlers =
 	TAILQ_HEAD_INITIALIZER(handlers);
 
 static int
 linux_ioctl_disk(struct thread *td, struct linux_ioctl_args *args)
 {
-	struct file *fp = td->td_proc->p_fd->fd_ofiles[args->fd];
+	struct file *fp;
 	int error;
 	struct disklabel dl;
 
+	fp = ffind_hold(td, args->fd);
+	if (fp == NULL)
+		return (EBADF);
 	switch (args->cmd & 0xffff) {
 	case LINUX_BLKGETSIZE:
 		error = fo_ioctl(fp, DIOCGDINFO, (caddr_t)&dl, td);
+		fdrop(fp, td);
 		if (error)
 			return (error);
 		return (copyout(&(dl.d_secperunit), (caddr_t)args->arg,
 		     sizeof(dl.d_secperunit)));
-		break;
 	}
+	fdrop(fp, td);
 	return (ENOIOCTL);
 }
 
 /*
  * termio related ioctls
  */
 
 struct linux_termio {
 	unsigned short c_iflag;
 	unsigned short c_oflag;
 	unsigned short c_cflag;
 	unsigned short c_lflag;
 	unsigned char c_line;
 	unsigned char c_cc[LINUX_NCC];
 };
 
 struct linux_termios {
 	unsigned int c_iflag;
 	unsigned int c_oflag;
 	unsigned int c_cflag;
 	unsigned int c_lflag;
 #ifdef __alpha__
 	unsigned char c_cc[LINUX_NCCS];
 	unsigned char c_line;
 	unsigned int  c_ispeed;
 	unsigned int  c_ospeed;
 #else
 	unsigned char c_line;
 	unsigned char c_cc[LINUX_NCCS];
 #endif
 };
 
 struct linux_winsize {
 	unsigned short ws_row, ws_col;
 	unsigned short ws_xpixel, ws_ypixel;
 };
 
 static struct speedtab sptab[] = {
 	{ B0, LINUX_B0 }, { B50, LINUX_B50 },
 	{ B75, LINUX_B75 }, { B110, LINUX_B110 },
 	{ B134, LINUX_B134 }, { B150, LINUX_B150 },
 	{ B200, LINUX_B200 }, { B300, LINUX_B300 },
 	{ B600, LINUX_B600 }, { B1200, LINUX_B1200 },
 	{ B1800, LINUX_B1800 }, { B2400, LINUX_B2400 },
 	{ B4800, LINUX_B4800 }, { B9600, LINUX_B9600 },
 	{ B19200, LINUX_B19200 }, { B38400, LINUX_B38400 },
 	{ B57600, LINUX_B57600 }, { B115200, LINUX_B115200 },
 	{-1, -1 }
 };
 
 struct linux_serial_struct {
 	int	type;
 	int	line;
 	int	port;
 	int	irq;
 	int	flags;
 	int	xmit_fifo_size;
 	int	custom_divisor;
 	int	baud_base;
 	unsigned short close_delay;
 	char	reserved_char[2];
 	int	hub6;
 	unsigned short closing_wait;
 	unsigned short closing_wait2;
 	int	reserved[4];
 };
 
 static int
 linux_to_bsd_speed(int code, struct speedtab *table)
 {
 	for ( ; table->sp_code != -1; table++)
 		if (table->sp_code == code)
 			return (table->sp_speed);
 	return -1;
 }
 
 static int
 bsd_to_linux_speed(int speed, struct speedtab *table)
 {
 	for ( ; table->sp_speed != -1; table++)
 		if (table->sp_speed == speed)
 			return (table->sp_code);
 	return -1;
 }
 
 static void
 bsd_to_linux_termios(struct termios *bios, struct linux_termios *lios)
 {
 	int i;
 
 #ifdef DEBUG
 	if (ldebug(ioctl)) {
 		printf("LINUX: BSD termios structure (input):\n");
 		printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n",
 		    bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag,
 		    bios->c_ispeed, bios->c_ospeed);
 		printf("c_cc ");
 		for (i=0; i<NCCS; i++)
 			printf("%02x ", bios->c_cc[i]);
 		printf("\n");
 	}
 #endif
 
 	lios->c_iflag = 0;
 	if (bios->c_iflag & IGNBRK)
 		lios->c_iflag |= LINUX_IGNBRK;
 	if (bios->c_iflag & BRKINT)
 		lios->c_iflag |= LINUX_BRKINT;
 	if (bios->c_iflag & IGNPAR)
 		lios->c_iflag |= LINUX_IGNPAR;
 	if (bios->c_iflag & PARMRK)
 		lios->c_iflag |= LINUX_PARMRK;
 	if (bios->c_iflag & INPCK)
 		lios->c_iflag |= LINUX_INPCK;
 	if (bios->c_iflag & ISTRIP)
 		lios->c_iflag |= LINUX_ISTRIP;
 	if (bios->c_iflag & INLCR)
 		lios->c_iflag |= LINUX_INLCR;
 	if (bios->c_iflag & IGNCR)
 		lios->c_iflag |= LINUX_IGNCR;
 	if (bios->c_iflag & ICRNL)
 		lios->c_iflag |= LINUX_ICRNL;
 	if (bios->c_iflag & IXON)
 		lios->c_iflag |= LINUX_IXON;
 	if (bios->c_iflag & IXANY)
 		lios->c_iflag |= LINUX_IXANY;
 	if (bios->c_iflag & IXOFF)
 		lios->c_iflag |= LINUX_IXOFF;
 	if (bios->c_iflag & IMAXBEL)
 		lios->c_iflag |= LINUX_IMAXBEL;
 
 	lios->c_oflag = 0;
 	if (bios->c_oflag & OPOST)
 		lios->c_oflag |= LINUX_OPOST;
 	if (bios->c_oflag & ONLCR)
 		lios->c_oflag |= LINUX_ONLCR;
 	if (bios->c_oflag & OXTABS)
 		lios->c_oflag |= LINUX_XTABS;
 
 	lios->c_cflag = bsd_to_linux_speed(bios->c_ispeed, sptab);
 	lios->c_cflag |= (bios->c_cflag & CSIZE) >> 4;
 	if (bios->c_cflag & CSTOPB)
 		lios->c_cflag |= LINUX_CSTOPB;
 	if (bios->c_cflag & CREAD)
 		lios->c_cflag |= LINUX_CREAD;
 	if (bios->c_cflag & PARENB)
 		lios->c_cflag |= LINUX_PARENB;
 	if (bios->c_cflag & PARODD)
 		lios->c_cflag |= LINUX_PARODD;
 	if (bios->c_cflag & HUPCL)
 		lios->c_cflag |= LINUX_HUPCL;
 	if (bios->c_cflag & CLOCAL)
 		lios->c_cflag |= LINUX_CLOCAL;
 	if (bios->c_cflag & CRTSCTS)
 		lios->c_cflag |= LINUX_CRTSCTS;
 
 	lios->c_lflag = 0;
 	if (bios->c_lflag & ISIG)
 		lios->c_lflag |= LINUX_ISIG;
 	if (bios->c_lflag & ICANON)
 		lios->c_lflag |= LINUX_ICANON;
 	if (bios->c_lflag & ECHO)
 		lios->c_lflag |= LINUX_ECHO;
 	if (bios->c_lflag & ECHOE)
 		lios->c_lflag |= LINUX_ECHOE;
 	if (bios->c_lflag & ECHOK)
 		lios->c_lflag |= LINUX_ECHOK;
 	if (bios->c_lflag & ECHONL)
 		lios->c_lflag |= LINUX_ECHONL;
 	if (bios->c_lflag & NOFLSH)
 		lios->c_lflag |= LINUX_NOFLSH;
 	if (bios->c_lflag & TOSTOP)
 		lios->c_lflag |= LINUX_TOSTOP;
 	if (bios->c_lflag & ECHOCTL)
 		lios->c_lflag |= LINUX_ECHOCTL;
 	if (bios->c_lflag & ECHOPRT)
 		lios->c_lflag |= LINUX_ECHOPRT;
 	if (bios->c_lflag & ECHOKE)
 		lios->c_lflag |= LINUX_ECHOKE;
 	if (bios->c_lflag & FLUSHO)
 		lios->c_lflag |= LINUX_FLUSHO;
 	if (bios->c_lflag & PENDIN)
 		lios->c_lflag |= LINUX_PENDIN;
 	if (bios->c_lflag & IEXTEN)
 		lios->c_lflag |= LINUX_IEXTEN;
 
 	for (i=0; i<LINUX_NCCS; i++)
 		lios->c_cc[i] = LINUX_POSIX_VDISABLE;
 	lios->c_cc[LINUX_VINTR] = bios->c_cc[VINTR];
 	lios->c_cc[LINUX_VQUIT] = bios->c_cc[VQUIT];
 	lios->c_cc[LINUX_VERASE] = bios->c_cc[VERASE];
 	lios->c_cc[LINUX_VKILL] = bios->c_cc[VKILL];
 	lios->c_cc[LINUX_VEOF] = bios->c_cc[VEOF];
 	lios->c_cc[LINUX_VEOL] = bios->c_cc[VEOL];
 	lios->c_cc[LINUX_VMIN] = bios->c_cc[VMIN];
 	lios->c_cc[LINUX_VTIME] = bios->c_cc[VTIME];
 	lios->c_cc[LINUX_VEOL2] = bios->c_cc[VEOL2];
 	lios->c_cc[LINUX_VSUSP] = bios->c_cc[VSUSP];
 	lios->c_cc[LINUX_VSTART] = bios->c_cc[VSTART];
 	lios->c_cc[LINUX_VSTOP] = bios->c_cc[VSTOP];
 	lios->c_cc[LINUX_VREPRINT] = bios->c_cc[VREPRINT];
 	lios->c_cc[LINUX_VDISCARD] = bios->c_cc[VDISCARD];
 	lios->c_cc[LINUX_VWERASE] = bios->c_cc[VWERASE];
 	lios->c_cc[LINUX_VLNEXT] = bios->c_cc[VLNEXT];
 
 	for (i=0; i<LINUX_NCCS; i++) {
 		if (lios->c_cc[i] == _POSIX_VDISABLE)
 			lios->c_cc[i] = LINUX_POSIX_VDISABLE;
 	}
 	lios->c_line = 0;
 
 #ifdef DEBUG
 	if (ldebug(ioctl)) {
 		printf("LINUX: LINUX termios structure (output):\n");
 		printf("i=%08x o=%08x c=%08x l=%08x line=%d\n",
 		    lios->c_iflag, lios->c_oflag, lios->c_cflag,
 		    lios->c_lflag, (int)lios->c_line);
 		printf("c_cc ");
 		for (i=0; i<LINUX_NCCS; i++) 
 			printf("%02x ", lios->c_cc[i]);
 		printf("\n");
 	}
 #endif
 }
 
 static void
 linux_to_bsd_termios(struct linux_termios *lios, struct termios *bios)
 {
 	int i;
 
 #ifdef DEBUG
 	if (ldebug(ioctl)) {
 		printf("LINUX: LINUX termios structure (input):\n");
 		printf("i=%08x o=%08x c=%08x l=%08x line=%d\n", 
 		    lios->c_iflag, lios->c_oflag, lios->c_cflag,
 		    lios->c_lflag, (int)lios->c_line);
 		printf("c_cc ");
 		for (i=0; i<LINUX_NCCS; i++)
 			printf("%02x ", lios->c_cc[i]);
 		printf("\n");
 	}
 #endif
 
 	bios->c_iflag = 0;
 	if (lios->c_iflag & LINUX_IGNBRK)
 		bios->c_iflag |= IGNBRK;
 	if (lios->c_iflag & LINUX_BRKINT)
 		bios->c_iflag |= BRKINT;
 	if (lios->c_iflag & LINUX_IGNPAR)
 		bios->c_iflag |= IGNPAR;
 	if (lios->c_iflag & LINUX_PARMRK)
 		bios->c_iflag |= PARMRK;
 	if (lios->c_iflag & LINUX_INPCK)
 		bios->c_iflag |= INPCK;
 	if (lios->c_iflag & LINUX_ISTRIP)
 		bios->c_iflag |= ISTRIP;
 	if (lios->c_iflag & LINUX_INLCR)
 		bios->c_iflag |= INLCR;
 	if (lios->c_iflag & LINUX_IGNCR)
 		bios->c_iflag |= IGNCR;
 	if (lios->c_iflag & LINUX_ICRNL)
 		bios->c_iflag |= ICRNL;
 	if (lios->c_iflag & LINUX_IXON)
 		bios->c_iflag |= IXON;
 	if (lios->c_iflag & LINUX_IXANY)
 		bios->c_iflag |= IXANY;
 	if (lios->c_iflag & LINUX_IXOFF)
 		bios->c_iflag |= IXOFF;
 	if (lios->c_iflag & LINUX_IMAXBEL)
 		bios->c_iflag |= IMAXBEL;
 
 	bios->c_oflag = 0;
 	if (lios->c_oflag & LINUX_OPOST)
 		bios->c_oflag |= OPOST;
 	if (lios->c_oflag & LINUX_ONLCR)
 		bios->c_oflag |= ONLCR;
 	if (lios->c_oflag & LINUX_XTABS)
 		bios->c_oflag |= OXTABS;
 
 	bios->c_cflag = (lios->c_cflag & LINUX_CSIZE) << 4;
 	if (lios->c_cflag & LINUX_CSTOPB)
 		bios->c_cflag |= CSTOPB;
 	if (lios->c_cflag & LINUX_CREAD)
 		bios->c_cflag |= CREAD;
 	if (lios->c_cflag & LINUX_PARENB)
 		bios->c_cflag |= PARENB;
 	if (lios->c_cflag & LINUX_PARODD)
 		bios->c_cflag |= PARODD;
 	if (lios->c_cflag & LINUX_HUPCL)
 		bios->c_cflag |= HUPCL;
 	if (lios->c_cflag & LINUX_CLOCAL)
 		bios->c_cflag |= CLOCAL;
 	if (lios->c_cflag & LINUX_CRTSCTS)
 		bios->c_cflag |= CRTSCTS;
 
 	bios->c_lflag = 0;
 	if (lios->c_lflag & LINUX_ISIG)
 		bios->c_lflag |= ISIG;
 	if (lios->c_lflag & LINUX_ICANON)
 		bios->c_lflag |= ICANON;
 	if (lios->c_lflag & LINUX_ECHO)
 		bios->c_lflag |= ECHO;
 	if (lios->c_lflag & LINUX_ECHOE)
 		bios->c_lflag |= ECHOE;
 	if (lios->c_lflag & LINUX_ECHOK)
 		bios->c_lflag |= ECHOK;
 	if (lios->c_lflag & LINUX_ECHONL)
 		bios->c_lflag |= ECHONL;
 	if (lios->c_lflag & LINUX_NOFLSH)
 		bios->c_lflag |= NOFLSH;
 	if (lios->c_lflag & LINUX_TOSTOP)
 		bios->c_lflag |= TOSTOP;
 	if (lios->c_lflag & LINUX_ECHOCTL)
 		bios->c_lflag |= ECHOCTL;
 	if (lios->c_lflag & LINUX_ECHOPRT)
 		bios->c_lflag |= ECHOPRT;
 	if (lios->c_lflag & LINUX_ECHOKE)
 		bios->c_lflag |= ECHOKE;
 	if (lios->c_lflag & LINUX_FLUSHO)
 		bios->c_lflag |= FLUSHO;
 	if (lios->c_lflag & LINUX_PENDIN)
 		bios->c_lflag |= PENDIN;
 	if (lios->c_lflag & LINUX_IEXTEN)
 		bios->c_lflag |= IEXTEN;
 
 	for (i=0; i<NCCS; i++)
 		bios->c_cc[i] = _POSIX_VDISABLE;
 	bios->c_cc[VINTR] = lios->c_cc[LINUX_VINTR];
 	bios->c_cc[VQUIT] = lios->c_cc[LINUX_VQUIT];
 	bios->c_cc[VERASE] = lios->c_cc[LINUX_VERASE];
 	bios->c_cc[VKILL] = lios->c_cc[LINUX_VKILL];
 	bios->c_cc[VEOF] = lios->c_cc[LINUX_VEOF];
 	bios->c_cc[VEOL] = lios->c_cc[LINUX_VEOL];
 	bios->c_cc[VMIN] = lios->c_cc[LINUX_VMIN];
 	bios->c_cc[VTIME] = lios->c_cc[LINUX_VTIME];
 	bios->c_cc[VEOL2] = lios->c_cc[LINUX_VEOL2];
 	bios->c_cc[VSUSP] = lios->c_cc[LINUX_VSUSP];
 	bios->c_cc[VSTART] = lios->c_cc[LINUX_VSTART];
 	bios->c_cc[VSTOP] = lios->c_cc[LINUX_VSTOP];
 	bios->c_cc[VREPRINT] = lios->c_cc[LINUX_VREPRINT];
 	bios->c_cc[VDISCARD] = lios->c_cc[LINUX_VDISCARD];
 	bios->c_cc[VWERASE] = lios->c_cc[LINUX_VWERASE];
 	bios->c_cc[VLNEXT] = lios->c_cc[LINUX_VLNEXT];
 
 	for (i=0; i<NCCS; i++) {
 		if (bios->c_cc[i] == LINUX_POSIX_VDISABLE)
 			bios->c_cc[i] = _POSIX_VDISABLE;
 	}
 
 	bios->c_ispeed = bios->c_ospeed =
 	    linux_to_bsd_speed(lios->c_cflag & LINUX_CBAUD, sptab);
 
 #ifdef DEBUG
 	if (ldebug(ioctl)) {
 		printf("LINUX: BSD termios structure (output):\n");
 		printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n",
 		    bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag,
 		    bios->c_ispeed, bios->c_ospeed);
 		printf("c_cc ");
 		for (i=0; i<NCCS; i++) 
 			printf("%02x ", bios->c_cc[i]);
 		printf("\n");
 	}
 #endif
 }
 
 static void
 bsd_to_linux_termio(struct termios *bios, struct linux_termio *lio)
 {
 	struct linux_termios lios;
 
 	bsd_to_linux_termios(bios, &lios);
 	lio->c_iflag = lios.c_iflag;
 	lio->c_oflag = lios.c_oflag;
 	lio->c_cflag = lios.c_cflag;
 	lio->c_lflag = lios.c_lflag;
 	lio->c_line  = lios.c_line;
 #ifdef __alpha__
 	lio->c_cc[LINUX__VINTR] = lios.c_cc[LINUX_VINTR];
 	lio->c_cc[LINUX__VQUIT] = lios.c_cc[LINUX_VQUIT];
 	lio->c_cc[LINUX__VERASE] = lios.c_cc[LINUX_VERASE];
 	lio->c_cc[LINUX__VKILL] = lios.c_cc[LINUX_VKILL];
 	lio->c_cc[LINUX__VEOF] =
 	    lios.c_cc[(lios.c_lflag & ICANON) ? LINUX_VEOF : LINUX_VMIN];
 	lio->c_cc[LINUX__VEOL] =
 	    lios.c_cc[(lios.c_lflag & ICANON) ? LINUX_VEOL : LINUX_VTIME];
 	lio->c_cc[LINUX__VEOL2] = lios.c_cc[LINUX_VEOL2];
 	lio->c_cc[LINUX__VSWTC] = lios.c_cc[LINUX_VSWTC];
 #else
 	memcpy(lio->c_cc, lios.c_cc, LINUX_NCC);
 #endif
 }
 
 static void
 linux_to_bsd_termio(struct linux_termio *lio, struct termios *bios)
 {
 	struct linux_termios lios;
 	int i;
 
 	lios.c_iflag = lio->c_iflag;
 	lios.c_oflag = lio->c_oflag;
 	lios.c_cflag = lio->c_cflag;
 	lios.c_lflag = lio->c_lflag;
 #ifdef __alpha__
 	for (i=0; i<LINUX_NCCS; i++)
 		lios.c_cc[i] = LINUX_POSIX_VDISABLE;
 	lios.c_cc[LINUX_VINTR] = lio->c_cc[LINUX__VINTR];
 	lios.c_cc[LINUX_VQUIT] = lio->c_cc[LINUX__VQUIT];
 	lios.c_cc[LINUX_VERASE] = lio->c_cc[LINUX__VERASE];
 	lios.c_cc[LINUX_VKILL] = lio->c_cc[LINUX__VKILL];
 	lios.c_cc[LINUX_VEOL2] = lio->c_cc[LINUX__VEOL2];
 	lios.c_cc[LINUX_VSWTC] = lio->c_cc[LINUX__VSWTC];
 	lios.c_cc[(lio->c_lflag & ICANON) ? LINUX_VEOF : LINUX_VMIN] =
 	    lio->c_cc[LINUX__VEOF];
 	lios.c_cc[(lio->c_lflag & ICANON) ? LINUX_VEOL : LINUX_VTIME] =
 	    lio->c_cc[LINUX__VEOL];
 #else
 	for (i=LINUX_NCC; i<LINUX_NCCS; i++)
 		lios.c_cc[i] = LINUX_POSIX_VDISABLE;
 	memcpy(lios.c_cc, lio->c_cc, LINUX_NCC);
 #endif
 	linux_to_bsd_termios(&lios, bios);
 }
 
 static int
 linux_ioctl_termio(struct thread *td, struct linux_ioctl_args *args)
 {
 	struct termios bios;
 	struct linux_termios lios;
 	struct linux_termio lio;
-	struct file *fp = td->td_proc->p_fd->fd_ofiles[args->fd];
+	struct file *fp;
 	int error;
 
+	fp = ffind_hold(td, args->fd);
+	if (fp == NULL)
+		return (EBADF);
 	switch (args->cmd & 0xffff) {
 
 	case LINUX_TCGETS:
 		error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td);
 		if (error)
-			return (error);
+			break;
 		bsd_to_linux_termios(&bios, &lios);
-		return copyout(&lios, (caddr_t)args->arg, sizeof(lios));
+		error = copyout(&lios, (caddr_t)args->arg, sizeof(lios));
+		break;
 
 	case LINUX_TCSETS:
 		error = copyin((caddr_t)args->arg, &lios, sizeof(lios));
 		if (error)
-			return (error);
+			break;
 		linux_to_bsd_termios(&lios, &bios);
-		return (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td));
+		error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td));
+		break;
 
 	case LINUX_TCSETSW:
 		error = copyin((caddr_t)args->arg, &lios, sizeof(lios));
 		if (error)
-			return (error);
+			break;
 		linux_to_bsd_termios(&lios, &bios);
-		return (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td));
+		error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td));
+		break;
 
 	case LINUX_TCSETSF:
 		error = copyin((caddr_t)args->arg, &lios, sizeof(lios));
 		if (error)
-			return (error);
+			break;
 		linux_to_bsd_termios(&lios, &bios);
-		return (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td));
+		error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td));
+		break;
 
 	case LINUX_TCGETA:
 		error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td);
 		if (error)
-			return (error);
+			break;
 		bsd_to_linux_termio(&bios, &lio);
-		return (copyout(&lio, (caddr_t)args->arg, sizeof(lio)));
+		error = (copyout(&lio, (caddr_t)args->arg, sizeof(lio)));
+		break;
 
 	case LINUX_TCSETA:
 		error = copyin((caddr_t)args->arg, &lio, sizeof(lio));
 		if (error)
-			return (error);
+			break;
 		linux_to_bsd_termio(&lio, &bios);
-		return (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td));
+		error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td));
+		break;
 
 	case LINUX_TCSETAW:
 		error = copyin((caddr_t)args->arg, &lio, sizeof(lio));
 		if (error)
-			return (error);
+			break;
 		linux_to_bsd_termio(&lio, &bios);
-		return (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td));
+		error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td));
+		break;
 
 	case LINUX_TCSETAF:
 		error = copyin((caddr_t)args->arg, &lio, sizeof(lio));
 		if (error)
-			return (error);
+			break;
 		linux_to_bsd_termio(&lio, &bios);
-		return (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td));
+		error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td));
+		break;
 
 	/* LINUX_TCSBRK */
 
 	case LINUX_TCXONC: {
 		switch (args->arg) {
 		case LINUX_TCOOFF:
 			args->cmd = TIOCSTOP;
 			break;
 		case LINUX_TCOON:
 			args->cmd = TIOCSTART;
 			break;
 		case LINUX_TCIOFF:
 		case LINUX_TCION: {
 			int c;
 			struct write_args wr;
 			error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td);
 			if (error)
-				return (error);
+				break;
+			fdrop(fp, td);
 			c = (args->arg == LINUX_TCIOFF) ? VSTOP : VSTART;
 			c = bios.c_cc[c];
 			if (c != _POSIX_VDISABLE) {
 				wr.fd = args->fd;
 				wr.buf = &c;
 				wr.nbyte = sizeof(c);
 				return (write(td, &wr));
 			} else
 				return (0);
 		}
 		default:
+			fdrop(fp, td);
 			return (EINVAL);
 		}
 		args->arg = 0;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 	}
 
 	case LINUX_TCFLSH: {
 		args->cmd = TIOCFLUSH;
 		switch (args->arg) {
 		case LINUX_TCIFLUSH:
 			args->arg = FREAD;
 			break;
 		case LINUX_TCOFLUSH:
 			args->arg = FWRITE;
 			break;
 		case LINUX_TCIOFLUSH:
 			args->arg = FREAD | FWRITE;
 			break;
 		default:
+			fdrop(fp, td);
 			return (EINVAL);
 		}
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 	}
 
 	case LINUX_TIOCEXCL:
 		args->cmd = TIOCEXCL;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_TIOCNXCL:
 		args->cmd = TIOCNXCL;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	/* LINUX_TIOCSCTTY */
 
 	case LINUX_TIOCGPGRP:
 		args->cmd = TIOCGPGRP;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_TIOCSPGRP:
 		args->cmd = TIOCSPGRP;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	/* LINUX_TIOCOUTQ */
 	/* LINUX_TIOCSTI */
 
 	case LINUX_TIOCGWINSZ:
 		args->cmd = TIOCGWINSZ;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_TIOCSWINSZ:
 		args->cmd = TIOCSWINSZ;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_TIOCMGET:
 		args->cmd = TIOCMGET;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_TIOCMBIS:
 		args->cmd = TIOCMBIS;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_TIOCMBIC:
 		args->cmd = TIOCMBIC;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_TIOCMSET:
 		args->cmd = TIOCMSET;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	/* TIOCGSOFTCAR */
 	/* TIOCSSOFTCAR */
 
 	case LINUX_FIONREAD: /* LINUX_TIOCINQ */
 		args->cmd = FIONREAD;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	/* LINUX_TIOCLINUX */
 
 	case LINUX_TIOCCONS:
 		args->cmd = TIOCCONS;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_TIOCGSERIAL: {
 		struct linux_serial_struct lss;
 		lss.type = LINUX_PORT_16550A;
 		lss.flags = 0;
 		lss.close_delay = 0;
-		return copyout(&lss, (caddr_t)args->arg, sizeof(lss));
+		error = copyout(&lss, (caddr_t)args->arg, sizeof(lss));
+		break;
 	}
 
 	case LINUX_TIOCSSERIAL: {
 		struct linux_serial_struct lss;
 		error = copyin((caddr_t)args->arg, &lss, sizeof(lss));
 		if (error)
-			return (error);
+			break;
 		/* XXX - It really helps to have an implementation that
 		 * does nothing. NOT!
 		 */
-		return (0);
+		error = 0;
+		break;
 	}
 
 	/* LINUX_TIOCPKT */
 
 	case LINUX_FIONBIO:
 		args->cmd = FIONBIO;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_TIOCNOTTY:
 		args->cmd = TIOCNOTTY;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_TIOCSETD: {
 		int line;
 		switch (args->arg) {
 		case LINUX_N_TTY:
 			line = TTYDISC;
 			break;
 		case LINUX_N_SLIP:
 			line = SLIPDISC;
 			break;
 		case LINUX_N_PPP:
 			line = PPPDISC;
 			break;
 		default:
+			fdrop(fp, td);
 			return (EINVAL);
 		}
-		return (fo_ioctl(fp, TIOCSETD, (caddr_t)&line, td));
+		error = (fo_ioctl(fp, TIOCSETD, (caddr_t)&line, td));
+		break;
 	}
 
 	case LINUX_TIOCGETD: {
 		int linux_line;
 		int bsd_line = TTYDISC;
 		error = fo_ioctl(fp, TIOCGETD, (caddr_t)&bsd_line, td);
 		if (error)
 			return (error);
 		switch (bsd_line) {
 		case TTYDISC:
 			linux_line = LINUX_N_TTY;
 			break;
 		case SLIPDISC:
 			linux_line = LINUX_N_SLIP;
 			break;
 		case PPPDISC:
 			linux_line = LINUX_N_PPP;
 			break;
 		default:
+			fdrop(fp, td);
 			return (EINVAL);
 		}
-		return (copyout(&linux_line, (caddr_t)args->arg, sizeof(int)));
+		error = (copyout(&linux_line, (caddr_t)args->arg, sizeof(int)));
+		break;
 	}
 
 	/* LINUX_TCSBRKP */
 	/* LINUX_TIOCTTYGSTRUCT */
 
 	case LINUX_FIONCLEX:
 		args->cmd = FIONCLEX;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_FIOCLEX:
 		args->cmd = FIOCLEX;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_FIOASYNC:
 		args->cmd = FIOASYNC;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	/* LINUX_TIOCSERCONFIG */
 	/* LINUX_TIOCSERGWILD */
 	/* LINUX_TIOCSERSWILD */
 	/* LINUX_TIOCGLCKTRMIOS */
 	/* LINUX_TIOCSLCKTRMIOS */
 
+	default:
+		error = ENOIOCTL;
+		break;
 	}
 
-	return (ENOIOCTL);
+	fdrop(fp, td);
+	return (error);
 }
 
 /*
  * CDROM related ioctls
  */
 
 struct linux_cdrom_msf
 {
 	u_char	cdmsf_min0;
 	u_char	cdmsf_sec0;
 	u_char	cdmsf_frame0;
 	u_char	cdmsf_min1;
 	u_char	cdmsf_sec1;
 	u_char	cdmsf_frame1;
 };
 
 struct linux_cdrom_tochdr
 {
 	u_char	cdth_trk0;
 	u_char	cdth_trk1;
 };
 
 union linux_cdrom_addr
 {
 	struct {
 		u_char	minute;
 		u_char	second;
 		u_char	frame;
 	} msf;
 	int	lba;
 };
 
 struct linux_cdrom_tocentry
 {
 	u_char	cdte_track;     
 	u_char	cdte_adr:4;
 	u_char	cdte_ctrl:4;
 	u_char	cdte_format;    
 	union linux_cdrom_addr cdte_addr;
 	u_char	cdte_datamode;  
 };
 
 struct linux_cdrom_subchnl
 {
 	u_char	cdsc_format;
 	u_char	cdsc_audiostatus;
 	u_char	cdsc_adr:4;
 	u_char	cdsc_ctrl:4;
 	u_char	cdsc_trk;
 	u_char	cdsc_ind;
 	union linux_cdrom_addr cdsc_absaddr;
 	union linux_cdrom_addr cdsc_reladdr;
 };
 
 struct l_dvd_layer {
 	u_char		book_version:4;
 	u_char		book_type:4;
 	u_char		min_rate:4;
 	u_char		disc_size:4;
 	u_char		layer_type:4;
 	u_char		track_path:1;
 	u_char		nlayers:2;
 	u_char		track_density:4;
 	u_char		linear_density:4;
 	u_char		bca:1;
 	u_int32_t	start_sector;
 	u_int32_t	end_sector;
 	u_int32_t	end_sector_l0;
 };
 
 struct l_dvd_physical {
 	u_char		type;
 	u_char		layer_num;
 	struct l_dvd_layer layer[4];
 };
 
 struct l_dvd_copyright {
 	u_char		type;
 	u_char		layer_num;
 	u_char		cpst;
 	u_char		rmi;
 };
 
 struct l_dvd_disckey {
 	u_char		type;
 	l_uint		agid:2;
 	u_char		value[2048];
 };
 
 struct l_dvd_bca {
 	u_char		type;
 	l_int		len;
 	u_char		value[188];
 };
 
 struct l_dvd_manufact {
 	u_char		type;
 	u_char		layer_num;
 	l_int		len;
 	u_char		value[2048];
 };
 
 typedef union {
 	u_char			type;
 	struct l_dvd_physical	physical;
 	struct l_dvd_copyright	copyright;
 	struct l_dvd_disckey	disckey;
 	struct l_dvd_bca	bca;
 	struct l_dvd_manufact	manufact;
 } l_dvd_struct;
 
 typedef u_char l_dvd_key[5];
 typedef u_char l_dvd_challenge[10];
 
 struct l_dvd_lu_send_agid {
 	u_char		type;
 	l_uint		agid:2;
 };
 
 struct l_dvd_host_send_challenge {
 	u_char		type;
 	l_uint		agid:2;
 	l_dvd_challenge	chal;
 };
 
 struct l_dvd_send_key {
 	u_char		type;
 	l_uint		agid:2;
 	l_dvd_key	key;
 };
 
 struct l_dvd_lu_send_challenge {
 	u_char		type;
 	l_uint		agid:2;
 	l_dvd_challenge	chal;
 };
 
 struct l_dvd_lu_send_title_key {
 	u_char		type;
 	l_uint		agid:2;
 	l_dvd_key	title_key;
 	l_int		lba;
 	l_uint		cpm:1;
 	l_uint		cp_sec:1;
 	l_uint		cgms:2;
 };
 
 struct l_dvd_lu_send_asf {
 	u_char		type;
 	l_uint		agid:2;
 	l_uint		asf:1;
 };
 
 struct l_dvd_host_send_rpcstate {
 	u_char		type;
 	u_char		pdrc;
 };
 
 struct l_dvd_lu_send_rpcstate {
 	u_char		type:2;
 	u_char		vra:3;
 	u_char		ucca:3;
 	u_char		region_mask;
 	u_char		rpc_scheme;
 };
 
 typedef union {
 	u_char				type;
 	struct l_dvd_lu_send_agid	lsa;
 	struct l_dvd_host_send_challenge hsc;
 	struct l_dvd_send_key		lsk;
 	struct l_dvd_lu_send_challenge	lsc;
 	struct l_dvd_send_key		hsk;
 	struct l_dvd_lu_send_title_key	lstk;
 	struct l_dvd_lu_send_asf	lsasf;
 	struct l_dvd_host_send_rpcstate	hrpcs;
 	struct l_dvd_lu_send_rpcstate	lrpcs;
 } l_dvd_authinfo;
 
 static void
 bsd_to_linux_msf_lba(u_char af, union msf_lba *bp, union linux_cdrom_addr *lp)
 {
 	if (af == CD_LBA_FORMAT)
 		lp->lba = bp->lba;
 	else {
 		lp->msf.minute = bp->msf.minute;
 		lp->msf.second = bp->msf.second;
 		lp->msf.frame = bp->msf.frame;
 	}
 }
 
 static void
 set_linux_cdrom_addr(union linux_cdrom_addr *addr, int format, int lba)
 {
 	if (format == LINUX_CDROM_MSF) {
 		addr->msf.frame = lba % 75;
 		lba /= 75;
 		lba += 2;
 		addr->msf.second = lba % 60;
 		addr->msf.minute = lba / 60;
 	} else
 		addr->lba = lba;
 }
 
 static int
 linux_to_bsd_dvd_struct(l_dvd_struct *lp, struct dvd_struct *bp)
 {
 	bp->format = lp->type;
 	switch (bp->format) {
 	case DVD_STRUCT_PHYSICAL:
 		if (bp->layer_num >= 4)
 			return (EINVAL);
 		bp->layer_num = lp->physical.layer_num;
 		break;
 	case DVD_STRUCT_COPYRIGHT:
 		bp->layer_num = lp->copyright.layer_num;
 		break;
 	case DVD_STRUCT_DISCKEY:
 		bp->agid = lp->disckey.agid;
 		break;
 	case DVD_STRUCT_BCA:
 	case DVD_STRUCT_MANUFACT:
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 bsd_to_linux_dvd_struct(struct dvd_struct *bp, l_dvd_struct *lp)
 {
 	switch (bp->format) {
 	case DVD_STRUCT_PHYSICAL: {
 		struct dvd_layer *blp = (struct dvd_layer *)bp->data;
 		struct l_dvd_layer *llp = &lp->physical.layer[bp->layer_num];
 		memset(llp, 0, sizeof(*llp));
 		llp->book_version = blp->book_version;
 		llp->book_type = blp->book_type;
 		llp->min_rate = blp->max_rate;
 		llp->disc_size = blp->disc_size;
 		llp->layer_type = blp->layer_type;
 		llp->track_path = blp->track_path;
 		llp->nlayers = blp->nlayers;
 		llp->track_density = blp->track_density;
 		llp->linear_density = blp->linear_density;
 		llp->bca = blp->bca;
 		llp->start_sector = blp->start_sector;
 		llp->end_sector = blp->end_sector;
 		llp->end_sector_l0 = blp->end_sector_l0;
 		break;
 	}
 	case DVD_STRUCT_COPYRIGHT:
 		lp->copyright.cpst = bp->cpst;
 		lp->copyright.rmi = bp->rmi;
 		break;
 	case DVD_STRUCT_DISCKEY:
 		memcpy(lp->disckey.value, bp->data, sizeof(lp->disckey.value));
 		break;
 	case DVD_STRUCT_BCA:
 		lp->bca.len = bp->length;
 		memcpy(lp->bca.value, bp->data, sizeof(lp->bca.value));
 		break;
 	case DVD_STRUCT_MANUFACT:
 		lp->manufact.len = bp->length;
 		memcpy(lp->manufact.value, bp->data,
 		    sizeof(lp->manufact.value));
 		/* lp->manufact.layer_num is unused in linux (redhat 7.0) */
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 linux_to_bsd_dvd_authinfo(l_dvd_authinfo *lp, int *bcode,
     struct dvd_authinfo *bp)
 {
 	switch (lp->type) {
 	case LINUX_DVD_LU_SEND_AGID:
 		*bcode = DVDIOCREPORTKEY;
 		bp->format = DVD_REPORT_AGID;
 		bp->agid = lp->lsa.agid;
 		break;
 	case LINUX_DVD_HOST_SEND_CHALLENGE:
 		*bcode = DVDIOCSENDKEY;
 		bp->format = DVD_SEND_CHALLENGE;
 		bp->agid = lp->hsc.agid;
 		memcpy(bp->keychal, lp->hsc.chal, 10);
 		break;
 	case LINUX_DVD_LU_SEND_KEY1:
 		*bcode = DVDIOCREPORTKEY;
 		bp->format = DVD_REPORT_KEY1;
 		bp->agid = lp->lsk.agid;
 		break;
 	case LINUX_DVD_LU_SEND_CHALLENGE:
 		*bcode = DVDIOCREPORTKEY;
 		bp->format = DVD_REPORT_CHALLENGE;
 		bp->agid = lp->lsc.agid;
 		break;
 	case LINUX_DVD_HOST_SEND_KEY2:
 		*bcode = DVDIOCSENDKEY;
 		bp->format = DVD_SEND_KEY2;
 		bp->agid = lp->hsk.agid;
 		memcpy(bp->keychal, lp->hsk.key, 5);
 		break;
 	case LINUX_DVD_LU_SEND_TITLE_KEY:
 		*bcode = DVDIOCREPORTKEY;
 		bp->format = DVD_REPORT_TITLE_KEY;
 		bp->agid = lp->lstk.agid;
 		bp->lba = lp->lstk.lba;
 		break;
 	case LINUX_DVD_LU_SEND_ASF:
 		*bcode = DVDIOCREPORTKEY;
 		bp->format = DVD_REPORT_ASF;
 		bp->agid = lp->lsasf.agid;
 		break;
 	case LINUX_DVD_INVALIDATE_AGID:
 		*bcode = DVDIOCREPORTKEY;
 		bp->format = DVD_INVALIDATE_AGID;
 		bp->agid = lp->lsa.agid;
 		break;
 	case LINUX_DVD_LU_SEND_RPC_STATE:
 		*bcode = DVDIOCREPORTKEY;
 		bp->format = DVD_REPORT_RPC;
 		break;
 	case LINUX_DVD_HOST_SEND_RPC_STATE:
 		*bcode = DVDIOCSENDKEY;
 		bp->format = DVD_SEND_RPC;
 		bp->region = lp->hrpcs.pdrc;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 bsd_to_linux_dvd_authinfo(struct dvd_authinfo *bp, l_dvd_authinfo *lp)
 {
 	switch (lp->type) {
 	case LINUX_DVD_LU_SEND_AGID:
 		lp->lsa.agid = bp->agid;
 		break;
 	case LINUX_DVD_HOST_SEND_CHALLENGE:
 		lp->type = LINUX_DVD_LU_SEND_KEY1;
 		break;
 	case LINUX_DVD_LU_SEND_KEY1:
 		memcpy(lp->lsk.key, bp->keychal, sizeof(lp->lsk.key));
 		break;
 	case LINUX_DVD_LU_SEND_CHALLENGE:
 		memcpy(lp->lsc.chal, bp->keychal, sizeof(lp->lsc.chal));
 		break;
 	case LINUX_DVD_HOST_SEND_KEY2:
 		lp->type = LINUX_DVD_AUTH_ESTABLISHED;
 		break;
 	case LINUX_DVD_LU_SEND_TITLE_KEY:
 		memcpy(lp->lstk.title_key, bp->keychal,
 		    sizeof(lp->lstk.title_key));
 		lp->lstk.cpm = bp->cpm;
 		lp->lstk.cp_sec = bp->cp_sec;
 		lp->lstk.cgms = bp->cgms;
 		break;
 	case LINUX_DVD_LU_SEND_ASF:
 		lp->lsasf.asf = bp->asf;
 		break;
 	case LINUX_DVD_INVALIDATE_AGID:
 		break;
 	case LINUX_DVD_LU_SEND_RPC_STATE:
 		lp->lrpcs.type = bp->reg_type;
 		lp->lrpcs.vra = bp->vend_rsts;
 		lp->lrpcs.ucca = bp->user_rsts;
 		lp->lrpcs.region_mask = bp->region;
 		lp->lrpcs.rpc_scheme = bp->rpc_scheme;
 		break;
 	case LINUX_DVD_HOST_SEND_RPC_STATE:
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 linux_ioctl_cdrom(struct thread *td, struct linux_ioctl_args *args)
 {
-	struct file *fp = td->td_proc->p_fd->fd_ofiles[args->fd];
+	struct file *fp;
 	int error;
 
+	fp = ffind_hold(td, args->fd);
+	if (fp == NULL)
+		return (EBADF);
 	switch (args->cmd & 0xffff) {
 
 	case LINUX_CDROMPAUSE:
 		args->cmd = CDIOCPAUSE;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_CDROMRESUME:
 		args->cmd = CDIOCRESUME;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_CDROMPLAYMSF:
 		args->cmd = CDIOCPLAYMSF;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_CDROMPLAYTRKIND:
 		args->cmd = CDIOCPLAYTRACKS;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_CDROMREADTOCHDR: {
 		struct ioc_toc_header th;
 		struct linux_cdrom_tochdr lth;
 		error = fo_ioctl(fp, CDIOREADTOCHEADER, (caddr_t)&th, td);
 		if (!error) {
 			lth.cdth_trk0 = th.starting_track;
 			lth.cdth_trk1 = th.ending_track;
 			copyout(&lth, (caddr_t)args->arg, sizeof(lth));
 		}
-		return (error);
+		break;
 	}
 
 	case LINUX_CDROMREADTOCENTRY: {
 		struct linux_cdrom_tocentry lte, *ltep =
 		    (struct linux_cdrom_tocentry *)args->arg;
 		struct ioc_read_toc_single_entry irtse;
 		irtse.address_format = ltep->cdte_format;
 		irtse.track = ltep->cdte_track;
 		error = fo_ioctl(fp, CDIOREADTOCENTRY, (caddr_t)&irtse, td);
 		if (!error) {
 			lte = *ltep;
 			lte.cdte_ctrl = irtse.entry.control;
 			lte.cdte_adr = irtse.entry.addr_type;
 			bsd_to_linux_msf_lba(irtse.address_format,
 			    &irtse.entry.addr, &lte.cdte_addr);
 			copyout(&lte, (caddr_t)args->arg, sizeof(lte));
 		}
-		return (error);
+		break;
 	}
 
 	case LINUX_CDROMSTOP:
 		args->cmd = CDIOCSTOP;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_CDROMSTART:
 		args->cmd = CDIOCSTART;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_CDROMEJECT:
 		args->cmd = CDIOCEJECT;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	/* LINUX_CDROMVOLCTRL */
 
 	case LINUX_CDROMSUBCHNL: {
 		struct linux_cdrom_subchnl sc;
 		struct ioc_read_subchannel bsdsc;
 		struct cd_sub_channel_info *bsdinfo;
 		caddr_t sg = stackgap_init();
 		bsdinfo = (struct cd_sub_channel_info*)stackgap_alloc(&sg,
 		    sizeof(struct cd_sub_channel_info));
 		bsdsc.address_format = CD_LBA_FORMAT;
 		bsdsc.data_format = CD_CURRENT_POSITION;
 		bsdsc.track = 0;
 		bsdsc.data_len = sizeof(struct cd_sub_channel_info);
 		bsdsc.data = bsdinfo;
 		error = fo_ioctl(fp, CDIOCREADSUBCHANNEL, (caddr_t)&bsdsc, td);
 		if (error)
-			return (error);
+			break;
 		error = copyin((caddr_t)args->arg, &sc,
 		    sizeof(struct linux_cdrom_subchnl));
 		if (error)
-			return (error);
+			break;
 		sc.cdsc_audiostatus = bsdinfo->header.audio_status;
 		sc.cdsc_adr = bsdinfo->what.position.addr_type;
 		sc.cdsc_ctrl = bsdinfo->what.position.control;
 		sc.cdsc_trk = bsdinfo->what.position.track_number;
 		sc.cdsc_ind = bsdinfo->what.position.index_number;
 		set_linux_cdrom_addr(&sc.cdsc_absaddr, sc.cdsc_format,
 		    bsdinfo->what.position.absaddr.lba);
 		set_linux_cdrom_addr(&sc.cdsc_reladdr, sc.cdsc_format,
 		    bsdinfo->what.position.reladdr.lba);
 		error = copyout(&sc, (caddr_t)args->arg,
 		    sizeof(struct linux_cdrom_subchnl));
-		return (error);
+		break;
 	}
 
 	/* LINUX_CDROMREADMODE2 */
 	/* LINUX_CDROMREADMODE1 */
 	/* LINUX_CDROMREADAUDIO */
 	/* LINUX_CDROMEJECT_SW */
 	/* LINUX_CDROMMULTISESSION */
 	/* LINUX_CDROM_GET_UPC */
 
 	case LINUX_CDROMRESET:
 		args->cmd = CDIOCRESET;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	/* LINUX_CDROMVOLREAD */
 	/* LINUX_CDROMREADRAW */
 	/* LINUX_CDROMREADCOOKED */
 	/* LINUX_CDROMSEEK */
 	/* LINUX_CDROMPLAYBLK */
 	/* LINUX_CDROMREADALL */
 	/* LINUX_CDROMCLOSETRAY */
 	/* LINUX_CDROMLOADFROMSLOT */
 	/* LINUX_CDROMGETSPINDOWN */
 	/* LINUX_CDROMSETSPINDOWN */
 	/* LINUX_CDROM_SET_OPTIONS */
 	/* LINUX_CDROM_CLEAR_OPTIONS */
 	/* LINUX_CDROM_SELECT_SPEED */
 	/* LINUX_CDROM_SELECT_DISC */
 	/* LINUX_CDROM_MEDIA_CHANGED */
 	/* LINUX_CDROM_DRIVE_STATUS */
 	/* LINUX_CDROM_DISC_STATUS */
 	/* LINUX_CDROM_CHANGER_NSLOTS */
 	/* LINUX_CDROM_LOCKDOOR */
 	/* LINUX_CDROM_DEBUG */
 	/* LINUX_CDROM_GET_CAPABILITY */
 	/* LINUX_CDROMAUDIOBUFSIZ */
 
 	case LINUX_DVD_READ_STRUCT: {
 		l_dvd_struct lds;
 		struct dvd_struct bds;
 
 		error = copyin((caddr_t)args->arg, &lds, sizeof(l_dvd_struct));
 		if (error)
-			return (error);
+			break;
 		error = linux_to_bsd_dvd_struct(&lds, &bds);
 		if (error)
-			return (error);
+			break;
 		error = fo_ioctl(fp, DVDIOCREADSTRUCTURE, (caddr_t)&bds, td);
 		if (error)
-			return (error);
+			break;
 		error = bsd_to_linux_dvd_struct(&bds, &lds);
 		if (error)
-			return (error);
-		return (copyout(&lds, (caddr_t)args->arg,
-			    sizeof(l_dvd_struct)));
+			break;
+		error = copyout(&lds, (caddr_t)args->arg,
+				sizeof(l_dvd_struct));
+		break;
 	}
 
 	/* LINUX_DVD_WRITE_STRUCT */
 
 	case LINUX_DVD_AUTH: {
 		l_dvd_authinfo lda;
 		struct dvd_authinfo bda;
 		int bcode;
 
 		error = copyin((caddr_t)args->arg, &lda,
 		    sizeof(l_dvd_authinfo));
 		if (error)
-			return (error);
+			break;
 		error = linux_to_bsd_dvd_authinfo(&lda, &bcode, &bda);
 		if (error)
-			return (error);
+			break;
 		error = fo_ioctl(fp, bcode, (caddr_t)&bda, td);
 		if (error) {
 			if (lda.type == LINUX_DVD_HOST_SEND_KEY2) {
 				lda.type = LINUX_DVD_AUTH_FAILURE;
 				copyout(&lda, (caddr_t)args->arg,
 				    sizeof(l_dvd_authinfo));
 			}
-			return (error);
+			break;
 		}
 		error = bsd_to_linux_dvd_authinfo(&bda, &lda);
 		if (error)
-			return (error);
-		return (copyout(&lda, (caddr_t)args->arg,
-			    sizeof(l_dvd_authinfo)));
+			break;
+		error = copyout(&lda, (caddr_t)args->arg,
+				sizeof(l_dvd_authinfo));
+		break;
 	}
 
 	/* LINUX_CDROM_SEND_PACKET */
 	/* LINUX_CDROM_NEXT_WRITABLE */
 	/* LINUX_CDROM_LAST_WRITTEN */
 
+	default:
+		error = ENOIOCTL;
+		break;
 	}
 
-	return (ENOIOCTL);
+	fdrop(fp, td);
+	return (error);
 }
 
 /*
  * Sound related ioctls
  */
 
 static u_int32_t dirbits[4] = { IOC_VOID, IOC_IN, IOC_OUT, IOC_INOUT };
 
 #define	SETDIR(c)	(((c) & ~IOC_DIRMASK) | dirbits[args->cmd >> 30])
 
 static int
 linux_ioctl_sound(struct thread *td, struct linux_ioctl_args *args)
 {
 
 	switch (args->cmd & 0xffff) {
 
 	case LINUX_SOUND_MIXER_WRITE_VOLUME:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_VOLUME);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_BASS:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_BASS);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_TREBLE:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_TREBLE);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_SYNTH:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_SYNTH);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_PCM:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_PCM);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_SPEAKER:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_SPEAKER);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_LINE:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_MIC:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_MIC);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_CD:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_CD);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_IMIX:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_IMIX);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_ALTPCM:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_ALTPCM);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_RECLEV:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_RECLEV);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_IGAIN:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_IGAIN);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_OGAIN:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_OGAIN);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_LINE1:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE1);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_LINE2:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE2);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_LINE3:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE3);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_OSS_GETVERSION: {
 		int version = linux_get_oss_version(td->td_proc);
 		return (copyout(&version, (caddr_t)args->arg, sizeof(int)));
 	}
 
 	case LINUX_SOUND_MIXER_READ_DEVMASK:
 		args->cmd = SOUND_MIXER_READ_DEVMASK;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_MIXER_WRITE_RECSRC:
 		args->cmd = SETDIR(SOUND_MIXER_WRITE_RECSRC);
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_RESET:
 		args->cmd = SNDCTL_DSP_RESET;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_SYNC:
 		args->cmd = SNDCTL_DSP_SYNC;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_SPEED:
 		args->cmd = SNDCTL_DSP_SPEED;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_STEREO:
 		args->cmd = SNDCTL_DSP_STEREO;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_GETBLKSIZE: /* LINUX_SNDCTL_DSP_SETBLKSIZE */
 		args->cmd = SNDCTL_DSP_GETBLKSIZE;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_SETFMT:
 		args->cmd = SNDCTL_DSP_SETFMT;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_PCM_WRITE_CHANNELS:
 		args->cmd = SOUND_PCM_WRITE_CHANNELS;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SOUND_PCM_WRITE_FILTER:
 		args->cmd = SOUND_PCM_WRITE_FILTER;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_POST:
 		args->cmd = SNDCTL_DSP_POST;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_SUBDIVIDE:
 		args->cmd = SNDCTL_DSP_SUBDIVIDE;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_SETFRAGMENT:
 		args->cmd = SNDCTL_DSP_SETFRAGMENT;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_GETFMTS:
 		args->cmd = SNDCTL_DSP_GETFMTS;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_GETOSPACE:
 		args->cmd = SNDCTL_DSP_GETOSPACE;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_GETISPACE:
 		args->cmd = SNDCTL_DSP_GETISPACE;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_NONBLOCK:
 		args->cmd = SNDCTL_DSP_NONBLOCK;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_GETCAPS:
 		args->cmd = SNDCTL_DSP_GETCAPS;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_SETTRIGGER: /* LINUX_SNDCTL_GETTRIGGER */
 		args->cmd = SNDCTL_DSP_SETTRIGGER;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_GETIPTR:
 		args->cmd = SNDCTL_DSP_GETIPTR;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_GETOPTR:
 		args->cmd = SNDCTL_DSP_GETOPTR;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_DSP_GETODELAY:
 		args->cmd = SNDCTL_DSP_GETODELAY;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_SEQ_RESET:
 		args->cmd = SNDCTL_SEQ_RESET;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_SEQ_SYNC:
 		args->cmd = SNDCTL_SEQ_SYNC;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_SYNTH_INFO:
 		args->cmd = SNDCTL_SYNTH_INFO;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_SEQ_CTRLRATE:
 		args->cmd = SNDCTL_SEQ_CTRLRATE;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_SEQ_GETOUTCOUNT:
 		args->cmd = SNDCTL_SEQ_GETOUTCOUNT;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_SEQ_GETINCOUNT:
 		args->cmd = SNDCTL_SEQ_GETINCOUNT;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_SEQ_PERCMODE:
 		args->cmd = SNDCTL_SEQ_PERCMODE;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_FM_LOAD_INSTR:
 		args->cmd = SNDCTL_FM_LOAD_INSTR;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_SEQ_TESTMIDI:
 		args->cmd = SNDCTL_SEQ_TESTMIDI;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_SEQ_RESETSAMPLES:
 		args->cmd = SNDCTL_SEQ_RESETSAMPLES;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_SEQ_NRSYNTHS:
 		args->cmd = SNDCTL_SEQ_NRSYNTHS;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_SEQ_NRMIDIS:
 		args->cmd = SNDCTL_SEQ_NRMIDIS;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_MIDI_INFO:
 		args->cmd = SNDCTL_MIDI_INFO;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_SEQ_TRESHOLD:
 		args->cmd = SNDCTL_SEQ_TRESHOLD;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	case LINUX_SNDCTL_SYNTH_MEMAVL:
 		args->cmd = SNDCTL_SYNTH_MEMAVL;
 		return (ioctl(td, (struct ioctl_args *)args));
 
 	}
 
 	return (ENOIOCTL);
 }
 
 /*
  * Console related ioctls
  */
 
 #define ISSIGVALID(sig)		((sig) > 0 && (sig) < NSIG)
 
 static int
 linux_ioctl_console(struct thread *td, struct linux_ioctl_args *args)
 {
-	struct file *fp = td->td_proc->p_fd->fd_ofiles[args->fd];
+	struct file *fp;
+	int error;
 
+	fp = ffind_hold(td, args->fd);
+	if (fp == NULL)
+		return (EBADF);
 	switch (args->cmd & 0xffff) {
 
 	case LINUX_KIOCSOUND:
 		args->cmd = KIOCSOUND;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_KDMKTONE:
 		args->cmd = KDMKTONE;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_KDGETLED:
 		args->cmd = KDGETLED;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_KDSETLED:
 		args->cmd = KDSETLED;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_KDSETMODE:
 		args->cmd = KDSETMODE;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_KDGETMODE:
 		args->cmd = KDGETMODE;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_KDGKBMODE:
 		args->cmd = KDGKBMODE;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_KDSKBMODE: {
 		int kbdmode;
 		switch (args->arg) {
 		case LINUX_KBD_RAW:
 			kbdmode = K_RAW;
 			break;
 		case LINUX_KBD_XLATE:
 			kbdmode = K_XLATE;
 			break;
 		case LINUX_KBD_MEDIUMRAW:
 			kbdmode = K_RAW;
 			break;
 		default:
+			fdrop(fp, td);
 			return (EINVAL);
 		}
-		return (fo_ioctl(fp, KDSKBMODE, (caddr_t)&kbdmode, td));
+		error = (fo_ioctl(fp, KDSKBMODE, (caddr_t)&kbdmode, td));
+		break;
 	}
 
 	case LINUX_VT_OPENQRY:
 		args->cmd = VT_OPENQRY;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_VT_GETMODE:
 		args->cmd = VT_GETMODE;
-		return  (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_VT_SETMODE: {
 		struct vt_mode *mode;
 		args->cmd = VT_SETMODE;
 		mode = (struct vt_mode *)args->arg;
 		if (!ISSIGVALID(mode->frsig) && ISSIGVALID(mode->acqsig))
 			mode->frsig = mode->acqsig;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 	}
 
 	case LINUX_VT_GETSTATE:
 		args->cmd = VT_GETACTIVE;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_VT_RELDISP:
 		args->cmd = VT_RELDISP;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_VT_ACTIVATE:
 		args->cmd = VT_ACTIVATE;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
 	case LINUX_VT_WAITACTIVE:
 		args->cmd = VT_WAITACTIVE;
-		return (ioctl(td, (struct ioctl_args *)args));
+		error = (ioctl(td, (struct ioctl_args *)args));
+		break;
 
+	default:
+		error = ENOIOCTL;
+		break;
 	}
 	
-	return (ENOIOCTL);
+	fdrop(fp, td);
+	return (error);
 }
 
 /*
  * Criteria for interface name translation
  */
 #define IFP_IS_ETH(ifp) (ifp->if_type == IFT_ETHER)
 
 /*
  * Translate a Linux interface name to a FreeBSD interface name,
  * and return the associated ifnet structure
  * bsdname and lxname need to be least IFNAMSIZ bytes long, but
  * can point to the same buffer.
  */
 
 static struct ifnet *
 ifname_linux_to_bsd(const char *lxname, char *bsdname)
 {
 	struct ifnet *ifp;
 	int len, unit;
 	char *ep;
 	int is_eth, index;
 
 	for (len = 0; len < LINUX_IFNAMSIZ; ++len)
 		if (!isalpha(lxname[len]))
 			break;
 	if (len == 0 || len == LINUX_IFNAMSIZ)
 		return (NULL);
 	unit = (int)strtoul(lxname + len, &ep, 10);
 	if (ep == NULL || ep == lxname + len || ep >= lxname + LINUX_IFNAMSIZ)
 		return (NULL);
 	index = 0;
 	is_eth = (len == 3 && !strncmp(lxname, "eth", len)) ? 1 : 0;
 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
 		/*
 		 * Allow Linux programs to use FreeBSD names. Don't presume
 		 * we never have an interface named "eth", so don't make
 		 * the test optional based on is_eth.
 		 */
 		if (ifp->if_unit == unit && ifp->if_name[len] == '\0' &&
 		    strncmp(ifp->if_name, lxname, len) == 0)
 			break;
 		if (is_eth && IFP_IS_ETH(ifp) && unit == index++)
 			break;
 	}
 	if (ifp != NULL)
 		snprintf(bsdname, IFNAMSIZ, "%s%d", ifp->if_name, ifp->if_unit);
 	return (ifp);
 }
 
 /*
  * Implement the SIOCGIFCONF ioctl
  */
 
 static int
 linux_ifconf(struct thread *td, struct ifconf *uifc)
 {
 	struct ifconf ifc;
 	struct l_ifreq ifr;
 	struct ifnet *ifp;
 	struct iovec iov;
 	struct uio uio;
 	int error, ethno;
 
 	error = copyin(uifc, &ifc, sizeof ifc);
 	if (error != 0)
 		return (error);
 
 	/* much easier to use uiomove than keep track ourselves */
 	iov.iov_base = ifc.ifc_buf;
 	iov.iov_len = ifc.ifc_len;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = 0;
 	uio.uio_resid = ifc.ifc_len;
 	uio.uio_segflg = UIO_USERSPACE;
 	uio.uio_rw = UIO_READ;
 	uio.uio_td = td;
 
 	/* Keep track of eth interfaces */
 	ethno = 0;
 
 	/* return interface names but no addresses. */
 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
 		if (uio.uio_resid <= 0)
 			break;
 		bzero(&ifr, sizeof ifr);
 		if (IFP_IS_ETH(ifp))
 			snprintf(ifr.ifr_name, LINUX_IFNAMSIZ, "eth%d",
 			    ethno++);
 		else
 			snprintf(ifr.ifr_name, LINUX_IFNAMSIZ, "%s%d",
 			    ifp->if_name, ifp->if_unit);
 		error = uiomove((caddr_t)&ifr, sizeof ifr, &uio);
 		if (error != 0)
 			return (error);
 	}
 
 	ifc.ifc_len -= uio.uio_resid;
 	error = copyout(&ifc, uifc, sizeof ifc);
 
 	return (error);
 }
 
 static int
 linux_gifflags(struct thread *td, struct ifnet *ifp, struct l_ifreq *ifr)
 {
 	l_short flags;
 
 	flags = ifp->if_flags;
 	/* these flags have no Linux equivalent */
 	flags &= ~(IFF_SMART|IFF_OACTIVE|IFF_SIMPLEX|
 	    IFF_LINK0|IFF_LINK1|IFF_LINK2);
 	/* Linux' multicast flag is in a different bit */
 	if (flags & IFF_MULTICAST) {
 		flags &= ~IFF_MULTICAST;
 		flags |= 0x1000;
 	}
 
 	return (copyout(&flags, &ifr->ifr_flags, sizeof flags));
 }
 
 #define ARPHRD_ETHER	1
 #define ARPHRD_LOOPBACK	772
 
 static int
 linux_gifhwaddr(struct ifnet *ifp, struct l_ifreq *ifr)
 {
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	struct l_sockaddr lsa;
 
 	if (ifp->if_type == IFT_LOOP) {
 		bzero(&lsa, sizeof lsa);
 		lsa.sa_family = ARPHRD_LOOPBACK;
 		return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof lsa));
 	}
 	
 	if (ifp->if_type != IFT_ETHER)
 		return (ENOENT);
 
 	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		sdl = (struct sockaddr_dl*)ifa->ifa_addr;
 		if (sdl != NULL && (sdl->sdl_family == AF_LINK) &&
 		    (sdl->sdl_type == IFT_ETHER)) {
 			bzero(&lsa, sizeof lsa);
 			lsa.sa_family = ARPHRD_ETHER;
 			bcopy(LLADDR(sdl), lsa.sa_data, LINUX_IFHWADDRLEN);
 			return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof lsa));
 		}
 	}
 	
 	return (ENOENT);
 }
 
 /*
  * Socket related ioctls
  */
 
 static int
 linux_ioctl_socket(struct thread *td, struct linux_ioctl_args *args)
 {
 	char lifname[LINUX_IFNAMSIZ], ifname[IFNAMSIZ];
 	struct ifnet *ifp;
 	struct file *fp;
 	int error, type;
 
 	KASSERT(LINUX_IFNAMSIZ == IFNAMSIZ,
 	    ("%s(): LINUX_IFNAMSIZ != IFNAMSIZ", __func__));
 	
 	ifp = NULL;
 	error = 0;
 	
 	mtx_lock(&Giant);
 	if ((error = fget(td, args->fd, &fp)) != 0) {
 		mtx_unlock(&Giant);
 		return (error);
 	}
 	type = fp->f_type;
 	fdrop(fp, td);
 	mtx_unlock(&Giant);
 
 	if (type != DTYPE_SOCKET) {
 		/* not a socket - probably a tap / vmnet device */
 		switch (args->cmd) {
 		case LINUX_SIOCGIFADDR:
 		case LINUX_SIOCSIFADDR:
 		case LINUX_SIOCGIFFLAGS:
 			return (linux_ioctl_special(td, args));
 		default:
 			return (ENOIOCTL);
 		}
 	}
 
 	switch (args->cmd & 0xffff) {
 		
 	case LINUX_FIOGETOWN:
 	case LINUX_FIOSETOWN:
 	case LINUX_SIOCADDMULTI:
 	case LINUX_SIOCATMARK:
 	case LINUX_SIOCDELMULTI:
 	case LINUX_SIOCGIFCONF:
 	case LINUX_SIOCGPGRP:
 	case LINUX_SIOCSPGRP:
 		/* these ioctls don't take an interface name */
 #ifdef DEBUG
 		printf("%s(): ioctl %d\n", __func__,
 		    args->cmd & 0xffff);
 #endif
 		break;
 		
 	case LINUX_SIOCGIFFLAGS:
 	case LINUX_SIOCGIFADDR:
 	case LINUX_SIOCSIFADDR:
 	case LINUX_SIOCGIFDSTADDR:
 	case LINUX_SIOCGIFBRDADDR:
 	case LINUX_SIOCGIFNETMASK:
 	case LINUX_SIOCSIFNETMASK:
 	case LINUX_SIOCGIFMTU:
 	case LINUX_SIOCSIFMTU:
 	case LINUX_SIOCSIFNAME:
 	case LINUX_SIOCGIFHWADDR:
 	case LINUX_SIOCSIFHWADDR:
 	case LINUX_SIOCDEVPRIVATE:
 	case LINUX_SIOCDEVPRIVATE+1:
 		/* copy in the interface name and translate it. */
 		error = copyin((char *)args->arg, lifname, LINUX_IFNAMSIZ);
 		if (error != 0)
 			return (error);
 #ifdef DEBUG
 		printf("%s(): ioctl %d on %.*s\n", __func__,
 		    args->cmd & 0xffff, LINUX_IFNAMSIZ, lifname);
 #endif
 		ifp = ifname_linux_to_bsd(lifname, ifname);
 		if (ifp == NULL)
 			return (EINVAL);
 		/*
 		 * We need to copy it back out in case we pass the
 		 * request on to our native ioctl(), which will expect
 		 * the ifreq to be in user space and have the correct
 		 * interface name.
 		 */
 		error = copyout(ifname, (char *)args->arg, IFNAMSIZ);
 		if (error != 0)
 			return (error);
 #ifdef DEBUG
 		printf("%s(): %s translated to %s\n", __func__,
 		    lifname, ifname);
 #endif
 		break;
 		
 	default:
 		return (ENOIOCTL);
 	}
 
 	switch (args->cmd & 0xffff) {
 
 	case LINUX_FIOSETOWN:
 		args->cmd = FIOSETOWN;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 
 	case LINUX_SIOCSPGRP:
 		args->cmd = SIOCSPGRP;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 
 	case LINUX_FIOGETOWN:
 		args->cmd = FIOGETOWN;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 
 	case LINUX_SIOCGPGRP:
 		args->cmd = SIOCGPGRP;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 
 	case LINUX_SIOCATMARK:
 		args->cmd = SIOCATMARK;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 
 	/* LINUX_SIOCGSTAMP */
 
 	case LINUX_SIOCGIFCONF:
 		error = linux_ifconf(td, (struct ifconf *)args->arg);
 		break;
 
 	case LINUX_SIOCGIFFLAGS:
 		args->cmd = SIOCGIFFLAGS;
 		error = linux_gifflags(td, ifp, (struct l_ifreq *)args->arg);
 		break;
 
 	case LINUX_SIOCGIFADDR:
 		args->cmd = OSIOCGIFADDR;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 
 	case LINUX_SIOCSIFADDR:
 		/* XXX probably doesn't work, included for completeness */
 		args->cmd = SIOCSIFADDR;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 
 	case LINUX_SIOCGIFDSTADDR:
 		args->cmd = OSIOCGIFDSTADDR;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 
 	case LINUX_SIOCGIFBRDADDR:
 		args->cmd = OSIOCGIFBRDADDR;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 
 	case LINUX_SIOCGIFNETMASK:
 		args->cmd = OSIOCGIFNETMASK;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 
 	case LINUX_SIOCSIFNETMASK:
 		error = ENOIOCTL;
 		break;
 		
 	case LINUX_SIOCGIFMTU:
 		args->cmd = SIOCGIFMTU;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 		
 	case LINUX_SIOCSIFMTU:
 		args->cmd = SIOCSIFMTU;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 		
 	case LINUX_SIOCSIFNAME:
 		error = ENOIOCTL;
 		break;
 		
 	case LINUX_SIOCGIFHWADDR:
 		error = linux_gifhwaddr(ifp, (struct l_ifreq *)args->arg);
 		break;
 
 	case LINUX_SIOCSIFHWADDR:
 		error = ENOIOCTL;
 		break;
 		
 	case LINUX_SIOCADDMULTI:
 		args->cmd = SIOCADDMULTI;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 
 	case LINUX_SIOCDELMULTI:
 		args->cmd = SIOCDELMULTI;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 
 	/*
 	 * XXX This is slightly bogus, but these ioctls are currently
 	 * XXX only used by the aironet (if_an) network driver.
 	 */
 	case LINUX_SIOCDEVPRIVATE:
 		args->cmd = SIOCGPRIVATE_0;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 		
 	case LINUX_SIOCDEVPRIVATE+1:
 		args->cmd = SIOCGPRIVATE_1;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 	}
 
 	if (ifp != NULL)
 		/* restore the original interface name */
 		copyout(lifname, (char *)args->arg, LINUX_IFNAMSIZ);
 
 #ifdef DEBUG
 	printf("%s(): returning %d\n", __func__, error);
 #endif
 	return (error);
 }
 
 /*
  * Device private ioctl handler
  */
 static int
 linux_ioctl_private(struct thread *td, struct linux_ioctl_args *args)
 {
 	struct file *fp;
 	int error, type;
 
 	mtx_lock(&Giant);
 	if ((error = fget(td, args->fd, &fp)) != 0) {
 		mtx_unlock(&Giant);
 		return (error);
 	}
 	type = fp->f_type;
 	fdrop(fp, td);
 	mtx_unlock(&Giant);
 	if (type == DTYPE_SOCKET)
 		return (linux_ioctl_socket(td, args));
 	return (ENOIOCTL);
 }
 
 /*
  * Special ioctl handler
  */
 static int
 linux_ioctl_special(struct thread *td, struct linux_ioctl_args *args)
 {
 	int error;
 
 	switch (args->cmd) {
 	case LINUX_SIOCGIFADDR:
 		args->cmd = SIOCGIFADDR;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 	case LINUX_SIOCSIFADDR:
 		args->cmd = SIOCSIFADDR;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 	case LINUX_SIOCGIFFLAGS:
 		args->cmd = SIOCGIFFLAGS;
 		error = ioctl(td, (struct ioctl_args *)args);
 		break;
 	default:
 		error = ENOIOCTL;
 	}
 
 	return (error);
 }
 
 /*
  * main ioctl syscall function
  */
 
 int
 linux_ioctl(struct thread *td, struct linux_ioctl_args *args)
 {
-	struct filedesc *fdp;
 	struct file *fp;
 	struct handler_element *he;
 	int error, cmd;
 
 #ifdef DEBUG
 	if (ldebug(ioctl))
 		printf(ARGS(ioctl, "%d, %04lx, *"), args->fd,
 		    (unsigned long)args->cmd);
 #endif
 
-	fdp = td->td_proc->p_fd;
-	if ((unsigned)args->fd >= fdp->fd_nfiles)
+	fp = ffind_hold(td, args->fd);
+	if (fp == NULL)
 		return (EBADF);
-	fp = fdp->fd_ofiles[args->fd];
-	if (fp == NULL || (fp->f_flag & (FREAD|FWRITE)) == 0)
+	if ((fp->f_flag & (FREAD|FWRITE)) == 0) {
+		fdrop(fp, td);
 		return (EBADF);
+	}
 
 	/* Iterate over the ioctl handlers */
 	cmd = args->cmd & 0xffff;
 	TAILQ_FOREACH(he, &handlers, list) {
 		if (cmd >= he->low && cmd <= he->high) {
 			error = (*he->func)(td, args);
 			if (error != ENOIOCTL)
+				fdrop(fp, td);
 				return (error);
 		}
 	}
+	fdrop(fp, td);
 
 	printf("linux: 'ioctl' fd=%d, cmd=0x%x ('%c',%d) not implemented\n",
 	    args->fd, (int)(args->cmd & 0xffff),
 	    (int)(args->cmd & 0xff00) >> 8, (int)(args->cmd & 0xff));
 
 	return (EINVAL);
 }
 
 int
 linux_ioctl_register_handler(struct linux_ioctl_handler *h)
 {
 	struct handler_element *he, *cur;
 
 	if (h == NULL || h->func == NULL)
 		return (EINVAL);
 
 	/*
 	 * Reuse the element if the handler is already on the list, otherwise
 	 * create a new element.
 	 */
 	TAILQ_FOREACH(he, &handlers, list) {
 		if (he->func == h->func)
 			break;
 	}
 	if (he == NULL) {
 		MALLOC(he, struct handler_element *, sizeof(*he),
 		    M_LINUX, M_WAITOK);
 		he->func = h->func;
 	} else
 		TAILQ_REMOVE(&handlers, he, list);
 	
 	/* Initialize range information. */
 	he->low = h->low;
 	he->high = h->high;
 	he->span = h->high - h->low + 1;
 
 	/* Add the element to the list, sorted on span. */
 	TAILQ_FOREACH(cur, &handlers, list) {
 		if (cur->span > he->span) {
 			TAILQ_INSERT_BEFORE(cur, he, list);
 			return (0);
 		}
 	}
 	TAILQ_INSERT_TAIL(&handlers, he, list);
 
 	return (0);
 }
 
 int
 linux_ioctl_unregister_handler(struct linux_ioctl_handler *h)
 {
 	struct handler_element *he;
 
 	if (h == NULL || h->func == NULL)
 		return (EINVAL);
 
 	TAILQ_FOREACH(he, &handlers, list) {
 		if (he->func == h->func) {
 			TAILQ_REMOVE(&handlers, he, list);
 			FREE(he, M_LINUX);
 			return (0);
 		}
 	}
 
 	return (EINVAL);
 }
Index: head/sys/compat/linux/linux_stats.c
===================================================================
--- head/sys/compat/linux/linux_stats.c	(revision 89305)
+++ head/sys/compat/linux/linux_stats.c	(revision 89306)
@@ -1,477 +1,480 @@
 /*-
  * Copyright (c) 1994-1995 S�ren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer 
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software withough specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #include <compat/linux/linux_util.h>
 
 static int
 newstat_copyout(struct stat *buf, void *ubuf)
 {
 	struct l_newstat tbuf;
 	struct cdevsw *cdevsw;
 	dev_t dev;
 
 	tbuf.st_dev = uminor(buf->st_dev) | (umajor(buf->st_dev) << 8);
 	tbuf.st_ino = buf->st_ino;
 	tbuf.st_mode = buf->st_mode;
 	tbuf.st_nlink = buf->st_nlink;
 	tbuf.st_uid = buf->st_uid;
 	tbuf.st_gid = buf->st_gid;
 	tbuf.st_rdev = buf->st_rdev;
 	tbuf.st_size = buf->st_size;
 	tbuf.st_atime = buf->st_atime;
 	tbuf.st_mtime = buf->st_mtime;
 	tbuf.st_ctime = buf->st_ctime;
 	tbuf.st_blksize = buf->st_blksize;
 	tbuf.st_blocks = buf->st_blocks;
 
 	/* Lie about disk drives which are character devices
 	 * in FreeBSD but block devices under Linux.
 	 */
 	if (S_ISCHR(tbuf.st_mode) &&
 	    (dev = udev2dev(buf->st_rdev, 0)) != NODEV) {
 		cdevsw = devsw(dev);
 		if (cdevsw != NULL && (cdevsw->d_flags & D_DISK)) {
 			tbuf.st_mode &= ~S_IFMT;
 			tbuf.st_mode |= S_IFBLK;
 
 			/* XXX this may not be quite right */
 			/* Map major number to 0 */
 			tbuf.st_dev = uminor(buf->st_dev) & 0xf;
 			tbuf.st_rdev = buf->st_rdev & 0xff;
 		}
 	}
 
 	return (copyout(&tbuf, ubuf, sizeof(tbuf)));
 }
 
 int
 linux_newstat(struct thread *td, struct linux_newstat_args *args)
 {
 	struct stat buf;
 	struct nameidata nd;
 	int error;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(newstat))
 		printf(ARGS(newstat, "%s, *"), args->path);
 #endif
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    args->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = vn_stat(nd.ni_vp, &buf, td);
 	vput(nd.ni_vp);
 	if (error)
 		return (error);
 
 	return (newstat_copyout(&buf, args->buf));
 }
 
 int
 linux_newlstat(struct thread *td, struct linux_newlstat_args *args)
 {
 	int error;
 	struct stat sb;
 	struct nameidata nd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(newlstat))
 		printf(ARGS(newlstat, "%s, *"), args->path);
 #endif
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    args->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF); 
 
 	error = vn_stat(nd.ni_vp, &sb, td);
 	vput(nd.ni_vp);
 	if (error)
 		return (error);
 
 	return (newstat_copyout(&sb, args->buf));
 }
 
 int
 linux_newfstat(struct thread *td, struct linux_newfstat_args *args)
 {
-	struct filedesc *fdp;
 	struct file *fp;
 	struct stat buf;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(newfstat))
 		printf(ARGS(newfstat, "%d, *"), args->fd);
 #endif
 
-	fdp = td->td_proc->p_fd;
-	if ((unsigned)args->fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[args->fd]) == NULL)
+	fp = ffind_hold(td, args->fd);
+	if (fp == NULL)
 		return (EBADF);
 
 	error = fo_stat(fp, &buf, td);
+	fdrop(fp, td);
 	if (!error)
 		error = newstat_copyout(&buf, args->buf);
 
 	return (error);
 }
 
 /* XXX - All fields of type l_int are defined as l_long on i386 */
 struct l_statfs {
 	l_int		f_type;
 	l_int		f_bsize;
 	l_int		f_blocks;
 	l_int		f_bfree;
 	l_int		f_bavail;
 	l_int		f_files;
 	l_int		f_ffree;
 	l_fsid_t	f_fsid;
 	l_int		f_namelen;
 	l_int		f_spare[6];
 };
 
 #define	LINUX_CODA_SUPER_MAGIC	0x73757245L
 #define	LINUX_EXT2_SUPER_MAGIC	0xEF53L
 #define	LINUX_HPFS_SUPER_MAGIC	0xf995e849L
 #define	LINUX_ISOFS_SUPER_MAGIC	0x9660L
 #define	LINUX_MSDOS_SUPER_MAGIC	0x4d44L
 #define	LINUX_NCP_SUPER_MAGIC	0x564cL
 #define	LINUX_NFS_SUPER_MAGIC	0x6969L
 #define	LINUX_NTFS_SUPER_MAGIC	0x5346544EL
 #define	LINUX_PROC_SUPER_MAGIC	0x9fa0L
 #define	LINUX_UFS_SUPER_MAGIC	0x00011954L	/* XXX - UFS_MAGIC in Linux */
 
 static long
 bsd_to_linux_ftype(const char *fstypename)
 {
 	int i;
 	static struct {const char *bsd_name; long linux_type;} b2l_tbl[] = {
 		{"ufs",     LINUX_UFS_SUPER_MAGIC},
 		{"cd9660",  LINUX_ISOFS_SUPER_MAGIC},
 		{"nfs",     LINUX_NFS_SUPER_MAGIC},
 		{"ext2fs",  LINUX_EXT2_SUPER_MAGIC},
 		{"procfs",  LINUX_PROC_SUPER_MAGIC},
 		{"msdosfs", LINUX_MSDOS_SUPER_MAGIC},
 		{"ntfs",    LINUX_NTFS_SUPER_MAGIC},
 		{"nwfs",    LINUX_NCP_SUPER_MAGIC},
 		{"hpfs",    LINUX_HPFS_SUPER_MAGIC},
 		{"coda",    LINUX_CODA_SUPER_MAGIC},
 		{NULL,      0L}};
 
 	for (i = 0; b2l_tbl[i].bsd_name != NULL; i++)
 		if (strcmp(b2l_tbl[i].bsd_name, fstypename) == 0)
 			return (b2l_tbl[i].linux_type);
 
 	return (0L);
 }
 
 int
 linux_statfs(struct thread *td, struct linux_statfs_args *args)
 {
 	struct mount *mp;
 	struct nameidata *ndp;
 	struct statfs *bsd_statfs;
 	struct nameidata nd;
 	struct l_statfs linux_statfs;
 	int error;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->path);
 
 #ifdef DEBUG
 	if (ldebug(statfs))
 		printf(ARGS(statfs, "%s, *"), args->path);
 #endif
 	ndp = &nd;
 	NDINIT(ndp, LOOKUP, FOLLOW, UIO_USERSPACE, args->path, curthread);
 	error = namei(ndp);
 	if (error)
 		return error;
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	mp = ndp->ni_vp->v_mount;
 	bsd_statfs = &mp->mnt_stat;
 	vrele(ndp->ni_vp);
 	error = VFS_STATFS(mp, bsd_statfs, td);
 	if (error)
 		return error;
 	bsd_statfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	linux_statfs.f_type = bsd_to_linux_ftype(bsd_statfs->f_fstypename);
 	linux_statfs.f_bsize = bsd_statfs->f_bsize;
 	linux_statfs.f_blocks = bsd_statfs->f_blocks;
 	linux_statfs.f_bfree = bsd_statfs->f_bfree;
 	linux_statfs.f_bavail = bsd_statfs->f_bavail;
   	linux_statfs.f_ffree = bsd_statfs->f_ffree;
 	linux_statfs.f_files = bsd_statfs->f_files;
 	linux_statfs.f_fsid.val[0] = bsd_statfs->f_fsid.val[0];
 	linux_statfs.f_fsid.val[1] = bsd_statfs->f_fsid.val[1];
 	linux_statfs.f_namelen = MAXNAMLEN;
 	return copyout((caddr_t)&linux_statfs, (caddr_t)args->buf,
 	    sizeof(linux_statfs));
 }
 
 int
 linux_fstatfs(struct thread *td, struct linux_fstatfs_args *args)
 {
 	struct file *fp;
 	struct mount *mp;
 	struct statfs *bsd_statfs;
 	struct l_statfs linux_statfs;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(fstatfs))
 		printf(ARGS(fstatfs, "%d, *"), args->fd);
 #endif
 	error = getvnode(td->td_proc->p_fd, args->fd, &fp);
 	if (error)
 		return error;
 	mp = ((struct vnode *)fp->f_data)->v_mount;
 	bsd_statfs = &mp->mnt_stat;
 	error = VFS_STATFS(mp, bsd_statfs, td);
-	if (error)
+	if (error) {
+		fdrop(fp, td);
 		return error;
+	}
 	bsd_statfs->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	linux_statfs.f_type = bsd_to_linux_ftype(bsd_statfs->f_fstypename);
 	linux_statfs.f_bsize = bsd_statfs->f_bsize;
 	linux_statfs.f_blocks = bsd_statfs->f_blocks;
 	linux_statfs.f_bfree = bsd_statfs->f_bfree;
 	linux_statfs.f_bavail = bsd_statfs->f_bavail;
   	linux_statfs.f_ffree = bsd_statfs->f_ffree;
 	linux_statfs.f_files = bsd_statfs->f_files;
 	linux_statfs.f_fsid.val[0] = bsd_statfs->f_fsid.val[0];
 	linux_statfs.f_fsid.val[1] = bsd_statfs->f_fsid.val[1];
 	linux_statfs.f_namelen = MAXNAMLEN;
-	return copyout((caddr_t)&linux_statfs, (caddr_t)args->buf,
+	error = copyout((caddr_t)&linux_statfs, (caddr_t)args->buf,
 	    sizeof(linux_statfs));
+	fdrop(fp, td);
+	return error;
 }
 
 struct l_ustat 
 {
 	l_daddr_t	f_tfree;
 	l_ino_t		f_tinode;
 	char		f_fname[6];
 	char		f_fpack[6];
 };
 
 int
 linux_ustat(struct thread *td, struct linux_ustat_args *args)
 {
 	struct l_ustat lu;
 	dev_t dev;
 	struct vnode *vp;
 	struct statfs *stat;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(ustat))
 		printf(ARGS(ustat, "%d, *"), args->dev);
 #endif
 
 	/*
 	 * lu.f_fname and lu.f_fpack are not used. They are always zeroed.
 	 * lu.f_tinode and lu.f_tfree are set from the device's super block.
 	 */
 	bzero(&lu, sizeof(lu));
 
 	/*
 	 * XXX - Don't return an error if we can't find a vnode for the
 	 * device. Our dev_t is 32-bits whereas Linux only has a 16-bits
 	 * dev_t. The dev_t that is used now may as well be a truncated
 	 * dev_t returned from previous syscalls. Just return a bzeroed
 	 * ustat in that case.
 	 */
 	dev = makedev(args->dev >> 8, args->dev & 0xFF);
 	if (vfinddev(dev, VCHR, &vp)) {
 		if (vp->v_mount == NULL)
 			return (EINVAL);
 		stat = &(vp->v_mount->mnt_stat);
 		error = VFS_STATFS(vp->v_mount, stat, td);
 		if (error)
 			return (error);
 
 		lu.f_tfree = stat->f_bfree;
 		lu.f_tinode = stat->f_ffree;
 	}
 
 	return (copyout(&lu, args->ubuf, sizeof(lu)));
 }
 
 #if defined(__i386__)
 
 static int
 stat64_copyout(struct stat *buf, void *ubuf)
 {
 	struct l_stat64 lbuf;
 
 	bzero(&lbuf, sizeof(lbuf));
 	lbuf.st_dev = uminor(buf->st_dev) | (umajor(buf->st_dev) << 8);
 	lbuf.st_ino = buf->st_ino;
 	lbuf.st_mode = buf->st_mode;
 	lbuf.st_nlink = buf->st_nlink;
 	lbuf.st_uid = buf->st_uid;
 	lbuf.st_gid = buf->st_gid;
 	lbuf.st_rdev = buf->st_rdev;
 	lbuf.st_size = buf->st_size;
 	lbuf.st_atime = buf->st_atime;
 	lbuf.st_mtime = buf->st_mtime;
 	lbuf.st_ctime = buf->st_ctime;
 	lbuf.st_blksize = buf->st_blksize;
 	lbuf.st_blocks = buf->st_blocks;
 
 	/*
 	 * The __st_ino field makes all the difference. In the Linux kernel
 	 * it is conditionally compiled based on STAT64_HAS_BROKEN_ST_INO,
 	 * but without the assignment to __st_ino the runtime linker refuses
 	 * to mmap(2) any shared libraries. I guess it's broken alright :-)
 	 */
 	lbuf.__st_ino = buf->st_ino;
 
 	return (copyout(&lbuf, ubuf, sizeof(lbuf)));
 }
 
 int
 linux_stat64(struct thread *td, struct linux_stat64_args *args)
 {
 	struct stat buf;
 	struct nameidata nd;
 	int error;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->filename);
 
 #ifdef DEBUG
 	if (ldebug(stat64))
 		printf(ARGS(stat64, "%s, *"), args->filename);
 #endif
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    args->filename, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = vn_stat(nd.ni_vp, &buf, td);
 	vput(nd.ni_vp);
 	if (error)
 		return (error);
 
 	return (stat64_copyout(&buf, args->statbuf));
 }
 
 int
 linux_lstat64(struct thread *td, struct linux_lstat64_args *args)
 {
 	int error;
 	struct stat sb;
 	struct nameidata nd;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, args->filename);
 
 #ifdef DEBUG
 	if (ldebug(lstat64))
 		printf(ARGS(lstat64, "%s, *"), args->filename);
 #endif
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    args->filename, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF); 
 
 	error = vn_stat(nd.ni_vp, &sb, td);
 	vput(nd.ni_vp);
 	if (error)
 		return (error);
 
 	return (stat64_copyout(&sb, args->statbuf));
 }
 
 int
 linux_fstat64(struct thread *td, struct linux_fstat64_args *args)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct stat buf;
 	int error;
 
 #ifdef DEBUG
 	if (ldebug(fstat64))
 		printf(ARGS(fstat64, "%d, *"), args->fd);
 #endif
 
 	fdp = td->td_proc->p_fd;
 	if ((unsigned)args->fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[args->fd]) == NULL)
 		return (EBADF);
 
 	error = fo_stat(fp, &buf, td);
 	if (!error)
 		error = stat64_copyout(&buf, args->statbuf);
 
 	return (error);
 }
 
 #endif /* __i386__ */
Index: head/sys/compat/svr4/svr4_fcntl.c
===================================================================
--- head/sys/compat/svr4/svr4_fcntl.c	(revision 89305)
+++ head/sys/compat/svr4/svr4_fcntl.c	(revision 89306)
@@ -1,726 +1,745 @@
 /*
  * Copyright (c) 1998 Mark Newton
  * Copyright (c) 1994, 1997 Christos Zoulas.  
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Christos Zoulas.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * 
  * $FreeBSD$
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 /*#include <sys/ioctl.h>*/
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <sys/sysproto.h>
 
 #include <compat/svr4/svr4.h>
 #include <compat/svr4/svr4_types.h>
 #include <compat/svr4/svr4_signal.h>
 #include <compat/svr4/svr4_proto.h>
 #include <compat/svr4/svr4_util.h>
 #include <compat/svr4/svr4_fcntl.h>
 
 static int svr4_to_bsd_flags __P((int));
 static u_long svr4_to_bsd_cmd __P((u_long));
 static int fd_revoke __P((struct thread *, int));
 static int fd_truncate __P((struct thread *, int, struct flock *));
 static int bsd_to_svr4_flags __P((int));
 static void bsd_to_svr4_flock __P((struct flock *, struct svr4_flock *));
 static void svr4_to_bsd_flock __P((struct svr4_flock *, struct flock *));
 static void bsd_to_svr4_flock64 __P((struct flock *, struct svr4_flock64 *));
 static void svr4_to_bsd_flock64 __P((struct svr4_flock64 *, struct flock *));
 
 static u_long
 svr4_to_bsd_cmd(cmd)
 	u_long	cmd;
 {
 	switch (cmd) {
 	case SVR4_F_DUPFD:
 		return F_DUPFD;
 	case SVR4_F_GETFD:
 		return F_GETFD;
 	case SVR4_F_SETFD:
 		return F_SETFD;
 	case SVR4_F_GETFL:
 		return F_GETFL;
 	case SVR4_F_SETFL:
 		return F_SETFL;
 	case SVR4_F_GETLK:
 		return F_GETLK;
 	case SVR4_F_SETLK:
 		return F_SETLK;
 	case SVR4_F_SETLKW:
 		return F_SETLKW;
 	default:
 		return -1;
 	}
 }
 
 static int
 svr4_to_bsd_flags(l)
 	int	l;
 {
 	int	r = 0;
 	r |= (l & SVR4_O_RDONLY) ? O_RDONLY : 0;
 	r |= (l & SVR4_O_WRONLY) ? O_WRONLY : 0;
 	r |= (l & SVR4_O_RDWR) ? O_RDWR : 0;
 	r |= (l & SVR4_O_NDELAY) ? O_NONBLOCK : 0;
 	r |= (l & SVR4_O_APPEND) ? O_APPEND : 0;
 	r |= (l & SVR4_O_SYNC) ? O_FSYNC : 0;
 	r |= (l & SVR4_O_NONBLOCK) ? O_NONBLOCK : 0;
 	r |= (l & SVR4_O_PRIV) ? O_EXLOCK : 0;
 	r |= (l & SVR4_O_CREAT) ? O_CREAT : 0;
 	r |= (l & SVR4_O_TRUNC) ? O_TRUNC : 0;
 	r |= (l & SVR4_O_EXCL) ? O_EXCL : 0;
 	r |= (l & SVR4_O_NOCTTY) ? O_NOCTTY : 0;
 	return r;
 }
 
 static int
 bsd_to_svr4_flags(l)
 	int	l;
 {
 	int	r = 0;
 	r |= (l & O_RDONLY) ? SVR4_O_RDONLY : 0;
 	r |= (l & O_WRONLY) ? SVR4_O_WRONLY : 0;
 	r |= (l & O_RDWR) ? SVR4_O_RDWR : 0;
 	r |= (l & O_NDELAY) ? SVR4_O_NONBLOCK : 0;
 	r |= (l & O_APPEND) ? SVR4_O_APPEND : 0;
 	r |= (l & O_FSYNC) ? SVR4_O_SYNC : 0;
 	r |= (l & O_NONBLOCK) ? SVR4_O_NONBLOCK : 0;
 	r |= (l & O_EXLOCK) ? SVR4_O_PRIV : 0;
 	r |= (l & O_CREAT) ? SVR4_O_CREAT : 0;
 	r |= (l & O_TRUNC) ? SVR4_O_TRUNC : 0;
 	r |= (l & O_EXCL) ? SVR4_O_EXCL : 0;
 	r |= (l & O_NOCTTY) ? SVR4_O_NOCTTY : 0;
 	return r;
 }
 
 
 static void
 bsd_to_svr4_flock(iflp, oflp)
 	struct flock		*iflp;
 	struct svr4_flock	*oflp;
 {
 	switch (iflp->l_type) {
 	case F_RDLCK:
 		oflp->l_type = SVR4_F_RDLCK;
 		break;
 	case F_WRLCK:
 		oflp->l_type = SVR4_F_WRLCK;
 		break;
 	case F_UNLCK:
 		oflp->l_type = SVR4_F_UNLCK;
 		break;
 	default:
 		oflp->l_type = -1;
 		break;
 	}
 
 	oflp->l_whence = (short) iflp->l_whence;
 	oflp->l_start = (svr4_off_t) iflp->l_start;
 	oflp->l_len = (svr4_off_t) iflp->l_len;
 	oflp->l_sysid = 0;
 	oflp->l_pid = (svr4_pid_t) iflp->l_pid;
 }
 
 
 static void
 svr4_to_bsd_flock(iflp, oflp)
 	struct svr4_flock	*iflp;
 	struct flock		*oflp;
 {
 	switch (iflp->l_type) {
 	case SVR4_F_RDLCK:
 		oflp->l_type = F_RDLCK;
 		break;
 	case SVR4_F_WRLCK:
 		oflp->l_type = F_WRLCK;
 		break;
 	case SVR4_F_UNLCK:
 		oflp->l_type = F_UNLCK;
 		break;
 	default:
 		oflp->l_type = -1;
 		break;
 	}
 
 	oflp->l_whence = iflp->l_whence;
 	oflp->l_start = (off_t) iflp->l_start;
 	oflp->l_len = (off_t) iflp->l_len;
 	oflp->l_pid = (pid_t) iflp->l_pid;
 
 }
 
 static void
 bsd_to_svr4_flock64(iflp, oflp)
 	struct flock		*iflp;
 	struct svr4_flock64	*oflp;
 {
 	switch (iflp->l_type) {
 	case F_RDLCK:
 		oflp->l_type = SVR4_F_RDLCK;
 		break;
 	case F_WRLCK:
 		oflp->l_type = SVR4_F_WRLCK;
 		break;
 	case F_UNLCK:
 		oflp->l_type = SVR4_F_UNLCK;
 		break;
 	default:
 		oflp->l_type = -1;
 		break;
 	}
 
 	oflp->l_whence = (short) iflp->l_whence;
 	oflp->l_start = (svr4_off64_t) iflp->l_start;
 	oflp->l_len = (svr4_off64_t) iflp->l_len;
 	oflp->l_sysid = 0;
 	oflp->l_pid = (svr4_pid_t) iflp->l_pid;
 }
 
 
 static void
 svr4_to_bsd_flock64(iflp, oflp)
 	struct svr4_flock64	*iflp;
 	struct flock		*oflp;
 {
 	switch (iflp->l_type) {
 	case SVR4_F_RDLCK:
 		oflp->l_type = F_RDLCK;
 		break;
 	case SVR4_F_WRLCK:
 		oflp->l_type = F_WRLCK;
 		break;
 	case SVR4_F_UNLCK:
 		oflp->l_type = F_UNLCK;
 		break;
 	default:
 		oflp->l_type = -1;
 		break;
 	}
 
 	oflp->l_whence = iflp->l_whence;
 	oflp->l_start = (off_t) iflp->l_start;
 	oflp->l_len = (off_t) iflp->l_len;
 	oflp->l_pid = (pid_t) iflp->l_pid;
 
 }
 
 
 static int
 fd_revoke(td, fd)
 	struct thread *td;
 	int fd;
 {
-	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *fp;
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vattr;
 	int error, *retval;
 
 	retval = td->td_retval;
-	if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
+	fp = ffind_hold(td, fd);
+	if (fp == NULL)
 		return EBADF;
 
-	if (fp->f_type != DTYPE_VNODE) 
+	if (fp->f_type != DTYPE_VNODE) {
+		fdrop(fp, td);
 		return EINVAL;
+	}
 
 	vp = (struct vnode *) fp->f_data;
 
 	if (vp->v_type != VCHR && vp->v_type != VBLK) {
 		error = EINVAL;
 		goto out;
 	}
 
 	if ((error = VOP_GETATTR(vp, &vattr, td->td_proc->p_ucred, td)) != 0)
 		goto out;
 
 	if (td->td_proc->p_ucred->cr_uid != vattr.va_uid &&
 	    (error = suser_td(td)) != 0)
 		goto out;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto out;
 	if (vcount(vp) > 1)
 		VOP_REVOKE(vp, REVOKEALL);
 	vn_finished_write(mp);
 out:
 	vrele(vp);
+	fdrop(fp, td);
 	return error;
 }
 
 
 static int
 fd_truncate(td, fd, flp)
 	struct thread *td;
 	int fd;
 	struct flock *flp;
 {
-	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *fp;
 	off_t start, length;
 	struct vnode *vp;
 	struct vattr vattr;
 	int error, *retval;
 	struct ftruncate_args ft;
 
 	retval = td->td_retval;
 
 	/*
 	 * We only support truncating the file.
 	 */
-	if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
+	fp = ffind_hold(td, fd);
+	if (fp == NULL)
 		return EBADF;
 
 	vp = (struct vnode *)fp->f_data;
-	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO)
+	if (fp->f_type != DTYPE_VNODE || vp->v_type == VFIFO) {
+		fdrop(fp, td);
 		return ESPIPE;
+	}
 
 	if ((error = VOP_GETATTR(vp, &vattr, td->td_proc->p_ucred, td)) != 0)
+		fdrop(fp, td);
 		return error;
+	}
 
 	length = vattr.va_size;
 
 	switch (flp->l_whence) {
 	case SEEK_CUR:
 		start = fp->f_offset + flp->l_start;
 		break;
 
 	case SEEK_END:
 		start = flp->l_start + length;
 		break;
 
 	case SEEK_SET:
 		start = flp->l_start;
 		break;
 
 	default:
+		fdrop(fp, td);
 		return EINVAL;
 	}
 
 	if (start + flp->l_len < length) {
 		/* We don't support free'ing in the middle of the file */
 		return EINVAL;
 	}
 
 	SCARG(&ft, fd) = fd;
 	SCARG(&ft, length) = start;
 
-	return ftruncate(td, &ft);
+	error = ftruncate(td, &ft);
+
+	fdrop(fp, td);
+	return (error);
 }
 
 int
 svr4_sys_open(td, uap)
 	register struct thread *td;
 	struct svr4_sys_open_args *uap;
 {
 	struct proc *p = td->td_proc;
 	int			error, retval;
 	struct open_args	cup;
 
 	caddr_t sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 
 	(&cup)->path = uap->path;
 	(&cup)->flags = svr4_to_bsd_flags(uap->flags);
 	(&cup)->mode = uap->mode;
 	error = open(td, &cup);
 
 	if (error) {
 	  /*	        uprintf("svr4_open(%s, 0x%0x, 0%o): %d\n", uap->path,
 			uap->flags, uap->mode, error);*/
 		return error;
 	}
 
 	retval = td->td_retval[0];
 
 	PROC_LOCK(p);
 	if (!(SCARG(&cup, flags) & O_NOCTTY) && SESS_LEADER(p) &&
 	    !(td->td_proc->p_flag & P_CONTROLT)) {
 #if defined(NOTYET)
-		struct filedesc	*fdp = td->td_proc->p_fd;
-		struct file	*fp = fdp->fd_ofiles[retval];
+		struct file	*fp;
 
+		fp = ffind_hold(td, retval);
 		PROC_UNLOCK(p);
+		/*
+		 * we may have lost a race the above open() and
+		 * another thread issuing a close()
+		 */
+		if (fp == NULL) 
+			return (EBADF);	/* XXX: correct errno? */
 		/* ignore any error, just give it a try */
 		if (fp->f_type == DTYPE_VNODE)
 			fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td);
-	} else
+		fdrop(fp, td);
+	} else {
 		PROC_UNLOCK(p);
+	}
 #else
 	}
 	PROC_UNLOCK(p);
 #endif
 	return error;
 }
 
 int
 svr4_sys_open64(td, uap)
 	register struct thread *td;
 	struct svr4_sys_open64_args *uap;
 {
 	return svr4_sys_open(td, (struct svr4_sys_open_args *)uap);
 }
 
 int
 svr4_sys_creat(td, uap)
 	register struct thread *td;
 	struct svr4_sys_creat_args *uap;
 {
 	struct open_args cup;
 
 	caddr_t sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 
 	SCARG(&cup, path) = SCARG(uap, path);
 	SCARG(&cup, mode) = SCARG(uap, mode);
 	SCARG(&cup, flags) = O_WRONLY | O_CREAT | O_TRUNC;
 
 	return open(td, &cup);
 }
 
 int
 svr4_sys_creat64(td, uap)
 	register struct thread *td;
 	struct svr4_sys_creat64_args *uap;
 {
 	return svr4_sys_creat(td, (struct svr4_sys_creat_args *)uap);
 }
 
 int
 svr4_sys_llseek(td, uap)
 	register struct thread *td;
 	struct svr4_sys_llseek_args *uap;
 {
 	struct lseek_args ap;
 
 	SCARG(&ap, fd) = SCARG(uap, fd);
 
 #if BYTE_ORDER == BIG_ENDIAN
 	SCARG(&ap, offset) = (((u_int64_t) SCARG(uap, offset1)) << 32) | 
 		SCARG(uap, offset2);
 #else
 	SCARG(&ap, offset) = (((u_int64_t) SCARG(uap, offset2)) << 32) | 
 		SCARG(uap, offset1);
 #endif
 	SCARG(&ap, whence) = SCARG(uap, whence);
 
 	return lseek(td, &ap);
 }
 
 int
 svr4_sys_access(td, uap)
 	register struct thread *td;
 	struct svr4_sys_access_args *uap;
 {
 	struct access_args cup;
 	int *retval;
 
 	caddr_t sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 
 	retval = td->td_retval;
 
 	SCARG(&cup, path) = SCARG(uap, path);
 	SCARG(&cup, flags) = SCARG(uap, flags);
 
 	return access(td, &cup);
 }
 
 #if defined(NOTYET)
 int
 svr4_sys_pread(td, uap)
 	register struct thread *td;
 	struct svr4_sys_pread_args *uap;
 {
 	struct pread_args pra;
 
 	/*
 	 * Just translate the args structure and call the NetBSD
 	 * pread(2) system call (offset type is 64-bit in NetBSD).
 	 */
 	SCARG(&pra, fd) = SCARG(uap, fd);
 	SCARG(&pra, buf) = SCARG(uap, buf);
 	SCARG(&pra, nbyte) = SCARG(uap, nbyte);
 	SCARG(&pra, offset) = SCARG(uap, off);
 
 	return pread(td, &pra);
 }
 #endif
 
 #if defined(NOTYET)
 int
 svr4_sys_pread64(td, v, retval)
 	register struct thread *td;
 	void *v; 
 	register_t *retval;
 {
 
 	struct svr4_sys_pread64_args *uap = v;
 	struct sys_pread_args pra;
 
 	/*
 	 * Just translate the args structure and call the NetBSD
 	 * pread(2) system call (offset type is 64-bit in NetBSD).
 	 */
 	SCARG(&pra, fd) = SCARG(uap, fd);
 	SCARG(&pra, buf) = SCARG(uap, buf);
 	SCARG(&pra, nbyte) = SCARG(uap, nbyte);
 	SCARG(&pra, offset) = SCARG(uap, off);
 
 	return (sys_pread(td, &pra, retval));
 }
 #endif /* NOTYET */
 
 #if defined(NOTYET)
 int
 svr4_sys_pwrite(td, uap)
 	register struct thread *td;
 	struct svr4_sys_pwrite_args *uap;
 {
 	struct pwrite_args pwa;
 
 	/*
 	 * Just translate the args structure and call the NetBSD
 	 * pwrite(2) system call (offset type is 64-bit in NetBSD).
 	 */
 	SCARG(&pwa, fd) = SCARG(uap, fd);
 	SCARG(&pwa, buf) = SCARG(uap, buf);
 	SCARG(&pwa, nbyte) = SCARG(uap, nbyte);
 	SCARG(&pwa, offset) = SCARG(uap, off);
 
 	return pwrite(td, &pwa);
 }
 #endif
 
 #if defined(NOTYET)
 int
 svr4_sys_pwrite64(td, v, retval)
 	register struct thread *td;
 	void *v; 
 	register_t *retval;
 {
 	struct svr4_sys_pwrite64_args *uap = v;
 	struct sys_pwrite_args pwa;
 
 	/*
 	 * Just translate the args structure and call the NetBSD
 	 * pwrite(2) system call (offset type is 64-bit in NetBSD).
 	 */
 	SCARG(&pwa, fd) = SCARG(uap, fd);
 	SCARG(&pwa, buf) = SCARG(uap, buf);
 	SCARG(&pwa, nbyte) = SCARG(uap, nbyte);
 	SCARG(&pwa, offset) = SCARG(uap, off);
 
 	return (sys_pwrite(td, &pwa, retval));
 }
 #endif /* NOTYET */
 
 int
 svr4_sys_fcntl(td, uap)
 	register struct thread *td;
 	struct svr4_sys_fcntl_args *uap;
 {
 	int				error;
 	struct fcntl_args		fa;
 	int                             *retval;
 
 	retval = td->td_retval;
 
 	SCARG(&fa, fd) = SCARG(uap, fd);
 	SCARG(&fa, cmd) = svr4_to_bsd_cmd(SCARG(uap, cmd));
 
 	switch (SCARG(&fa, cmd)) {
 	case F_DUPFD:
 	case F_GETFD:
 	case F_SETFD:
 		SCARG(&fa, arg) = (long) SCARG(uap, arg);
 		return fcntl(td, &fa);
 
 	case F_GETFL:
 		SCARG(&fa, arg) = (long) SCARG(uap, arg);
 		error = fcntl(td, &fa);
 		if (error)
 			return error;
 		*retval = bsd_to_svr4_flags(*retval);
 		return error;
 
 	case F_SETFL:
 		{
 			/*
 			 * we must save the O_ASYNC flag, as that is
 			 * handled by ioctl(_, I_SETSIG, _) emulation.
 			 */
 			long cmd;
 			int flags;
 
 			DPRINTF(("Setting flags %p\n", SCARG(uap, arg)));
 			cmd = SCARG(&fa, cmd); /* save it for a while */
 
 			SCARG(&fa, cmd) = F_GETFL;
 			if ((error = fcntl(td, &fa)) != 0)
 				return error;
 			flags = *retval;
 			flags &= O_ASYNC;
 			flags |= svr4_to_bsd_flags((u_long) SCARG(uap, arg));
 			SCARG(&fa, cmd) = cmd;
 			SCARG(&fa, arg) = (long) flags;
 			return fcntl(td, &fa);
 		}
 
 	case F_GETLK:
 	case F_SETLK:
 	case F_SETLKW:
 		{
 			struct svr4_flock	 ifl;
 			struct flock		*flp, fl;
 			caddr_t sg = stackgap_init();
 
 			flp = stackgap_alloc(&sg, sizeof(struct flock));
 			SCARG(&fa, arg) = (long) flp;
 
 			error = copyin(SCARG(uap, arg), &ifl, sizeof ifl);
 			if (error)
 				return error;
 
 			svr4_to_bsd_flock(&ifl, &fl);
 
 			error = copyout(&fl, flp, sizeof fl);
 			if (error)
 				return error;
 
 			error = fcntl(td, &fa);
 			if (error || SCARG(&fa, cmd) != F_GETLK)
 				return error;
 
 			error = copyin(flp, &fl, sizeof fl);
 			if (error)
 				return error;
 
 			bsd_to_svr4_flock(&fl, &ifl);
 
 			return copyout(&ifl, SCARG(uap, arg), sizeof ifl);
 		}
 	case -1:
 		switch (SCARG(uap, cmd)) {
 		case SVR4_F_DUP2FD:
 			{
 				struct dup2_args du;
 
 				SCARG(&du, from) = SCARG(uap, fd);
 				SCARG(&du, to) = (int)SCARG(uap, arg);
 				error = dup2(td, &du);
 				if (error)
 					return error;
 				*retval = SCARG(&du, to);
 				return 0;
 			}
 
 		case SVR4_F_FREESP:
 			{
 				struct svr4_flock	 ifl;
 				struct flock		 fl;
 
 				error = copyin(SCARG(uap, arg), &ifl,
 				    sizeof ifl);
 				if (error)
 					return error;
 				svr4_to_bsd_flock(&ifl, &fl);
 				return fd_truncate(td, SCARG(uap, fd), &fl);
 			}
 
 		case SVR4_F_GETLK64:
 		case SVR4_F_SETLK64:
 		case SVR4_F_SETLKW64:
 			{
 				struct svr4_flock64	 ifl;
 				struct flock		*flp, fl;
 				caddr_t sg = stackgap_init();
 
 				flp = stackgap_alloc(&sg, sizeof(struct flock));
 				SCARG(&fa, arg) = (long) flp;
 
 				error = copyin(SCARG(uap, arg), &ifl,
 				    sizeof ifl);
 				if (error)
 					return error;
 
 				svr4_to_bsd_flock64(&ifl, &fl);
 
 				error = copyout(&fl, flp, sizeof fl);
 				if (error)
 					return error;
 
 				error = fcntl(td, &fa);
 				if (error || SCARG(&fa, cmd) != F_GETLK)
 					return error;
 
 				error = copyin(flp, &fl, sizeof fl);
 				if (error)
 					return error;
 
 				bsd_to_svr4_flock64(&fl, &ifl);
 
 				return copyout(&ifl, SCARG(uap, arg),
 				    sizeof ifl);
 			}
 
 		case SVR4_F_FREESP64:
 			{
 				struct svr4_flock64	 ifl;
 				struct flock		 fl;
 
 				error = copyin(SCARG(uap, arg), &ifl,
 				    sizeof ifl);
 				if (error)
 					return error;
 				svr4_to_bsd_flock64(&ifl, &fl);
 				return fd_truncate(td, SCARG(uap, fd), &fl);
 			}
 
 		case SVR4_F_REVOKE:
 			return fd_revoke(td, SCARG(uap, fd));
 
 		default:
 			return ENOSYS;
 		}
 
 	default:
 		return ENOSYS;
 	}
 }
Index: head/sys/compat/svr4/svr4_filio.c
===================================================================
--- head/sys/compat/svr4/svr4_filio.c	(revision 89305)
+++ head/sys/compat/svr4/svr4_filio.c	(revision 89306)
@@ -1,228 +1,233 @@
 /*
  * Copyright (c) 1998 Mark Newton
  * Copyright (c) 1994 Christos Zoulas
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * 
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/file.h>
 #include <sys/filio.h>
 #include <sys/signal.h>
 #include <sys/filedesc.h>
 #include <sys/poll.h>
 #include <sys/malloc.h>
 
 #include <sys/sysproto.h>
 
 #include <compat/svr4/svr4.h>
 #include <compat/svr4/svr4_types.h>
 #include <compat/svr4/svr4_util.h>
 #include <compat/svr4/svr4_signal.h>
 #include <compat/svr4/svr4_proto.h>
 #include <compat/svr4/svr4_ioctl.h>
 #include <compat/svr4/svr4_filio.h>
 
 /*#define GROTTY_READ_HACK*/
 
 int
 svr4_sys_poll(td, uap)
      struct thread *td;
      struct svr4_sys_poll_args *uap;
 {
      int error;
      struct poll_args pa;
      struct pollfd *pfd;
      int idx = 0, cerr;
      u_long siz;
 
      SCARG(&pa, fds) = SCARG(uap, fds);
      SCARG(&pa, nfds) = SCARG(uap, nfds);
      SCARG(&pa, timeout) = SCARG(uap, timeout);
 
      siz = SCARG(uap, nfds) * sizeof(struct pollfd);
      pfd = (struct pollfd *)malloc(siz, M_TEMP, M_WAITOK);
 
      error = poll(td, (struct poll_args *)uap);
 
      if ((cerr = copyin(SCARG(uap, fds), pfd, siz)) != 0) {
        error = cerr;
        goto done;
      }
 
      for (idx = 0; idx < SCARG(uap, nfds); idx++) {
        /* POLLWRNORM already equals POLLOUT, so we don't worry about that */
        if (pfd[idx].revents & (POLLOUT | POLLWRNORM | POLLWRBAND))
 	    pfd[idx].revents |= (POLLOUT | POLLWRNORM | POLLWRBAND);
      }
      if ((cerr = copyout(pfd, SCARG(uap, fds), siz)) != 0) {
        error = cerr;
        goto done;   /* yeah, I know it's the next line, but this way I won't
 		       forget to update it if I add more code */
      }
 done:
      free(pfd, M_TEMP);
      return error;
 }
 
 #if defined(READ_TEST)
 int
 svr4_sys_read(td, uap)
      struct thread *td;
      struct svr4_sys_read_args *uap;
 {
      struct read_args ra;
-     struct filedesc *fdp = td->td_proc->p_fd;
      struct file *fp;
      struct socket *so = NULL;
      int so_state;
      sigset_t sigmask;
      int rv;
 
      SCARG(&ra, fd) = SCARG(uap, fd);
      SCARG(&ra, buf) = SCARG(uap, buf);
      SCARG(&ra, nbyte) = SCARG(uap, nbyte);
 
-     if ((fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) {
+     fp = ffind_hold(td, uap->fd);
+     if (fp == NULL) {
        DPRINTF(("Something fishy with the user-supplied file descriptor...\n"));
        return EBADF;
      }
 
      if (fp->f_type == DTYPE_SOCKET) {
        so = (struct socket *)fp->f_data;
        DPRINTF(("fd %d is a socket\n", SCARG(uap, fd)));
        if (so->so_state & SS_ASYNC) {
 	 DPRINTF(("fd %d is an ASYNC socket!\n", SCARG(uap, fd)));
        }
        DPRINTF(("Here are its flags: 0x%x\n", so->so_state));
 #if defined(GROTTY_READ_HACK)
        so_state = so->so_state;
        so->so_state &= ~SS_NBIO;
 #endif
      }
 
      rv = read(td, &ra);
 
      DPRINTF(("svr4_read(%d, 0x%0x, %d) = %d\n", 
 	     SCARG(uap, fd), SCARG(uap, buf), SCARG(uap, nbyte), rv));
      if (rv == EAGAIN) {
        DPRINTF(("sigmask = 0x%x\n", td->td_proc->p_sigmask));
        DPRINTF(("sigignore = 0x%x\n", td->td_proc->p_sigignore));
        DPRINTF(("sigcaught = 0x%x\n", td->td_proc->p_sigcatch));
        DPRINTF(("siglist = 0x%x\n", td->td_proc->p_siglist));
      }
 
 #if defined(GROTTY_READ_HACK)
      if (so) {  /* We've already checked to see if this is a socket */
        so->so_state = so_state;
      }
 #endif
+     fdrop(fp, td);
 
      return(rv);
 }
 #endif /* READ_TEST */
 
 #if defined(BOGUS)
 int
 svr4_sys_write(td, uap)
      struct thread *td;
      struct svr4_sys_write_args *uap;
 {
      struct write_args wa;
-     struct filedesc *fdp;
      struct file *fp;
      int rv;
 
      SCARG(&wa, fd) = SCARG(uap, fd);
      SCARG(&wa, buf) = SCARG(uap, buf);
      SCARG(&wa, nbyte) = SCARG(uap, nbyte);
 
      rv = write(td, &wa);
 
      DPRINTF(("svr4_write(%d, 0x%0x, %d) = %d\n", 
 	     SCARG(uap, fd), SCARG(uap, buf), SCARG(uap, nbyte), rv));
 
      return(rv);
 }
 #endif /* BOGUS */
 
 int
 svr4_fil_ioctl(fp, td, retval, fd, cmd, data)
 	struct file *fp;
 	struct thread *td;
 	register_t *retval;
 	int fd;
 	u_long cmd;
 	caddr_t data;
 {
 	int error;
 	int num;
 	struct filedesc *fdp = td->td_proc->p_fd;
 
 	*retval = 0;
 
+	FILEDESC_LOCK(fdp);
 	switch (cmd) {
 	case SVR4_FIOCLEX:
 		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
+		FILEDESC_UNLOCK(fdp);
 		return 0;
 
 	case SVR4_FIONCLEX:
 		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
+		FILEDESC_UNLOCK(fdp);
 		return 0;
 
 	case SVR4_FIOGETOWN:
 	case SVR4_FIOSETOWN:
 	case SVR4_FIOASYNC:
 	case SVR4_FIONBIO:
 	case SVR4_FIONREAD:
+		FILEDESC_UNLOCK(fdp);
 		if ((error = copyin(data, &num, sizeof(num))) != 0)
 			return error;
 
 		switch (cmd) {
 		case SVR4_FIOGETOWN:	cmd = FIOGETOWN; break;
 		case SVR4_FIOSETOWN:	cmd = FIOSETOWN; break;
 		case SVR4_FIOASYNC:	cmd = FIOASYNC;  break;
 		case SVR4_FIONBIO:	cmd = FIONBIO;   break;
 		case SVR4_FIONREAD:	cmd = FIONREAD;  break;
 		}
 
 #ifdef SVR4_DEBUG
 		if (cmd == FIOASYNC) DPRINTF(("FIOASYNC\n"));
 #endif
 		error = fo_ioctl(fp, cmd, (caddr_t) &num, td);
 
 		if (error)
 			return error;
 
 		return copyout(&num, data, sizeof(num));
 
 	default:
+		FILEDESC_UNLOCK(fdp);
 		DPRINTF(("Unknown svr4 filio %lx\n", cmd));
 		return 0;	/* ENOSYS really */
 	}
 }
Index: head/sys/compat/svr4/svr4_ioctl.c
===================================================================
--- head/sys/compat/svr4/svr4_ioctl.c	(revision 89305)
+++ head/sys/compat/svr4/svr4_ioctl.c	(revision 89306)
@@ -1,161 +1,168 @@
 /*
  * Copyright (c) 1998 Mark Newton
  * Copyright (c) 1994 Christos Zoulas
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * 
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/systm.h>
 
 #include <compat/svr4/svr4.h>
 #include <compat/svr4/svr4_types.h>
 #include <compat/svr4/svr4_util.h>
 #include <compat/svr4/svr4_signal.h>
 #include <compat/svr4/svr4_proto.h>
 #include <compat/svr4/svr4_stropts.h>
 #include <compat/svr4/svr4_ioctl.h>
 #include <compat/svr4/svr4_termios.h>
 #include <compat/svr4/svr4_ttold.h>
 #include <compat/svr4/svr4_filio.h>
 #include <compat/svr4/svr4_sockio.h>
 
 #ifdef DEBUG_SVR4
 static void svr4_decode_cmd __P((u_long, char *, char *, int *, int *));
 /*
  * Decode an ioctl command symbolically
  */
 static void
 svr4_decode_cmd(cmd, dir, c, num, argsiz)
 	u_long		  cmd;
 	char		 *dir, *c;
 	int		 *num, *argsiz;
 {
 	if (cmd & SVR4_IOC_VOID)
 		*dir++ = 'V';
 	if (cmd & SVR4_IOC_IN)
 		*dir++ = 'R';
 	if (cmd & SVR4_IOC_OUT)
 		*dir++ = 'W';
 	*dir = '\0';
 	if (cmd & SVR4_IOC_INOUT)
 		*argsiz = (cmd >> 16) & 0xff;
 	else
 		*argsiz = -1;
 
 	*c = (cmd >> 8) & 0xff;
 	*num = cmd & 0xff;
 }
 #endif
 
 int
 svr4_sys_ioctl(td, uap)
 	register struct thread *td;
 	struct svr4_sys_ioctl_args *uap;
 {
 	int             *retval;
 	struct file	*fp;
-	struct filedesc	*fdp;
 	u_long		 cmd;
 	int (*fun) __P((struct file *, struct thread *, register_t *,
 			int, u_long, caddr_t));
+	int error;
 #ifdef DEBUG_SVR4
 	char		 dir[4];
 	char		 c;
 	int		 num;
 	int		 argsiz;
 
 	svr4_decode_cmd(SCARG(uap, com), dir, &c, &num, &argsiz);
 
 	DPRINTF(("svr4_ioctl[%lx](%d, _IO%s(%c, %d, %d), %p);\n", SCARG(uap, com), SCARG(uap, fd),
 	    dir, c, num, argsiz, SCARG(uap, data)));
 #endif
 	retval = td->td_retval;
-	fdp = td->td_proc->p_fd;
 	cmd = SCARG(uap, com);
 
-	if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
+	fp = ffind_hold(td, uap->fd);
+	if (fp == NULL)
 		return EBADF;
 
-	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
+	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
+		fdrop(fp, td);
 		return EBADF;
+	}
 
 #if defined(DEBUG_SVR4)
 	if (fp->f_type == DTYPE_SOCKET) {
 	        struct socket *so = (struct socket *)fp->f_data;
 		DPRINTF(("<<< IN: so_state = 0x%x\n", so->so_state));
 	}
 #endif
 
 	switch (cmd & 0xff00) {
 	case SVR4_tIOC:
 	        DPRINTF(("ttold\n"));
 		fun = svr4_ttold_ioctl;
 		break;
 
 	case SVR4_TIOC:
 	        DPRINTF(("term\n"));
 		fun = svr4_term_ioctl;
 		break;
 
 	case SVR4_STR:
 	        DPRINTF(("stream\n"));
 		fun = svr4_stream_ioctl;
 		break;
 
 	case SVR4_FIOC:
                 DPRINTF(("file\n"));
 		fun = svr4_fil_ioctl;
 		break;
 
 	case SVR4_SIOC:
 	        DPRINTF(("socket\n"));
 		fun = svr4_sock_ioctl;
 		break;
 
 	case SVR4_XIOC:
 		/* We do not support those */
+		fdrop(fp, td);
 		return EINVAL;
 
 	default:
+		fdrop(fp, td);
 		DPRINTF(("Unimplemented ioctl %lx\n", cmd));
 		return 0;	/* XXX: really ENOSYS */
 	}
 #if defined(DEBUG_SVR4)
 	if (fp->f_type == DTYPE_SOCKET) {
-	        struct socket *so = (struct socket *)fp->f_data;
+	        struct socket *so;
+
+	        so = (struct socket *)fp->f_data;
 		DPRINTF((">>> OUT: so_state = 0x%x\n", so->so_state));
 	}
 #endif
-	return (*fun)(fp, td, retval, SCARG(uap, fd), cmd, SCARG(uap, data));
+	error = (*fun)(fp, td, retval, SCARG(uap, fd), cmd, SCARG(uap, data));
+	fdrop(fp, td);
+	return (error);
 }
Index: head/sys/compat/svr4/svr4_misc.c
===================================================================
--- head/sys/compat/svr4/svr4_misc.c	(revision 89305)
+++ head/sys/compat/svr4/svr4_misc.c	(revision 89306)
@@ -1,1715 +1,1734 @@
 /*
  * Copyright (c) 1998 Mark Newton
  * Copyright (c) 1994 Christos Zoulas
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * 
  * $FreeBSD$
  */
 
 /*
  * SVR4 compatibility module.
  *
  * SVR4 system calls that are implemented differently in BSD are
  * handled here.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h>		/* Must come after sys/malloc.h */
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/msg.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/ptrace.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/sem.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysproto.h>
 #include <sys/time.h>
 #include <sys/times.h>
 #include <sys/uio.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 
 #include <compat/svr4/svr4.h>
 #include <compat/svr4/svr4_types.h>
 #include <compat/svr4/svr4_signal.h>
 #include <compat/svr4/svr4_proto.h>
 #include <compat/svr4/svr4_util.h>
 #include <compat/svr4/svr4_sysconfig.h>
 #include <compat/svr4/svr4_dirent.h>
 #include <compat/svr4/svr4_acl.h>
 #include <compat/svr4/svr4_ulimit.h>
 #include <compat/svr4/svr4_statvfs.h>
 #include <compat/svr4/svr4_hrt.h>
 #include <compat/svr4/svr4_mman.h>
 #include <compat/svr4/svr4_wait.h>
 
 #include <machine/vmparam.h>
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_map.h>
 #if defined(__FreeBSD__)
 #include <vm/vm_zone.h>
 #endif
 
 #if defined(NetBSD)
 # if defined(UVM)
 #  include <uvm/uvm_extern.h>
 # endif
 #endif
 
 #define	BSD_DIRENT(cp)		((struct dirent *)(cp))
 
 static int svr4_mknod __P((struct thread *, register_t *, char *,
     svr4_mode_t, svr4_dev_t));
 
 static __inline clock_t timeval_to_clock_t __P((struct timeval *));
 static int svr4_setinfo	__P((struct proc *, int, svr4_siginfo_t *));
 
 struct svr4_hrtcntl_args;
 static int svr4_hrtcntl	__P((struct thread *, struct svr4_hrtcntl_args *,
     register_t *));
 static void bsd_statfs_to_svr4_statvfs __P((const struct statfs *,
     struct svr4_statvfs *));
 static void bsd_statfs_to_svr4_statvfs64 __P((const struct statfs *,
     struct svr4_statvfs64 *));
 static struct proc *svr4_pfind __P((pid_t pid));
 
 /* BOGUS noop */
 #if defined(BOGUS)
 int
 svr4_sys_setitimer(td, uap)
         register struct thread *td;
 	struct svr4_sys_setitimer_args *uap;
 {
         td->td_retval[0] = 0;
 	return 0;
 }
 #endif
 
 int
 svr4_sys_wait(td, uap)
 	struct thread *td;
 	struct svr4_sys_wait_args *uap;
 {
 	struct wait_args w4;
 	int error, *retval = td->td_retval, st, sig;
 	size_t sz = sizeof(*SCARG(&w4, status));
 
 	SCARG(&w4, rusage) = NULL;
 	SCARG(&w4, options) = 0;
 
 	if (SCARG(uap, status) == NULL) {
 		caddr_t sg = stackgap_init();
 
 		SCARG(&w4, status) = stackgap_alloc(&sg, sz);
 	}
 	else
 		SCARG(&w4, status) = SCARG(uap, status);
 
 	SCARG(&w4, pid) = WAIT_ANY;
 
 	if ((error = wait4(td, &w4)) != 0)
 		return error;
       
 	if ((error = copyin(SCARG(&w4, status), &st, sizeof(st))) != 0)
 		return error;
 
 	if (WIFSIGNALED(st)) {
 		sig = WTERMSIG(st);
 		if (sig >= 0 && sig < NSIG)
 			st = (st & ~0177) | SVR4_BSD2SVR4_SIG(sig);
 	} else if (WIFSTOPPED(st)) {
 		sig = WSTOPSIG(st);
 		if (sig >= 0 && sig < NSIG)
 			st = (st & ~0xff00) | (SVR4_BSD2SVR4_SIG(sig) << 8);
 	}
 
 	/*
 	 * It looks like wait(2) on svr4/solaris/2.4 returns
 	 * the status in retval[1], and the pid on retval[0].
 	 */
 	retval[1] = st;
 
 	if (SCARG(uap, status))
 		if ((error = copyout(&st, SCARG(uap, status), sizeof(st))) != 0)
 			return error;
 
 	return 0;
 }
 
 int
 svr4_sys_execv(td, uap)
 	struct thread *td;
 	struct svr4_sys_execv_args *uap;
 {
 	struct execve_args ap;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 
 	SCARG(&ap, fname) = SCARG(uap, path);
 	SCARG(&ap, argv) = SCARG(uap, argp);
 	SCARG(&ap, envv) = NULL;
 
 	return execve(td, &ap);
 }
 
 int
 svr4_sys_execve(td, uap)
 	struct thread *td;
 	struct svr4_sys_execve_args *uap;
 {
 	struct execve_args ap;
 	caddr_t sg;
 
 	sg = stackgap_init();
 	CHECKALTEXIST(td, &sg, uap->path);
 
 	SCARG(&ap, fname) = SCARG(uap, path);
 	SCARG(&ap, argv) = SCARG(uap, argp);
 	SCARG(&ap, envv) = SCARG(uap, envp);
 
 	return execve(td, &ap);
 }
 
 int
 svr4_sys_time(td, v)
 	struct thread *td;
 	struct svr4_sys_time_args *v;
 {
 	struct svr4_sys_time_args *uap = v;
 	int error = 0;
 	struct timeval tv;
 
 	microtime(&tv);
 	if (SCARG(uap, t))
 		error = copyout(&tv.tv_sec, SCARG(uap, t),
 				sizeof(*(SCARG(uap, t))));
 	td->td_retval[0] = (int) tv.tv_sec;
 
 	return error;
 }
 
 
 /*
  * Read SVR4-style directory entries.  We suck them into kernel space so
  * that they can be massaged before being copied out to user code.  
  *
  * This code is ported from the Linux emulator:  Changes to the VFS interface
  * between FreeBSD and NetBSD have made it simpler to port it from there than
  * to adapt the NetBSD version.
  */
 int
 svr4_sys_getdents64(td, uap)
 	struct thread *td;
 	struct svr4_sys_getdents64_args *uap;
 {
 	register struct dirent *bdp;
 	struct vnode *vp;
 	caddr_t inp, buf;		/* BSD-format */
 	int len, reclen;		/* BSD-format */
 	caddr_t outp;			/* SVR4-format */
 	int resid, svr4reclen=0;	/* SVR4-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	struct vattr va;
 	off_t off;
 	struct svr4_dirent64 svr4_dirent;
 	int buflen, error, eofflag, nbytes, justone;
 	u_long *cookies = NULL, *cookiep;
 	int ncookies;
 
 	DPRINTF(("svr4_sys_getdents64(%d, *, %d)\n",
 		td->td_proc->p_pid, SCARG(uap, fd), SCARG(uap, nbytes)));
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) {
 		return (error);
 	}
 
-	if ((fp->f_flag & FREAD) == 0)
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
 		return (EBADF);
+	}
 
 	vp = (struct vnode *) fp->f_data;
 
-	if (vp->v_type != VDIR)
+	if (vp->v_type != VDIR) {
+		fdrop(fp, td);
 		return (EINVAL);
+	}
 
 	if ((error = VOP_GETATTR(vp, &va, td->td_proc->p_ucred, td))) {
+		fdrop(fp, td);
 		return error;
 	}
 
 	nbytes = SCARG(uap, nbytes);
 	if (nbytes == 1) {
 		nbytes = sizeof (struct svr4_dirent64);
 		justone = 1;
 	}
 	else
 		justone = 0;
 
 	off = fp->f_offset;
 #define	DIRBLKSIZ	512		/* XXX we used to use ufs's DIRBLKSIZ */
 	buflen = max(DIRBLKSIZ, nbytes);
 	buflen = min(buflen, MAXBSIZE);
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 
 	if (cookies) {
 		free(cookies, M_TEMP);
 		cookies = NULL;
 	}
 
 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
 						&ncookies, &cookies);
 	if (error) {
 		goto out;
 	}
 
 	inp = buf;
 	outp = (caddr_t) SCARG(uap, dp);
 	resid = nbytes;
 	if ((len = buflen - auio.uio_resid) <= 0) {
 		goto eof;
 	}
 
 	cookiep = cookies;
 
 	if (cookies) {
 		/*
 		 * When using cookies, the vfs has the option of reading from
 		 * a different offset than that supplied (UFS truncates the
 		 * offset to a block boundary to make sure that it never reads
 		 * partway through a directory entry, even if the directory
 		 * has been compacted).
 		 */
 		while (len > 0 && ncookies > 0 && *cookiep <= off) {
 			bdp = (struct dirent *) inp;
 			len -= bdp->d_reclen;
 			inp += bdp->d_reclen;
 			cookiep++;
 			ncookies--;
 		}
 	}
 
 	while (len > 0) {
 		if (cookiep && ncookies == 0)
 			break;
 		bdp = (struct dirent *) inp;
 		reclen = bdp->d_reclen;
 		if (reclen & 3) {
 			DPRINTF(("svr4_readdir: reclen=%d\n", reclen));
 			error = EFAULT;
 			goto out;
 		}
   
 		if (bdp->d_fileno == 0) {
 	    		inp += reclen;
 			if (cookiep) {
 				off = *cookiep++;
 				ncookies--;
 			} else
 				off += reclen;
 			len -= reclen;
 			continue;
 		}
 		svr4reclen = SVR4_RECLEN(&svr4_dirent, bdp->d_namlen);
 		if (reclen > len || resid < svr4reclen) {
 			outp++;
 			break;
 		}
 		svr4_dirent.d_ino = (long) bdp->d_fileno;
 		if (justone) {
 			/*
 			 * old svr4-style readdir usage.
 			 */
 			svr4_dirent.d_off = (svr4_off_t) svr4reclen;
 			svr4_dirent.d_reclen = (u_short) bdp->d_namlen;
 		} else {
 			svr4_dirent.d_off = (svr4_off_t)(off + reclen);
 			svr4_dirent.d_reclen = (u_short) svr4reclen;
 		}
 		strcpy(svr4_dirent.d_name, bdp->d_name);
 		if ((error = copyout((caddr_t)&svr4_dirent, outp, svr4reclen)))
 			goto out;
 		inp += reclen;
 		if (cookiep) {
 			off = *cookiep++;
 			ncookies--;
 		} else
 			off += reclen;
 		outp += svr4reclen;
 		resid -= svr4reclen;
 		len -= reclen;
 		if (justone)
 			break;
     	}
 
 	if (outp == (caddr_t) SCARG(uap, dp))
 		goto again;
 	fp->f_offset = off;
 
 	if (justone)
 		nbytes = resid + svr4reclen;
 
 eof:
 	td->td_retval[0] = nbytes - resid;
 out:
+	VOP_UNLOCK(vp, 0, td);
+	fdrop(fp, td);
 	if (cookies)
 		free(cookies, M_TEMP);
-	VOP_UNLOCK(vp, 0, td);
 	free(buf, M_TEMP);
 	return error;
 }
 
 
 int
 svr4_sys_getdents(td, uap)
 	struct thread *td;
 	struct svr4_sys_getdents_args *uap;
 {
 	struct dirent *bdp;
 	struct vnode *vp;
 	caddr_t inp, buf;	/* BSD-format */
 	int len, reclen;	/* BSD-format */
 	caddr_t outp;		/* SVR4-format */
 	int resid, svr4_reclen;	/* SVR4-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	struct svr4_dirent idb;
 	off_t off;		/* true file offset */
 	int buflen, error, eofflag;
 	u_long *cookiebuf = NULL, *cookie;
 	int ncookies = 0, *retval = td->td_retval;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
 
-	if ((fp->f_flag & FREAD) == 0)
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
 		return (EBADF);
+	}
 
 	vp = (struct vnode *)fp->f_data;
-	if (vp->v_type != VDIR)
+	if (vp->v_type != VDIR) {
+		fdrop(fp, td);
 		return (EINVAL);
+	}
 
 	buflen = min(MAXBSIZE, SCARG(uap, nbytes));
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	off = fp->f_offset;
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 	/*
          * First we read into the malloc'ed buffer, then
          * we massage it into user space, one record at a time.
          */
 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies,
 	    &cookiebuf);
-	if (error)
+	if (error) {
 		goto out;
+	}
 
 	inp = buf;
 	outp = SCARG(uap, buf);
 	resid = SCARG(uap, nbytes);
 	if ((len = buflen - auio.uio_resid) == 0)
 		goto eof;
 
 	for (cookie = cookiebuf; len > 0; len -= reclen) {
 		bdp = (struct dirent *)inp;
 		reclen = bdp->d_reclen;
 		if (reclen & 3)
 			panic("svr4_sys_getdents64: bad reclen");
 		off = *cookie++;	/* each entry points to the next */
 		if ((off >> 32) != 0) {
 			uprintf("svr4_sys_getdents64: dir offset too large for emulated program");
 			error = EINVAL;
 			goto out;
 		}
 		if (bdp->d_fileno == 0) {
 			inp += reclen;	/* it is a hole; squish it out */
 			continue;
 		}
 		svr4_reclen = SVR4_RECLEN(&idb, bdp->d_namlen);
 		if (reclen > len || resid < svr4_reclen) {
 			/* entry too big for buffer, so just stop */
 			outp++;
 			break;
 		}
 		/*
 		 * Massage in place to make a SVR4-shaped dirent (otherwise
 		 * we have to worry about touching user memory outside of
 		 * the copyout() call).
 		 */
 		idb.d_ino = (svr4_ino_t)bdp->d_fileno;
 		idb.d_off = (svr4_off_t)off;
 		idb.d_reclen = (u_short)svr4_reclen;
 		strcpy(idb.d_name, bdp->d_name);
 		if ((error = copyout((caddr_t)&idb, outp, svr4_reclen)))
 			goto out;
 		/* advance past this real entry */
 		inp += reclen;
 		/* advance output past SVR4-shaped entry */
 		outp += svr4_reclen;
 		resid -= svr4_reclen;
 	}
 
 	/* if we squished out the whole block, try again */
 	if (outp == SCARG(uap, buf))
 		goto again;
 	fp->f_offset = off;	/* update the vnode offset */
 
 eof:
 	*retval = SCARG(uap, nbytes) - resid;
 out:
 	VOP_UNLOCK(vp, 0, td);
+	fdrop(fp, td);
 	if (cookiebuf)
 		free(cookiebuf, M_TEMP);
 	free(buf, M_TEMP);
 	return error;
 }
 
 
 int
 svr4_sys_mmap(td, uap)
 	struct thread *td;
 	struct svr4_sys_mmap_args *uap;
 {
 	struct mmap_args	 mm;
 	int             *retval;
 
 	retval = td->td_retval;
 #define _MAP_NEW	0x80000000
 	/*
          * Verify the arguments.
          */
 	if (SCARG(uap, prot) & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
 		return EINVAL;	/* XXX still needed? */
 
 	if (SCARG(uap, len) == 0)
 		return EINVAL;
 
 	SCARG(&mm, prot) = SCARG(uap, prot);
 	SCARG(&mm, len) = SCARG(uap, len);
 	SCARG(&mm, flags) = SCARG(uap, flags) & ~_MAP_NEW;
 	SCARG(&mm, fd) = SCARG(uap, fd);
 	SCARG(&mm, addr) = SCARG(uap, addr);
 	SCARG(&mm, pos) = SCARG(uap, pos);
 
 	return mmap(td, &mm);
 }
 
 int
 svr4_sys_mmap64(td, uap)
 	struct thread *td;
 	struct svr4_sys_mmap64_args *uap;
 {
 	struct mmap_args	 mm;
 	void		*rp;
 
 #define _MAP_NEW	0x80000000
 	/*
          * Verify the arguments.
          */
 	if (SCARG(uap, prot) & ~(PROT_READ | PROT_WRITE | PROT_EXEC))
 		return EINVAL;	/* XXX still needed? */
 
 	if (SCARG(uap, len) == 0)
 		return EINVAL;
 
 	SCARG(&mm, prot) = SCARG(uap, prot);
 	SCARG(&mm, len) = SCARG(uap, len);
 	SCARG(&mm, flags) = SCARG(uap, flags) & ~_MAP_NEW;
 	SCARG(&mm, fd) = SCARG(uap, fd);
 	SCARG(&mm, addr) = SCARG(uap, addr);
 	SCARG(&mm, pos) = SCARG(uap, pos);
 
 	rp = (void *) round_page((vm_offset_t)(td->td_proc->p_vmspace->vm_daddr + maxdsiz));
 	if ((SCARG(&mm, flags) & MAP_FIXED) == 0 &&
 	    SCARG(&mm, addr) != 0 && (void *)SCARG(&mm, addr) < rp)
 		SCARG(&mm, addr) = rp;
 
 	return mmap(td, &mm);
 }
 
 
 int
 svr4_sys_fchroot(td, uap)
 	struct thread *td;
 	struct svr4_sys_fchroot_args *uap;
 {
 	struct filedesc	*fdp = td->td_proc->p_fd;
 	struct vnode	*vp;
 	struct file	*fp;
 	int		 error;
 
 	if ((error = suser_td(td)) != 0)
 		return error;
 	if ((error = getvnode(fdp, SCARG(uap, fd), &fp)) != 0)
 		return error;
 	vp = (struct vnode *) fp->f_data;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (vp->v_type != VDIR)
 		error = ENOTDIR;
 	else
 		error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	if (error)
+		fdrop(fp, td);
 		return error;
+	}
 	VREF(vp);
-	if (fdp->fd_rdir != NULL)
-		vrele(fdp->fd_rdir);
+	FILEDESC_LOCK(fdp);
+	vpold = fdp->fd_rdir;
 	fdp->fd_rdir = vp;
+	FILEDESC_UNLOCK(fdp);
+	if (vpold != NULL)
+		vrele(vpold);
+	fdrop(fp, td);
 	return 0;
 }
 
 
 static int
 svr4_mknod(td, retval, path, mode, dev)
 	struct thread *td;
 	register_t *retval;
 	char *path;
 	svr4_mode_t mode;
 	svr4_dev_t dev;
 {
 	caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, path);
 
 	if (S_ISFIFO(mode)) {
 		struct mkfifo_args ap;
 		SCARG(&ap, path) = path;
 		SCARG(&ap, mode) = mode;
 		return mkfifo(td, &ap);
 	} else {
 		struct mknod_args ap;
 		SCARG(&ap, path) = path;
 		SCARG(&ap, mode) = mode;
 		SCARG(&ap, dev) = dev;
 		return mknod(td, &ap);
 	}
 }
 
 
 int
 svr4_sys_mknod(td, uap)
 	register struct thread *td;
 	struct svr4_sys_mknod_args *uap;
 {
         int *retval = td->td_retval;
 	return svr4_mknod(td, retval,
 			  SCARG(uap, path), SCARG(uap, mode),
 			  (svr4_dev_t)svr4_to_bsd_odev_t(SCARG(uap, dev)));
 }
 
 
 int
 svr4_sys_xmknod(td, uap)
 	struct thread *td;
 	struct svr4_sys_xmknod_args *uap;
 {
         int *retval = td->td_retval;
 	return svr4_mknod(td, retval,
 			  SCARG(uap, path), SCARG(uap, mode),
 			  (svr4_dev_t)svr4_to_bsd_dev_t(SCARG(uap, dev)));
 }
 
 
 int
 svr4_sys_vhangup(td, uap)
 	struct thread *td;
 	struct svr4_sys_vhangup_args *uap;
 {
 	return 0;
 }
 
 
 int
 svr4_sys_sysconfig(td, uap)
 	struct thread *td;
 	struct svr4_sys_sysconfig_args *uap;
 {
 	int *retval;
 
 	retval = &(td->td_retval[0]);
 
 	switch (SCARG(uap, name)) {
 	case SVR4_CONFIG_UNUSED:
 		*retval = 0;
 		break;
 	case SVR4_CONFIG_NGROUPS:
 		*retval = NGROUPS_MAX;
 		break;
 	case SVR4_CONFIG_CHILD_MAX:
 		*retval = maxproc;
 		break;
 	case SVR4_CONFIG_OPEN_FILES:
 		*retval = maxfiles;
 		break;
 	case SVR4_CONFIG_POSIX_VER:
 		*retval = 198808;
 		break;
 	case SVR4_CONFIG_PAGESIZE:
 		*retval = PAGE_SIZE;
 		break;
 	case SVR4_CONFIG_CLK_TCK:
 		*retval = 60;	/* should this be `hz', ie. 100? */
 		break;
 	case SVR4_CONFIG_XOPEN_VER:
 		*retval = 2;	/* XXX: What should that be? */
 		break;
 	case SVR4_CONFIG_PROF_TCK:
 		*retval = 60;	/* XXX: What should that be? */
 		break;
 	case SVR4_CONFIG_NPROC_CONF:
 		*retval = 1;	/* Only one processor for now */
 		break;
 	case SVR4_CONFIG_NPROC_ONLN:
 		*retval = 1;	/* And it better be online */
 		break;
 	case SVR4_CONFIG_AIO_LISTIO_MAX:
 	case SVR4_CONFIG_AIO_MAX:
 	case SVR4_CONFIG_AIO_PRIO_DELTA_MAX:
 		*retval = 0;	/* No aio support */
 		break;
 	case SVR4_CONFIG_DELAYTIMER_MAX:
 		*retval = 0;	/* No delaytimer support */
 		break;
 	case SVR4_CONFIG_MQ_OPEN_MAX:
 		*retval = msginfo.msgmni;
 		break;
 	case SVR4_CONFIG_MQ_PRIO_MAX:
 		*retval = 0;	/* XXX: Don't know */
 		break;
 	case SVR4_CONFIG_RTSIG_MAX:
 		*retval = 0;
 		break;
 	case SVR4_CONFIG_SEM_NSEMS_MAX:
 		*retval = seminfo.semmni;
 		break;
 	case SVR4_CONFIG_SEM_VALUE_MAX:
 		*retval = seminfo.semvmx;
 		break;
 	case SVR4_CONFIG_SIGQUEUE_MAX:
 		*retval = 0;	/* XXX: Don't know */
 		break;
 	case SVR4_CONFIG_SIGRT_MIN:
 	case SVR4_CONFIG_SIGRT_MAX:
 		*retval = 0;	/* No real time signals */
 		break;
 	case SVR4_CONFIG_TIMER_MAX:
 		*retval = 3;	/* XXX: real, virtual, profiling */
 		break;
 #if defined(NOTYET)
 	case SVR4_CONFIG_PHYS_PAGES:
 #if defined(UVM)
 		*retval = uvmexp.free;	/* XXX: free instead of total */
 #else
 		*retval = cnt.v_free_count;	/* XXX: free instead of total */
 #endif
 		break;
 	case SVR4_CONFIG_AVPHYS_PAGES:
 #if defined(UVM)
 		*retval = uvmexp.active;	/* XXX: active instead of avg */
 #else
 		*retval = cnt.v_active_count;	/* XXX: active instead of avg */
 #endif
 		break;
 #endif /* NOTYET */
 
 	default:
 		return EINVAL;
 	}
 	return 0;
 }
 
 extern int swap_pager_full;
 
 /* ARGSUSED */
 int
 svr4_sys_break(td, uap)
 	struct thread *td;
 	struct svr4_sys_break_args *uap;
 {
 	struct vmspace *vm = td->td_proc->p_vmspace;
 	vm_offset_t new, old, base, ns;
 	int rv;
 
 	base = round_page((vm_offset_t) vm->vm_daddr);
 	ns = (vm_offset_t)SCARG(uap, nsize);
 	new = round_page(ns);
 	/* For p_rlimit. */
 	mtx_assert(&Giant, MA_OWNED);
 	if (new > base) {
 	  if ((new - base) > (unsigned) td->td_proc->p_rlimit[RLIMIT_DATA].rlim_cur) {
 			return ENOMEM;
 	  }
 	  if (new >= VM_MAXUSER_ADDRESS) {
 	    return (ENOMEM);
 	  }
 	} else if (new < base) {
 		/*
 		 * This is simply an invalid value.  If someone wants to
 		 * do fancy address space manipulations, mmap and munmap
 		 * can do most of what the user would want.
 		 */
 		return EINVAL;
 	}
 
 	old = base + ctob(vm->vm_dsize);
 
 	if (new > old) {
 		vm_size_t diff;
 		if (swap_pager_full) {
 			return (ENOMEM);
 		}
 		diff = new - old;
 		rv = vm_map_find(&vm->vm_map, NULL, 0, &old, diff, FALSE,
 			VM_PROT_ALL, VM_PROT_ALL, 0);
 		if (rv != KERN_SUCCESS) {
 			return (ENOMEM);
 		}
 		vm->vm_dsize += btoc(diff);
 	} else if (new < old) {
 		rv = vm_map_remove(&vm->vm_map, new, old);
 		if (rv != KERN_SUCCESS) {
 			return (ENOMEM);
 		}
 		vm->vm_dsize -= btoc(old - new);
 	}
 
 	return (0);
 }
 
 static __inline clock_t
 timeval_to_clock_t(tv)
 	struct timeval *tv;
 {
 	return tv->tv_sec * hz + tv->tv_usec / (1000000 / hz);
 }
 
 
 int
 svr4_sys_times(td, uap)
 	struct thread *td;
 	struct svr4_sys_times_args *uap;
 {
 	int			 error, *retval = td->td_retval;
 	struct tms		 tms;
 	struct timeval		 t;
 	struct rusage		*ru;
 	struct rusage		 r;
 	struct getrusage_args 	 ga;
 
 	caddr_t sg = stackgap_init();
 	ru = stackgap_alloc(&sg, sizeof(struct rusage));
 
 	SCARG(&ga, who) = RUSAGE_SELF;
 	SCARG(&ga, rusage) = ru;
 
 	error = getrusage(td, &ga);
 	if (error)
 		return error;
 
 	if ((error = copyin(ru, &r, sizeof r)) != 0)
 		return error;
 
 	tms.tms_utime = timeval_to_clock_t(&r.ru_utime);
 	tms.tms_stime = timeval_to_clock_t(&r.ru_stime);
 
 	SCARG(&ga, who) = RUSAGE_CHILDREN;
 	error = getrusage(td, &ga);
 	if (error)
 		return error;
 
 	if ((error = copyin(ru, &r, sizeof r)) != 0)
 		return error;
 
 	tms.tms_cutime = timeval_to_clock_t(&r.ru_utime);
 	tms.tms_cstime = timeval_to_clock_t(&r.ru_stime);
 
 	microtime(&t);
 	*retval = timeval_to_clock_t(&t);
 
 	return copyout(&tms, SCARG(uap, tp), sizeof(tms));
 }
 
 
 int
 svr4_sys_ulimit(td, uap)
 	struct thread *td;
 	struct svr4_sys_ulimit_args *uap;
 {
         int *retval = td->td_retval;
 
 	switch (SCARG(uap, cmd)) {
 	case SVR4_GFILLIM:
 		/* For p_rlimit below. */
 		mtx_assert(&Giant, MA_OWNED);
 		*retval = td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur / 512;
 		if (*retval == -1)
 			*retval = 0x7fffffff;
 		return 0;
 
 	case SVR4_SFILLIM:
 		{
 			int error;
 			struct __setrlimit_args srl;
 			struct rlimit krl;
 			caddr_t sg = stackgap_init();
 			struct rlimit *url = (struct rlimit *) 
 				stackgap_alloc(&sg, sizeof *url);
 
 			krl.rlim_cur = SCARG(uap, newlimit) * 512;
 			mtx_assert(&Giant, MA_OWNED);
 			krl.rlim_max = td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_max;
 
 			error = copyout(&krl, url, sizeof(*url));
 			if (error)
 				return error;
 
 			SCARG(&srl, which) = RLIMIT_FSIZE;
 			SCARG(&srl, rlp) = url;
 
 			error = setrlimit(td, &srl);
 			if (error)
 				return error;
 
 			mtx_assert(&Giant, MA_OWNED);
 			*retval = td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur;
 			if (*retval == -1)
 				*retval = 0x7fffffff;
 			return 0;
 		}
 
 	case SVR4_GMEMLIM:
 		{
 			struct vmspace *vm = td->td_proc->p_vmspace;
 			register_t r;
 
 			mtx_assert(&Giant, MA_OWNED);
 			r = td->td_proc->p_rlimit[RLIMIT_DATA].rlim_cur;
 
 			if (r == -1)
 				r = 0x7fffffff;
 			r += (long) vm->vm_daddr;
 			if (r < 0)
 				r = 0x7fffffff;
 			*retval = r;
 			return 0;
 		}
 
 	case SVR4_GDESLIM:
 		mtx_assert(&Giant, MA_OWNED);
 		*retval = td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur;
 		if (*retval == -1)
 			*retval = 0x7fffffff;
 		return 0;
 
 	default:
 		return EINVAL;
 	}
 }
 
 static struct proc *
 svr4_pfind(pid)
 	pid_t pid;
 {
 	struct proc *p;
 
 	/* look in the live processes */
 	if ((p = pfind(pid)) == NULL)
 		/* look in the zombies */
 		p = zpfind(pid);
 
 	return p;
 }
 
 
 int
 svr4_sys_pgrpsys(td, uap)
 	struct thread *td;
 	struct svr4_sys_pgrpsys_args *uap;
 {
         int *retval = td->td_retval;
 	struct proc *p = td->td_proc;
 
 	switch (SCARG(uap, cmd)) {
 	case 1:			/* setpgrp() */
 		/*
 		 * SVR4 setpgrp() (which takes no arguments) has the
 		 * semantics that the session ID is also created anew, so
 		 * in almost every sense, setpgrp() is identical to
 		 * setsid() for SVR4.  (Under BSD, the difference is that
 		 * a setpgid(0,0) will not create a new session.)
 		 */
 		setsid(td, NULL);
 		/*FALLTHROUGH*/
 
 	case 0:			/* getpgrp() */
 		*retval = p->p_pgrp->pg_id;
 		return 0;
 
 	case 2:			/* getsid(pid) */
 		if (SCARG(uap, pid) != 0 &&
 		    (p = svr4_pfind(SCARG(uap, pid))) == NULL)
 			return ESRCH;
 		/*
 		 * This has already been initialized to the pid of
 		 * the session leader.
 		 */
 		*retval = (register_t) p->p_session->s_leader->p_pid;
 		PROC_UNLOCK(p);
 		return 0;
 
 	case 3:			/* setsid() */
 		return setsid(td, NULL);
 
 	case 4:			/* getpgid(pid) */
 
 		if (SCARG(uap, pid) != 0 &&
 		    (p = svr4_pfind(SCARG(uap, pid))) == NULL)
 			return ESRCH;
 
 		*retval = (int) p->p_pgrp->pg_id;
 		PROC_UNLOCK(p);
 		return 0;
 
 	case 5:			/* setpgid(pid, pgid); */
 		{
 			struct setpgid_args sa;
 
 			SCARG(&sa, pid) = SCARG(uap, pid);
 			SCARG(&sa, pgid) = SCARG(uap, pgid);
 			return setpgid(td, &sa);
 		}
 
 	default:
 		return EINVAL;
 	}
 }
 
 #define syscallarg(x)   union { x datum; register_t pad; }
 
 struct svr4_hrtcntl_args {
 	int 			cmd;
 	int 			fun;
 	int 			clk;
 	svr4_hrt_interval_t *	iv;
 	svr4_hrt_time_t *	ti;
 };
 
 
 static int
 svr4_hrtcntl(td, uap, retval)
 	struct thread *td;
 	struct svr4_hrtcntl_args *uap;
 	register_t *retval;
 {
 	switch (SCARG(uap, fun)) {
 	case SVR4_HRT_CNTL_RES:
 		DPRINTF(("htrcntl(RES)\n"));
 		*retval = SVR4_HRT_USEC;
 		return 0;
 
 	case SVR4_HRT_CNTL_TOFD:
 		DPRINTF(("htrcntl(TOFD)\n"));
 		{
 			struct timeval tv;
 			svr4_hrt_time_t t;
 			if (SCARG(uap, clk) != SVR4_HRT_CLK_STD) {
 				DPRINTF(("clk == %d\n", SCARG(uap, clk)));
 				return EINVAL;
 			}
 			if (SCARG(uap, ti) == NULL) {
 				DPRINTF(("ti NULL\n"));
 				return EINVAL;
 			}
 			microtime(&tv);
 			t.h_sec = tv.tv_sec;
 			t.h_rem = tv.tv_usec;
 			t.h_res = SVR4_HRT_USEC;
 			return copyout(&t, SCARG(uap, ti), sizeof(t));
 		}
 
 	case SVR4_HRT_CNTL_START:
 		DPRINTF(("htrcntl(START)\n"));
 		return ENOSYS;
 
 	case SVR4_HRT_CNTL_GET:
 		DPRINTF(("htrcntl(GET)\n"));
 		return ENOSYS;
 	default:
 		DPRINTF(("Bad htrcntl command %d\n", SCARG(uap, fun)));
 		return ENOSYS;
 	}
 }
 
 
 int
 svr4_sys_hrtsys(td, uap) 
 	struct thread *td;
 	struct svr4_sys_hrtsys_args *uap;
 {
         int *retval = td->td_retval;
 
 	switch (SCARG(uap, cmd)) {
 	case SVR4_HRT_CNTL:
 		return svr4_hrtcntl(td, (struct svr4_hrtcntl_args *) uap,
 				    retval);
 
 	case SVR4_HRT_ALRM:
 		DPRINTF(("hrtalarm\n"));
 		return ENOSYS;
 
 	case SVR4_HRT_SLP:
 		DPRINTF(("hrtsleep\n"));
 		return ENOSYS;
 
 	case SVR4_HRT_CAN:
 		DPRINTF(("hrtcancel\n"));
 		return ENOSYS;
 
 	default:
 		DPRINTF(("Bad hrtsys command %d\n", SCARG(uap, cmd)));
 		return EINVAL;
 	}
 }
 
 
 static int
 svr4_setinfo(p, st, s)
 	struct proc *p;
 	int st;
 	svr4_siginfo_t *s;
 {
 	svr4_siginfo_t i;
 	int sig;
 
 	memset(&i, 0, sizeof(i));
 
 	i.si_signo = SVR4_SIGCHLD;
 	i.si_errno = 0;	/* XXX? */
 
 	if (p) {
 		i.si_pid = p->p_pid;
 		mtx_lock_spin(&sched_lock);
 		if (p->p_stat == SZOMB) {
 			i.si_stime = p->p_ru->ru_stime.tv_sec;
 			i.si_utime = p->p_ru->ru_utime.tv_sec;
 		}
 		else {
 			i.si_stime = p->p_stats->p_ru.ru_stime.tv_sec;
 			i.si_utime = p->p_stats->p_ru.ru_utime.tv_sec;
 		}
 		mtx_unlock_spin(&sched_lock);
 	}
 
 	if (WIFEXITED(st)) {
 		i.si_status = WEXITSTATUS(st);
 		i.si_code = SVR4_CLD_EXITED;
 	} else if (WIFSTOPPED(st)) {
 		sig = WSTOPSIG(st);
 		if (sig >= 0 && sig < NSIG)
 			i.si_status = SVR4_BSD2SVR4_SIG(sig);
 
 		if (i.si_status == SVR4_SIGCONT)
 			i.si_code = SVR4_CLD_CONTINUED;
 		else
 			i.si_code = SVR4_CLD_STOPPED;
 	} else {
 		sig = WTERMSIG(st);
 		if (sig >= 0 && sig < NSIG)
 			i.si_status = SVR4_BSD2SVR4_SIG(sig);
 
 		if (WCOREDUMP(st))
 			i.si_code = SVR4_CLD_DUMPED;
 		else
 			i.si_code = SVR4_CLD_KILLED;
 	}
 
 	DPRINTF(("siginfo [pid %ld signo %d code %d errno %d status %d]\n",
 		 i.si_pid, i.si_signo, i.si_code, i.si_errno, i.si_status));
 
 	return copyout(&i, s, sizeof(i));
 }
 
 
 int
 svr4_sys_waitsys(td, uap)
 	struct thread *td;
 	struct svr4_sys_waitsys_args *uap;
 {
 	int nfound;
 	int error, *retval = td->td_retval;
 	struct proc *q, *t;
 
 
 	switch (SCARG(uap, grp)) {
 	case SVR4_P_PID:	
 		break;
 
 	case SVR4_P_PGID:
 		SCARG(uap, id) = -td->td_proc->p_pgid;
 		break;
 
 	case SVR4_P_ALL:
 		SCARG(uap, id) = WAIT_ANY;
 		break;
 
 	default:
 		return EINVAL;
 	}
 
 	DPRINTF(("waitsys(%d, %d, %p, %x)\n", 
 	         SCARG(uap, grp), SCARG(uap, id),
 		 SCARG(uap, info), SCARG(uap, options)));
 
 loop:
 	nfound = 0;
 	sx_slock(&proctree_lock);
 	LIST_FOREACH(q, &td->td_proc->p_children, p_sibling) {
+		PROC_LOCK(q);
 		if (SCARG(uap, id) != WAIT_ANY &&
 		    q->p_pid != SCARG(uap, id) &&
 		    q->p_pgid != -SCARG(uap, id)) {
+			PROC_UNLOCK(q);
 			DPRINTF(("pid %d pgid %d != %d\n", q->p_pid,
 				 q->p_pgid, SCARG(uap, id)));
 			continue;
 		}
 		nfound++;
-		PROC_LOCK(q);
 		mtx_lock_spin(&sched_lock);
 		if (q->p_stat == SZOMB && 
 		    ((SCARG(uap, options) & (SVR4_WEXITED|SVR4_WTRAPPED)))) {
 			mtx_unlock_spin(&sched_lock);
 			PROC_UNLOCK(q);
 			sx_sunlock(&proctree_lock);
 			*retval = 0;
 			DPRINTF(("found %d\n", q->p_pid));
 			error = svr4_setinfo(q, q->p_xstat, SCARG(uap, info));
 			if (error != 0)
 				return error;
 
 
 		        if ((SCARG(uap, options) & SVR4_WNOWAIT)) {
 				DPRINTF(("Don't wait\n"));
 				return 0;
 			}
 
 			/*
 			 * If we got the child via ptrace(2) or procfs, and
 			 * the parent is different (meaning the process was
 			 * attached, rather than run as a child), then we need
 			 * to give it back to the old parent, and send the
 			 * parent a SIGCHLD.  The rest of the cleanup will be
 			 * done when the old parent waits on the child.
 			 */
 			sx_xlock(&proctree_lock);
 			PROC_LOCK(q);
 			if (q->p_flag & P_TRACED) {
 				if (q->p_oppid != q->p_pptr->p_pid) {
 					PROC_UNLOCK(q);
 					t = pfind(q->p_oppid);
 					PROC_LOCK(q);
 					proc_reparent(q, t ? t : initproc);
  					q->p_oppid = 0;
 					q->p_flag &= ~(P_TRACED | P_WAITED);
 					PROC_UNLOCK(q);
 					psignal(t, SIGCHLD);
 					wakeup(t);
 					PROC_UNLOCK(t);
 					sx_xunlock(&proctree_lock);
 					return 0;
 				}
 			}
 			PROC_UNLOCK(q);
 			sx_xunlock(&proctree_lock);
 			q->p_xstat = 0;
 			ruadd(&td->td_proc->p_stats->p_cru, q->p_ru);
 			FREE(q->p_ru, M_ZOMBIE);
 			q->p_ru = 0;
 
 			/*
 			 * Decrement the count of procs running with this uid.
 			 */
 			(void)chgproccnt(q->p_ucred->cr_ruidinfo, -1, 0);
 
 			/*
 			 * Release reference to text vnode.
 			 */
 			if (q->p_textvp)
 				vrele(q->p_textvp);
 
 			/*
 			 * Free up credentials.
 			 */
 			crfree(q->p_ucred);
 			q->p_ucred = NULL;
 
 			/*
 			 * Remove unused arguments
 			 */
 			if (q->p_args && --q->p_args->ar_ref == 0)
 				FREE(q->p_args, M_PARGS);
 			PROC_UNLOCK(q);
 
 			/*
 			 * Finally finished with old proc entry.
 			 * Unlink it from its process group and free it.
 			 */
 			leavepgrp(q);
 
 			sx_xlock(&allproc_lock);
 			LIST_REMOVE(q, p_list); /* off zombproc */
 			sx_xunlock(&allproc_lock);
 
 			sx_xlock(&proctree_lock);
 			LIST_REMOVE(q, p_sibling);
 			sx_xunlock(&proctree_lock);
 
 			PROC_LOCK(q);
 			if (--q->p_procsig->ps_refcnt == 0) {
 				if (q->p_sigacts != &q->p_uarea->u_sigacts)
 					FREE(q->p_sigacts, M_SUBPROC);
 				FREE(q->p_procsig, M_SUBPROC);
 				q->p_procsig = NULL;
 			}
 			PROC_UNLOCK(q);
 
 			/*
 			 * Give machine-dependent layer a chance
 			 * to free anything that cpu_exit couldn't
 			 * release while still running in process context.
 			 */
 			cpu_wait(q);
 #if defined(__NetBSD__)
 			pool_put(&proc_pool, q);
 #endif
 #ifdef __FreeBSD__
 			mtx_destroy(&q->p_mtx);
 			zfree(proc_zone, q);
 #endif
 			nprocs--;
 			return 0;
 		}
 		if (q->p_stat == SSTOP && (q->p_flag & P_WAITED) == 0 &&
 		    (q->p_flag & P_TRACED ||
 		     (SCARG(uap, options) & (SVR4_WSTOPPED|SVR4_WCONTINUED)))) {
 			mtx_unlock_spin(&sched_lock);
 			DPRINTF(("jobcontrol %d\n", q->p_pid));
 		        if (((SCARG(uap, options) & SVR4_WNOWAIT)) == 0)
 				q->p_flag |= P_WAITED;
 			PROC_UNLOCK(q);
 			*retval = 0;
 			return svr4_setinfo(q, W_STOPCODE(q->p_xstat),
 					    SCARG(uap, info));
 		}
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(q);
 	}
 
 	if (nfound == 0)
 		return ECHILD;
 
 	if (SCARG(uap, options) & SVR4_WNOHANG) {
 		*retval = 0;
 		if ((error = svr4_setinfo(NULL, 0, SCARG(uap, info))) != 0)
 			return error;
 		return 0;
 	}
 
 	if ((error = tsleep((caddr_t)td->td_proc, PWAIT | PCATCH, "svr4_wait", 0)) != 0)
 		return error;
 	goto loop;
 }
 
 
 static void
 bsd_statfs_to_svr4_statvfs(bfs, sfs)
 	const struct statfs *bfs;
 	struct svr4_statvfs *sfs;
 {
 	sfs->f_bsize = bfs->f_iosize; /* XXX */
 	sfs->f_frsize = bfs->f_bsize;
 	sfs->f_blocks = bfs->f_blocks;
 	sfs->f_bfree = bfs->f_bfree;
 	sfs->f_bavail = bfs->f_bavail;
 	sfs->f_files = bfs->f_files;
 	sfs->f_ffree = bfs->f_ffree;
 	sfs->f_favail = bfs->f_ffree;
 	sfs->f_fsid = bfs->f_fsid.val[0];
 	memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype));
 	sfs->f_flag = 0;
 	if (bfs->f_flags & MNT_RDONLY)
 		sfs->f_flag |= SVR4_ST_RDONLY;
 	if (bfs->f_flags & MNT_NOSUID)
 		sfs->f_flag |= SVR4_ST_NOSUID;
 	sfs->f_namemax = MAXNAMLEN;
 	memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */
 	memset(sfs->f_filler, 0, sizeof(sfs->f_filler));
 }
 
 
 static void
 bsd_statfs_to_svr4_statvfs64(bfs, sfs)
 	const struct statfs *bfs;
 	struct svr4_statvfs64 *sfs;
 {
 	sfs->f_bsize = bfs->f_iosize; /* XXX */
 	sfs->f_frsize = bfs->f_bsize;
 	sfs->f_blocks = bfs->f_blocks;
 	sfs->f_bfree = bfs->f_bfree;
 	sfs->f_bavail = bfs->f_bavail;
 	sfs->f_files = bfs->f_files;
 	sfs->f_ffree = bfs->f_ffree;
 	sfs->f_favail = bfs->f_ffree;
 	sfs->f_fsid = bfs->f_fsid.val[0];
 	memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype));
 	sfs->f_flag = 0;
 	if (bfs->f_flags & MNT_RDONLY)
 		sfs->f_flag |= SVR4_ST_RDONLY;
 	if (bfs->f_flags & MNT_NOSUID)
 		sfs->f_flag |= SVR4_ST_NOSUID;
 	sfs->f_namemax = MAXNAMLEN;
 	memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */
 	memset(sfs->f_filler, 0, sizeof(sfs->f_filler));
 }
 
 
 int
 svr4_sys_statvfs(td, uap)
 	struct thread *td;
 	struct svr4_sys_statvfs_args *uap;
 {
 	struct statfs_args	fs_args;
 	caddr_t sg = stackgap_init();
 	struct statfs *fs = stackgap_alloc(&sg, sizeof(struct statfs));
 	struct statfs bfs;
 	struct svr4_statvfs sfs;
 	int error;
 
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	SCARG(&fs_args, path) = SCARG(uap, path);
 	SCARG(&fs_args, buf) = fs;
 
 	if ((error = statfs(td, &fs_args)) != 0)
 		return error;
 
 	if ((error = copyin(fs, &bfs, sizeof(bfs))) != 0)
 		return error;
 
 	bsd_statfs_to_svr4_statvfs(&bfs, &sfs);
 
 	return copyout(&sfs, SCARG(uap, fs), sizeof(sfs));
 }
 
 
 int
 svr4_sys_fstatvfs(td, uap)
 	struct thread *td;
 	struct svr4_sys_fstatvfs_args *uap;
 {
 	struct fstatfs_args	fs_args;
 	caddr_t sg = stackgap_init();
 	struct statfs *fs = stackgap_alloc(&sg, sizeof(struct statfs));
 	struct statfs bfs;
 	struct svr4_statvfs sfs;
 	int error;
 
 	SCARG(&fs_args, fd) = SCARG(uap, fd);
 	SCARG(&fs_args, buf) = fs;
 
 	if ((error = fstatfs(td, &fs_args)) != 0)
 		return error;
 
 	if ((error = copyin(fs, &bfs, sizeof(bfs))) != 0)
 		return error;
 
 	bsd_statfs_to_svr4_statvfs(&bfs, &sfs);
 
 	return copyout(&sfs, SCARG(uap, fs), sizeof(sfs));
 }
 
 
 int
 svr4_sys_statvfs64(td, uap)
 	struct thread *td;
 	struct svr4_sys_statvfs64_args *uap;
 {
 	struct statfs_args	fs_args;
 	caddr_t sg = stackgap_init();
 	struct statfs *fs = stackgap_alloc(&sg, sizeof(struct statfs));
 	struct statfs bfs;
 	struct svr4_statvfs64 sfs;
 	int error;
 
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	SCARG(&fs_args, path) = SCARG(uap, path);
 	SCARG(&fs_args, buf) = fs;
 
 	if ((error = statfs(td, &fs_args)) != 0)
 		return error;
 
 	if ((error = copyin(fs, &bfs, sizeof(bfs))) != 0)
 		return error;
 
 	bsd_statfs_to_svr4_statvfs64(&bfs, &sfs);
 
 	return copyout(&sfs, SCARG(uap, fs), sizeof(sfs));
 }
 
 
 int
 svr4_sys_fstatvfs64(td, uap) 
 	struct thread *td;
 	struct svr4_sys_fstatvfs64_args *uap;
 {
 	struct fstatfs_args	fs_args;
 	caddr_t sg = stackgap_init();
 	struct statfs *fs = stackgap_alloc(&sg, sizeof(struct statfs));
 	struct statfs bfs;
 	struct svr4_statvfs64 sfs;
 	int error;
 
 	SCARG(&fs_args, fd) = SCARG(uap, fd);
 	SCARG(&fs_args, buf) = fs;
 
 	if ((error = fstatfs(td, &fs_args)) != 0)
 		return error;
 
 	if ((error = copyin(fs, &bfs, sizeof(bfs))) != 0)
 		return error;
 
 	bsd_statfs_to_svr4_statvfs64(&bfs, &sfs);
 
 	return copyout(&sfs, SCARG(uap, fs), sizeof(sfs));
 }
 
 int
 svr4_sys_alarm(td, uap)
 	struct thread *td;
 	struct svr4_sys_alarm_args *uap;
 {
 	int error;
         struct itimerval *itp, *oitp;
 	struct setitimer_args sa;
 	caddr_t sg = stackgap_init();
 
         itp = stackgap_alloc(&sg, sizeof(*itp));
 	oitp = stackgap_alloc(&sg, sizeof(*oitp));
         timevalclear(&itp->it_interval);
         itp->it_value.tv_sec = SCARG(uap, sec);
         itp->it_value.tv_usec = 0;
 
 	SCARG(&sa, which) = ITIMER_REAL;
 	SCARG(&sa, itv) = itp;
 	SCARG(&sa, oitv) = oitp;
         error = setitimer(td, &sa);
 	if (error)
 		return error;
         if (oitp->it_value.tv_usec)
                 oitp->it_value.tv_sec++;
         td->td_retval[0] = oitp->it_value.tv_sec;
         return 0;
 
 }
 
 int
 svr4_sys_gettimeofday(td, uap)
 	struct thread *td;
 	struct svr4_sys_gettimeofday_args *uap;
 {
 	if (SCARG(uap, tp)) {
 		struct timeval atv;
 
 		microtime(&atv);
 		return copyout(&atv, SCARG(uap, tp), sizeof (atv));
 	}
 
 	return 0;
 }
 
 int
 svr4_sys_facl(td, uap)
 	struct thread *td;
 	struct svr4_sys_facl_args *uap;
 {
 	int *retval;
 
 	retval = td->td_retval;
 	*retval = 0;
 
 	switch (SCARG(uap, cmd)) {
 	case SVR4_SYS_SETACL:
 		/* We don't support acls on any filesystem */
 		return ENOSYS;
 
 	case SVR4_SYS_GETACL:
 		return copyout(retval, &SCARG(uap, num),
 		    sizeof(SCARG(uap, num)));
 
 	case SVR4_SYS_GETACLCNT:
 		return 0;
 
 	default:
 		return EINVAL;
 	}
 }
 
 
 int
 svr4_sys_acl(td, uap)
 	struct thread *td;
 	struct svr4_sys_acl_args *uap;
 {
 	/* XXX: for now the same */
 	return svr4_sys_facl(td, (struct svr4_sys_facl_args *)uap);
 }
 
 int
 svr4_sys_auditsys(td, uap)
 	struct thread *td;
 	struct svr4_sys_auditsys_args *uap;
 {
 	/*
 	 * XXX: Big brother is *not* watching.
 	 */
 	return 0;
 }
 
 int
 svr4_sys_memcntl(td, uap)
 	struct thread *td;
 	struct svr4_sys_memcntl_args *uap;
 {
 	switch (SCARG(uap, cmd)) {
 	case SVR4_MC_SYNC:
 		{
 			struct msync_args msa;
 
 			SCARG(&msa, addr) = SCARG(uap, addr);
 			SCARG(&msa, len) = SCARG(uap, len);
 			SCARG(&msa, flags) = (int)SCARG(uap, arg);
 
 			return msync(td, &msa);
 		}
 	case SVR4_MC_ADVISE:
 		{
 			struct madvise_args maa;
 
 			SCARG(&maa, addr) = SCARG(uap, addr);
 			SCARG(&maa, len) = SCARG(uap, len);
 			SCARG(&maa, behav) = (int)SCARG(uap, arg);
 
 			return madvise(td, &maa);
 		}
 	case SVR4_MC_LOCK:
 	case SVR4_MC_UNLOCK:
 	case SVR4_MC_LOCKAS:
 	case SVR4_MC_UNLOCKAS:
 		return EOPNOTSUPP;
 	default:
 		return ENOSYS;
 	}
 }
 
 
 int
 svr4_sys_nice(td, uap)
 	struct thread *td;
 	struct svr4_sys_nice_args *uap;
 {
 	struct setpriority_args ap;
 	int error;
 
 	SCARG(&ap, which) = PRIO_PROCESS;
 	SCARG(&ap, who) = 0;
 	SCARG(&ap, prio) = SCARG(uap, prio);
 
 	if ((error = setpriority(td, &ap)) != 0)
 		return error;
 
 	/* the cast is stupid, but the structures are the same */
 	if ((error = getpriority(td, (struct getpriority_args *)&ap)) != 0)
 		return error;
 
 	return 0;
 }
 
 int
 svr4_sys_resolvepath(td, uap)
 	struct thread *td;
 	struct svr4_sys_resolvepath_args *uap;
 {
 	struct nameidata nd;
 	int error, *retval = td->td_retval;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | SAVENAME, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 
 	if ((error = namei(&nd)) != 0)
 		return error;
 
 	if ((error = copyout(nd.ni_cnd.cn_pnbuf, SCARG(uap, buf),
 	    SCARG(uap, bufsiz))) != 0)
 		goto bad;
 
 	*retval = strlen(nd.ni_cnd.cn_pnbuf) < SCARG(uap, bufsiz) ? 
 	  strlen(nd.ni_cnd.cn_pnbuf) + 1 : SCARG(uap, bufsiz);
 bad:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
 	return error;
 }
Index: head/sys/compat/svr4/svr4_stream.c
===================================================================
--- head/sys/compat/svr4/svr4_stream.c	(revision 89305)
+++ head/sys/compat/svr4/svr4_stream.c	(revision 89306)
@@ -1,2273 +1,2302 @@
 /*
  * Copyright (c) 1998 Mark Newton.  All rights reserved.
  * Copyright (c) 1994, 1996 Christos Zoulas.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Christos Zoulas.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * 
  * $FreeBSD$
  */
 
 /*
  * Pretend that we have streams...
  * Yes, this is gross.
  *
  * ToDo: The state machine for getmsg needs re-thinking
  */
 
 #define COMPAT_43 1
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h> 		/* Must come after sys/malloc.h */
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sysproto.h>
 #include <sys/uio.h>
 #include <sys/ktrace.h>		/* Must come after sys/uio.h */
 #include <sys/un.h>
 
 #include <netinet/in.h>
 
 #include <compat/svr4/svr4.h>
 #include <compat/svr4/svr4_types.h>
 #include <compat/svr4/svr4_util.h>
 #include <compat/svr4/svr4_signal.h>
 #include <compat/svr4/svr4_proto.h>
 #include <compat/svr4/svr4_stropts.h>
 #include <compat/svr4/svr4_timod.h>
 #include <compat/svr4/svr4_sockmod.h>
 #include <compat/svr4/svr4_ioctl.h>
 #include <compat/svr4/svr4_socket.h>
 
 /* Utils */
 static int clean_pipe __P((struct thread *, const char *));
 static void getparm __P((struct file *, struct svr4_si_sockparms *));
+static int svr4_do_putmsg __P((struct proc *, struct svr4_sys_putmsg_args *,
+			       struct file *));
+static int svr4_do_getmsg __P((struct proc *, struct svr4_sys_getmsg_args *,
+			       struct file *));
 
 /* Address Conversions */
 static void sockaddr_to_netaddr_in __P((struct svr4_strmcmd *,
 					const struct sockaddr_in *));
 static void sockaddr_to_netaddr_un __P((struct svr4_strmcmd *,
 					const struct sockaddr_un *));
 static void netaddr_to_sockaddr_in __P((struct sockaddr_in *,
 					const struct svr4_strmcmd *));
 static void netaddr_to_sockaddr_un __P((struct sockaddr_un *,
 					const struct svr4_strmcmd *));
 
 /* stream ioctls */
 static int i_nread __P((struct file *, struct thread *, register_t *, int,
 			u_long, caddr_t));
 static int i_fdinsert __P((struct file *, struct thread *, register_t *, int,
 			   u_long, caddr_t));
 static int i_str   __P((struct file *, struct thread *, register_t *, int,
 			u_long, caddr_t));
 static int i_setsig   __P((struct file *, struct thread *, register_t *, int,
 			u_long, caddr_t));
 static int i_getsig   __P((struct file *, struct thread *, register_t *, int,
 			u_long, caddr_t));
 static int _i_bind_rsvd __P((struct file *, struct thread *, register_t *, int,
 			     u_long, caddr_t));
 static int _i_rele_rsvd __P((struct file *, struct thread *, register_t *, int,
 			     u_long, caddr_t));
 
 /* i_str sockmod calls */
 static int sockmod       __P((struct file *, int, struct svr4_strioctl *,
 			      struct thread *));
 static int si_listen     __P((struct file *, int, struct svr4_strioctl *,
 			      struct thread *));
 static int si_ogetudata  __P((struct file *, int, struct svr4_strioctl *,
 			      struct thread *));
 static int si_sockparams __P((struct file *, int, struct svr4_strioctl *,
 			      struct thread *));
 static int si_shutdown	 __P((struct file *, int, struct svr4_strioctl *,
 			      struct thread *));
 static int si_getudata   __P((struct file *, int, struct svr4_strioctl *,
 			      struct thread *));
 
 /* i_str timod calls */
 static int timod         __P((struct file *, int, struct svr4_strioctl *,
 		              struct thread *));
 static int ti_getinfo    __P((struct file *, int, struct svr4_strioctl *,
 			      struct thread *));
 static int ti_bind       __P((struct file *, int, struct svr4_strioctl *,
 			      struct thread *));
 
 /* infrastructure */
 static int svr4_sendit __P((struct thread *td, int s, struct msghdr *mp,
 			    int flags));
 
 static int svr4_recvit __P((struct thread *td, int s, struct msghdr *mp,
 			    caddr_t namelenp));
 
 /* <sigh>  Ok, so we shouldn't use sendit() in uipc_syscalls.c because
  * it isn't part of a "public" interface;  We're supposed to use
  * pru_sosend instead.  Same goes for recvit()/pru_soreceive() for
  * that matter.  Solution:  Suck sendit()/recvit() into here where we
  * can do what we like.
  * 
  * I hate code duplication. 
  * 
  * I will take out all the #ifdef COMPAT_OLDSOCK gumph, though.
  */
 static int
 svr4_sendit(td, s, mp, flags)
 	register struct thread *td;
 	int s;
 	register struct msghdr *mp;
 	int flags;
 {
 	struct uio auio;
 	register struct iovec *iov;
 	register int i;
 	struct mbuf *control;
 	struct sockaddr *to;
 	int len, error;
 	struct socket *so;
 #ifdef KTRACE
 	struct iovec *ktriov = NULL;
 	struct uio ktruio;
 #endif
 
 	if ((error = fgetsock(td, s, &so, NULL)) != 0)
 		return (error);
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			error = EINVAL;
 			goto done1;
 		}
 	}
 	if (mp->msg_name) {
 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
 		if (error)
 			goto done1;
 	} else {
 		to = 0;
 	}
 	if (mp->msg_control) {
 		if (mp->msg_controllen < sizeof(struct cmsghdr)) {
 			error = EINVAL;
 			goto bad;
 		}
 		error = sockargs(&control, mp->msg_control,
 		    mp->msg_controllen, MT_CONTROL);
 		if (error)
 			goto bad;
 	} else {
 		control = 0;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
 
 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
 		ktruio = auio;
 	}
 #endif
 	len = auio.uio_resid;
 	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
 						     flags, td);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		if (error == EPIPE) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktriov != NULL) {
 		if (error == 0) {
 			ktruio.uio_iov = ktriov;
 			ktruio.uio_resid = td->td_retval[0];
 			ktrgenio(td->td_proc->p_tracep, s, UIO_WRITE, &ktruio, error);
 		}
 		FREE(ktriov, M_TEMP);
 	}
 #endif
 bad:
 	if (to)
 		FREE(to, M_SONAME);
 done1:
+	fdrop(fp, td);
 	fputsock(so);
 	return (error);
 }
 
 static int
 svr4_recvit(td, s, mp, namelenp)
 	register struct thread *td;
 	int s;
 	register struct msghdr *mp;
 	caddr_t namelenp;
 {
 	struct uio auio;
 	register struct iovec *iov;
 	register int i;
 	int len, error;
 	struct mbuf *m, *control = 0;
 	caddr_t ctlbuf;
 	struct socket *so;
 	struct sockaddr *fromsa = 0;
 #ifdef KTRACE
 	struct iovec *ktriov = NULL;
 	struct uio ktruio;
 #endif
 
 	if ((error = fgetsock(td, s, &so, NULL)) != 0)
 		return (error);
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			error = EINVAL;
 			goto done1;
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
 
 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
 		ktruio = auio;
 	}
 #endif
 	len = auio.uio_resid;
 	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
 	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
 	    &mp->msg_flags);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 #ifdef KTRACE
 	if (ktriov != NULL) {
 		if (error == 0) {
 			ktruio.uio_iov = ktriov;
 			ktruio.uio_resid = len - auio.uio_resid;
 			ktrgenio(td->td_proc->p_tracep, s, UIO_READ, &ktruio, error);
 		}
 		FREE(ktriov, M_TEMP);
 	}
 #endif
 	if (error)
 		goto out;
 	td->td_retval[0] = len - auio.uio_resid;
 	if (mp->msg_name) {
 		len = mp->msg_namelen;
 		if (len <= 0 || fromsa == 0)
 			len = 0;
 		else {
 #ifndef MIN
 #define MIN(a,b) ((a)>(b)?(b):(a))
 #endif
 			/* save sa_len before it is destroyed by MSG_COMPAT */
 			len = MIN(len, fromsa->sa_len);
 			error = copyout(fromsa,
 			    (caddr_t)mp->msg_name, (unsigned)len);
 			if (error)
 				goto out;
 		}
 		mp->msg_namelen = len;
 		if (namelenp &&
 		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
 			goto out;
 		}
 	}
 	if (mp->msg_control) {
 		len = mp->msg_controllen;
 		m = control;
 		mp->msg_controllen = 0;
 		ctlbuf = (caddr_t) mp->msg_control;
 
 		while (m && len > 0) {
 			unsigned int tocopy;
 
 			if (len >= m->m_len) 
 				tocopy = m->m_len;
 			else {
 				mp->msg_flags |= MSG_CTRUNC;
 				tocopy = len;
 			}
 		
 			if ((error = copyout((caddr_t)mtod(m, caddr_t),
 					ctlbuf, tocopy)) != 0)
 				goto out;
 
 			ctlbuf += tocopy;
 			len -= tocopy;
 			m = m->m_next;
 		}
 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
 	}
 out:
 	if (fromsa)
 		FREE(fromsa, M_SONAME);
 	if (control)
 		m_freem(control);
 done1:
+	fdrop(fp, td);
 	fputsock(so);
 	return (error);
 }
 
 #ifdef DEBUG_SVR4
 static void bufprint __P((u_char *, size_t));
 static int show_ioc __P((const char *, struct svr4_strioctl *));
 static int show_strbuf __P((struct svr4_strbuf *));
 static void show_msg __P((const char *, int, struct svr4_strbuf *, 
 			  struct svr4_strbuf *, int));
 
 static void
 bufprint(buf, len)
 	u_char *buf;
 	size_t len;
 {
 	size_t i;
 
 	uprintf("\n\t");
 	for (i = 0; i < len; i++) {
 		uprintf("%x ", buf[i]);
 		if (i && (i % 16) == 0) 
 			uprintf("\n\t");
 	}
 }
 
 static int
 show_ioc(str, ioc)
 	const char		*str;
 	struct svr4_strioctl	*ioc;
 {
 	u_char *ptr = (u_char *) malloc(ioc->len, M_TEMP, M_WAITOK);
 	int error;
 
 	uprintf("%s cmd = %ld, timeout = %d, len = %d, buf = %p { ",
 	    str, ioc->cmd, ioc->timeout, ioc->len, ioc->buf);
 
 	if ((error = copyin(ioc->buf, ptr, ioc->len)) != 0) {
 		free((char *) ptr, M_TEMP);
 		return error;
 	}
 
 	bufprint(ptr, ioc->len);
 
 	uprintf("}\n");
 
 	free((char *) ptr, M_TEMP);
 	return 0;
 }
 
 
 static int
 show_strbuf(str)
 	struct svr4_strbuf *str;
 {
 	int error;
 	u_char *ptr = NULL;
 	int maxlen = str->maxlen;
 	int len = str->len;
 
 	if (maxlen < 0)
 		maxlen = 0;
 
 	if (len >= maxlen)
 		len = maxlen;
 
 	if (len > 0) {
 	    ptr = (u_char *) malloc(len, M_TEMP, M_WAITOK);
 
 	    if ((error = copyin(str->buf, ptr, len)) != 0) {
 		    free((char *) ptr, M_TEMP);
 		    return error;
 	    }
 	}
 
 	uprintf(", { %d, %d, %p=[ ", str->maxlen, str->len, str->buf);
 
 	if (ptr)
 		bufprint(ptr, len);
 
 	uprintf("]}");
 
 	if (ptr)
 		free((char *) ptr, M_TEMP);
 
 	return 0;
 }
 
 
 static void
 show_msg(str, fd, ctl, dat, flags)
 	const char		*str;
 	int			 fd;
 	struct svr4_strbuf	*ctl;
 	struct svr4_strbuf	*dat;
 	int			 flags;
 {
 	struct svr4_strbuf	buf;
 	int error;
 
 	uprintf("%s(%d", str, fd);
 	if (ctl != NULL) {
 		if ((error = copyin(ctl, &buf, sizeof(buf))) != 0)
 			return;
 		show_strbuf(&buf);
 	}
 	else 
 		uprintf(", NULL");
 
 	if (dat != NULL) {
 		if ((error = copyin(dat, &buf, sizeof(buf))) != 0)
 			return;
 		show_strbuf(&buf);
 	}
 	else 
 		uprintf(", NULL");
 
 	uprintf(", %x);\n", flags);
 }
 
 #endif /* DEBUG_SVR4 */
 
 /*
  * We are faced with an interesting situation. On svr4 unix sockets
  * are really pipes. But we really have sockets, and we might as
  * well use them. At the point where svr4 calls TI_BIND, it has
  * already created a named pipe for the socket using mknod(2).
  * We need to create a socket with the same name when we bind,
  * so we need to remove the pipe before, otherwise we'll get address
  * already in use. So we *carefully* remove the pipe, to avoid
  * using this as a random file removal tool. We use system calls
  * to avoid code duplication.
  */
 static int
 clean_pipe(td, path)
 	struct thread *td;
 	const char *path;
 {
 	struct lstat_args la;
 	struct unlink_args ua;
 	struct stat st;
 	int error;
 	caddr_t sg = stackgap_init();
 	size_t l = strlen(path) + 1;
 	void *tpath;
 
 	tpath = stackgap_alloc(&sg, l);
 	SCARG(&la, ub) = stackgap_alloc(&sg, sizeof(struct stat));
 
 	if ((error = copyout(path, tpath, l)) != 0)
 		return error;
 
 	SCARG(&la, path) = tpath;
 
 	if ((error = lstat(td, &la)) != 0)
 		return 0;
 
 	if ((error = copyin(SCARG(&la, ub), &st, sizeof(st))) != 0)
 		return 0;
 
 	/*
 	 * Make sure we are dealing with a mode 0 named pipe.
 	 */
 	if ((st.st_mode & S_IFMT) != S_IFIFO)
 		return 0;
 
 	if ((st.st_mode & ALLPERMS) != 0)
 		return 0;
 
 	SCARG(&ua, path) = SCARG(&la, path);
 
 	if ((error = unlink(td, &ua)) != 0) {
 		DPRINTF(("clean_pipe: unlink failed %d\n", error));
 		return error;
 	}
 
 	return 0;
 }
 
 
 static void
 sockaddr_to_netaddr_in(sc, sain)
 	struct svr4_strmcmd *sc;
 	const struct sockaddr_in *sain;
 {
 	struct svr4_netaddr_in *na;
 	na = SVR4_ADDROF(sc);
 
 	na->family = sain->sin_family;
 	na->port = sain->sin_port;
 	na->addr = sain->sin_addr.s_addr;
 	DPRINTF(("sockaddr_in -> netaddr %d %d %lx\n", na->family, na->port,
 		 na->addr));
 }
 
 
 static void
 sockaddr_to_netaddr_un(sc, saun)
 	struct svr4_strmcmd *sc;
 	const struct sockaddr_un *saun;
 {
 	struct svr4_netaddr_un *na;
 	char *dst, *edst = ((char *) sc) + sc->offs + sizeof(na->family) + 1  -
 	    sizeof(*sc);
 	const char *src;
 
 	na = SVR4_ADDROF(sc);
 	na->family = saun->sun_family;
 	for (src = saun->sun_path, dst = na->path; (*dst++ = *src++) != '\0'; )
 		if (dst == edst)
 			break;
 	DPRINTF(("sockaddr_un -> netaddr %d %s\n", na->family, na->path));
 }
 
 
 static void
 netaddr_to_sockaddr_in(sain, sc)
 	struct sockaddr_in *sain;
 	const struct svr4_strmcmd *sc;
 {
 	const struct svr4_netaddr_in *na;
 
 
 	na = SVR4_C_ADDROF(sc);
 	memset(sain, 0, sizeof(*sain));
 	sain->sin_len = sizeof(*sain);
 	sain->sin_family = na->family;
 	sain->sin_port = na->port;
 	sain->sin_addr.s_addr = na->addr;
 	DPRINTF(("netaddr -> sockaddr_in %d %d %x\n", sain->sin_family,
 		 sain->sin_port, sain->sin_addr.s_addr));
 }
 
 
 static void
 netaddr_to_sockaddr_un(saun, sc)
 	struct sockaddr_un *saun;
 	const struct svr4_strmcmd *sc;
 {
 	const struct svr4_netaddr_un *na;
 	char *dst, *edst = &saun->sun_path[sizeof(saun->sun_path) - 1];
 	const char *src;
 
 	na = SVR4_C_ADDROF(sc);
 	memset(saun, 0, sizeof(*saun));
 	saun->sun_family = na->family;
 	for (src = na->path, dst = saun->sun_path; (*dst++ = *src++) != '\0'; )
 		if (dst == edst)
 			break;
 	saun->sun_len = dst - saun->sun_path;
 	DPRINTF(("netaddr -> sockaddr_un %d %s\n", saun->sun_family,
 		 saun->sun_path));
 }
 
 
 static void
 getparm(fp, pa)
 	struct file *fp;
 	struct svr4_si_sockparms *pa;
 {
-	struct svr4_strm *st = svr4_stream_get(fp);
-	struct socket *so = (struct socket *) fp->f_data;
+	struct svr4_strm *st;
+	struct socket *so;
 
+	st = svr4_stream_get(fp);
 	if (st == NULL)
 		return;
 
+	so = (struct socket *) fp->f_data;
+
 	pa->family = st->s_family;
 
 	switch (so->so_type) {
 	case SOCK_DGRAM:
 		pa->type = SVR4_T_CLTS;
 		pa->protocol = IPPROTO_UDP;
 		DPRINTF(("getparm(dgram)\n"));
 		return;
 
 	case SOCK_STREAM:
 	        pa->type = SVR4_T_COTS;  /* What about T_COTS_ORD? XXX */
 		pa->protocol = IPPROTO_IP;
 		DPRINTF(("getparm(stream)\n"));
 		return;
 
 	case SOCK_RAW:
 		pa->type = SVR4_T_CLTS;
 		pa->protocol = IPPROTO_RAW;
 		DPRINTF(("getparm(raw)\n"));
 		return;
 
 	default:
 		pa->type = 0;
 		pa->protocol = 0;
 		DPRINTF(("getparm(type %d?)\n", so->so_type));
 		return;
 	}
 }
 
 
 static int
 si_ogetudata(fp, fd, ioc, td)
 	struct file		*fp;
 	int 			 fd;
 	struct svr4_strioctl	*ioc;
 	struct thread		*td;
 {
 	int error;
 	struct svr4_si_oudata ud;
 	struct svr4_si_sockparms pa;
 
 	if (ioc->len != sizeof(ud) && ioc->len != sizeof(ud) - sizeof(int)) {
 		DPRINTF(("SI_OGETUDATA: Wrong size %d != %d\n",
 			 sizeof(ud), ioc->len));
 		return EINVAL;
 	}
 
 	if ((error = copyin(ioc->buf, &ud, sizeof(ud))) != 0)
 		return error;
 
 	getparm(fp, &pa);
 
 	switch (pa.family) {
 	case AF_INET:
 	    ud.tidusize = 16384;
 	    ud.addrsize = sizeof(struct svr4_sockaddr_in);
 	    if (pa.type == SVR4_SOCK_STREAM) 
 		    ud.etsdusize = 1;
 	    else
 		    ud.etsdusize = 0;
 	    break;
 
 	case AF_LOCAL:
 	    ud.tidusize = 65536;
 	    ud.addrsize = 128;
 	    ud.etsdusize = 128;
 	    break;
 
 	default:
 	    DPRINTF(("SI_OGETUDATA: Unsupported address family %d\n",
 		     pa.family));
 	    return ENOSYS;
 	}
 
 	/* I have no idea what these should be! */
 	ud.optsize = 128;
 	ud.tsdusize = 128;
 
 	ud.servtype = pa.type;
 
 	/* XXX: Fixme */
 	ud.so_state = 0;
 	ud.so_options = 0;
 	return copyout(&ud, ioc->buf, ioc->len);
 }
 
 
 static int
 si_sockparams(fp, fd, ioc, td)
 	struct file		*fp;
 	int 			 fd;
 	struct svr4_strioctl	*ioc;
 	struct thread		*td;
 {
 	struct svr4_si_sockparms pa;
 
 	getparm(fp, &pa);
 	return copyout(&pa, ioc->buf, sizeof(pa));
 }
 
 
 static int
 si_listen(fp, fd, ioc, td)
 	struct file		*fp;
 	int 			 fd;
 	struct svr4_strioctl	*ioc;
 	struct thread		*td;
 {
 	int error;
 	struct svr4_strm *st = svr4_stream_get(fp);
 	struct svr4_strmcmd lst;
 	struct listen_args la;
 
 	if (st == NULL)
 		return EINVAL;
 
 	if ((error = copyin(ioc->buf, &lst, ioc->len)) != 0)
 		return error;
 
 	if (lst.cmd != SVR4_TI_OLD_BIND_REQUEST) {
 		DPRINTF(("si_listen: bad request %ld\n", lst.cmd));
 		return EINVAL;
 	}
 
 	/*
 	 * We are making assumptions again...
 	 */
 	SCARG(&la, s) = fd;
 	DPRINTF(("SI_LISTEN: fileno %d backlog = %d\n", fd, 5));
 	SCARG(&la, backlog) = 5;
 
 	if ((error = listen(td, &la)) != 0) {
 		DPRINTF(("SI_LISTEN: listen failed %d\n", error));
 		return error;
 	}
 
 	st->s_cmd = SVR4_TI__ACCEPT_WAIT;
 	lst.cmd = SVR4_TI_BIND_REPLY;
 
 	switch (st->s_family) {
 	case AF_INET:
 		/* XXX: Fill the length here */
 		break;
 
 	case AF_LOCAL:
 		lst.len = 140;
 		lst.pad[28] = 0x00000000;	/* magic again */
 		lst.pad[29] = 0x00000800;	/* magic again */
 		lst.pad[30] = 0x80001400;	/* magic again */
 		break;
 
 	default:
 		DPRINTF(("SI_LISTEN: Unsupported address family %d\n",
 		    st->s_family));
 		return ENOSYS;
 	}
 
 
 	if ((error = copyout(&lst, ioc->buf, ioc->len)) != 0)
 		return error;
 
 	return 0;
 }
 
 
 static int
 si_getudata(fp, fd, ioc, td)
 	struct file		*fp;
 	int 			 fd;
 	struct svr4_strioctl	*ioc;
 	struct thread		*td;
 {
 	int error;
 	struct svr4_si_udata ud;
 
 	if (sizeof(ud) != ioc->len) {
 		DPRINTF(("SI_GETUDATA: Wrong size %d != %d\n",
 			 sizeof(ud), ioc->len));
 		return EINVAL;
 	}
 
 	if ((error = copyin(ioc->buf, &ud, sizeof(ud))) != 0)
 		return error;
 
 	getparm(fp, &ud.sockparms);
 
 	switch (ud.sockparms.family) {
 	case AF_INET:
 	    DPRINTF(("getudata_inet\n"));
 	    ud.tidusize = 16384;
 	    ud.tsdusize = 16384;
 	    ud.addrsize = sizeof(struct svr4_sockaddr_in);
 	    if (ud.sockparms.type == SVR4_SOCK_STREAM) 
 		    ud.etsdusize = 1;
 	    else
 		    ud.etsdusize = 0;
 	    ud.optsize = 0;
 	    break;
 
 	case AF_LOCAL:
 	    DPRINTF(("getudata_local\n"));
 	    ud.tidusize = 65536;
 	    ud.tsdusize = 128;
 	    ud.addrsize = 128;
 	    ud.etsdusize = 128;
 	    ud.optsize = 128;
 	    break;
 
 	default:
 	    DPRINTF(("SI_GETUDATA: Unsupported address family %d\n",
 		     ud.sockparms.family));
 	    return ENOSYS;
 	}
 
 
 	ud.servtype = ud.sockparms.type;
 	DPRINTF(("ud.servtype = %d\n", ud.servtype));
 	/* XXX: Fixme */
 	ud.so_state = 0;
 	ud.so_options = 0;
 	return copyout(&ud, ioc->buf, sizeof(ud));
 }
 
 
 static int
 si_shutdown(fp, fd, ioc, td)
 	struct file		*fp;
 	int 			 fd;
 	struct svr4_strioctl	*ioc;
 	struct thread		*td;
 {
 	int error;
 	struct shutdown_args ap;
 
 	if (ioc->len != sizeof(SCARG(&ap, how))) {
 		DPRINTF(("SI_SHUTDOWN: Wrong size %d != %d\n",
 			 sizeof(SCARG(&ap, how)), ioc->len));
 		return EINVAL;
 	}
 
 	if ((error = copyin(ioc->buf, &SCARG(&ap, how), ioc->len)) != 0)
 		return error;
 
 	SCARG(&ap, s) = fd;
 
 	return shutdown(td, &ap);
 }
 
 
 static int
 sockmod(fp, fd, ioc, td)
 	struct file		*fp;
 	int			 fd;
 	struct svr4_strioctl	*ioc;
 	struct thread		*td;
 {
 	switch (ioc->cmd) {
 	case SVR4_SI_OGETUDATA:
 		DPRINTF(("SI_OGETUDATA\n"));
 		return si_ogetudata(fp, fd, ioc, td);
 
 	case SVR4_SI_SHUTDOWN:
 		DPRINTF(("SI_SHUTDOWN\n"));
 		return si_shutdown(fp, fd, ioc, td);
 
 	case SVR4_SI_LISTEN:
 		DPRINTF(("SI_LISTEN\n"));
 		return si_listen(fp, fd, ioc, td);
 
 	case SVR4_SI_SETMYNAME:
 		DPRINTF(("SI_SETMYNAME\n"));
 		return 0;
 
 	case SVR4_SI_SETPEERNAME:
 		DPRINTF(("SI_SETPEERNAME\n"));
 		return 0;
 
 	case SVR4_SI_GETINTRANSIT:
 		DPRINTF(("SI_GETINTRANSIT\n"));
 		return 0;
 
 	case SVR4_SI_TCL_LINK:
 		DPRINTF(("SI_TCL_LINK\n"));
 		return 0;
 
 	case SVR4_SI_TCL_UNLINK:
 		DPRINTF(("SI_TCL_UNLINK\n"));
 		return 0;
 
 	case SVR4_SI_SOCKPARAMS:
 		DPRINTF(("SI_SOCKPARAMS\n"));
 		return si_sockparams(fp, fd, ioc, td);
 
 	case SVR4_SI_GETUDATA:
 		DPRINTF(("SI_GETUDATA\n"));
 		return si_getudata(fp, fd, ioc, td);
 
 	default:
 		DPRINTF(("Unknown sockmod ioctl %lx\n", ioc->cmd));
 		return 0;
 
 	}
 }
 
 
 static int
 ti_getinfo(fp, fd, ioc, td)
 	struct file		*fp;
 	int 			 fd;
 	struct svr4_strioctl	*ioc;
 	struct thread		*td;
 {
 	int error;
 	struct svr4_infocmd info;
 
 	memset(&info, 0, sizeof(info));
 
 	if ((error = copyin(ioc->buf, &info, ioc->len)) != 0)
 		return error;
 
 	if (info.cmd != SVR4_TI_INFO_REQUEST)
 		return EINVAL;
 
 	info.cmd = SVR4_TI_INFO_REPLY;
 	info.tsdu = 0;
 	info.etsdu = 1;
 	info.cdata = -2;
 	info.ddata = -2;
 	info.addr = 16;
 	info.opt = -1;
 	info.tidu = 16384;
 	info.serv = 2;
 	info.current = 0;
 	info.provider = 2;
 
 	ioc->len = sizeof(info);
 	if ((error = copyout(&info, ioc->buf, ioc->len)) != 0)
 		return error;
 
 	return 0;
 }
 
 
 static int
 ti_bind(fp, fd, ioc, td)
 	struct file		*fp;
 	int 			 fd;
 	struct svr4_strioctl	*ioc;
 	struct thread		*td;
 {
 	int error;
 	struct svr4_strm *st = svr4_stream_get(fp);
 	struct sockaddr_in sain;
 	struct sockaddr_un saun;
 	caddr_t sg;
 	void *skp, *sup = NULL;
 	int sasize;
 	struct svr4_strmcmd bnd;
 	struct bind_args ba;
 
 	if (st == NULL) {
 		DPRINTF(("ti_bind: bad file descriptor\n"));
 		return EINVAL;
 	}
 
 	if ((error = copyin(ioc->buf, &bnd, ioc->len)) != 0)
 		return error;
 
 	if (bnd.cmd != SVR4_TI_OLD_BIND_REQUEST) {
 		DPRINTF(("ti_bind: bad request %ld\n", bnd.cmd));
 		return EINVAL;
 	}
 
 	switch (st->s_family) {
 	case AF_INET:
 		skp = &sain;
 		sasize = sizeof(sain);
 
 		if (bnd.offs == 0)
 			goto reply;
 
 		netaddr_to_sockaddr_in(&sain, &bnd);
 
 		DPRINTF(("TI_BIND: fam %d, port %d, addr %x\n",
 			 sain.sin_family, sain.sin_port,
 			 sain.sin_addr.s_addr));
 		break;
 
 	case AF_LOCAL:
 		skp = &saun;
 		sasize = sizeof(saun);
 		if (bnd.offs == 0)
 			goto reply;
 
 		netaddr_to_sockaddr_un(&saun, &bnd);
 
 		if (saun.sun_path[0] == '\0')
 			goto reply;
 
 		DPRINTF(("TI_BIND: fam %d, path %s\n",
 			 saun.sun_family, saun.sun_path));
 
 		if ((error = clean_pipe(td, saun.sun_path)) != 0)
 			return error;
 
 		bnd.pad[28] = 0x00001000;	/* magic again */
 		break;
 
 	default:
 		DPRINTF(("TI_BIND: Unsupported address family %d\n",
 			 st->s_family));
 		return ENOSYS;
 	}
 
 	sg = stackgap_init();
 	sup = stackgap_alloc(&sg, sasize);
 
 	if ((error = copyout(skp, sup, sasize)) != 0)
 		return error;
 
 	SCARG(&ba, s) = fd;
 	DPRINTF(("TI_BIND: fileno %d\n", fd));
 	SCARG(&ba, name) = (void *) sup;
 	SCARG(&ba, namelen) = sasize;
 
 	if ((error = bind(td, &ba)) != 0) {
 		DPRINTF(("TI_BIND: bind failed %d\n", error));
 		return error;
 	}
 
 reply:
 	if (sup == NULL) {
 		memset(&bnd, 0, sizeof(bnd));
 		bnd.len = sasize + 4;
 		bnd.offs = 0x10;	/* XXX */
 	}
 
 	bnd.cmd = SVR4_TI_BIND_REPLY;
 
 	if ((error = copyout(&bnd, ioc->buf, ioc->len)) != 0)
 		return error;
 
 	return 0;
 }
 
 
 static int
 timod(fp, fd, ioc, td)
 	struct file		*fp;
 	int			 fd;
 	struct svr4_strioctl	*ioc;
 	struct thread		*td;
 {
 	switch (ioc->cmd) {
 	case SVR4_TI_GETINFO:
 		DPRINTF(("TI_GETINFO\n"));
 		return ti_getinfo(fp, fd, ioc, td);
 
 	case SVR4_TI_OPTMGMT:
 		DPRINTF(("TI_OPTMGMT\n"));
 		return 0;
 
 	case SVR4_TI_BIND:
 		DPRINTF(("TI_BIND\n"));
 		return ti_bind(fp, fd, ioc, td);
 
 	case SVR4_TI_UNBIND:
 		DPRINTF(("TI_UNBIND\n"));
 		return 0;
 
 	default:
 		DPRINTF(("Unknown timod ioctl %lx\n", ioc->cmd));
 		return 0;
 	}
 }
 
 
 int
 svr4_stream_ti_ioctl(fp, td, retval, fd, cmd, dat)
 	struct file *fp;
 	struct thread *td;
 	register_t *retval;
 	int fd;
 	u_long cmd;
 	caddr_t dat;
 {
 	struct svr4_strbuf skb, *sub = (struct svr4_strbuf *) dat;
 	struct svr4_strm *st = svr4_stream_get(fp);
 	int error;
 	void *skp, *sup;
 	struct sockaddr_in sain;
 	struct sockaddr_un saun;
 	struct svr4_strmcmd sc;
 	int sasize;
 	caddr_t sg;
 	int *lenp;
 
 	DPRINTF(("svr4_stream_ti_ioctl\n"));
 
 	if (st == NULL)
 		return EINVAL;
 
 	sc.offs = 0x10;
 	
 	if ((error = copyin(sub, &skb, sizeof(skb))) != 0) {
 		DPRINTF(("ti_ioctl: error copying in strbuf\n"));
 		return error;
 	}
 
 	switch (st->s_family) {
 	case AF_INET:
 		skp = &sain;
 		sasize = sizeof(sain);
 		break;
 
 	case AF_LOCAL:
 		skp = &saun;
 		sasize = sizeof(saun);
 		break;
 
 	default:
 		DPRINTF(("ti_ioctl: Unsupported address family %d\n",
 			 st->s_family));
 		return ENOSYS;
 	}
 
 	sg = stackgap_init();
 	sup = stackgap_alloc(&sg, sasize);
 	lenp = stackgap_alloc(&sg, sizeof(*lenp));
 
 	if ((error = copyout(&sasize, lenp, sizeof(*lenp))) != 0) {
 		DPRINTF(("ti_ioctl: error copying out lenp\n"));
 		return error;
 	}
 
 	switch (cmd) {
 	case SVR4_TI_GETMYNAME:
 		DPRINTF(("TI_GETMYNAME\n"));
 		{
 			struct getsockname_args ap;
 			SCARG(&ap, fdes) = fd;
 			SCARG(&ap, asa) = sup;
 			SCARG(&ap, alen) = lenp;
 			if ((error = getsockname(td, &ap)) != 0) {
 				DPRINTF(("ti_ioctl: getsockname error\n"));
 				return error;
 			}
 		}
 		break;
 
 	case SVR4_TI_GETPEERNAME:
 		DPRINTF(("TI_GETPEERNAME\n"));
 		{
 			struct getpeername_args ap;
 			SCARG(&ap, fdes) = fd;
 			SCARG(&ap, asa) = sup;
 			SCARG(&ap, alen) = lenp;
 			if ((error = getpeername(td, &ap)) != 0) {
 				DPRINTF(("ti_ioctl: getpeername error\n"));
 				return error;
 			}
 		}
 		break;
 
 	case SVR4_TI_SETMYNAME:
 		DPRINTF(("TI_SETMYNAME\n"));
 		return 0;
 
 	case SVR4_TI_SETPEERNAME:
 		DPRINTF(("TI_SETPEERNAME\n"));
 		return 0;
 	default:
 		DPRINTF(("ti_ioctl: Unknown ioctl %lx\n", cmd));
 		return ENOSYS;
 	}
 
 	if ((error = copyin(sup, skp, sasize)) != 0) {
 		DPRINTF(("ti_ioctl: error copying in socket data\n"));
 		return error;
 	}
 
 	if ((error = copyin(lenp, &sasize, sizeof(*lenp))) != 0) {
 		DPRINTF(("ti_ioctl: error copying in socket size\n"));
 		return error;
 	}
 
 	switch (st->s_family) {
 	case AF_INET:
 		sockaddr_to_netaddr_in(&sc, &sain);
 		skb.len = sasize;
 		break;
 
 	case AF_LOCAL:
 		sockaddr_to_netaddr_un(&sc, &saun);
 		skb.len = sasize + 4;
 		break;
 
 	default:
 		return ENOSYS;
 	}
 
 
 	if ((error = copyout(SVR4_ADDROF(&sc), skb.buf, sasize)) != 0) {
 		DPRINTF(("ti_ioctl: error copying out socket data\n"));
 		return error;
 	}
 
 
 	if ((error = copyout(&skb, sub, sizeof(skb))) != 0) {
 		DPRINTF(("ti_ioctl: error copying out strbuf\n"));
 		return error;
 	}
 
 	return error;
 }
 
 
 
 
 static int
 i_nread(fp, td, retval, fd, cmd, dat)
 	struct file *fp;
 	struct thread *td;
 	register_t *retval;
 	int fd;
 	u_long cmd;
 	caddr_t dat;
 {
 	int error;
 	int nread = 0;	
 
 	/*
 	 * We are supposed to return the message length in nread, and the
 	 * number of messages in retval. We don't have the notion of number
 	 * of stream messages, so we just find out if we have any bytes waiting
 	 * for us, and if we do, then we assume that we have at least one
 	 * message waiting for us.
 	 */
 	if ((error = fo_ioctl(fp, FIONREAD, (caddr_t) &nread, td)) != 0)
 		return error;
 
 	if (nread != 0)
 		*retval = 1;
 	else
 		*retval = 0;
 
 	return copyout(&nread, dat, sizeof(nread));
 }
 
 static int
 i_fdinsert(fp, td, retval, fd, cmd, dat)
 	struct file *fp;
 	struct thread *td;
 	register_t *retval;
 	int fd;
 	u_long cmd;
 	caddr_t dat;
 {
 	/*
 	 * Major hack again here. We assume that we are using this to
 	 * implement accept(2). If that is the case, we have already
 	 * called accept, and we have stored the file descriptor in
 	 * afd. We find the file descriptor that the code wants to use
 	 * in fd insert, and then we dup2() our accepted file descriptor
 	 * to it.
 	 */
 	int error;
 	struct svr4_strm *st = svr4_stream_get(fp);
 	struct svr4_strfdinsert fdi;
 	struct dup2_args d2p;
 	struct close_args clp;
 
 	if (st == NULL) {
 		DPRINTF(("fdinsert: bad file type\n"));
 		return EINVAL;
 	}
 
 	if (st->s_afd == -1) {
 		DPRINTF(("fdinsert: accept fd not found\n"));
 		return ENOENT;
 	}
 
 	if ((error = copyin(dat, &fdi, sizeof(fdi))) != 0) {
 		DPRINTF(("fdinsert: copyin failed %d\n", error));
 		return error;
 	}
 
 	SCARG(&d2p, from) = st->s_afd;
 	SCARG(&d2p, to) = fdi.fd;
 
 	if ((error = dup2(td, &d2p)) != 0) {
 		DPRINTF(("fdinsert: dup2(%d, %d) failed %d\n", 
 		    st->s_afd, fdi.fd, error));
 		return error;
 	}
 
 	SCARG(&clp, fd) = st->s_afd;
 
 	if ((error = close(td, &clp)) != 0) {
 		DPRINTF(("fdinsert: close(%d) failed %d\n", 
 		    st->s_afd, error));
 		return error;
 	}
 
 	st->s_afd = -1;
 
 	*retval = 0;
 	return 0;
 }
 
 
 static int
 _i_bind_rsvd(fp, td, retval, fd, cmd, dat)
 	struct file *fp;
 	struct thread *td;
 	register_t *retval;
 	int fd;
 	u_long cmd;
 	caddr_t dat;
 {
 	struct mkfifo_args ap;
 
 	/*
 	 * This is a supposed to be a kernel and library only ioctl.
 	 * It gets called before ti_bind, when we have a unix 
 	 * socket, to physically create the socket transport and
 	 * ``reserve'' it. I don't know how this get reserved inside
 	 * the kernel, but we are going to create it nevertheless.
 	 */
 	SCARG(&ap, path) = dat;
 	SCARG(&ap, mode) = S_IFIFO;
 
 	return mkfifo(td, &ap);
 }
 
 static int
 _i_rele_rsvd(fp, td, retval, fd, cmd, dat)
 	struct file *fp;
 	struct thread *td;
 	register_t *retval;
 	int fd;
 	u_long cmd;
 	caddr_t dat;
 {
 	struct unlink_args ap;
 
 	/*
 	 * This is a supposed to be a kernel and library only ioctl.
 	 * I guess it is supposed to release the socket.
 	 */
 	SCARG(&ap, path) = dat;
 
 	return unlink(td, &ap);
 }
 
 static int
 i_str(fp, td, retval, fd, cmd, dat)
 	struct file *fp;
 	struct thread *td;
 	register_t *retval;
 	int fd;
 	u_long cmd;
 	caddr_t dat;
 {
 	int			 error;
 	struct svr4_strioctl	 ioc;
 
 	if ((error = copyin(dat, &ioc, sizeof(ioc))) != 0)
 		return error;
 
 #ifdef DEBUG_SVR4
 	if ((error = show_ioc(">", &ioc)) != 0)
 		return error;
 #endif /* DEBUG_SVR4 */
 
 	switch (ioc.cmd & 0xff00) {
 	case SVR4_SIMOD:
 		if ((error = sockmod(fp, fd, &ioc, td)) != 0)
 			return error;
 		break;
 
 	case SVR4_TIMOD:
 		if ((error = timod(fp, fd, &ioc, td)) != 0)
 			return error;
 		break;
 
 	default:
 		DPRINTF(("Unimplemented module %c %ld\n",
 			 (char) (cmd >> 8), cmd & 0xff));
 		return 0;
 	}
 
 #ifdef DEBUG_SVR4
 	if ((error = show_ioc("<", &ioc)) != 0)
 		return error;
 #endif /* DEBUG_SVR4 */
 	return copyout(&ioc, dat, sizeof(ioc));
 }
 
 static int
 i_setsig(fp, td, retval, fd, cmd, dat)
 	struct file *fp;
 	struct thread *td;
 	register_t *retval;
 	int fd;
 	u_long cmd;
 	caddr_t dat;
 {
 	/* 
 	 * This is the best we can do for now; we cannot generate
 	 * signals only for specific events so the signal mask gets
 	 * ignored; we save it just to pass it to a possible I_GETSIG...
 	 *
 	 * We alse have to fix the O_ASYNC fcntl bit, so the
 	 * process will get SIGPOLLs.
 	 */
 	struct fcntl_args fa;
 	int error;
 	register_t oflags, flags;
 	struct svr4_strm *st = svr4_stream_get(fp);
 
 	if (st == NULL) {
 		DPRINTF(("i_setsig: bad file descriptor\n"));
 		return EINVAL;
 	}
 	/* get old status flags */
 	SCARG(&fa, fd) = fd;
 	SCARG(&fa, cmd) = F_GETFL;
 	if ((error = fcntl(td, &fa)) != 0)
 		return error;
 
 	oflags = td->td_retval[0];
 
 	/* update the flags */
 	if (dat != NULL) {
 		int mask;
 
 		flags = oflags | O_ASYNC;
 		if ((error = copyin(dat, &mask, sizeof(mask))) != 0) {
 			  DPRINTF(("i_setsig: bad eventmask pointer\n"));
 			  return error;
 		}
 		if (mask & SVR4_S_ALLMASK) {
 			  DPRINTF(("i_setsig: bad eventmask data %x\n", mask));
 			  return EINVAL;
 		}
 		st->s_eventmask = mask;
 	}
 	else {
 		flags = oflags & ~O_ASYNC;
 		st->s_eventmask = 0;
 	}
 
 	/* set the new flags, if changed */
 	if (flags != oflags) {
 		SCARG(&fa, cmd) = F_SETFL;
 		SCARG(&fa, arg) = (long) flags;
 		if ((error = fcntl(td, &fa)) != 0)
 			  return error;
 		flags = td->td_retval[0];
 	}
 
 	/* set up SIGIO receiver if needed */
 	if (dat != NULL) {
 		SCARG(&fa, cmd) = F_SETOWN;
 		SCARG(&fa, arg) = (long) td->td_proc->p_pid;
 		return fcntl(td, &fa);
 	}
 	return 0;
 }
 
 static int
 i_getsig(fp, td, retval, fd, cmd, dat)
 	struct file *fp;
 	struct thread *td;
 	register_t *retval;
 	int fd;
 	u_long cmd;
 	caddr_t dat;
 {
 	int error;
 
 	if (dat != NULL) {
 		struct svr4_strm *st = svr4_stream_get(fp);
 
 		if (st == NULL) {
 			DPRINTF(("i_getsig: bad file descriptor\n"));
 			return EINVAL;
 		}
 		if ((error = copyout(&st->s_eventmask, dat, 
 				     sizeof(st->s_eventmask))) != 0) {
 			DPRINTF(("i_getsig: bad eventmask pointer\n"));
 			return error;
 		}
 	}
 	return 0;
 }
 
 int
 svr4_stream_ioctl(fp, td, retval, fd, cmd, dat)
 	struct file *fp;
 	struct thread *td;
 	register_t *retval;
 	int fd;
 	u_long cmd;
 	caddr_t dat;
 {
 	*retval = 0;
 
 	/*
 	 * All the following stuff assumes "sockmod" is pushed...
 	 */
 	switch (cmd) {
 	case SVR4_I_NREAD:
 		DPRINTF(("I_NREAD\n"));
 		return i_nread(fp, td, retval, fd, cmd, dat);
 
 	case SVR4_I_PUSH:
 		DPRINTF(("I_PUSH %p\n", dat));
 #if defined(DEBUG_SVR4)
 		show_strbuf((struct svr4_strbuf *)dat);
 #endif
 		return 0;
 
 	case SVR4_I_POP:
 		DPRINTF(("I_POP\n"));
 		return 0;
 
 	case SVR4_I_LOOK:
 		DPRINTF(("I_LOOK\n"));
 		return 0;
 
 	case SVR4_I_FLUSH:
 		DPRINTF(("I_FLUSH\n"));
 		return 0;
 
 	case SVR4_I_SRDOPT:
 		DPRINTF(("I_SRDOPT\n"));
 		return 0;
 
 	case SVR4_I_GRDOPT:
 		DPRINTF(("I_GRDOPT\n"));
 		return 0;
 
 	case SVR4_I_STR:
 		DPRINTF(("I_STR\n"));
 		return i_str(fp, td, retval, fd, cmd, dat);
 
 	case SVR4_I_SETSIG:
 		DPRINTF(("I_SETSIG\n"));
 		return i_setsig(fp, td, retval, fd, cmd, dat);
 
 	case SVR4_I_GETSIG:
 	        DPRINTF(("I_GETSIG\n"));
 		return i_getsig(fp, td, retval, fd, cmd, dat);
 
 	case SVR4_I_FIND:
 		DPRINTF(("I_FIND\n"));
 		/*
 		 * Here we are not pushing modules really, we just
 		 * pretend all are present
 		 */
 		*retval = 0;
 		return 0;
 
 	case SVR4_I_LINK:
 		DPRINTF(("I_LINK\n"));
 		return 0;
 
 	case SVR4_I_UNLINK:
 		DPRINTF(("I_UNLINK\n"));
 		return 0;
 
 	case SVR4_I_ERECVFD:
 		DPRINTF(("I_ERECVFD\n"));
 		return 0;
 
 	case SVR4_I_PEEK:
 		DPRINTF(("I_PEEK\n"));
 		return 0;
 
 	case SVR4_I_FDINSERT:
 		DPRINTF(("I_FDINSERT\n"));
 		return i_fdinsert(fp, td, retval, fd, cmd, dat);
 
 	case SVR4_I_SENDFD:
 		DPRINTF(("I_SENDFD\n"));
 		return 0;
 
 	case SVR4_I_RECVFD:
 		DPRINTF(("I_RECVFD\n"));
 		return 0;
 
 	case SVR4_I_SWROPT:
 		DPRINTF(("I_SWROPT\n"));
 		return 0;
 
 	case SVR4_I_GWROPT:
 		DPRINTF(("I_GWROPT\n"));
 		return 0;
 
 	case SVR4_I_LIST:
 		DPRINTF(("I_LIST\n"));
 		return 0;
 
 	case SVR4_I_PLINK:
 		DPRINTF(("I_PLINK\n"));
 		return 0;
 
 	case SVR4_I_PUNLINK:
 		DPRINTF(("I_PUNLINK\n"));
 		return 0;
 
 	case SVR4_I_SETEV:
 		DPRINTF(("I_SETEV\n"));
 		return 0;
 
 	case SVR4_I_GETEV:
 		DPRINTF(("I_GETEV\n"));
 		return 0;
 
 	case SVR4_I_STREV:
 		DPRINTF(("I_STREV\n"));
 		return 0;
 
 	case SVR4_I_UNSTREV:
 		DPRINTF(("I_UNSTREV\n"));
 		return 0;
 
 	case SVR4_I_FLUSHBAND:
 		DPRINTF(("I_FLUSHBAND\n"));
 		return 0;
 
 	case SVR4_I_CKBAND:
 		DPRINTF(("I_CKBAND\n"));
 		return 0;
 
 	case SVR4_I_GETBAND:
 		DPRINTF(("I_GETBANK\n"));
 		return 0;
 
 	case SVR4_I_ATMARK:
 		DPRINTF(("I_ATMARK\n"));
 		return 0;
 
 	case SVR4_I_SETCLTIME:
 		DPRINTF(("I_SETCLTIME\n"));
 		return 0;
 
 	case SVR4_I_GETCLTIME:
 		DPRINTF(("I_GETCLTIME\n"));
 		return 0;
 
 	case SVR4_I_CANPUT:
 		DPRINTF(("I_CANPUT\n"));
 		return 0;
 
 	case SVR4__I_BIND_RSVD:
 		DPRINTF(("_I_BIND_RSVD\n"));
 		return _i_bind_rsvd(fp, td, retval, fd, cmd, dat);
 
 	case SVR4__I_RELE_RSVD:
 		DPRINTF(("_I_RELE_RSVD\n"));
 		return _i_rele_rsvd(fp, td, retval, fd, cmd, dat);
 
 	default:
 		DPRINTF(("unimpl cmd = %lx\n", cmd));
 		break;
 	}
 
 	return 0;
 }
 
 
 
 int
 svr4_sys_putmsg(td, uap)
 	register struct thread *td;
 	struct svr4_sys_putmsg_args *uap;
 {
-	struct filedesc	*fdp = td->td_proc->p_fd;
+	struct file     *fp;
+	int error;
+
+	fp = ffind_hold(td, uap->fd);
+	if (fp == NULL) {
+#ifdef DEBUG_SVR4
+	        uprintf("putmsg: bad fp\n");
+#endif
+		return EBADF;
+	}
+	error = svr4_do_putmsg(td, uap, fp);
+	fdrop(fp, td);
+	return (error);
+}
+
+static int
+svr4_do_putmsg(td, uap, fp)
+	struct thread *td;
+	struct svr4_sys_putmsg_args *uap;
 	struct file	*fp;
+{
 	struct svr4_strbuf dat, ctl;
 	struct svr4_strmcmd sc;
 	struct sockaddr_in sain;
 	struct sockaddr_un saun;
 	void *skp, *sup;
 	int sasize, *retval;
 	struct svr4_strm *st;
 	int error;
 	caddr_t sg;
 
 	retval = td->td_retval;
-	fp = fdp->fd_ofiles[SCARG(uap, fd)];
 
-	if (((u_int)SCARG(uap, fd) >= fdp->fd_nfiles) || (fp == NULL)) {
 #ifdef DEBUG_SVR4
-	        uprintf("putmsg: bad fp\n");
-#endif
-		return EBADF;
-	}
-
-#ifdef DEBUG_SVR4
 	show_msg(">putmsg", SCARG(uap, fd), SCARG(uap, ctl),
 		 SCARG(uap, dat), SCARG(uap, flags));
 #endif /* DEBUG_SVR4 */
 
-	if (((u_int)SCARG(uap, fd) >= fdp->fd_nfiles) || (fp == NULL)) {
-#ifdef DEBUG_SVR4
-	        uprintf("putmsg: bad fp(2)\n");
-#endif
-		return EBADF;
-	}
+	FILE_LOCK_ASSERT(fp, MA_NOTOWNED);
 
 	if (SCARG(uap, ctl) != NULL) {
 	  if ((error = copyin(SCARG(uap, ctl), &ctl, sizeof(ctl))) != 0) {
 #ifdef DEBUG_SVR4
 	    uprintf("putmsg: copyin(): %d\n", error);
 #endif
 	    return error;
 	  }
 	}
 	else
 		ctl.len = -1;
 
 	if (SCARG(uap, dat) != NULL) {
 	  if ((error = copyin(SCARG(uap, dat), &dat, sizeof(dat))) != 0) {
 #ifdef DEBUG_SVR4
 	    uprintf("putmsg: copyin(): %d (2)\n", error);
 #endif
 	    return error;
 	  }
 	}
 	else
 		dat.len = -1;
 
 	/*
 	 * Only for sockets for now.
 	 */
 	if ((st = svr4_stream_get(fp)) == NULL) {
 		DPRINTF(("putmsg: bad file type\n"));
 		return EINVAL;
 	}
 
 	if (ctl.len > sizeof(sc)) {
 		DPRINTF(("putmsg: Bad control size %d != %d\n", ctl.len,
 			 sizeof(struct svr4_strmcmd)));
 		return EINVAL;
 	}
 
 	if ((error = copyin(ctl.buf, &sc, ctl.len)) != 0)
 		return error;
 
 	switch (st->s_family) {
 	case AF_INET:
 	        if (sc.len != sizeof(sain)) {
 		        if (sc.cmd == SVR4_TI_DATA_REQUEST) {
 			        struct write_args wa;
 
 				/* Solaris seems to use sc.cmd = 3 to
 				 * send "expedited" data.  telnet uses
 				 * this for options processing, sending EOF,
 				 * etc.  I'm sure other things use it too.
 				 * I don't have any documentation
 				 * on it, so I'm making a guess that this
 				 * is how it works. newton@atdot.dotat.org XXX
 				 */
 				DPRINTF(("sending expedited data ??\n"));
 				SCARG(&wa, fd) = SCARG(uap, fd);
 				SCARG(&wa, buf) = dat.buf;
 				SCARG(&wa, nbyte) = dat.len;
 				return write(td, &wa);
 			}
 	                DPRINTF(("putmsg: Invalid inet length %ld\n", sc.len));
 	                return EINVAL;
 	        }
 	        netaddr_to_sockaddr_in(&sain, &sc);
 	        skp = &sain;
 	        sasize = sizeof(sain);
 	        error = sain.sin_family != st->s_family;
 		break;
 
 	case AF_LOCAL:
 		if (ctl.len == 8) {
 			/* We are doing an accept; succeed */
 			DPRINTF(("putmsg: Do nothing\n"));
 			*retval = 0;
 			return 0;
 		}
 		else {
 			/* Maybe we've been given a device/inode pair */
 			udev_t *dev = SVR4_ADDROF(&sc);
 			ino_t *ino = (ino_t *) &dev[1];
 			skp = svr4_find_socket(td, fp, *dev, *ino);
 			if (skp == NULL) {
 				skp = &saun;
 				/* I guess we have it by name */
 				netaddr_to_sockaddr_un(skp, &sc);
 			}
 			sasize = sizeof(saun);
 		}
 		break;
 
 	default:
 		DPRINTF(("putmsg: Unsupported address family %d\n",
 			 st->s_family));
 		return ENOSYS;
 	}
 
 	sg = stackgap_init();
 	sup = stackgap_alloc(&sg, sasize);
 
 	if ((error = copyout(skp, sup, sasize)) != 0)
 		return error;
 
 	switch (st->s_cmd = sc.cmd) {
 	case SVR4_TI_CONNECT_REQUEST:	/* connect 	*/
 		{
 			struct connect_args co;
 
 			SCARG(&co, s) = SCARG(uap, fd);
 			SCARG(&co, name) = (void *) sup;
 			SCARG(&co, namelen) = (int) sasize;
 			
 			return connect(td, &co);
 		}
 
 	case SVR4_TI_SENDTO_REQUEST:	/* sendto 	*/
 		{
 			struct msghdr msg;
 			struct iovec aiov;
 
 			msg.msg_name = (caddr_t) sup;
 			msg.msg_namelen = sasize;
 			msg.msg_iov = &aiov;
 			msg.msg_iovlen = 1;
 			msg.msg_control = 0;
 			msg.msg_flags = 0;
 			aiov.iov_base = dat.buf;
 			aiov.iov_len = dat.len;
 #if 0
 			error = so->so_proto->pr_usrreqs->pru_sosend(so, 0, 
 					      uio, 0, 0, 0, uio->uio_td);
 #endif
 			error = svr4_sendit(td, SCARG(uap, fd), &msg,
 				       SCARG(uap, flags));
 			DPRINTF(("sendto_request error: %d\n", error));
 			*retval = 0;
 			return error;
 		}
 
 	default:
 		DPRINTF(("putmsg: Unimplemented command %lx\n", sc.cmd));
 		return ENOSYS;
 	}
 }
 
 int
+svr4_sys_getmsg(p, uap)
+	struct proc *p;
+	struct svr4_sys_getmsg_args *uap;
+{
+	struct file     *fp;
+	int error;
+
+	fp = ffind_hold(td, uap->fd);
+	if (fp == NULL) {
+#ifdef DEBUG_SVR4
+	        uprintf("getmsg: bad fp\n");
+#endif
+		return EBADF;
+	}
+	error = svr4_do_getmsg(p, uap, fp);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
 svr4_sys_getmsg(td, uap)
 	register struct thread *td;
 	struct svr4_sys_getmsg_args *uap;
+	struct file *fp;
 {
-	struct filedesc	*fdp = td->td_proc->p_fd;
-	struct file	*fp;
 	struct getpeername_args ga;
 	struct accept_args aa;
 	struct svr4_strbuf dat, ctl;
 	struct svr4_strmcmd sc;
 	int error, *retval;
 	struct msghdr msg;
 	struct iovec aiov;
 	struct sockaddr_in sain;
 	struct sockaddr_un saun;
 	void *skp, *sup;
 	int sasize;
 	struct svr4_strm *st;
 	int *flen;
 	int fl;
 	caddr_t sg;
 
 	retval = td->td_retval;
-	fp = fdp->fd_ofiles[SCARG(uap, fd)];
 
-	if (((u_int)SCARG(uap, fd) >= fdp->fd_nfiles) || (fp == NULL))
-		return EBADF;
+	FILE_LOCK_ASSERT(fp, MA_NOTOWNED);
 
 	memset(&sc, 0, sizeof(sc));
 
 #ifdef DEBUG_SVR4
 	show_msg(">getmsg", SCARG(uap, fd), SCARG(uap, ctl),
 		 SCARG(uap, dat), 0);
 #endif /* DEBUG_SVR4 */
-			
-	if (((u_int)SCARG(uap, fd) >= fdp->fd_nfiles) || (fp == NULL))
-		return EBADF;
 
 	if (SCARG(uap, ctl) != NULL) {
 		if ((error = copyin(SCARG(uap, ctl), &ctl, sizeof(ctl))) != 0)
 			return error;
 	}
 	else {
 		ctl.len = -1;
 		ctl.maxlen = 0;
 	}
 
 	if (SCARG(uap, dat) != NULL) {
 	    	if ((error = copyin(SCARG(uap, dat), &dat, sizeof(dat))) != 0)
 			return error;
 	}
 	else {
 		dat.len = -1;
 		dat.maxlen = 0;
 	}
 
 	/*
 	 * Only for sockets for now.
 	 */
 	if ((st = svr4_stream_get(fp)) == NULL) {
 		DPRINTF(("getmsg: bad file type\n"));
 		return EINVAL;
 	}
 
 	if (ctl.maxlen == -1 || dat.maxlen == -1) {
 		DPRINTF(("getmsg: Cannot handle -1 maxlen (yet)\n"));
 		return ENOSYS;
 	}
 
 	switch (st->s_family) {
 	case AF_INET:
 		skp = &sain;
 		sasize = sizeof(sain);
 		break;
 
 	case AF_LOCAL:
 		skp = &saun;
 		sasize = sizeof(saun);
 		break;
 
 	default:
 		DPRINTF(("getmsg: Unsupported address family %d\n",
 			 st->s_family));
 		return ENOSYS;
 	}
 
 	sg = stackgap_init();
 	sup = stackgap_alloc(&sg, sasize);
 	flen = (int *) stackgap_alloc(&sg, sizeof(*flen));
 
 	fl = sasize;
 	if ((error = copyout(&fl, flen, sizeof(fl))) != 0)
 		return error;
 
 	switch (st->s_cmd) {
 	case SVR4_TI_CONNECT_REQUEST:
 		DPRINTF(("getmsg: TI_CONNECT_REQUEST\n"));
 		/*
 		 * We do the connect in one step, so the putmsg should
 		 * have gotten the error.
 		 */
 		sc.cmd = SVR4_TI_OK_REPLY;
 		sc.len = 0;
 
 		ctl.len = 8;
 		dat.len = -1;
 		fl = 1;
 		st->s_cmd = sc.cmd;
 		break;
 
 	case SVR4_TI_OK_REPLY:
 		DPRINTF(("getmsg: TI_OK_REPLY\n"));
 		/*
 		 * We are immediately after a connect reply, so we send
 		 * a connect verification.
 		 */
 
 		SCARG(&ga, fdes) = SCARG(uap, fd);
 		SCARG(&ga, asa) = (void *) sup;
 		SCARG(&ga, alen) = flen;
 		
 		if ((error = getpeername(td, &ga)) != 0) {
 			DPRINTF(("getmsg: getpeername failed %d\n", error));
 			return error;
 		}
 
 		if ((error = copyin(sup, skp, sasize)) != 0)
 			return error;
 		
 		sc.cmd = SVR4_TI_CONNECT_REPLY;
 		sc.pad[0] = 0x4;
 		sc.offs = 0x18;
 		sc.pad[1] = 0x14;
 		sc.pad[2] = 0x04000402;
 
 		switch (st->s_family) {
 		case AF_INET:
 			sc.len = sasize;
 			sockaddr_to_netaddr_in(&sc, &sain);
 			break;
 
 		case AF_LOCAL:
 			sc.len = sasize + 4;
 			sockaddr_to_netaddr_un(&sc, &saun);
 			break;
 
 		default:
 			return ENOSYS;
 		}
 
 		ctl.len = 40;
 		dat.len = -1;
 		fl = 0;
 		st->s_cmd = sc.cmd;
 		break;
 
 	case SVR4_TI__ACCEPT_OK:
 		DPRINTF(("getmsg: TI__ACCEPT_OK\n"));
 		/*
 		 * We do the connect in one step, so the putmsg should
 		 * have gotten the error.
 		 */
 		sc.cmd = SVR4_TI_OK_REPLY;
 		sc.len = 1;
 
 		ctl.len = 8;
 		dat.len = -1;
 		fl = 1;
 		st->s_cmd = SVR4_TI__ACCEPT_WAIT;
 		break;
 
 	case SVR4_TI__ACCEPT_WAIT:
 		DPRINTF(("getmsg: TI__ACCEPT_WAIT\n"));
 		/*
 		 * We are after a listen, so we try to accept...
 		 */
 		SCARG(&aa, s) = SCARG(uap, fd);
 		SCARG(&aa, name) = (void *) sup;
 		SCARG(&aa, anamelen) = flen;
 		
 		if ((error = accept(td, &aa)) != 0) {
 			DPRINTF(("getmsg: accept failed %d\n", error));
 			return error;
 		}
 
 		st->s_afd = *retval;
 
 		DPRINTF(("getmsg: Accept fd = %d\n", st->s_afd));
 
 		if ((error = copyin(sup, skp, sasize)) != 0)
 			return error;
 		
 		sc.cmd = SVR4_TI_ACCEPT_REPLY;
 		sc.offs = 0x18;
 		sc.pad[0] = 0x0;
 
 		switch (st->s_family) {
 		case AF_INET:
 			sc.pad[1] = 0x28;
 			sockaddr_to_netaddr_in(&sc, &sain);
 			ctl.len = 40;
 			sc.len = sasize;
 			break;
 
 		case AF_LOCAL:
 			sc.pad[1] = 0x00010000;
 			sc.pad[2] = 0xf6bcdaa0;	/* I don't know what that is */
 			sc.pad[3] = 0x00010000;
 			ctl.len = 134;
 			sc.len = sasize + 4;
 			break;
 
 		default:
 			return ENOSYS;
 		}
 
 		dat.len = -1;
 		fl = 0;
 		st->s_cmd = SVR4_TI__ACCEPT_OK;
 		break;
 
 	case SVR4_TI_SENDTO_REQUEST:
 		DPRINTF(("getmsg: TI_SENDTO_REQUEST\n"));
 		if (ctl.maxlen > 36 && ctl.len < 36)
 		    ctl.len = 36;
 
 		if ((error = copyin(ctl.buf, &sc, ctl.len)) != 0)
 			return error;
 
 		switch (st->s_family) {
 		case AF_INET:
 			sockaddr_to_netaddr_in(&sc, &sain);
 			break;
 
 		case AF_LOCAL:
 			sockaddr_to_netaddr_un(&sc, &saun);
 			break;
 
 		default:
 			return ENOSYS;
 		}
 
 		msg.msg_name = (caddr_t) sup;
 		msg.msg_namelen = sasize;
 		msg.msg_iov = &aiov;
 		msg.msg_iovlen = 1;
 		msg.msg_control = 0;
 		aiov.iov_base = dat.buf;
 		aiov.iov_len = dat.maxlen;
 		msg.msg_flags = 0;
 
 		error = svr4_recvit(td, SCARG(uap, fd), &msg, (caddr_t) flen);
 
 		if (error) {
 			DPRINTF(("getmsg: recvit failed %d\n", error));
 			return error;
 		}
 
 		if ((error = copyin(msg.msg_name, skp, sasize)) != 0)
 			return error;
 
 		sc.cmd = SVR4_TI_RECVFROM_IND;
 
 		switch (st->s_family) {
 		case AF_INET:
 			sc.len = sasize;
 			sockaddr_to_netaddr_in(&sc, &sain);
 			break;
 
 		case AF_LOCAL:
 			sc.len = sasize + 4;
 			sockaddr_to_netaddr_un(&sc, &saun);
 			break;
 
 		default:
 			return ENOSYS;
 		}
 
 		dat.len = *retval;
 		fl = 0;
 		st->s_cmd = sc.cmd;
 		break;
 
 	default:
 		st->s_cmd = sc.cmd;
 		if (st->s_cmd == SVR4_TI_CONNECT_REQUEST) {
 		        struct read_args ra;
 
 			/* More weirdness:  Again, I can't find documentation
 			 * to back this up, but when a process does a generic
 			 * "getmsg()" call it seems that the command field is
 			 * zero and the length of the data area is zero.  I
 			 * think processes expect getmsg() to fill in dat.len
 			 * after reading at most dat.maxlen octets from the
 			 * stream.  Since we're using sockets I can let 
 			 * read() look after it and frob return values
 			 * appropriately (or inappropriately :-)
 			 *   -- newton@atdot.dotat.org        XXX
 			 */
 			SCARG(&ra, fd) = SCARG(uap, fd);
 			SCARG(&ra, buf) = dat.buf;
 			SCARG(&ra, nbyte) = dat.maxlen;
 			if ((error = read(td, &ra)) != 0) {
 			        return error;
 			}
 			dat.len = *retval;
 			*retval = 0;
 			st->s_cmd = SVR4_TI_SENDTO_REQUEST;
 			break;
 		}
 		DPRINTF(("getmsg: Unknown state %x\n", st->s_cmd));
 		return EINVAL;
 	}
 
 	if (SCARG(uap, ctl)) {
 		if (ctl.len != -1)
 			if ((error = copyout(&sc, ctl.buf, ctl.len)) != 0)
 				return error;
 
 		if ((error = copyout(&ctl, SCARG(uap, ctl), sizeof(ctl))) != 0)
 			return error;
 	}
 
 	if (SCARG(uap, dat)) {
 		if ((error = copyout(&dat, SCARG(uap, dat), sizeof(dat))) != 0)
 			return error;
 	}
 
 	if (SCARG(uap, flags)) { /* XXX: Need translation */
 		if ((error = copyout(&fl, SCARG(uap, flags), sizeof(fl))) != 0)
 			return error;
 	}
 
 	*retval = 0;
 
 #ifdef DEBUG_SVR4
 	show_msg("<getmsg", SCARG(uap, fd), SCARG(uap, ctl),
 		 SCARG(uap, dat), fl);
 #endif /* DEBUG_SVR4 */
 	return error;
 }
 
 int svr4_sys_send(td, uap)
 	struct thread *td;
 	struct svr4_sys_send_args *uap;
 {
 	struct osend_args osa;
 	SCARG(&osa, s) = SCARG(uap, s);
 	SCARG(&osa, buf) = SCARG(uap, buf);
 	SCARG(&osa, len) = SCARG(uap, len);
 	SCARG(&osa, flags) = SCARG(uap, flags);
 	return osend(td, &osa);
 }
 
 int svr4_sys_recv(td, uap)
 	struct thread *td;
 	struct svr4_sys_recv_args *uap;
 {
 	struct orecv_args ora;
 	SCARG(&ora, s) = SCARG(uap, s);
 	SCARG(&ora, buf) = SCARG(uap, buf);
 	SCARG(&ora, len) = SCARG(uap, len);
 	SCARG(&ora, flags) = SCARG(uap, flags);
 	return orecv(td, &ora);
 }
 
 /* 
  * XXX This isn't necessary, but it's handy for inserting debug code into
  * sendto().  Let's leave it here for now...
  */	
 int
 svr4_sys_sendto(td, uap)
         struct thread *td;
         struct svr4_sys_sendto_args *uap;
 {
         struct sendto_args sa;
 
 	SCARG(&sa, s) = SCARG(uap, s);
 	SCARG(&sa, buf) = SCARG(uap, buf);
 	SCARG(&sa, len) = SCARG(uap, len);
 	SCARG(&sa, flags) = SCARG(uap, flags);
 	SCARG(&sa, to) = (caddr_t)SCARG(uap, to);
 	SCARG(&sa, tolen) = SCARG(uap, tolen);
 
 	DPRINTF(("calling sendto()\n"));
 	return sendto(td, &sa);
 }
 
Index: head/sys/dev/aac/aac.c
===================================================================
--- head/sys/dev/aac/aac.c	(revision 89305)
+++ head/sys/dev/aac/aac.c	(revision 89306)
@@ -1,2754 +1,2759 @@
 /*-
  * Copyright (c) 2000 Michael Smith
  * Copyright (c) 2001 Scott Long
  * Copyright (c) 2000 BSDi
  * Copyright (c) 2001 Adaptec, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$FreeBSD$
  */
 
 /*
  * Driver for the Adaptec 'FSA' family of PCI/SCSI RAID adapters.
  */
 
 #include "opt_aac.h"
 
 /* #include <stddef.h> */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/sysctl.h>
 #include <sys/poll.h>
 #if __FreeBSD_version >= 500005
 #include <sys/selinfo.h>
 #else
 #include <sys/select.h>
 #endif
 
 #include <dev/aac/aac_compat.h>
 
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/devicestat.h>
 #include <sys/disk.h>
 #include <sys/file.h>
 #include <sys/signalvar.h>
 #include <sys/time.h>
 #include <sys/eventhandler.h>
 
 #include <machine/bus_memio.h>
 #include <machine/bus.h>
 #include <machine/resource.h>
 
 #include <dev/aac/aacreg.h>
 #include <dev/aac/aac_ioctl.h>
 #include <dev/aac/aacvar.h>
 #include <dev/aac/aac_tables.h>
 
 static void	aac_startup(void *arg);
 static void	aac_add_container(struct aac_softc *sc,
 				  struct aac_mntinforesponse *mir, int f);
 
 /* Command Processing */
 static void	aac_startio(struct aac_softc *sc);
 static void	aac_timeout(struct aac_softc *sc);
 static int	aac_start(struct aac_command *cm);
 static void	aac_complete(void *context, int pending);
 static int	aac_bio_command(struct aac_softc *sc, struct aac_command **cmp);
 static void	aac_bio_complete(struct aac_command *cm);
 static int	aac_wait_command(struct aac_command *cm, int timeout);
 static void	aac_host_command(struct aac_softc *sc);
 static void	aac_host_response(struct aac_softc *sc);
 
 /* Command Buffer Management */
 static int	aac_alloc_command(struct aac_softc *sc,
 				  struct aac_command **cmp);
 static void	aac_release_command(struct aac_command *cm);
 static void	aac_map_command_helper(void *arg, bus_dma_segment_t *segs,
 				       int nseg, int error);
 static int	aac_alloc_commands(struct aac_softc *sc);
 static void	aac_free_commands(struct aac_softc *sc);
 static void	aac_map_command(struct aac_command *cm);
 static void	aac_unmap_command(struct aac_command *cm);
 
 /* Hardware Interface */
 static void	aac_common_map(void *arg, bus_dma_segment_t *segs, int nseg,
 			       int error);
 static int	aac_init(struct aac_softc *sc);
 static int	aac_sync_command(struct aac_softc *sc, u_int32_t command,
 				 u_int32_t arg0, u_int32_t arg1, u_int32_t arg2,
 				 u_int32_t arg3, u_int32_t *sp);
 static int	aac_sync_fib(struct aac_softc *sc, u_int32_t command,
 			     u_int32_t xferstate, void *data,
 			     u_int16_t datasize, void *result,
 			     u_int16_t *resultsize);
 static int	aac_enqueue_fib(struct aac_softc *sc, int queue,
 				struct aac_command *cm);
 static int	aac_dequeue_fib(struct aac_softc *sc, int queue,
 				u_int32_t *fib_size, struct aac_fib **fib_addr);
 static int	aac_enqueue_response(struct aac_softc *sc, int queue,
 				     struct aac_fib *fib);
 
 /* Falcon/PPC interface */
 static int	aac_fa_get_fwstatus(struct aac_softc *sc);
 static void	aac_fa_qnotify(struct aac_softc *sc, int qbit);
 static int	aac_fa_get_istatus(struct aac_softc *sc);
 static void	aac_fa_clear_istatus(struct aac_softc *sc, int mask);
 static void	aac_fa_set_mailbox(struct aac_softc *sc, u_int32_t command,
 				   u_int32_t arg0, u_int32_t arg1,
 				   u_int32_t arg2, u_int32_t arg3);
 static int	aac_fa_get_mailboxstatus(struct aac_softc *sc);
 static void	aac_fa_set_interrupts(struct aac_softc *sc, int enable);
 
 struct aac_interface aac_fa_interface = {
 	aac_fa_get_fwstatus,
 	aac_fa_qnotify,
 	aac_fa_get_istatus,
 	aac_fa_clear_istatus,
 	aac_fa_set_mailbox,
 	aac_fa_get_mailboxstatus,
 	aac_fa_set_interrupts
 };
 
 /* StrongARM interface */
 static int	aac_sa_get_fwstatus(struct aac_softc *sc);
 static void	aac_sa_qnotify(struct aac_softc *sc, int qbit);
 static int	aac_sa_get_istatus(struct aac_softc *sc);
 static void	aac_sa_clear_istatus(struct aac_softc *sc, int mask);
 static void	aac_sa_set_mailbox(struct aac_softc *sc, u_int32_t command,
 				   u_int32_t arg0, u_int32_t arg1,
 				   u_int32_t arg2, u_int32_t arg3);
 static int	aac_sa_get_mailboxstatus(struct aac_softc *sc);
 static void	aac_sa_set_interrupts(struct aac_softc *sc, int enable);
 
 struct aac_interface aac_sa_interface = {
 	aac_sa_get_fwstatus,
 	aac_sa_qnotify,
 	aac_sa_get_istatus,
 	aac_sa_clear_istatus,
 	aac_sa_set_mailbox,
 	aac_sa_get_mailboxstatus,
 	aac_sa_set_interrupts
 };
 
 /* i960Rx interface */	
 static int	aac_rx_get_fwstatus(struct aac_softc *sc);
 static void	aac_rx_qnotify(struct aac_softc *sc, int qbit);
 static int	aac_rx_get_istatus(struct aac_softc *sc);
 static void	aac_rx_clear_istatus(struct aac_softc *sc, int mask);
 static void	aac_rx_set_mailbox(struct aac_softc *sc, u_int32_t command,
 				   u_int32_t arg0, u_int32_t arg1,
 				   u_int32_t arg2, u_int32_t arg3);
 static int	aac_rx_get_mailboxstatus(struct aac_softc *sc);
 static void	aac_rx_set_interrupts(struct aac_softc *sc, int enable);
 
 struct aac_interface aac_rx_interface = {
 	aac_rx_get_fwstatus,
 	aac_rx_qnotify,
 	aac_rx_get_istatus,
 	aac_rx_clear_istatus,
 	aac_rx_set_mailbox,
 	aac_rx_get_mailboxstatus,
 	aac_rx_set_interrupts
 };
 
 /* Debugging and Diagnostics */
 static void	aac_describe_controller(struct aac_softc *sc);
 static char	*aac_describe_code(struct aac_code_lookup *table,
 				   u_int32_t code);
 
 /* Management Interface */
 static d_open_t		aac_open;
 static d_close_t	aac_close;
 static d_ioctl_t	aac_ioctl;
 static d_poll_t		aac_poll;
 static int		aac_ioctl_sendfib(struct aac_softc *sc, caddr_t ufib);
 static void		aac_handle_aif(struct aac_softc *sc,
 					   struct aac_fib *fib);
 static int		aac_rev_check(struct aac_softc *sc, caddr_t udata);
 static int		aac_getnext_aif(struct aac_softc *sc, caddr_t arg);
 static int		aac_return_aif(struct aac_softc *sc, caddr_t uptr);
 static int		aac_query_disk(struct aac_softc *sc, caddr_t uptr);
 
 #define AAC_CDEV_MAJOR	150
 
 static struct cdevsw aac_cdevsw = {
 	aac_open,		/* open */
 	aac_close,		/* close */
 	noread,			/* read */
 	nowrite,		/* write */
 	aac_ioctl,		/* ioctl */
 	aac_poll,		/* poll */
 	nommap,			/* mmap */
 	nostrategy,		/* strategy */
 	"aac",			/* name */
 	AAC_CDEV_MAJOR,		/* major */
 	nodump,			/* dump */
 	nopsize,		/* psize */
 	0,			/* flags */
 #if __FreeBSD_version < 500005
 	-1,			/* bmaj */
 #endif
 };
 
 MALLOC_DEFINE(M_AACBUF, "aacbuf", "Buffers for the AAC driver");
 
 /* sysctl node */
 SYSCTL_NODE(_hw, OID_AUTO, aac, CTLFLAG_RD, 0, "AAC driver parameters");
 
 /*
  * Device Interface
  */
 
 /*
  * Initialise the controller and softc
  */
 int
 aac_attach(struct aac_softc *sc)
 {
 	int error, unit;
 
 	debug_called(1);
 
 	/*
 	 * Initialise per-controller queues.
 	 */
 	aac_initq_free(sc);
 	aac_initq_ready(sc);
 	aac_initq_busy(sc);
 	aac_initq_complete(sc);
 	aac_initq_bio(sc);
 
 #if __FreeBSD_version >= 500005
 	/*
 	 * Initialise command-completion task.
 	 */
 	TASK_INIT(&sc->aac_task_complete, 0, aac_complete, sc);
 #endif
 
 	/* disable interrupts before we enable anything */
 	AAC_MASK_INTERRUPTS(sc);
 
 	/* mark controller as suspended until we get ourselves organised */
 	sc->aac_state |= AAC_STATE_SUSPEND;
 
 	/*
 	 * Allocate command structures.
 	 */
 	if ((error = aac_alloc_commands(sc)) != 0)
 		return(error);
 
 	/*
 	 * Initialise the adapter.
 	 */
 	if ((error = aac_init(sc)) != 0)
 		return(error);
 
 	/* 
 	 * Print a little information about the controller.
 	 */
 	aac_describe_controller(sc);
 
 	/*
 	 * Register to probe our containers later.
 	 */
 	TAILQ_INIT(&sc->aac_container_tqh);
 	AAC_LOCK_INIT(&sc->aac_container_lock, "AAC container lock");
 
 	/*
 	 * Lock for the AIF queue
 	 */
 	AAC_LOCK_INIT(&sc->aac_aifq_lock, "AAC AIF lock");
 
 	sc->aac_ich.ich_func = aac_startup;
 	sc->aac_ich.ich_arg = sc;
 	if (config_intrhook_establish(&sc->aac_ich) != 0) {
 		device_printf(sc->aac_dev,
 			      "can't establish configuration hook\n");
 		return(ENXIO);
 	}
 
 	/*
 	 * Make the control device.
 	 */
 	unit = device_get_unit(sc->aac_dev);
 	sc->aac_dev_t = make_dev(&aac_cdevsw, unit, UID_ROOT, GID_WHEEL, 0644,
 				 "aac%d", unit);
 #if __FreeBSD_version > 500005
 	(void)make_dev_alias(sc->aac_dev_t, "afa%d", unit);
 	(void)make_dev_alias(sc->aac_dev_t, "hpn%d", unit);
 #endif
 	sc->aac_dev_t->si_drv1 = sc;
 
 	/* Create the AIF thread */
 #if __FreeBSD_version > 500005
 	if (kthread_create((void(*)(void *))aac_host_command, sc,
 			   &sc->aifthread, 0, "aac%daif", unit))
 #else
 	if (kthread_create((void(*)(void *))aac_host_command, sc,
 			   &sc->aifthread, "aac%daif", unit))
 #endif
 		panic("Could not create AIF thread\n");
 
 	/* Register the shutdown method to only be called post-dump */
 	if ((EVENTHANDLER_REGISTER(shutdown_final, aac_shutdown, sc->aac_dev,
 				   SHUTDOWN_PRI_DEFAULT)) == NULL)
 	device_printf(sc->aac_dev, "shutdown event registration failed\n");
 
 	return(0);
 }
 
 /*
  * Probe for containers, create disks.
  */
 static void
 aac_startup(void *arg)
 {
 	struct aac_softc *sc;
 	struct aac_mntinfo mi;
 	struct aac_mntinforesponse mir;
 	u_int16_t rsize;	
 	int i = 0;
 
 	debug_called(1);
 
 	sc = (struct aac_softc *)arg;
 
 	/* disconnect ourselves from the intrhook chain */
 	config_intrhook_disestablish(&sc->aac_ich);
 
 	/* loop over possible containers */
 	mi.Command = VM_NameServe;
 	mi.MntType = FT_FILESYS;
 	do {
 		/* request information on this container */
 		mi.MntCount = i;
 		rsize = sizeof(mir);
 		if (aac_sync_fib(sc, ContainerCommand, 0, &mi,
 				 sizeof(struct aac_mntinfo), &mir, &rsize)) {
 			debug(2, "error probing container %d", i);
 			continue;
 		}
 		/* check response size */
 		if (rsize != sizeof(mir)) {
 			debug(2, "container info response wrong size "
 			      "(%d should be %d)", rsize, sizeof(mir));
 			continue;
 		}
 
 		aac_add_container(sc, &mir, 0);
 		i++;
 	} while ((i < mir.MntRespCount) && (i < AAC_MAX_CONTAINERS));
 
 	/* poke the bus to actually attach the child devices */
 	if (bus_generic_attach(sc->aac_dev))
 		device_printf(sc->aac_dev, "bus_generic_attach failed\n");
 
 	/* mark the controller up */
 	sc->aac_state &= ~AAC_STATE_SUSPEND;
 
 	/* enable interrupts now */
 	AAC_UNMASK_INTERRUPTS(sc);
 
 	/* enable the timeout watchdog */
 	timeout((timeout_t*)aac_timeout, sc, AAC_PERIODIC_INTERVAL * hz);
 }
 
 /*
  * Create a device to respresent a new container
  */
 static void
 aac_add_container(struct aac_softc *sc, struct aac_mntinforesponse *mir, int f)
 {
 	struct aac_container *co;
 	device_t child;
 
 	/* 
 	 * Check container volume type for validity.  Note that many of
 	 * the possible types may never show up.
 	 */
 	if ((mir->Status == ST_OK) && (mir->MntTable[0].VolType != CT_NONE)) {
 		MALLOC(co, struct aac_container *, sizeof *co, M_AACBUF,
 		       M_NOWAIT);
 		if (co == NULL)
 			panic("Out of memory?!\n");
 		debug(1, "id %x  name '%.16s'  size %u  type %d", 
 		      mir->MntTable[0].ObjectId,
 		      mir->MntTable[0].FileSystemName,
 		      mir->MntTable[0].Capacity, mir->MntTable[0].VolType);
 	
 		if ((child = device_add_child(sc->aac_dev, NULL, -1)) == NULL)
 			device_printf(sc->aac_dev, "device_add_child failed\n");
 		else
 			device_set_ivars(child, co);
 		device_set_desc(child, aac_describe_code(aac_container_types,
 				mir->MntTable[0].VolType));
 		co->co_disk = child;
 		co->co_found = f;
 		bcopy(&mir->MntTable[0], &co->co_mntobj,
 		      sizeof(struct aac_mntobj));
 		AAC_LOCK_ACQUIRE(&sc->aac_container_lock);
 		TAILQ_INSERT_TAIL(&sc->aac_container_tqh, co, co_link);
 		AAC_LOCK_RELEASE(&sc->aac_container_lock);
 	}
 }
 
 /*
  * Free all of the resources associated with (sc)
  *
  * Should not be called if the controller is active.
  */
 void
 aac_free(struct aac_softc *sc)
 {
 	debug_called(1);
 
 	/* remove the control device */
 	if (sc->aac_dev_t != NULL)
 		destroy_dev(sc->aac_dev_t);
 
 	/* throw away any FIB buffers, discard the FIB DMA tag */
 	if (sc->aac_fibs != NULL)
 		aac_free_commands(sc);
 	if (sc->aac_fib_dmat)
 		bus_dma_tag_destroy(sc->aac_fib_dmat);
 
 	/* destroy the common area */
 	if (sc->aac_common) {
 		bus_dmamap_unload(sc->aac_common_dmat, sc->aac_common_dmamap);
 		bus_dmamem_free(sc->aac_common_dmat, sc->aac_common,
 				sc->aac_common_dmamap);
 	}
 	if (sc->aac_common_dmat)
 		bus_dma_tag_destroy(sc->aac_common_dmat);
 
 	/* disconnect the interrupt handler */
 	if (sc->aac_intr)
 		bus_teardown_intr(sc->aac_dev, sc->aac_irq, sc->aac_intr);
 	if (sc->aac_irq != NULL)
 		bus_release_resource(sc->aac_dev, SYS_RES_IRQ, sc->aac_irq_rid,
 				     sc->aac_irq);
 
 	/* destroy data-transfer DMA tag */
 	if (sc->aac_buffer_dmat)
 		bus_dma_tag_destroy(sc->aac_buffer_dmat);
 
 	/* destroy the parent DMA tag */
 	if (sc->aac_parent_dmat)
 		bus_dma_tag_destroy(sc->aac_parent_dmat);
 
 	/* release the register window mapping */
 	if (sc->aac_regs_resource != NULL)
 		bus_release_resource(sc->aac_dev, SYS_RES_MEMORY,
 				     sc->aac_regs_rid, sc->aac_regs_resource);
 }
 
 /*
  * Disconnect from the controller completely, in preparation for unload.
  */
 int
 aac_detach(device_t dev)
 {
 	struct aac_softc *sc;
 #if AAC_BROKEN
 	int error;
 #endif
 
 	debug_called(1);
 
 	sc = device_get_softc(dev);
 
 	if (sc->aac_state & AAC_STATE_OPEN)
 	return(EBUSY);
 
 #if AAC_BROKEN
 	if (sc->aifflags & AAC_AIFFLAGS_RUNNING) {
 		sc->aifflags |= AAC_AIFFLAGS_EXIT;
 		wakeup(sc->aifthread);
 		tsleep(sc->aac_dev, PUSER | PCATCH, "aacdch", 30 * hz);
 	}
 
 	if (sc->aifflags & AAC_AIFFLAGS_RUNNING)
 		panic("Cannot shutdown AIF thread\n");
 
 	if ((error = aac_shutdown(dev)))
 		return(error);
 
 	aac_free(sc);
 
 	return(0);
 #else
 	return (EBUSY);
 #endif
 }
 
 /*
  * Bring the controller down to a dormant state and detach all child devices.
  *
  * This function is called before detach or system shutdown.
  *
  * Note that we can assume that the bioq on the controller is empty, as we won't
  * allow shutdown if any device is open.
  */
 int
 aac_shutdown(device_t dev)
 {
 	struct aac_softc *sc;
 	struct aac_close_command cc;
 	int s, i;
 
 	debug_called(1);
 
 	sc = device_get_softc(dev);
 
 	s = splbio();
 
 	sc->aac_state |= AAC_STATE_SUSPEND;
 
 	/* 
 	 * Send a Container shutdown followed by a HostShutdown FIB to the
 	 * controller to convince it that we don't want to talk to it anymore.
 	 * We've been closed and all I/O completed already
 	 */
 	device_printf(sc->aac_dev, "shutting down controller...");
 
 	cc.Command = VM_CloseAll;
 	cc.ContainerId = 0xffffffff;
 	if (aac_sync_fib(sc, ContainerCommand, 0, &cc, sizeof(cc), NULL, NULL))
 		printf("FAILED.\n");
 	else {
 		i = 0;
 		/*
 		 * XXX Issuing this command to the controller makes it shut down
 		 * but also keeps it from coming back up without a reset of the
 		 * PCI bus.  This is not desirable if you are just unloading the
 		 * driver module with the intent to reload it later.
 		 */
 		if (aac_sync_fib(sc, FsaHostShutdown, AAC_FIBSTATE_SHUTDOWN, &i,
 				 sizeof(i), NULL, NULL)) {
 			printf("FAILED.\n");
 		} else {
 			printf("done.\n");
 		}
 	}
 
 	AAC_MASK_INTERRUPTS(sc);
 
 	splx(s);
 	return(0);
 }
 
 /*
  * Bring the controller to a quiescent state, ready for system suspend.
  */
 int
 aac_suspend(device_t dev)
 {
 	struct aac_softc *sc;
 	int s;
 
 	debug_called(1);
 
 	sc = device_get_softc(dev);
 
 	s = splbio();
 
 	sc->aac_state |= AAC_STATE_SUSPEND;
 	
 	AAC_MASK_INTERRUPTS(sc);
 	splx(s);
 	return(0);
 }
 
 /*
  * Bring the controller back to a state ready for operation.
  */
 int
 aac_resume(device_t dev)
 {
 	struct aac_softc *sc;
 
 	debug_called(1);
 
 	sc = device_get_softc(dev);
 
 	sc->aac_state &= ~AAC_STATE_SUSPEND;
 	AAC_UNMASK_INTERRUPTS(sc);
 	return(0);
 }
 
 /*
  * Take an interrupt.
  */
 void
 aac_intr(void *arg)
 {
 	struct aac_softc *sc;
 	u_int16_t reason;
 
 	debug_called(2);
 
 	sc = (struct aac_softc *)arg;
 
 	reason = AAC_GET_ISTATUS(sc);
 
 	/* controller wants to talk to the log */
 	if (reason & AAC_DB_PRINTF) {
 		AAC_CLEAR_ISTATUS(sc, AAC_DB_PRINTF);
 		aac_print_printf(sc);
 	}
 
 	/* controller has a message for us? */
 	if (reason & AAC_DB_COMMAND_READY) {
 		AAC_CLEAR_ISTATUS(sc, AAC_DB_COMMAND_READY);
 		/* XXX What happens if the thread is already awake? */
 		if (sc->aifflags & AAC_AIFFLAGS_RUNNING) {
 			sc->aifflags |= AAC_AIFFLAGS_PENDING;
 			wakeup(sc->aifthread);
 		}
 	}
 	
 	/* controller has a response for us? */
 	if (reason & AAC_DB_RESPONSE_READY) {
 		AAC_CLEAR_ISTATUS(sc, AAC_DB_RESPONSE_READY);
 		aac_host_response(sc);
 	}
 
 	/*
 	 * spurious interrupts that we don't use - reset the mask and clear the
 	 * interrupts
 	 */
 	if (reason & (AAC_DB_COMMAND_NOT_FULL | AAC_DB_RESPONSE_NOT_FULL)) {
 		AAC_UNMASK_INTERRUPTS(sc);
 		AAC_CLEAR_ISTATUS(sc, AAC_DB_COMMAND_NOT_FULL |
 				  AAC_DB_RESPONSE_NOT_FULL);
 	}
 };
 
 /*
  * Command Processing
  */
 
 /*
  * Start as much queued I/O as possible on the controller
  */
 static void
 aac_startio(struct aac_softc *sc)
 {
 	struct aac_command *cm;
 
 	debug_called(2);
 
 	for (;;) {
 		/*
 		 * Try to get a command that's been put off for lack of 
 		 * resources
 		 */
 		cm = aac_dequeue_ready(sc);
 
 		/*
 		 * Try to build a command off the bio queue (ignore error 
 		 * return)
 		 */
 		if (cm == NULL)
 			aac_bio_command(sc, &cm);
 
 		/* nothing to do? */
 		if (cm == NULL)
 			break;
 
 		/* try to give the command to the controller */
 		if (aac_start(cm) == EBUSY) {
 			/* put it on the ready queue for later */
 			aac_requeue_ready(cm);
 			break;
 		}
 	}
 }
 
 /*
  * Deliver a command to the controller; allocate controller resources at the
  * last moment when possible.
  */
 static int
 aac_start(struct aac_command *cm)
 {
 	struct aac_softc *sc;
 	int error;
 
 	debug_called(2);
 
 	sc = cm->cm_sc;
 
 	/* get the command mapped */
 	aac_map_command(cm);
 
 	/* fix up the address values in the FIB */
 	cm->cm_fib->Header.SenderFibAddress = (u_int32_t)cm->cm_fib;
 	cm->cm_fib->Header.ReceiverFibAddress = cm->cm_fibphys;
 
 	/* save a pointer to the command for speedy reverse-lookup */
 	cm->cm_fib->Header.SenderData = (u_int32_t)cm;	/* XXX 64-bit physical
 							 * address issue */
 
 	/* put the FIB on the outbound queue */
 	error = aac_enqueue_fib(sc, cm->cm_queue, cm);
 	return(error);
 }
 
 /*
  * Handle notification of one or more FIBs coming from the controller.
  */
 static void
 aac_host_command(struct aac_softc *sc)
 {
 	struct aac_fib *fib;
 	u_int32_t fib_size;
 	int size;
 
 	debug_called(2);
 
 	sc->aifflags |= AAC_AIFFLAGS_RUNNING;
 
 	while (!(sc->aifflags & AAC_AIFFLAGS_EXIT)) {
 		if (!(sc->aifflags & AAC_AIFFLAGS_PENDING))
 			tsleep(sc->aifthread, PRIBIO, "aifthd", 15 * hz);
 
 		sc->aifflags &= ~AAC_AIFFLAGS_PENDING;
 		for (;;) {
 			if (aac_dequeue_fib(sc, AAC_HOST_NORM_CMD_QUEUE,
 					    &fib_size, &fib))
 				break;	/* nothing to do */
 	
 			AAC_PRINT_FIB(sc, fib);
 	
 			switch (fib->Header.Command) {
 			case AifRequest:
 				aac_handle_aif(sc, fib);
 				break;
 			default:
 				device_printf(sc->aac_dev, "unknown command "
 					      "from controller\n");
 				break;
 			}
 
 			/* Return the AIF to the controller. */
 			if ((fib->Header.XferState == 0) ||
 			    (fib->Header.StructType != AAC_FIBTYPE_TFIB))
 				break;
 
 			if (fib->Header.XferState & AAC_FIBSTATE_FROMADAP) {
 				fib->Header.XferState |= AAC_FIBSTATE_DONEHOST;
 				*(AAC_FSAStatus*)fib->data = ST_OK;
 
 				/* XXX Compute the Size field? */
 				size = fib->Header.Size;
 				if (size > sizeof(struct aac_fib)) {
 	 				size = sizeof(struct aac_fib);
 					fib->Header.Size = size;
 				}
 				/*
 				 * Since we did not generate this command, it
 				 * cannot go through the normal
 				 * enqueue->startio chain.
 				 */
 				aac_enqueue_response(sc,
 						     AAC_ADAP_NORM_RESP_QUEUE,
 						     fib);
 			}
 		}
 	}
 	sc->aifflags &= ~AAC_AIFFLAGS_RUNNING;
 	wakeup(sc->aac_dev);
 
 #if __FreeBSD_version > 500005
 	mtx_lock(&Giant);
 #endif
 	kthread_exit(0);
 }
 
 /*
  * Handle notification of one or more FIBs completed by the controller
  */
 static void
 aac_host_response(struct aac_softc *sc)
 {
 	struct aac_command *cm;
 	struct aac_fib *fib;
 	u_int32_t fib_size;
 
 	debug_called(2);
 
 	for (;;) {
 		/* look for completed FIBs on our queue */
 		if (aac_dequeue_fib(sc, AAC_HOST_NORM_RESP_QUEUE, &fib_size,
 				    &fib))
 			break;	/* nothing to do */
 	
 		/* get the command, unmap and queue for later processing */
 		cm = (struct aac_command *)fib->Header.SenderData;
 		if (cm == NULL) {
 			AAC_PRINT_FIB(sc, fib);
 		} else {
 			aac_remove_busy(cm);
 			aac_unmap_command(cm);		/* XXX defer? */
 			aac_enqueue_complete(cm);
 		}
 	}
 
 	/* handle completion processing */
 #if __FreeBSD_version >= 500005
 	taskqueue_enqueue(taskqueue_swi, &sc->aac_task_complete);
 #else
 	aac_complete(sc, 0);
 #endif
 }
 
 /*
  * Process completed commands.
  */
 static void
 aac_complete(void *context, int pending)
 {
 	struct aac_softc *sc;
 	struct aac_command *cm;
 	
 	debug_called(2);
 
 	sc = (struct aac_softc *)context;
 
 	/* pull completed commands off the queue */
 	for (;;) {
 		cm = aac_dequeue_complete(sc);
 		if (cm == NULL)
 			break;
 		cm->cm_flags |= AAC_CMD_COMPLETED;
 
 		/* is there a completion handler? */
 		if (cm->cm_complete != NULL) {
 			cm->cm_complete(cm);
 		} else {
 			/* assume that someone is sleeping on this command */
 			wakeup(cm);
 		}
 	}
 
 	/* see if we can start some more I/O */
 	aac_startio(sc);
 }
 
 /*
  * Handle a bio submitted from a disk device.
  */
 void
 aac_submit_bio(struct bio *bp)
 {
 	struct aac_disk *ad;
 	struct aac_softc *sc;
 
 	debug_called(2);
 
 	ad = (struct aac_disk *)bp->bio_dev->si_drv1;
 	sc = ad->ad_controller;
 
 	/* queue the BIO and try to get some work done */
 	aac_enqueue_bio(sc, bp);
 	aac_startio(sc);
 }
 
 /*
  * Get a bio and build a command to go with it.
  */
 static int
 aac_bio_command(struct aac_softc *sc, struct aac_command **cmp)
 {
 	struct aac_command *cm;
 	struct aac_fib *fib;
 	struct aac_blockread *br;
 	struct aac_blockwrite *bw;
 	struct aac_disk *ad;
 	struct bio *bp;
 
 	debug_called(2);
 
 	/* get the resources we will need */
 	cm = NULL;
 	if ((bp = aac_dequeue_bio(sc)) == NULL)
 		goto fail;
 	if (aac_alloc_command(sc, &cm))	/* get a command */
 		goto fail;
 
 	/* fill out the command */
 	cm->cm_data = (void *)bp->bio_data;
 	cm->cm_datalen = bp->bio_bcount;
 	cm->cm_complete = aac_bio_complete;
 	cm->cm_private = bp;
 	cm->cm_timestamp = time_second;
 	cm->cm_queue = AAC_ADAP_NORM_CMD_QUEUE;
 
 	/* build the FIB */
 	fib = cm->cm_fib;
 	fib->Header.XferState =  
 	AAC_FIBSTATE_HOSTOWNED   | 
 	AAC_FIBSTATE_INITIALISED | 
 	AAC_FIBSTATE_FROMHOST	 |
 	AAC_FIBSTATE_REXPECTED   |
 	AAC_FIBSTATE_NORM;
 	fib->Header.Command = ContainerCommand;
 	fib->Header.Size = sizeof(struct aac_fib_header);
 
 	/* build the read/write request */
 	ad = (struct aac_disk *)bp->bio_dev->si_drv1;
 	if (BIO_IS_READ(bp)) {
 		br = (struct aac_blockread *)&fib->data[0];
 		br->Command = VM_CtBlockRead;
 		br->ContainerId = ad->ad_container->co_mntobj.ObjectId;
 		br->BlockNumber = bp->bio_pblkno;
 		br->ByteCount = bp->bio_bcount;
 		fib->Header.Size += sizeof(struct aac_blockread);
 		cm->cm_sgtable = &br->SgMap;
 		cm->cm_flags |= AAC_CMD_DATAIN;
 	} else {
 		bw = (struct aac_blockwrite *)&fib->data[0];
 		bw->Command = VM_CtBlockWrite;
 		bw->ContainerId = ad->ad_container->co_mntobj.ObjectId;
 		bw->BlockNumber = bp->bio_pblkno;
 		bw->ByteCount = bp->bio_bcount;
 		bw->Stable = CUNSTABLE;	/* XXX what's appropriate here? */
 		fib->Header.Size += sizeof(struct aac_blockwrite);
 		cm->cm_flags |= AAC_CMD_DATAOUT;
 		cm->cm_sgtable = &bw->SgMap;
 	}
 
 	*cmp = cm;
 	return(0);
 
 fail:
 	if (bp != NULL)
 		aac_enqueue_bio(sc, bp);
 	if (cm != NULL)
 		aac_release_command(cm);
 	return(ENOMEM);
 }
 
 /*
  * Handle a bio-instigated command that has been completed.
  */
 static void
 aac_bio_complete(struct aac_command *cm)
 {
 	struct aac_blockread_response *brr;
 	struct aac_blockwrite_response *bwr;
 	struct bio *bp;
 	AAC_FSAStatus status;
 
 	/* fetch relevant status and then release the command */
 	bp = (struct bio *)cm->cm_private;
 	if (BIO_IS_READ(bp)) {
 		brr = (struct aac_blockread_response *)&cm->cm_fib->data[0];
 		status = brr->Status;
 	} else {
 		bwr = (struct aac_blockwrite_response *)&cm->cm_fib->data[0];
 		status = bwr->Status;
 	}
 	aac_release_command(cm);
 
 	/* fix up the bio based on status */
 	if (status == ST_OK) {
 		bp->bio_resid = 0;
 	} else {
 		bp->bio_error = EIO;
 		bp->bio_flags |= BIO_ERROR;
 		/* pass an error string out to the disk layer */
 		bp->bio_driver1 = aac_describe_code(aac_command_status_table,
 						    status);
 	}
 	aac_biodone(bp);
 }
 
 /*
  * Dump a block of data to the controller.  If the queue is full, tell the
  * caller to hold off and wait for the queue to drain.
  */
 int
 aac_dump_enqueue(struct aac_disk *ad, u_int32_t lba, void *data, int dumppages)
 {
 	struct aac_softc *sc;
 	struct aac_command *cm;
 	struct aac_fib *fib;
 	struct aac_blockwrite *bw;
 
 	sc = ad->ad_controller;
 	cm = NULL;
 
 	if (aac_alloc_command(sc, &cm))
 		return (EBUSY);
 
 	/* fill out the command */
 	cm->cm_data = data;
 	cm->cm_datalen = dumppages * PAGE_SIZE;
 	cm->cm_complete = NULL;
 	cm->cm_private = NULL;
 	cm->cm_timestamp = time_second;
 	cm->cm_queue = AAC_ADAP_NORM_CMD_QUEUE;
 
 	/* build the FIB */
 	fib = cm->cm_fib;
 	fib->Header.XferState =  
 	AAC_FIBSTATE_HOSTOWNED   | 
 	AAC_FIBSTATE_INITIALISED | 
 	AAC_FIBSTATE_FROMHOST	 |
 	AAC_FIBSTATE_REXPECTED   |
 	AAC_FIBSTATE_NORM;
 	fib->Header.Command = ContainerCommand;
 	fib->Header.Size = sizeof(struct aac_fib_header);
 
 	bw = (struct aac_blockwrite *)&fib->data[0];
 	bw->Command = VM_CtBlockWrite;
 	bw->ContainerId = ad->ad_container->co_mntobj.ObjectId;
 	bw->BlockNumber = lba;
 	bw->ByteCount = dumppages * PAGE_SIZE;
 	bw->Stable = CUNSTABLE;		/* XXX what's appropriate here? */
 	fib->Header.Size += sizeof(struct aac_blockwrite);
 	cm->cm_flags |= AAC_CMD_DATAOUT;
 	cm->cm_sgtable = &bw->SgMap;
 
 	return (aac_start(cm));
 }
 
 /*
  * Wait for the card's queue to drain when dumping.  Also check for monitor
  * printf's
  */
 void
 aac_dump_complete(struct aac_softc *sc)
 {
 	struct aac_fib *fib;
 	struct aac_command *cm;
 	u_int16_t reason;
 	u_int32_t pi, ci, fib_size;
 
 	do {
 		reason = AAC_GET_ISTATUS(sc);
 		if (reason & AAC_DB_RESPONSE_READY) {
 			AAC_CLEAR_ISTATUS(sc, AAC_DB_RESPONSE_READY);
 			for (;;) {
 				if (aac_dequeue_fib(sc,
 						    AAC_HOST_NORM_RESP_QUEUE,
 						    &fib_size, &fib))
 					break;
 				cm = (struct aac_command *)
 					fib->Header.SenderData;
 				if (cm == NULL)
 					AAC_PRINT_FIB(sc, fib);
 				else {
 					aac_remove_busy(cm);
 					aac_unmap_command(cm);
 					aac_enqueue_complete(cm);
 					aac_release_command(cm);
 				}
 			}
 		}
 		if (reason & AAC_DB_PRINTF) {
 			AAC_CLEAR_ISTATUS(sc, AAC_DB_PRINTF);
 			aac_print_printf(sc);
 		}
 		pi = sc->aac_queues->qt_qindex[AAC_ADAP_NORM_CMD_QUEUE][
 			AAC_PRODUCER_INDEX];
 		ci = sc->aac_queues->qt_qindex[AAC_ADAP_NORM_CMD_QUEUE][	
 			AAC_CONSUMER_INDEX];
 	} while (ci != pi);
 
 	return;
 }
 
 /*
  * Submit a command to the controller, return when it completes.
  * XXX This is very dangerous!  If the card has gone out to lunch, we could
  *     be stuck here forever.  At the same time, signals are not caught
  *     because there is a risk that a signal could wakeup the tsleep before
  *     the card has a chance to complete the command.  The passed in timeout
  *     is ignored for the same reason.  Since there is no way to cancel a
  *     command in progress, we should probably create a 'dead' queue where
  *     commands go that have been interrupted/timed-out/etc, that keeps them
  *     out of the free pool.  That way, if the card is just slow, it won't
  *     spam the memory of a command that has been recycled.
  */
 static int
 aac_wait_command(struct aac_command *cm, int timeout)
 {
 	int s, error = 0;
 
 	debug_called(2);
 
 	/* Put the command on the ready queue and get things going */
 	cm->cm_queue = AAC_ADAP_NORM_CMD_QUEUE;
 	aac_enqueue_ready(cm);
 	aac_startio(cm->cm_sc);
 	s = splbio();
 	while (!(cm->cm_flags & AAC_CMD_COMPLETED) && (error != EWOULDBLOCK)) {
 		error = tsleep(cm, PRIBIO, "aacwait", 0);
 	}
 	splx(s);
 	return(error);
 }
 
 /*
  *Command Buffer Management
  */
 
 /*
  * Allocate a command.
  */
 static int
 aac_alloc_command(struct aac_softc *sc, struct aac_command **cmp)
 {
 	struct aac_command *cm;
 
 	debug_called(3);
 
 	if ((cm = aac_dequeue_free(sc)) == NULL)
 		return(ENOMEM);
 
 	*cmp = cm;
 	return(0);
 }
 
 /*
  * Release a command back to the freelist.
  */
 static void
 aac_release_command(struct aac_command *cm)
 {
 	debug_called(3);
 
 	/* (re)initialise the command/FIB */
 	cm->cm_sgtable = NULL;
 	cm->cm_flags = 0;
 	cm->cm_complete = NULL;
 	cm->cm_private = NULL;
 	cm->cm_fib->Header.XferState = AAC_FIBSTATE_EMPTY;
 	cm->cm_fib->Header.StructType = AAC_FIBTYPE_TFIB;
 	cm->cm_fib->Header.Flags = 0;
 	cm->cm_fib->Header.SenderSize = sizeof(struct aac_fib);
 
 	/* 
 	 * These are duplicated in aac_start to cover the case where an
 	 * intermediate stage may have destroyed them.  They're left
 	 * initialised here for debugging purposes only.
 	 */
 	cm->cm_fib->Header.SenderFibAddress = (u_int32_t)cm->cm_fib;
 	cm->cm_fib->Header.ReceiverFibAddress = cm->cm_fibphys;
 
 	aac_enqueue_free(cm);
 }
 
 /*
  * Map helper for command/FIB allocation.
  */
 static void
 aac_map_command_helper(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	struct aac_softc *sc;
 
 	sc = (struct aac_softc *)arg;
 
 	debug_called(3);
 
 	sc->aac_fibphys = segs[0].ds_addr;
 }
 
 /*
  * Allocate and initialise commands/FIBs for this adapter.
  */
 static int
 aac_alloc_commands(struct aac_softc *sc)
 {
 	struct aac_command *cm;
 	int i;
  
 	debug_called(1);
 
 	/* allocate the FIBs in DMAable memory and load them */
 	if (bus_dmamem_alloc(sc->aac_fib_dmat, (void **)&sc->aac_fibs,
 			 BUS_DMA_NOWAIT, &sc->aac_fibmap)) {
 		return(ENOMEM);
 	}
 	bus_dmamap_load(sc->aac_fib_dmat, sc->aac_fibmap, sc->aac_fibs, 
 			AAC_FIB_COUNT * sizeof(struct aac_fib),
 			aac_map_command_helper, sc, 0);
 
 	/* initialise constant fields in the command structure */
 	for (i = 0; i < AAC_FIB_COUNT; i++) {
 		cm = &sc->aac_command[i];
 		cm->cm_sc = sc;
 		cm->cm_fib = sc->aac_fibs + i;
 		cm->cm_fibphys = sc->aac_fibphys + (i * sizeof(struct aac_fib));
 
 		if (!bus_dmamap_create(sc->aac_buffer_dmat, 0, &cm->cm_datamap))
 			aac_release_command(cm);
 	}
 	return(0);
 }
 
 /*
  * Free FIBs owned by this adapter.
  */
 static void
 aac_free_commands(struct aac_softc *sc)
 {
 	int i;
 
 	debug_called(1);
 
 	for (i = 0; i < AAC_FIB_COUNT; i++)
 		bus_dmamap_destroy(sc->aac_buffer_dmat,
 				   sc->aac_command[i].cm_datamap);
 
 	bus_dmamap_unload(sc->aac_fib_dmat, sc->aac_fibmap);
 	bus_dmamem_free(sc->aac_fib_dmat, sc->aac_fibs, sc->aac_fibmap);
 }
 
 /*
  * Command-mapping helper function - populate this command's s/g table.
  */
 static void
 aac_map_command_sg(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	struct aac_command *cm;
 	struct aac_fib *fib;
 	struct aac_sg_table *sg;
 	int i;
 
 	debug_called(3);
 
 	cm = (struct aac_command *)arg;
 	fib = cm->cm_fib;
 
 	/* find the s/g table */
 	sg = cm->cm_sgtable;
 
 	/* copy into the FIB */
 	if (sg != NULL) {
 		sg->SgCount = nseg;
 		for (i = 0; i < nseg; i++) {
 			sg->SgEntry[i].SgAddress = segs[i].ds_addr;
 			sg->SgEntry[i].SgByteCount = segs[i].ds_len;
 		}
 		/* update the FIB size for the s/g count */
 		fib->Header.Size += nseg * sizeof(struct aac_sg_entry);
 	}
 
 }
 
 /*
  * Map a command into controller-visible space.
  */
 static void
 aac_map_command(struct aac_command *cm)
 {
 	struct aac_softc *sc;
 
 	debug_called(2);
 
 	sc = cm->cm_sc;
 
 	/* don't map more than once */
 	if (cm->cm_flags & AAC_CMD_MAPPED)
 		return;
 
 	if (cm->cm_datalen != 0) {
 		bus_dmamap_load(sc->aac_buffer_dmat, cm->cm_datamap,
 				cm->cm_data, cm->cm_datalen,
 				aac_map_command_sg, cm, 0);
 
 	if (cm->cm_flags & AAC_CMD_DATAIN)
 		bus_dmamap_sync(sc->aac_buffer_dmat, cm->cm_datamap,
 				BUS_DMASYNC_PREREAD);
 	if (cm->cm_flags & AAC_CMD_DATAOUT)
 		bus_dmamap_sync(sc->aac_buffer_dmat, cm->cm_datamap,
 				BUS_DMASYNC_PREWRITE);
 	}
 	cm->cm_flags |= AAC_CMD_MAPPED;
 }
 
 /*
  * Unmap a command from controller-visible space.
  */
 static void
 aac_unmap_command(struct aac_command *cm)
 {
 	struct aac_softc *sc;
 
 	debug_called(2);
 
 	sc = cm->cm_sc;
 
 	if (!(cm->cm_flags & AAC_CMD_MAPPED))
 		return;
 
 	if (cm->cm_datalen != 0) {
 		if (cm->cm_flags & AAC_CMD_DATAIN)
 			bus_dmamap_sync(sc->aac_buffer_dmat, cm->cm_datamap,
 					BUS_DMASYNC_POSTREAD);
 		if (cm->cm_flags & AAC_CMD_DATAOUT)
 			bus_dmamap_sync(sc->aac_buffer_dmat, cm->cm_datamap,
 					BUS_DMASYNC_POSTWRITE);
 
 		bus_dmamap_unload(sc->aac_buffer_dmat, cm->cm_datamap);
 	}
 	cm->cm_flags &= ~AAC_CMD_MAPPED;
 }
 
 /*
  * Hardware Interface
  */
 
 /*
  * Initialise the adapter.
  */
 static void
 aac_common_map(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	struct aac_softc *sc;
 
 	debug_called(1);
 
 	sc = (struct aac_softc *)arg;
 
 	sc->aac_common_busaddr = segs[0].ds_addr;
 }
 
 static int
 aac_init(struct aac_softc *sc)
 {
 	struct aac_adapter_init	*ip;
 	time_t then;
 	u_int32_t code;
 	u_int8_t *qaddr;
 
 	debug_called(1);
 
 	/*
 	 * First wait for the adapter to come ready.
 	 */
 	then = time_second;
 	do {
 		code = AAC_GET_FWSTATUS(sc);
 		if (code & AAC_SELF_TEST_FAILED) {
 			device_printf(sc->aac_dev, "FATAL: selftest failed\n");
 			return(ENXIO);
 		}
 		if (code & AAC_KERNEL_PANIC) {
 			device_printf(sc->aac_dev,
 				      "FATAL: controller kernel panic\n");
 			return(ENXIO);
 		}
 		if (time_second > (then + AAC_BOOT_TIMEOUT)) {
 			device_printf(sc->aac_dev,
 				      "FATAL: controller not coming ready, "
 					   "status %x\n", code);
 			return(ENXIO);
 		}
 	} while (!(code & AAC_UP_AND_RUNNING));
 
 	/*
 	 * Create DMA tag for the common structure and allocate it.
 	 */
 	if (bus_dma_tag_create(sc->aac_parent_dmat, 	/* parent */
 			       1, 0, 			/* algnmnt, boundary */
 			       BUS_SPACE_MAXADDR,	/* lowaddr */
 			       BUS_SPACE_MAXADDR, 	/* highaddr */
 			       NULL, NULL, 		/* filter, filterarg */
 			       sizeof(struct aac_common), /* maxsize */
 			       1,			/* nsegments */
 			       BUS_SPACE_MAXSIZE_32BIT,	/* maxsegsize */
 			       0,			/* flags */
 			       &sc->aac_common_dmat)) {
 		device_printf(sc->aac_dev,
 			      "can't allocate common structure DMA tag\n");
 		return(ENOMEM);
 	}
 	if (bus_dmamem_alloc(sc->aac_common_dmat, (void **)&sc->aac_common,
 			     BUS_DMA_NOWAIT, &sc->aac_common_dmamap)) {
 		device_printf(sc->aac_dev, "can't allocate common structure\n");
 		return(ENOMEM);
 	}
 	bus_dmamap_load(sc->aac_common_dmat, sc->aac_common_dmamap,
 			sc->aac_common, sizeof(*sc->aac_common), aac_common_map,
 		        sc, 0);
 	bzero(sc->aac_common, sizeof(*sc->aac_common));
 	
 	/*
 	 * Fill in the init structure.  This tells the adapter about the
 	 * physical location of various important shared data structures.
 	 */
 	ip = &sc->aac_common->ac_init;
 	ip->InitStructRevision = AAC_INIT_STRUCT_REVISION;
 
 	ip->AdapterFibsPhysicalAddress = sc->aac_common_busaddr +
 					 offsetof(struct aac_common, ac_fibs);
 	ip->AdapterFibsVirtualAddress = &sc->aac_common->ac_fibs[0];
 	ip->AdapterFibsSize = AAC_ADAPTER_FIBS * sizeof(struct aac_fib);
 	ip->AdapterFibAlign = sizeof(struct aac_fib);
 
 	ip->PrintfBufferAddress = sc->aac_common_busaddr +
 				  offsetof(struct aac_common, ac_printf);
 	ip->PrintfBufferSize = AAC_PRINTF_BUFSIZE;
 
 	ip->HostPhysMemPages = 0;		/* not used? */
 	ip->HostElapsedSeconds = time_second;	/* reset later if invalid */
 
 	/*
 	 * Initialise FIB queues.  Note that it appears that the layout of the
 	 * indexes and the segmentation of the entries may be mandated by the
 	 * adapter, which is only told about the base of the queue index fields.
 	 *
 	 * The initial values of the indices are assumed to inform the adapter
 	 * of the sizes of the respective queues, and theoretically it could 
 	 * work out the entire layout of the queue structures from this.  We
 	 * take the easy route and just lay this area out like everyone else
 	 * does.
 	 *
 	 * The Linux driver uses a much more complex scheme whereby several 
 	 * header records are kept for each queue.  We use a couple of generic 
 	 * list manipulation functions which 'know' the size of each list by
 	 * virtue of a table.
 	 */
 	qaddr = &sc->aac_common->ac_qbuf[0] + AAC_QUEUE_ALIGN;
 	qaddr -= (u_int32_t)qaddr % AAC_QUEUE_ALIGN;
 	sc->aac_queues = (struct aac_queue_table *)qaddr;
 	ip->CommHeaderAddress = sc->aac_common_busaddr +
 				((u_int32_t)sc->aac_queues -
 				(u_int32_t)sc->aac_common);
 	bzero(sc->aac_queues, sizeof(struct aac_queue_table));
 
 	sc->aac_queues->qt_qindex[AAC_HOST_NORM_CMD_QUEUE][AAC_PRODUCER_INDEX] =
 		AAC_HOST_NORM_CMD_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_HOST_NORM_CMD_QUEUE][AAC_CONSUMER_INDEX] =
 		AAC_HOST_NORM_CMD_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_HOST_HIGH_CMD_QUEUE][AAC_PRODUCER_INDEX] =
 		AAC_HOST_HIGH_CMD_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_HOST_HIGH_CMD_QUEUE][AAC_CONSUMER_INDEX] =
 		AAC_HOST_HIGH_CMD_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_ADAP_NORM_CMD_QUEUE][AAC_PRODUCER_INDEX] =
 		AAC_ADAP_NORM_CMD_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_ADAP_NORM_CMD_QUEUE][AAC_CONSUMER_INDEX] =
 		AAC_ADAP_NORM_CMD_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_ADAP_HIGH_CMD_QUEUE][AAC_PRODUCER_INDEX] =
 		AAC_ADAP_HIGH_CMD_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_ADAP_HIGH_CMD_QUEUE][AAC_CONSUMER_INDEX] =
 		AAC_ADAP_HIGH_CMD_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_HOST_NORM_RESP_QUEUE][AAC_PRODUCER_INDEX]=
 		AAC_HOST_NORM_RESP_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_HOST_NORM_RESP_QUEUE][AAC_CONSUMER_INDEX]=
 		AAC_HOST_NORM_RESP_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_HOST_HIGH_RESP_QUEUE][AAC_PRODUCER_INDEX]=
 		AAC_HOST_HIGH_RESP_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_HOST_HIGH_RESP_QUEUE][AAC_CONSUMER_INDEX]=
 		AAC_HOST_HIGH_RESP_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_ADAP_NORM_RESP_QUEUE][AAC_PRODUCER_INDEX]=
 		AAC_ADAP_NORM_RESP_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_ADAP_NORM_RESP_QUEUE][AAC_CONSUMER_INDEX]=
 		AAC_ADAP_NORM_RESP_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_ADAP_HIGH_RESP_QUEUE][AAC_PRODUCER_INDEX]=
 		AAC_ADAP_HIGH_RESP_ENTRIES;
 	sc->aac_queues->qt_qindex[AAC_ADAP_HIGH_RESP_QUEUE][AAC_CONSUMER_INDEX]=
 		AAC_ADAP_HIGH_RESP_ENTRIES;
 	sc->aac_qentries[AAC_HOST_NORM_CMD_QUEUE] =
 		&sc->aac_queues->qt_HostNormCmdQueue[0];
 	sc->aac_qentries[AAC_HOST_HIGH_CMD_QUEUE] =
 		&sc->aac_queues->qt_HostHighCmdQueue[0];
 	sc->aac_qentries[AAC_ADAP_NORM_CMD_QUEUE] =
 		&sc->aac_queues->qt_AdapNormCmdQueue[0];
 	sc->aac_qentries[AAC_ADAP_HIGH_CMD_QUEUE] =
 		&sc->aac_queues->qt_AdapHighCmdQueue[0];
 	sc->aac_qentries[AAC_HOST_NORM_RESP_QUEUE] =
 		&sc->aac_queues->qt_HostNormRespQueue[0];
 	sc->aac_qentries[AAC_HOST_HIGH_RESP_QUEUE] =
 		&sc->aac_queues->qt_HostHighRespQueue[0];
 	sc->aac_qentries[AAC_ADAP_NORM_RESP_QUEUE] =
 		&sc->aac_queues->qt_AdapNormRespQueue[0];
 	sc->aac_qentries[AAC_ADAP_HIGH_RESP_QUEUE] =
 		&sc->aac_queues->qt_AdapHighRespQueue[0];
 
 	/*
 	 * Do controller-type-specific initialisation
 	 */
 	switch (sc->aac_hwif) {
 	case AAC_HWIF_I960RX:
 		AAC_SETREG4(sc, AAC_RX_ODBR, ~0);
 		break;
 	}
 
 	/*
 	 * Give the init structure to the controller.
 	 */
 	if (aac_sync_command(sc, AAC_MONKER_INITSTRUCT, 
 			     sc->aac_common_busaddr +
 			     offsetof(struct aac_common, ac_init), 0, 0, 0,
 			     NULL)) {
 		device_printf(sc->aac_dev,
 			      "error establishing init structure\n");
 		return(EIO);
 	}
 
 	return(0);
 }
 
 /*
  * Send a synchronous command to the controller and wait for a result.
  */
 static int
 aac_sync_command(struct aac_softc *sc, u_int32_t command,
 		 u_int32_t arg0, u_int32_t arg1, u_int32_t arg2, u_int32_t arg3,
 		 u_int32_t *sp)
 {
 	time_t then;
 	u_int32_t status;
 
 	debug_called(3);
 
 	/* populate the mailbox */
 	AAC_SET_MAILBOX(sc, command, arg0, arg1, arg2, arg3);
 
 	/* ensure the sync command doorbell flag is cleared */
 	AAC_CLEAR_ISTATUS(sc, AAC_DB_SYNC_COMMAND);
 
 	/* then set it to signal the adapter */
 	AAC_QNOTIFY(sc, AAC_DB_SYNC_COMMAND);
 
 	/* spin waiting for the command to complete */
 	then = time_second;
 	do {
 		if (time_second > (then + AAC_IMMEDIATE_TIMEOUT)) {
 			debug(2, "timed out");
 			return(EIO);
 		}
 	} while (!(AAC_GET_ISTATUS(sc) & AAC_DB_SYNC_COMMAND));
 
 	/* clear the completion flag */
 	AAC_CLEAR_ISTATUS(sc, AAC_DB_SYNC_COMMAND);
 
 	/* get the command status */
 	status = AAC_GET_MAILBOXSTATUS(sc);
 	if (sp != NULL)
 		*sp = status;
 	return(0);
 }
 
 /*
  * Send a synchronous FIB to the controller and wait for a result.
  */
 static int
 aac_sync_fib(struct aac_softc *sc, u_int32_t command, u_int32_t xferstate, 
 		 void *data, u_int16_t datasize,
 		 void *result, u_int16_t *resultsize)
 {
 	struct aac_fib *fib;
 
 	debug_called(3);
 
 	fib = &sc->aac_common->ac_sync_fib;
 
 	if (datasize > AAC_FIB_DATASIZE)
 		return(EINVAL);
 
 	/*
 	 * Set up the sync FIB
 	 */
 	fib->Header.XferState = AAC_FIBSTATE_HOSTOWNED |
 				AAC_FIBSTATE_INITIALISED |
 				AAC_FIBSTATE_EMPTY;
 	fib->Header.XferState |= xferstate;
 	fib->Header.Command = command;
 	fib->Header.StructType = AAC_FIBTYPE_TFIB;
 	fib->Header.Size = sizeof(struct aac_fib) + datasize;
 	fib->Header.SenderSize = sizeof(struct aac_fib);
 	fib->Header.SenderFibAddress = (u_int32_t)fib;
 	fib->Header.ReceiverFibAddress = sc->aac_common_busaddr +
 					 offsetof(struct aac_common,
 						  ac_sync_fib);
 
 	/*
 	 * Copy in data.
 	 */
 	if (data != NULL) {
 		KASSERT(datasize <= sizeof(fib->data),
 			("aac_sync_fib: datasize to large"));
 		bcopy(data, fib->data, datasize);
 		fib->Header.XferState |= AAC_FIBSTATE_FROMHOST |
 					 AAC_FIBSTATE_NORM;
 	}
 
 	/*
 	 * Give the FIB to the controller, wait for a response.
 	 */
 	if (aac_sync_command(sc, AAC_MONKER_SYNCFIB,
 			     fib->Header.ReceiverFibAddress, 0, 0, 0, NULL)) {
 		debug(2, "IO error");
 		return(EIO);
 	}
 
 	/* 
 	 * Copy out the result
 	 */
 	if (result != NULL) {
 		u_int copysize;
 
 		copysize = fib->Header.Size - sizeof(struct aac_fib_header);
 		if (copysize > *resultsize)
 			copysize = *resultsize;
 		*resultsize = fib->Header.Size - sizeof(struct aac_fib_header);
 		bcopy(fib->data, result, copysize);
 	}
 	return(0);
 }
 
 /*
  * Adapter-space FIB queue manipulation
  *
  * Note that the queue implementation here is a little funky; neither the PI or
  * CI will ever be zero.  This behaviour is a controller feature.
  */
 static struct {
 	int		size;
 	int		notify;
 } aac_qinfo[] = {
 	{AAC_HOST_NORM_CMD_ENTRIES, AAC_DB_COMMAND_NOT_FULL},
 	{AAC_HOST_HIGH_CMD_ENTRIES, 0},
 	{AAC_ADAP_NORM_CMD_ENTRIES, AAC_DB_COMMAND_READY},
 	{AAC_ADAP_HIGH_CMD_ENTRIES, 0},
 	{AAC_HOST_NORM_RESP_ENTRIES, AAC_DB_RESPONSE_NOT_FULL},
 	{AAC_HOST_HIGH_RESP_ENTRIES, 0},
 	{AAC_ADAP_NORM_RESP_ENTRIES, AAC_DB_RESPONSE_READY},
 	{AAC_ADAP_HIGH_RESP_ENTRIES, 0}
 };
 
 /*
  * Atomically insert an entry into the nominated queue, returns 0 on success or
  * EBUSY if the queue is full.
  *
  * Note: it would be more efficient to defer notifying the controller in
  *	 the case where we may be inserting several entries in rapid succession,
  *	 but implementing this usefully may be difficult (it would involve a
  *	 separate queue/notify interface).
  */
 static int
 aac_enqueue_fib(struct aac_softc *sc, int queue, struct aac_command *cm)
 {
 	u_int32_t pi, ci;
 	int s, error;
 	u_int32_t fib_size;
 	u_int32_t fib_addr;
 
 	debug_called(3);
 
 	fib_size = cm->cm_fib->Header.Size; 
 	fib_addr = cm->cm_fib->Header.ReceiverFibAddress;
 
 	s = splbio();
 
 	/* get the producer/consumer indices */
 	pi = sc->aac_queues->qt_qindex[queue][AAC_PRODUCER_INDEX];
 	ci = sc->aac_queues->qt_qindex[queue][AAC_CONSUMER_INDEX];
 
 	/* wrap the queue? */
 	if (pi >= aac_qinfo[queue].size)
 		pi = 0;
 
 	/* check for queue full */
 	if ((pi + 1) == ci) {
 		error = EBUSY;
 		goto out;
 	}
 
 	/* populate queue entry */
 	(sc->aac_qentries[queue] + pi)->aq_fib_size = fib_size;
 	(sc->aac_qentries[queue] + pi)->aq_fib_addr = fib_addr;
 
 	/* update producer index */
 	sc->aac_queues->qt_qindex[queue][AAC_PRODUCER_INDEX] = pi + 1;
 
 	/*
 	 * To avoid a race with its completion interrupt, place this command on
 	 * the busy queue prior to advertising it to the controller.
 	 */
 	aac_enqueue_busy(cm);
 
 	/* notify the adapter if we know how */
 	if (aac_qinfo[queue].notify != 0)
 		AAC_QNOTIFY(sc, aac_qinfo[queue].notify);
 
 	error = 0;
 
 out:
 	splx(s);
 	return(error);
 }
 
 /*
  * Atomically remove one entry from the nominated queue, returns 0 on
  * success or ENOENT if the queue is empty.
  */
 static int
 aac_dequeue_fib(struct aac_softc *sc, int queue, u_int32_t *fib_size,
 		struct aac_fib **fib_addr)
 {
 	u_int32_t pi, ci;
 	int s, error;
 	int notify;
 
 	debug_called(3);
 
 	s = splbio();
 
 	/* get the producer/consumer indices */
 	pi = sc->aac_queues->qt_qindex[queue][AAC_PRODUCER_INDEX];
 	ci = sc->aac_queues->qt_qindex[queue][AAC_CONSUMER_INDEX];
 
 	/* check for queue empty */
 	if (ci == pi) {
 		error = ENOENT;
 		goto out;
 	}
 	
 	notify = 0;
 	if (ci == pi + 1)
 		notify++;
 
 	/* wrap the queue? */
 	if (ci >= aac_qinfo[queue].size)
 		ci = 0;
 
 	/* fetch the entry */
 	*fib_size = (sc->aac_qentries[queue] + ci)->aq_fib_size;
 	*fib_addr = (struct aac_fib *)(sc->aac_qentries[queue] +
 				       ci)->aq_fib_addr;
 
 	/* update consumer index */
 	sc->aac_queues->qt_qindex[queue][AAC_CONSUMER_INDEX] = ci + 1;
 
 	/* if we have made the queue un-full, notify the adapter */
 	if (notify && (aac_qinfo[queue].notify != 0))
 		AAC_QNOTIFY(sc, aac_qinfo[queue].notify);
 	error = 0;
 
 out:
 	splx(s);
 	return(error);
 }
 
 /*
  * Put our response to an Adapter Initialed Fib on the response queue
  */
 static int
 aac_enqueue_response(struct aac_softc *sc, int queue, struct aac_fib *fib)
 {
 	u_int32_t pi, ci;
 	int s, error;
 	u_int32_t fib_size;
 	u_int32_t fib_addr;
 
 	debug_called(1);
 
 	/* Tell the adapter where the FIB is */
 	fib_size = fib->Header.Size; 
 	fib_addr = fib->Header.SenderFibAddress;
 	fib->Header.ReceiverFibAddress = fib_addr;
 
 	s = splbio();
 
 	/* get the producer/consumer indices */
 	pi = sc->aac_queues->qt_qindex[queue][AAC_PRODUCER_INDEX];
 	ci = sc->aac_queues->qt_qindex[queue][AAC_CONSUMER_INDEX];
 
 	/* wrap the queue? */
 	if (pi >= aac_qinfo[queue].size)
 		pi = 0;
 
 	/* check for queue full */
 	if ((pi + 1) == ci) {
 		error = EBUSY;
 		goto out;
 	}
 
 	/* populate queue entry */
 	(sc->aac_qentries[queue] + pi)->aq_fib_size = fib_size;
 	(sc->aac_qentries[queue] + pi)->aq_fib_addr = fib_addr;
 
 	/* update producer index */
 	sc->aac_queues->qt_qindex[queue][AAC_PRODUCER_INDEX] = pi + 1;
 
 	/* notify the adapter if we know how */
 	if (aac_qinfo[queue].notify != 0)
 		AAC_QNOTIFY(sc, aac_qinfo[queue].notify);
 
 	error = 0;
 
 out:
 	splx(s);
 	return(error);
 }
 
 /*
  * Check for commands that have been outstanding for a suspiciously long time,
  * and complain about them.
  */
 static void
 aac_timeout(struct aac_softc *sc)
 {
 	int s;
 	struct aac_command *cm;
 	time_t deadline;
 
 #if 0
 	/* simulate an interrupt to handle possibly-missed interrupts */
 	/*
 	 * XXX This was done to work around another bug which has since been
 	 * fixed.  It is dangerous anyways because you don't want multiple
 	 * threads in the interrupt handler at the same time!  If calling
 	 * is deamed neccesary in the future, proper mutexes must be used.
 	 */
 	s = splbio();
 	aac_intr(sc);
 	splx(s);
 
 	/* kick the I/O queue to restart it in the case of deadlock */
 	aac_startio(sc);
 #endif
 
 	/*
 	 * traverse the busy command list, bitch about late commands once
 	 * only.
 	 */
 	deadline = time_second - AAC_CMD_TIMEOUT;
 	s = splbio();
 	TAILQ_FOREACH(cm, &sc->aac_busy, cm_link) {
 		if ((cm->cm_timestamp  < deadline)
 			/* && !(cm->cm_flags & AAC_CMD_TIMEDOUT) */) {
 			cm->cm_flags |= AAC_CMD_TIMEDOUT;
 			device_printf(sc->aac_dev,
 				      "COMMAND %p TIMEOUT AFTER %d SECONDS\n",
 				      cm, (int)(time_second-cm->cm_timestamp));
 			AAC_PRINT_FIB(sc, cm->cm_fib);
 		}
 	}
 	splx(s);
 
 	/* reset the timer for next time */
 	timeout((timeout_t*)aac_timeout, sc, AAC_PERIODIC_INTERVAL * hz);
 	return;
 }
 
 /*
  * Interface Function Vectors
  */
 
 /*
  * Read the current firmware status word.
  */
 static int
 aac_sa_get_fwstatus(struct aac_softc *sc)
 {
 	debug_called(3);
 
 	return(AAC_GETREG4(sc, AAC_SA_FWSTATUS));
 }
 
 static int
 aac_rx_get_fwstatus(struct aac_softc *sc)
 {
 	debug_called(3);
 
 	return(AAC_GETREG4(sc, AAC_RX_FWSTATUS));
 }
 
 static int
 aac_fa_get_fwstatus(struct aac_softc *sc)
 {
 	int val;
 
 	debug_called(3);
 
 	val = AAC_GETREG4(sc, AAC_FA_FWSTATUS);
 	return (val);
 }
 
 /*
  * Notify the controller of a change in a given queue
  */
 
 static void
 aac_sa_qnotify(struct aac_softc *sc, int qbit)
 {
 	debug_called(3);
 
 	AAC_SETREG2(sc, AAC_SA_DOORBELL1_SET, qbit);
 }
 
 static void
 aac_rx_qnotify(struct aac_softc *sc, int qbit)
 {
 	debug_called(3);
 
 	AAC_SETREG4(sc, AAC_RX_IDBR, qbit);
 }
 
 static void
 aac_fa_qnotify(struct aac_softc *sc, int qbit)
 {
 	debug_called(3);
 
 	AAC_SETREG2(sc, AAC_FA_DOORBELL1, qbit);
 	AAC_FA_HACK(sc);
 }
 
 /*
  * Get the interrupt reason bits
  */
 static int
 aac_sa_get_istatus(struct aac_softc *sc)
 {
 	debug_called(3);
 
 	return(AAC_GETREG2(sc, AAC_SA_DOORBELL0));
 }
 
 static int
 aac_rx_get_istatus(struct aac_softc *sc)
 {
 	debug_called(3);
 
 	return(AAC_GETREG4(sc, AAC_RX_ODBR));
 }
 
 static int
 aac_fa_get_istatus(struct aac_softc *sc)
 {
 	int val;
 
 	debug_called(3);
 
 	val = AAC_GETREG2(sc, AAC_FA_DOORBELL0);
 	return (val);
 }
 
 /*
  * Clear some interrupt reason bits
  */
 static void
 aac_sa_clear_istatus(struct aac_softc *sc, int mask)
 {
 	debug_called(3);
 
 	AAC_SETREG2(sc, AAC_SA_DOORBELL0_CLEAR, mask);
 }
 
 static void
 aac_rx_clear_istatus(struct aac_softc *sc, int mask)
 {
 	debug_called(3);
 
 	AAC_SETREG4(sc, AAC_RX_ODBR, mask);
 }
 
 static void
 aac_fa_clear_istatus(struct aac_softc *sc, int mask)
 {
 	debug_called(3);
 
 	AAC_SETREG2(sc, AAC_FA_DOORBELL0_CLEAR, mask);
 	AAC_FA_HACK(sc);
 }
 
 /*
  * Populate the mailbox and set the command word
  */
 static void
 aac_sa_set_mailbox(struct aac_softc *sc, u_int32_t command,
 		u_int32_t arg0, u_int32_t arg1, u_int32_t arg2, u_int32_t arg3)
 {
 	debug_called(4);
 
 	AAC_SETREG4(sc, AAC_SA_MAILBOX, command);
 	AAC_SETREG4(sc, AAC_SA_MAILBOX + 4, arg0);
 	AAC_SETREG4(sc, AAC_SA_MAILBOX + 8, arg1);
 	AAC_SETREG4(sc, AAC_SA_MAILBOX + 12, arg2);
 	AAC_SETREG4(sc, AAC_SA_MAILBOX + 16, arg3);
 }
 
 static void
 aac_rx_set_mailbox(struct aac_softc *sc, u_int32_t command,
 		u_int32_t arg0, u_int32_t arg1, u_int32_t arg2, u_int32_t arg3)
 {
 	debug_called(4);
 
 	AAC_SETREG4(sc, AAC_RX_MAILBOX, command);
 	AAC_SETREG4(sc, AAC_RX_MAILBOX + 4, arg0);
 	AAC_SETREG4(sc, AAC_RX_MAILBOX + 8, arg1);
 	AAC_SETREG4(sc, AAC_RX_MAILBOX + 12, arg2);
 	AAC_SETREG4(sc, AAC_RX_MAILBOX + 16, arg3);
 }
 
 static void
 aac_fa_set_mailbox(struct aac_softc *sc, u_int32_t command,
 		u_int32_t arg0, u_int32_t arg1, u_int32_t arg2, u_int32_t arg3)
 {
 	debug_called(4);
 
 	AAC_SETREG4(sc, AAC_FA_MAILBOX, command);
 	AAC_FA_HACK(sc);
 	AAC_SETREG4(sc, AAC_FA_MAILBOX + 4, arg0);
 	AAC_FA_HACK(sc);
 	AAC_SETREG4(sc, AAC_FA_MAILBOX + 8, arg1);
 	AAC_FA_HACK(sc);
 	AAC_SETREG4(sc, AAC_FA_MAILBOX + 12, arg2);
 	AAC_FA_HACK(sc);
 	AAC_SETREG4(sc, AAC_FA_MAILBOX + 16, arg3);
 	AAC_FA_HACK(sc);
 }
 
 /*
  * Fetch the immediate command status word
  */
 static int
 aac_sa_get_mailboxstatus(struct aac_softc *sc)
 {
 	debug_called(4);
 
 	return(AAC_GETREG4(sc, AAC_SA_MAILBOX));
 }
 
 static int
 aac_rx_get_mailboxstatus(struct aac_softc *sc)
 {
 	debug_called(4);
 
 	return(AAC_GETREG4(sc, AAC_RX_MAILBOX));
 }
 
 static int
 aac_fa_get_mailboxstatus(struct aac_softc *sc)
 {
 	int val;
 
 	debug_called(4);
 
 	val = AAC_GETREG4(sc, AAC_FA_MAILBOX);
 	return (val);
 }
 
 /*
  * Set/clear interrupt masks
  */
 static void
 aac_sa_set_interrupts(struct aac_softc *sc, int enable)
 {
 	debug(2, "%sable interrupts", enable ? "en" : "dis");
 
 	if (enable) {
 		AAC_SETREG2((sc), AAC_SA_MASK0_CLEAR, AAC_DB_INTERRUPTS);
 	} else {
 		AAC_SETREG2((sc), AAC_SA_MASK0_SET, ~0);
 	}
 }
 
 static void
 aac_rx_set_interrupts(struct aac_softc *sc, int enable)
 {
 	debug(2, "%sable interrupts", enable ? "en" : "dis");
 
 	if (enable) {
 		AAC_SETREG4(sc, AAC_RX_OIMR, ~AAC_DB_INTERRUPTS);
 	} else {
 		AAC_SETREG4(sc, AAC_RX_OIMR, ~0);
 	}
 }
 
 static void
 aac_fa_set_interrupts(struct aac_softc *sc, int enable)
 {
 	debug(2, "%sable interrupts", enable ? "en" : "dis");
 
 	if (enable) {
 		AAC_SETREG2((sc), AAC_FA_MASK0_CLEAR, AAC_DB_INTERRUPTS);
 		AAC_FA_HACK(sc);
 	} else {
 		AAC_SETREG2((sc), AAC_FA_MASK0, ~0);
 		AAC_FA_HACK(sc);
 	}
 }
 
 /*
  * Debugging and Diagnostics
  */
 
 /*
  * Print some information about the controller.
  */
 static void
 aac_describe_controller(struct aac_softc *sc)
 {
 	u_int8_t buf[AAC_FIB_DATASIZE];	/* XXX really a bit big
 					 * for the stack */
 	u_int16_t bufsize;
 	struct aac_adapter_info	*info;
 	u_int8_t arg;
 
 	debug_called(2);
 
 	arg = 0;
 	bufsize = sizeof(buf);
 	if (aac_sync_fib(sc, RequestAdapterInfo, 0, &arg, sizeof(arg), &buf,
 			 &bufsize)) {
 		device_printf(sc->aac_dev, "RequestAdapterInfo failed\n");
 		return;
 	}
 	if (bufsize != sizeof(*info)) {
 		device_printf(sc->aac_dev,
 			      "RequestAdapterInfo returned wrong data size "
 			      "(%d != %d)\n", bufsize, sizeof(*info));
 		/*return;*/
 	}
 	info = (struct aac_adapter_info *)&buf[0];
 
 	device_printf(sc->aac_dev, "%s %dMHz, %dMB cache memory, %s\n", 
 		      aac_describe_code(aac_cpu_variant, info->CpuVariant),
 		      info->ClockSpeed, info->BufferMem / (1024 * 1024), 
 		      aac_describe_code(aac_battery_platform,
 					info->batteryPlatform));
 
 	/* save the kernel revision structure for later use */
 	sc->aac_revision = info->KernelRevision;
 	device_printf(sc->aac_dev, "Kernel %d.%d-%d, Build %d, S/N %6X\n",
 		      info->KernelRevision.external.comp.major,
 		      info->KernelRevision.external.comp.minor,
 		      info->KernelRevision.external.comp.dash,
 		      info->KernelRevision.buildNumber,
 		      (u_int32_t)(info->SerialNumber & 0xffffff));
 }
 
 /*
  * Look up a text description of a numeric error code and return a pointer to
  * same.
  */
 static char *
 aac_describe_code(struct aac_code_lookup *table, u_int32_t code)
 {
 	int i;
 
 	for (i = 0; table[i].string != NULL; i++)
 		if (table[i].code == code)
 			return(table[i].string);
 	return(table[i + 1].string);
 }
 
 /*
  * Management Interface
  */
 
 static int
 aac_open(dev_t dev, int flags, int fmt, d_thread_t *td)
 {
 	struct aac_softc *sc;
 
 	debug_called(2);
 
 	sc = dev->si_drv1;
 
 	/* Check to make sure the device isn't already open */
 	if (sc->aac_state & AAC_STATE_OPEN) {
 		return EBUSY;
 	}
 	sc->aac_state |= AAC_STATE_OPEN;
 
 	return 0;
 }
 
 static int
 aac_close(dev_t dev, int flags, int fmt, d_thread_t *td)
 {
 	struct aac_softc *sc;
 
 	debug_called(2);
 
 	sc = dev->si_drv1;
 
 	/* Mark this unit as no longer open  */
 	sc->aac_state &= ~AAC_STATE_OPEN;
 
 	return 0;
 }
 
 static int
 aac_ioctl(dev_t dev, u_long cmd, caddr_t arg, int flag, d_thread_t *td)
 {
 	union aac_statrequest *as;
 	struct aac_softc *sc;
 	int error = 0;
 	int i;
 
 	debug_called(2);
 
 	as = (union aac_statrequest *)arg;
 	sc = dev->si_drv1;
 
 	switch (cmd) {
 	case AACIO_STATS:
 		switch (as->as_item) {
 		case AACQ_FREE:
 		case AACQ_BIO:
 		case AACQ_READY:
 		case AACQ_BUSY:
 		case AACQ_COMPLETE:
 			bcopy(&sc->aac_qstat[as->as_item], &as->as_qstat,
 			      sizeof(struct aac_qstat));
 			break;
 		default:
 			error = ENOENT;
 			break;
 		}
 	break;
 	
 	case FSACTL_SENDFIB:
 		arg = *(caddr_t*)arg;
 	case FSACTL_LNX_SENDFIB:
 		debug(1, "FSACTL_SENDFIB");
 		error = aac_ioctl_sendfib(sc, arg);
 		break;
 	case FSACTL_AIF_THREAD:
 	case FSACTL_LNX_AIF_THREAD:
 		debug(1, "FSACTL_AIF_THREAD");
 		error = EINVAL;
 		break;
 	case FSACTL_OPEN_GET_ADAPTER_FIB:
 		arg = *(caddr_t*)arg;
 	case FSACTL_LNX_OPEN_GET_ADAPTER_FIB:
 		debug(1, "FSACTL_OPEN_GET_ADAPTER_FIB");
 		/*
 		 * Pass the caller out an AdapterFibContext.
 		 *
 		 * Note that because we only support one opener, we
 		 * basically ignore this.  Set the caller's context to a magic
 		 * number just in case.
 		 *
 		 * The Linux code hands the driver a pointer into kernel space,
 		 * and then trusts it when the caller hands it back.  Aiee!
 		 * Here, we give it the proc pointer of the per-adapter aif 
 		 * thread. It's only used as a sanity check in other calls.
 		 */
 		i = (int)sc->aifthread;
 		error = copyout(&i, arg, sizeof(i));
 		break;
 	case FSACTL_GET_NEXT_ADAPTER_FIB:
 		arg = *(caddr_t*)arg;
 	case FSACTL_LNX_GET_NEXT_ADAPTER_FIB:
 		debug(1, "FSACTL_GET_NEXT_ADAPTER_FIB");
 		error = aac_getnext_aif(sc, arg);
 		break;
 	case FSACTL_CLOSE_GET_ADAPTER_FIB:
 	case FSACTL_LNX_CLOSE_GET_ADAPTER_FIB:
 		debug(1, "FSACTL_CLOSE_GET_ADAPTER_FIB");
 		/* don't do anything here */
 		break;
 	case FSACTL_MINIPORT_REV_CHECK:
 		arg = *(caddr_t*)arg;
 	case FSACTL_LNX_MINIPORT_REV_CHECK:
 		debug(1, "FSACTL_MINIPORT_REV_CHECK");
 		error = aac_rev_check(sc, arg);
 		break;
 	case FSACTL_QUERY_DISK:
 		arg = *(caddr_t*)arg;
 	case FSACTL_LNX_QUERY_DISK:
 		debug(1, "FSACTL_QUERY_DISK");
 		error = aac_query_disk(sc, arg);
 			break;
 	case FSACTL_DELETE_DISK:
 	case FSACTL_LNX_DELETE_DISK:
 		/*
 		 * We don't trust the underland to tell us when to delete a
 		 * container, rather we rely on an AIF coming from the 
 		 * controller
 		 */
 		error = 0;
 		break;
 	default:
 		debug(1, "unsupported cmd 0x%lx\n", cmd);
 		error = EINVAL;
 		break;
 	}
 	return(error);
 }
 
 static int
 aac_poll(dev_t dev, int poll_events, d_thread_t *td)
 {
 	struct aac_softc *sc;
 	int revents;
 
 	sc = dev->si_drv1;
 	revents = 0;
 
 	AAC_LOCK_ACQUIRE(&sc->aac_aifq_lock);
 	if ((poll_events & (POLLRDNORM | POLLIN)) != 0) {
 		if (sc->aac_aifq_tail != sc->aac_aifq_head)
 			revents |= poll_events & (POLLIN | POLLRDNORM);
 	}
 	AAC_LOCK_RELEASE(&sc->aac_aifq_lock);
 
 	if (revents == 0) {
 		if (poll_events & (POLLIN | POLLRDNORM))
 			selrecord(td, &sc->rcv_select);
 	}
 
 	return (revents);
 }
 
 /*
  * Send a FIB supplied from userspace
  */
 static int
 aac_ioctl_sendfib(struct aac_softc *sc, caddr_t ufib)
 {
 	struct aac_command *cm;
 	int size, error;
 
 	debug_called(2);
 
 	cm = NULL;
 
 	/*
 	 * Get a command
 	 */
 	if (aac_alloc_command(sc, &cm)) {
 		error = EBUSY;
 		goto out;
 	}
 
 	/*
 	 * Fetch the FIB header, then re-copy to get data as well.
 	 */
 	if ((error = copyin(ufib, cm->cm_fib,
 			    sizeof(struct aac_fib_header))) != 0)
 		goto out;
 	size = cm->cm_fib->Header.Size + sizeof(struct aac_fib_header);
 	if (size > sizeof(struct aac_fib)) {
 		device_printf(sc->aac_dev, "incoming FIB oversized (%d > %d)\n",
 			      size, sizeof(struct aac_fib));
 		size = sizeof(struct aac_fib);
 	}
 	if ((error = copyin(ufib, cm->cm_fib, size)) != 0)
 		goto out;
 	cm->cm_fib->Header.Size = size;
 	cm->cm_timestamp = time_second;
 
 	/*
 	 * Pass the FIB to the controller, wait for it to complete.
 	 */
 	if ((error = aac_wait_command(cm, 30)) != 0) {	/* XXX user timeout? */
 		printf("aac_wait_command return %d\n", error);
 		goto out;
 	}
 
 	/*
 	 * Copy the FIB and data back out to the caller.
 	 */
 	size = cm->cm_fib->Header.Size;
 	if (size > sizeof(struct aac_fib)) {
 		device_printf(sc->aac_dev, "outbound FIB oversized (%d > %d)\n",
 			      size, sizeof(struct aac_fib));
 		size = sizeof(struct aac_fib);
 	}
 	error = copyout(cm->cm_fib, ufib, size);
 
 out:
 	if (cm != NULL) {
 		aac_release_command(cm);
 	}
 	return(error);
 }
 
 /*
  * Handle an AIF sent to us by the controller; queue it for later reference.
  * If the queue fills up, then drop the older entries.
  */
 static void
 aac_handle_aif(struct aac_softc *sc, struct aac_fib *fib)
 {
 	struct aac_aif_command *aif;
 	struct aac_container *co, *co_next;
 	struct aac_mntinfo mi;
 	struct aac_mntinforesponse mir;
 	u_int16_t rsize;
 	int next, found;
 	int added = 0, i = 0;
 
 	debug_called(2);
 
 	aif = (struct aac_aif_command*)&fib->data[0];
 	aac_print_aif(sc, aif);
 
 	/* Is it an event that we should care about? */
 	switch (aif->command) {
 	case AifCmdEventNotify:
 		switch (aif->data.EN.type) {
 		case AifEnAddContainer:
 		case AifEnDeleteContainer:
 			/*
 			 * A container was added or deleted, but the message 
 			 * doesn't tell us anything else!  Re-enumerate the
 			 * containers and sort things out.
 			 */
 			mi.Command = VM_NameServe;
 			mi.MntType = FT_FILESYS;
 			do {
 				/*
 				 * Ask the controller for its containers one at
 				 * a time.
 				 * XXX What if the controller's list changes
 				 * midway through this enumaration?
 				 * XXX This should be done async.
 				 */
 				mi.MntCount = i;
 				rsize = sizeof(mir);
 				if (aac_sync_fib(sc, ContainerCommand, 0, &mi,
 						 sizeof(mi), &mir, &rsize)) {
 					debug(2, "Error probing container %d\n",
 					      i);
 					continue;
 				}
 				if (rsize != sizeof(mir)) {
 					debug(2, "Container response size too "
 						 "large\n");
 					continue;
 				}
 				/*
 				 * Check the container against our list.
 				 * co->co_found was already set to 0 in a
 				 * previous run.
 				 */
 				if ((mir.Status == ST_OK) &&
 				    (mir.MntTable[0].VolType != CT_NONE)) {
 					found = 0;
 					TAILQ_FOREACH(co,
 						      &sc->aac_container_tqh, 
 						      co_link) {
 						if (co->co_mntobj.ObjectId ==
 						    mir.MntTable[0].ObjectId) {
 							co->co_found = 1;
 							found = 1;
 							break;
 						}
 					}
 					/*
 					 * If the container matched, continue
 					 * in the list.
 					 */
 					if (found) {
 						i++;
 						continue;
 					}
 
 					/*
 					 * This is a new container.  Do all the
 					 * appropriate things to set it up.						 */
 					aac_add_container(sc, &mir, 1);
 					added = 1;
 				}
 				i++;
 			} while ((i < mir.MntRespCount) &&
 				 (i < AAC_MAX_CONTAINERS));
 
 			/*
 			 * Go through our list of containers and see which ones
 			 * were not marked 'found'.  Since the controller didn't
 			 * list them they must have been deleted.  Do the
 			 * appropriate steps to destroy the device.  Also reset
 			 * the co->co_found field.
 			 */
 			co = TAILQ_FIRST(&sc->aac_container_tqh);
 			while (co != NULL) {
 				if (co->co_found == 0) {
 					device_delete_child(sc->aac_dev,
 							    co->co_disk);
 					co_next = TAILQ_NEXT(co, co_link);
 					AAC_LOCK_ACQUIRE(&sc->
 							aac_container_lock);
 					TAILQ_REMOVE(&sc->aac_container_tqh, co,
 						     co_link);
 					AAC_LOCK_RELEASE(&sc->
 							 aac_container_lock);
 					FREE(co, M_AACBUF);
 					co = co_next;
 				} else {
 					co->co_found = 0;
 					co = TAILQ_NEXT(co, co_link);
 				}
 			}
 
 			/* Attach the newly created containers */
 			if (added)
 				bus_generic_attach(sc->aac_dev);
 	
 				break;
 
 		default:
 			break;
 		}
 
 	default:
 		break;
 	}
 
 	/* Copy the AIF data to the AIF queue for ioctl retrieval */
 	AAC_LOCK_ACQUIRE(&sc->aac_aifq_lock);
 	next = (sc->aac_aifq_head + 1) % AAC_AIFQ_LENGTH;
 	if (next != sc->aac_aifq_tail) {
 		bcopy(aif, &sc->aac_aifq[next], sizeof(struct aac_aif_command));
 		sc->aac_aifq_head = next;
 
 		/* On the off chance that someone is sleeping for an aif... */
 		if (sc->aac_state & AAC_STATE_AIF_SLEEPER)
 			wakeup(sc->aac_aifq);
 		/* Wakeup any poll()ers */
 		selwakeup(&sc->rcv_select);
 	}
 	AAC_LOCK_RELEASE(&sc->aac_aifq_lock);
 
 	return;
 }
 
 /*
  * Linux Management Interface
  * This is soon to be removed!
  */
 
 #ifdef AAC_COMPAT_LINUX
 
 #include <sys/proc.h>
 #include <machine/../linux/linux.h>
 #include <machine/../linux/linux_proto.h>
 #include <compat/linux/linux_ioctl.h>
 
 /* There are multiple ioctl number ranges that need to be handled */
 #define AAC_LINUX_IOCTL_MIN  0x0000
 #define AAC_LINUX_IOCTL_MAX  0x21ff
 
 static linux_ioctl_function_t aac_linux_ioctl;
 static struct linux_ioctl_handler aac_handler = {aac_linux_ioctl,
 						 AAC_LINUX_IOCTL_MIN,
 						 AAC_LINUX_IOCTL_MAX};
 
 SYSINIT  (aac_register,   SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_ioctl_register_handler, &aac_handler);
 SYSUNINIT(aac_unregister, SI_SUB_KLD, SI_ORDER_MIDDLE,
 	  linux_ioctl_unregister_handler, &aac_handler);
 
 MODULE_DEPEND(aac, linux, 1, 1, 1);
 
 static int
 aac_linux_ioctl(struct thread *td, struct linux_ioctl_args *args)
 {
 	struct file *fp;
 	u_long cmd;
+	int error;
 
 	debug_called(2);
 
-	fp = td->td_proc->p_fd->fd_ofiles[args->fd];
+	fp = ffind_hold(td, args->fd);
+	if (fp == NULL)
+		return (EBADF);
 	cmd = args->cmd;
 
 	/*
 	 * Pass the ioctl off to our standard handler.
 	 */
-	return(fo_ioctl(fp, cmd, (caddr_t)args->arg, td));
+	error = (fo_ioctl(fp, cmd, (caddr_t)args->arg, td));
+	fdrop(fp, td);
+	return (error);
 }
 
 #endif
 
 /*
  * Return the Revision of the driver to userspace and check to see if the
  * userspace app is possibly compatible.  This is extremely bogus since
  * our driver doesn't follow Adaptec's versioning system.  Cheat by just
  * returning what the card reported.
  */
 static int
 aac_rev_check(struct aac_softc *sc, caddr_t udata)
 {
 	struct aac_rev_check rev_check;
 	struct aac_rev_check_resp rev_check_resp;
 	int error = 0;
 
 	debug_called(2);
 
 	/*
 	 * Copyin the revision struct from userspace
 	 */
 	if ((error = copyin(udata, (caddr_t)&rev_check,
 			sizeof(struct aac_rev_check))) != 0) {
 		return error;
 	}
 
 	debug(2, "Userland revision= %d\n",
 	      rev_check.callingRevision.buildNumber);
 
 	/*
 	 * Doctor up the response struct.
 	 */
 	rev_check_resp.possiblyCompatible = 1;
 	rev_check_resp.adapterSWRevision.external.ul =
 	    sc->aac_revision.external.ul;
 	rev_check_resp.adapterSWRevision.buildNumber =
 	    sc->aac_revision.buildNumber;
 
 	return(copyout((caddr_t)&rev_check_resp, udata,
 			sizeof(struct aac_rev_check_resp)));
 }
 
 /*
  * Pass the caller the next AIF in their queue
  */
 static int
 aac_getnext_aif(struct aac_softc *sc, caddr_t arg)
 {
 	struct get_adapter_fib_ioctl agf;
 	int error, s;
 
 	debug_called(2);
 
 	if ((error = copyin(arg, &agf, sizeof(agf))) == 0) {
 
 		/*
 		 * Check the magic number that we gave the caller.
 		 */
 		if (agf.AdapterFibContext != (int)sc->aifthread) {
 			error = EFAULT;
 		} else {
 	
 			s = splbio();
 			error = aac_return_aif(sc, agf.AifFib);
 	
 			if ((error == EAGAIN) && (agf.Wait)) {
 				sc->aac_state |= AAC_STATE_AIF_SLEEPER;
 				while (error == EAGAIN) {
 					error = tsleep(sc->aac_aifq, PRIBIO |
 						       PCATCH, "aacaif", 0);
 					if (error == 0)
 						error = aac_return_aif(sc,
 						    agf.AifFib);
 				}
 				sc->aac_state &= ~AAC_STATE_AIF_SLEEPER;
 			}
 		splx(s);
 		}
 	}
 	return(error);
 }
 
 /*
  * Hand the next AIF off the top of the queue out to userspace.
  */
 static int
 aac_return_aif(struct aac_softc *sc, caddr_t uptr)
 {
 	int error;
 
 	debug_called(2);
 
 	AAC_LOCK_ACQUIRE(&sc->aac_aifq_lock);
 	if (sc->aac_aifq_tail == sc->aac_aifq_head) {
 		error = EAGAIN;
 	} else {
 		error = copyout(&sc->aac_aifq[sc->aac_aifq_tail], uptr,
 				sizeof(struct aac_aif_command));
 		if (error)
 			printf("aac_return_aif: copyout returned %d\n", error);
 		if (!error)
 			sc->aac_aifq_tail = (sc->aac_aifq_tail + 1) %
 					    AAC_AIFQ_LENGTH;
 	}
 	AAC_LOCK_RELEASE(&sc->aac_aifq_lock);
 	return(error);
 }
 
 /*
  * Give the userland some information about the container.  The AAC arch
  * expects the driver to be a SCSI passthrough type driver, so it expects
  * the containers to have b:t:l numbers.  Fake it.
  */
 static int
 aac_query_disk(struct aac_softc *sc, caddr_t uptr)
 {
 	struct aac_query_disk query_disk;
 	struct aac_container *co;
 	struct aac_disk	*disk;
 	int error, id;
 
 	debug_called(2);
 
 	disk = NULL;
 
 	error = copyin(uptr, (caddr_t)&query_disk,
 		       sizeof(struct aac_query_disk));
 	if (error)
 		return (error);
 
 	id = query_disk.ContainerNumber;
 	if (id == -1)
 		return (EINVAL);
 
 	AAC_LOCK_ACQUIRE(&sc->aac_container_lock);
 	TAILQ_FOREACH(co, &sc->aac_container_tqh, co_link) {
 		if (co->co_mntobj.ObjectId == id)
 			break;
 		}
 
 		if (co == NULL) {
 			query_disk.Valid = 0;
 			query_disk.Locked = 0;
 			query_disk.Deleted = 1;		/* XXX is this right? */
 		} else {
 			disk = device_get_softc(co->co_disk);
 			query_disk.Valid = 1;
 			query_disk.Locked =
 			    (disk->ad_flags & AAC_DISK_OPEN) ? 1 : 0;
 			query_disk.Deleted = 0;
 			query_disk.Bus = device_get_unit(sc->aac_dev);
 			query_disk.Target = disk->unit;
 			query_disk.Lun = 0;
 			query_disk.UnMapped = 0;
 			bcopy(disk->ad_dev_t->si_name,
 			      &query_disk.diskDeviceName[0], 10);
 		}
 	AAC_LOCK_RELEASE(&sc->aac_container_lock);
 
 	error = copyout((caddr_t)&query_disk, uptr,
 			sizeof(struct aac_query_disk));
 
 	return (error);
 }
 
Index: head/sys/dev/streams/streams.c
===================================================================
--- head/sys/dev/streams/streams.c	(revision 89305)
+++ head/sys/dev/streams/streams.c	(revision 89306)
@@ -1,410 +1,428 @@
 /*
  * Copyright (c) 1998 Mark Newton
  * Copyright (c) 1994 Christos Zoulas
  * Copyright (c) 1997 Todd Vierling
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The names of the authors may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Stolen from NetBSD /sys/compat/svr4/svr4_net.c.  Pseudo-device driver
  * skeleton produced from /usr/share/examples/drivers/make_pseudo_driver.sh
  * in 3.0-980524-SNAP then hacked a bit (but probably not enough :-).
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>		/* SYSINIT stuff */
 #include <sys/conf.h>		/* cdevsw stuff */
 #include <sys/malloc.h>		/* malloc region definitions */
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/unistd.h>
 #include <sys/fcntl.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/socketvar.h>
 #include <sys/un.h>
 #include <sys/domain.h>
 #include <net/if.h>
 #include <netinet/in.h>
 #include <sys/proc.h>
 #include <sys/uio.h>
 
 #include <sys/sysproto.h>
 
 #include <compat/svr4/svr4_types.h>
 #include <compat/svr4/svr4_util.h>
 #include <compat/svr4/svr4_signal.h>
 #include <compat/svr4/svr4_ioctl.h>
 #include <compat/svr4/svr4_stropts.h>
 #include <compat/svr4/svr4_socket.h>
 
 static int svr4_soo_close __P((struct file *, struct thread *));
 static int svr4_ptm_alloc __P((struct thread *));
 static  d_open_t	streamsopen;
 
 struct svr4_sockcache_entry {
 	struct proc *p;		/* Process for the socket		*/
 	void *cookie;		/* Internal cookie used for matching	*/
 	struct sockaddr_un sock;/* Pathname for the socket		*/
 	dev_t dev;		/* Device where the socket lives on	*/
 	ino_t ino;		/* Inode where the socket lives on	*/
 	TAILQ_ENTRY(svr4_sockcache_entry) entries;
 };
 
 TAILQ_HEAD(svr4_sockcache_head, svr4_sockcache_entry) svr4_head;
 
 /* Initialization flag (set/queried by svr4_mod LKM) */
 int svr4_str_initialized = 0;
 
 /*
  * Device minor numbers
  */
 enum {
 	dev_ptm			= 10,
 	dev_arp			= 26,
 	dev_icmp		= 27,
 	dev_ip			= 28,
 	dev_tcp			= 35,
 	dev_udp			= 36,
 	dev_rawip		= 37,
 	dev_unix_dgram		= 38,
 	dev_unix_stream		= 39,
 	dev_unix_ord_stream	= 40
 };
 
 static dev_t dt_ptm, dt_arp, dt_icmp, dt_ip, dt_tcp, dt_udp, dt_rawip,
 	dt_unix_dgram, dt_unix_stream, dt_unix_ord_stream;
 
 static struct fileops svr4_netops = {
 	soo_read, soo_write, soo_ioctl, soo_poll, sokqfilter,
 	soo_stat, svr4_soo_close
 };
  
 #define CDEV_MAJOR 103
 static struct cdevsw streams_cdevsw = {
 	/* open */	streamsopen,
 	/* close */	noclose,
 	/* read */	noread,
 	/* write */	nowrite,
 	/* ioctl */	noioctl,
 	/* poll */	nopoll,
 	/* mmap */	nommap,
 	/* strategy */	nostrategy,
 	/* name */	"streams",
 	/* maj */	CDEV_MAJOR,
 	/* dump */	nodump,
 	/* psize */	nopsize,
 	/* flags */	0,
 };
  
 struct streams_softc {
 	struct isa_device *dev;
 } ;
 
 #define UNIT(dev) minor(dev)	/* assume one minor number per unit */
 
 typedef	struct streams_softc *sc_p;
 
 static	int
 streams_modevent(module_t mod, int type, void *unused)
 {
 	switch (type) {
 	case MOD_LOAD:
 		/* XXX should make sure it isn't already loaded first */
 		dt_ptm = make_dev(&streams_cdevsw, dev_ptm, 0, 0, 0666,
 			"ptm");
 		dt_arp = make_dev(&streams_cdevsw, dev_arp, 0, 0, 0666,
 			"arp");
 		dt_icmp = make_dev(&streams_cdevsw, dev_icmp, 0, 0, 0666,
 			"icmp");
 		dt_ip = make_dev(&streams_cdevsw, dev_ip, 0, 0, 0666,
 			"ip");
 		dt_tcp = make_dev(&streams_cdevsw, dev_tcp, 0, 0, 0666,
 			"tcp");
 		dt_udp = make_dev(&streams_cdevsw, dev_udp, 0, 0, 0666,
 			"udp");
 		dt_rawip = make_dev(&streams_cdevsw, dev_rawip, 0, 0, 0666,
 			"rawip");
 		dt_unix_dgram = make_dev(&streams_cdevsw, dev_unix_dgram,
 			0, 0, 0666, "ticlts");
 		dt_unix_stream = make_dev(&streams_cdevsw, dev_unix_stream,
 			0, 0, 0666, "ticots");
 		dt_unix_ord_stream = make_dev(&streams_cdevsw,
 			dev_unix_ord_stream, 0, 0, 0666, "ticotsord");
 
 		if (! (dt_ptm && dt_arp && dt_icmp && dt_ip && dt_tcp &&
 				dt_udp && dt_rawip && dt_unix_dgram &&
 				dt_unix_stream && dt_unix_ord_stream)) {
 			printf("WARNING: device config for STREAMS failed\n");
 			printf("Suggest unloading streams KLD\n");
 		}
 		return 0;
 	case MOD_UNLOAD:
 	  	/* XXX should check to see if it's busy first */
 		destroy_dev(dt_ptm);
 		destroy_dev(dt_arp);
 		destroy_dev(dt_icmp);
 		destroy_dev(dt_ip);
 		destroy_dev(dt_tcp);
 		destroy_dev(dt_udp);
 		destroy_dev(dt_rawip);
 		destroy_dev(dt_unix_dgram);
 		destroy_dev(dt_unix_stream);
 		destroy_dev(dt_unix_ord_stream);
 
 		return 0;
 	default:
 		break;
 	}
 	return 0;
 }
 
 static moduledata_t streams_mod = {
 	"streams",
 	streams_modevent,
 	0
 };
 DECLARE_MODULE(streams, streams_mod, SI_SUB_DRIVERS, SI_ORDER_ANY);
 MODULE_VERSION(streams, 1);
 
 /*
  * We only need open() and close() routines.  open() calls socreate()
  * to allocate a "real" object behind the stream and mallocs some state
  * info for use by the svr4 emulator;  close() deallocates the state
  * information and passes the underlying object to the normal socket close
  * routine.
  */
 static  int
 streamsopen(dev_t dev, int oflags, int devtype, struct thread *td)
 {
 	int type, protocol;
 	int fd;
 	struct file *fp;
 	struct socket *so;
 	int error;
 	int family;
 	struct proc *p = td->td_proc;
 	
 	PROC_LOCK(p);
 	if (td->td_dupfd >= 0) {
 	  PROC_UNLOCK(p);
 	  return ENODEV;
 	}
 	PROC_UNLOCK(p);
 
 	switch (minor(dev)) {
 	case dev_udp:
 	  family = AF_INET;
 	  type = SOCK_DGRAM;
 	  protocol = IPPROTO_UDP;
 	  break;
 
 	case dev_tcp:
 	  family = AF_INET;
 	  type = SOCK_STREAM;
 	  protocol = IPPROTO_TCP;
 	  break;
 
 	case dev_ip:
 	case dev_rawip:
 	  family = AF_INET;
 	  type = SOCK_RAW;
 	  protocol = IPPROTO_IP;
 	  break;
 
 	case dev_icmp:
 	  family = AF_INET;
 	  type = SOCK_RAW;
 	  protocol = IPPROTO_ICMP;
 	  break;
 
 	case dev_unix_dgram:
 	  family = AF_LOCAL;
 	  type = SOCK_DGRAM;
 	  protocol = 0;
 	  break;
 
 	case dev_unix_stream:
 	case dev_unix_ord_stream:
 	  family = AF_LOCAL;
 	  type = SOCK_STREAM;
 	  protocol = 0;
 	  break;
 
 	case dev_ptm:
 	  return svr4_ptm_alloc(td);
 
 	default:
 	  return EOPNOTSUPP;
 	}
 
 	if ((error = falloc(td, &fp, &fd)) != 0)
 	  return error;
 
 	if ((error = socreate(family, &so, type, protocol,
 	    td->td_proc->p_ucred, td)) != 0) {
+	  FILEDESC_LOCK(p->p_fd);
 	  p->p_fd->fd_ofiles[fd] = 0;
+	  FILEDESC_UNLOCK(p->p_fd);
 	  ffree(fp);
 	  return error;
 	}
 
+	FILEDESC_LOCK(p->p_fd);
 	fp->f_data = (caddr_t)so;
 	fp->f_flag = FREAD|FWRITE;
 	fp->f_ops = &svr4_netops;
 	fp->f_type = DTYPE_SOCKET;
+	FILEDESC_UNLOCK(p->p_fd);
 
 	(void)svr4_stream_get(fp);
 	PROC_LOCK(p);
 	td->td_dupfd = fd;
 	PROC_UNLOCK(p);
 	return ENXIO;
 }
 
 static int
 svr4_ptm_alloc(td)
 	struct thread *td;
 {
 	struct proc *p = td->td_proc;
 	/*
 	 * XXX this is very, very ugly.  But I can't find a better
 	 * way that won't duplicate a big amount of code from
 	 * sys_open().  Ho hum...
 	 *
 	 * Fortunately for us, Solaris (at least 2.5.1) makes the
 	 * /dev/ptmx open automatically just open a pty, that (after
 	 * STREAMS I_PUSHes), is just a plain pty.  fstat() is used
 	 * to get the minor device number to map to a tty.
 	 * 
 	 * Cycle through the names. If sys_open() returns ENOENT (or
 	 * ENXIO), short circuit the cycle and exit.
 	 */
 	static char ptyname[] = "/dev/ptyXX";
 	static char ttyletters[] = "pqrstuwxyzPQRST";
 	static char ttynumbers[] = "0123456789abcdef";
 	caddr_t sg = stackgap_init();
 	char *path = stackgap_alloc(&sg, sizeof(ptyname));
 	struct open_args oa;
 	int l = 0, n = 0;
 	register_t fd = -1;
 	int error;
 
 	SCARG(&oa, path) = path;
 	SCARG(&oa, flags) = O_RDWR;
 	SCARG(&oa, mode) = 0;
 
 	while (fd == -1) {
 		ptyname[8] = ttyletters[l];
 		ptyname[9] = ttynumbers[n];
 
 		if ((error = copyout(ptyname, path, sizeof(ptyname))) != 0)
 			return error;
 
 		switch (error = open(td, &oa)) {
 		case ENOENT:
 		case ENXIO:
 			return error;
 		case 0:
 			PROC_LOCK(p);
 			td->td_dupfd = td->td_retval[0];
 			PROC_UNLOCK(p);
 			return ENXIO;
 		default:
 			if (ttynumbers[++n] == '\0') {
 				if (ttyletters[++l] == '\0')
 					break;
 				n = 0;
 			}
 		}
 	}
 	return ENOENT;
 }
 
 
 struct svr4_strm *
 svr4_stream_get(fp)
 	struct file *fp;
 {
 	struct socket *so;
 	struct svr4_strm *st;
 
 	if (fp == NULL || fp->f_type != DTYPE_SOCKET)
 		return NULL;
 
 	so = (struct socket *) fp->f_data;
 
-       	if (so->so_emuldata)
+	/*
+	 * mpfixme: lock socketbuffer here
+	 */
+	if (so->so_emuldata) {
 		return so->so_emuldata;
+	}
 
 	/* Allocate a new one. */
 	st = malloc(sizeof(struct svr4_strm), M_TEMP, M_WAITOK);
 	st->s_family = so->so_proto->pr_domain->dom_family;
 	st->s_cmd = ~0;
 	st->s_afd = -1;
 	st->s_eventmask = 0;
-	so->so_emuldata = st;
-	fp->f_ops = &svr4_netops;
+	/*
+	 * avoid a race where we loose due to concurrancy issues
+	 * of two threads trying to allocate the so_emuldata.
+	 */
+	if (so->so_emuldata) {
+		/* lost the race, use the existing emuldata */
+		FREE(st, M_TEMP);
+		st = so->so_emuldata;
+	} else {
+		/* we won, or there was no race, use our copy */
+		so->so_emuldata = st;
+		fp->f_ops = &svr4_netops;
+	}
 
 	return st;
 }
 
 void
 svr4_delete_socket(p, fp)
 	struct proc *p;
 	struct file *fp;
 {
 	struct svr4_sockcache_entry *e;
 	void *cookie = ((struct socket *) fp->f_data)->so_emuldata;
 
 	while (svr4_str_initialized != 2) {
 		if (atomic_cmpset_acq_int(&svr4_str_initialized, 0, 1)) {
 			TAILQ_INIT(&svr4_head);
 			atomic_store_rel_int(&svr4_str_initialized, 2);
 		}
 		return;
 	}
 
 	TAILQ_FOREACH(e, &svr4_head, entries)
 		if (e->p == p && e->cookie == cookie) {
 			TAILQ_REMOVE(&svr4_head, e, entries);
 			DPRINTF(("svr4_delete_socket: %s [%p,%d,%d]\n",
 				 e->sock.sun_path, p, (int)e->dev, e->ino));
 			free(e, M_TEMP);
 			return;
 		}
 }
 
 static int
 svr4_soo_close(struct file *fp, struct thread *td)
 {
         struct socket *so = (struct socket *)fp->f_data;
 	
 	/*	CHECKUNIT_DIAG(ENXIO);*/
 
 	svr4_delete_socket(td->td_proc, fp);
 	free(so->so_emuldata, M_TEMP);
 	return soo_close(fp, td);
-	return (0);
 }
Index: head/sys/dev/tdfx/tdfx_pci.c
===================================================================
--- head/sys/dev/tdfx/tdfx_pci.c	(revision 89305)
+++ head/sys/dev/tdfx/tdfx_pci.c	(revision 89306)
@@ -1,866 +1,870 @@
 /*
  * Copyright (c) 2000-2001 by Coleman Kane <cokane@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Gardner Buchanan.
  * 4. The name of Gardner Buchanan may not be used to endorse or promote
  *    products derived from this software without specific prior written
  *    permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  *   $FreeBSD$
  */
 
 /* 3dfx driver for FreeBSD 4.x - Finished 11 May 2000, 12:25AM ET
  *
  * Copyright (C) 2000-2001, by Coleman Kane <cokane@FreeBSD.org>, 
  * based upon the 3dfx driver written for linux, by Daryll Straus, Jon Taylor,
  * and Jens Axboe, located at http://linux.3dfx.com.
  */
 
 #include <sys/param.h>
 
 #include <sys/bus_private.h>
 #include <sys/bus.h>
 #include <sys/cdefs.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/ioccom.h>
 #include <sys/kernel.h>
 #include	<sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/signalvar.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 
 #include <pci/pcivar.h>
 #include <pci/pcireg.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 
 /* rman.h depends on machine/bus.h */
 #include <machine/resource.h>
 #include <machine/bus.h>
 #include <sys/rman.h>
 
 /* This must come first */
 #include "opt_tdfx.h"
 #ifdef TDFX_LINUX
 #include <dev/tdfx/tdfx_linux.h>
 #endif
 
 #include <dev/tdfx/tdfx_io.h>
 #include <dev/tdfx/tdfx_vars.h>
 #include <dev/tdfx/tdfx_pci.h>
 
 
 static devclass_t tdfx_devclass;
 
 
 static int tdfx_count = 0;
 
 
 /* Set up the boot probe/attach routines */
 static device_method_t tdfx_methods[] = {
 	DEVMETHOD(device_probe,		tdfx_probe),
 	DEVMETHOD(device_attach,	tdfx_attach),
 	DEVMETHOD(device_detach,	tdfx_detach),
 	DEVMETHOD(device_shutdown,	tdfx_shutdown),
 	{ 0, 0 }
 };
 
 MALLOC_DEFINE(M_TDFX,"TDFX Driver","3DFX Graphics[/2D]/3D Accelerator(s)");
 
 #ifdef TDFX_LINUX
 MODULE_DEPEND(tdfx, linux, 1, 1, 1);
 LINUX_IOCTL_SET(tdfx, LINUX_IOCTL_TDFX_MIN, LINUX_IOCTL_TDFX_MAX);
 #endif
 
 /* Char. Dev. file operations structure */
 static struct cdevsw tdfx_cdev = {
 	tdfx_open,		/* open */
 	tdfx_close,		/* close */
 	noread,			/* read */
 	nowrite,			/* write */
 	tdfx_ioctl,		/* ioctl */
 	nopoll,			/* poll */
 	tdfx_mmap,		/* mmap */
 	nostrategy,		/* strategy */
 	"tdfx",			/* dev name */
 	CDEV_MAJOR, 	/* char major */
 	nodump,			/* dump */
 	nopsize,			/* size */
 	0,					/* flags (no set flags) */
 };
 
 static int
 tdfx_probe(device_t dev)
 {
 	/*
 	 * probe routine called on kernel boot to register supported devices. We get
 	 * a device structure to work with, and we can test the VENDOR/DEVICE IDs to
 	 * see if this PCI device is one that we support. Return 0 if yes, ENXIO if
 	 * not.
 	 */
 	switch(pci_get_devid(dev)) {
 	case PCI_DEVICE_ALLIANCE_AT3D:
 		device_set_desc(dev, "ProMotion At3D 3D Accelerator");
 		return 0;
 	case PCI_DEVICE_3DFX_VOODOO2:
 		device_set_desc(dev, "3DFX Voodoo II 3D Accelerator");
 		return 0;
 	/*case PCI_DEVICE_3DFX_BANSHEE:
 		device_set_desc(dev, "3DFX Voodoo Banshee 2D/3D Graphics Accelerator");
 		return 0;
 	case PCI_DEVICE_3DFX_VOODOO3:
 		device_set_desc(dev, "3DFX Voodoo3 2D/3D Graphics Accelerator");
 		return 0;*/
 	case PCI_DEVICE_3DFX_VOODOO1:
 		device_set_desc(dev, "3DFX Voodoo Graphics 3D Accelerator");
 		return 0;;
 	};
 
 	return ENXIO;
 }
 
 static int
 tdfx_attach(device_t dev) { 
 	/*
 	 * The attach routine is called after the probe routine successfully says it
 	 * supports a given card. We now proceed to initialize this card for use with
 	 * the system. I want to map the device memory for userland allocation and
 	 * fill an information structure with information on this card. I'd also like
 	 * to set Write Combining with the MTRR code so that we can hopefully speed
 	 * up memory writes. The last thing is to register the character device
 	 * interface to the card, so we can open it from /dev/3dfxN, where N is a
 	 * small, whole number.
 	 */
 	struct tdfx_softc *tdfx_info;
 	u_long	val;
 	/* rid value tells bus_alloc_resource where to find the addresses of ports or
 	 * of memory ranges in the PCI config space*/
 	int rid = PCIR_MAPS;
 
 	/* Increment the card counter (for the ioctl code) */
 	tdfx_count++;
 
  	/* Enable MemMap on Voodoo */
 	val = pci_read_config(dev, PCIR_COMMAND, 2);
 	val |= (PCIM_CMD_MEMEN);
 	pci_write_config(dev, PCIR_COMMAND, val, 2);
 	val = pci_read_config(dev, PCIR_COMMAND, 2);
 	
 	/* Fill the soft config struct with info about this device*/
 	tdfx_info = device_get_softc(dev);
 	tdfx_info->dev = dev;
 	tdfx_info->vendor = pci_get_vendor(dev);
 	tdfx_info->type = pci_get_devid(dev) >> 16;
 	tdfx_info->bus = pci_get_bus(dev);
 	tdfx_info->dv = pci_get_slot(dev);
 	tdfx_info->curFile = NULL;
 
 	/* 
 	 *	Get the Memory Location from the PCI Config, mask out lower word, since
 	 * the config space register is only one word long (this is nicer than a
 	 * bitshift).
 	 */
 	tdfx_info->addr0 = (pci_read_config(dev, 0x10, 4) & 0xffff0000);
 #ifdef DEBUG
 	device_printf(dev, "Base0 @ 0x%x\n", tdfx_info->addr0);
 #endif
 	/* Notify the VM that we will be mapping some memory later */
 	tdfx_info->memrange = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0, ~0, 1,
 			RF_ACTIVE | RF_SHAREABLE);
 	if(tdfx_info->memrange == NULL) {
 #ifdef DEBUG
 		device_printf(dev, "Error mapping mem, won't be able to use mmap()\n");
 #endif
 		tdfx_info->memrid = 0;
 	}
 	else {
 		tdfx_info->memrid = rid;
 #ifdef DEBUG
 		device_printf(dev, "Mapped to: 0x%x\n", 
 				(unsigned int)rman_get_start(tdfx_info->memrange));
 #endif
 	}
 
 	/* Setup for Voodoo3 and Banshee, PIO and an extram Memrange */
 	if(pci_get_devid(dev) == PCI_DEVICE_3DFX_VOODOO3 ||
 		pci_get_devid(dev) == PCI_DEVICE_3DFX_BANSHEE) {
 		rid = 0x14;	/* 2nd mem map */
 		tdfx_info->addr1 = (pci_read_config(dev, 0x14, 4) & 0xffff0000);
 #ifdef DEBUG
 		device_printf(dev, "Base1 @ 0x%x\n", tdfx_info->addr1);
 #endif
 		tdfx_info->memrange2 = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid,
 			 0, ~0, 1, RF_ACTIVE | RF_SHAREABLE);
 		if(tdfx_info->memrange2 == NULL) {
 #ifdef DEBUG
 			device_printf(dev, "Mem1 couldn't be allocated, glide may not work.");
 #endif
 			tdfx_info->memrid2 = 0;
 		}
 		else {
 			tdfx_info->memrid2 = rid;
 		}
 		/* Now to map the PIO stuff */
 		rid = PCIR_IOBASE0_2;
 		tdfx_info->pio0 = pci_read_config(dev, 0x2c, 2);
 		tdfx_info->pio0max = pci_read_config(dev, 0x30, 2) + tdfx_info->pio0;
 		tdfx_info->piorange = bus_alloc_resource(dev, SYS_RES_IOPORT, &rid,
 			 0, ~0, 1, RF_ACTIVE | RF_SHAREABLE);
 		if(tdfx_info->piorange == NULL) {
 #ifdef DEBUG
 			device_printf(dev, "Couldn't map PIO range.");
 #endif
 			tdfx_info->piorid = 0;
 		}
 		else {
 			tdfx_info->piorid = rid;
 		}
 	} else {
 	  tdfx_info->addr1 = 0;
 	  tdfx_info->memrange2 = NULL;
 	  tdfx_info->piorange = NULL;
 	}
 
 	/* 
 	 *	Set Writecombining, or at least Uncacheable for the memory region, if we
 	 * are able to
 	 */
 
 	if(tdfx_setmtrr(dev) != 0) {
 #ifdef DEBUG
 		device_printf(dev, "Some weird error setting MTRRs");
 #endif
 		return -1;
 	}
 
 	/* 
 	 * make_dev registers the cdev to access the 3dfx card from /dev
 	 *	use hex here for the dev num, simply to provide better support if > 10
 	 * voodoo cards, for the mad. The user must set the link, or use MAKEDEV.
 	 * Why would we want that many voodoo cards anyhow? 
 	 */
 	tdfx_info->devt = make_dev(&tdfx_cdev, dev->unit, 0, 0, 02660, 
 		"3dfx%x", dev->unit);
 	
 	return 0;
 }
 
 static int
 tdfx_detach(device_t dev) {
 	struct tdfx_softc* tdfx_info;
 	int retval;
 	tdfx_info = device_get_softc(dev);
 	
 	/* Delete allocated resource, of course */
 	bus_release_resource(dev, SYS_RES_MEMORY, tdfx_info->memrid,
 			tdfx_info->memrange);
 
 	/* Release extended Voodoo3/Banshee resources */
 	if(pci_get_devid(dev) == PCI_DEVICE_3DFX_BANSHEE || 
 			pci_get_devid(dev) == PCI_DEVICE_3DFX_VOODOO3) {
 		if(tdfx_info->memrange2 != NULL)
 			bus_release_resource(dev, SYS_RES_MEMORY, tdfx_info->memrid2,
 				tdfx_info->memrange);
 	/*	if(tdfx_info->piorange != NULL)
 			bus_release_resource(dev, SYS_RES_IOPORT, tdfx_info->piorid,
 				tdfx_info->piorange);*/
 	}		
 
 	/* Though it is safe to leave the WRCOMB support since the 
 		mem driver checks for it, we should remove it in order
 		to free an MTRR for another device */
 	retval = tdfx_clrmtrr(dev);
 #ifdef DEBUG
 	if(retval != 0) 
 		printf("tdfx: For some reason, I couldn't clear the mtrr\n");
 #endif
 	/* Remove device entry when it can no longer be accessed */
    destroy_dev(tdfx_info->devt);
 	return(0);
 }
 
 static int
 tdfx_shutdown(device_t dev) {
 #ifdef DEBUG
 	device_printf(dev, "tdfx: Device Shutdown\n");
 #endif
 	return 0;
 }
 
 static int
 tdfx_clrmtrr(device_t dev) {
 	/* This function removes the MTRR set by the attach call, so it can be used
 	 * in the future by other drivers. 
 	 */
 	int retval, act;
 	struct tdfx_softc *tdfx_info = device_get_softc(dev);
 	
 	act = MEMRANGE_SET_REMOVE;
 	retval = mem_range_attr_set(&tdfx_info->mrdesc, &act);
 	return retval;
 }
 	
 static int
 tdfx_setmtrr(device_t dev) {
 	/*
 	 * This is the MTRR setting function for the 3dfx card. It is called from
 	 * tdfx_attach. If we can't set the MTRR properly, it's not the end of the
 	 * world. We can still continue, just with slightly (very slightly) degraded
 	 * performance.
 	 */
 	int retval = 0, act;
 	struct tdfx_softc *tdfx_info = device_get_softc(dev);
 
 	/* The older Voodoo cards have a shorter memrange than the newer ones */
 	if((pci_get_devid(dev) == PCI_DEVICE_3DFX_VOODOO1) || (pci_get_devid(dev) ==
 			PCI_DEVICE_3DFX_VOODOO2)) {
 		tdfx_info->mrdesc.mr_len = 0x400000;
 
 		/* The memory descriptor is described as the top 15 bits of the real
 			address */
 		tdfx_info->mrdesc.mr_base = tdfx_info->addr0 & 0xfffe0000;
 	}
 	else if((pci_get_devid(dev) == PCI_DEVICE_3DFX_VOODOO3) ||
 			(pci_get_devid(dev) == PCI_DEVICE_3DFX_BANSHEE)) {
 		tdfx_info->mrdesc.mr_len = 0x1000000;
 		/* The Voodoo3 and Banshee LFB is the second memory address */
 		/* The memory descriptor is described as the top 15 bits of the real
 			address */
 		tdfx_info->mrdesc.mr_base = tdfx_info->addr1 & 0xfffe0000;
 	}
 	else
 		 return 0;	
 	/* 
     *	The Alliance Pro Motion AT3D was not mentioned in the linux
 	 * driver as far as MTRR support goes, so I just won't put the
 	 * code in here for it. This is where it should go, though. 
 	 */
 
 	/* Firstly, try to set write combining */
 	tdfx_info->mrdesc.mr_flags = MDF_WRITECOMBINE;
 	bcopy("tdfx", &tdfx_info->mrdesc.mr_owner, 4);
 	act = MEMRANGE_SET_UPDATE;
 	retval = mem_range_attr_set(&tdfx_info->mrdesc, &act);
 
 	if(retval == 0) {
 #ifdef DEBUG
 		device_printf(dev, "MTRR Set Correctly for tdfx\n");
 #endif
 	} else if((pci_get_devid(dev) == PCI_DEVICE_3DFX_VOODOO2) ||
 		(pci_get_devid(dev) == PCI_DEVICE_3DFX_VOODOO1)) {
 		/* if, for some reason we can't set the WRCOMB range with the V1/V2, we
 		 * can still possibly use the UNCACHEABLE region for it instead, and help
 		 * out in a small way */
 		tdfx_info->mrdesc.mr_flags = MDF_UNCACHEABLE;
 		/* This length of 1000h was taken from the linux device driver... */
 		tdfx_info->mrdesc.mr_len = 0x1000;
 
 		/*
 		 * If, for some reason, we can't set the MTRR (N/A?) we may still continue
 		 */
 #ifdef DEBUG
 		if(retval == 0) {
 			device_printf(dev, "MTRR Set Type Uncacheable
 					%x\n", (u_int32_t)tdfx_info->mrdesc.mr_base);
 		} else {
 			device_printf(dev, "Couldn't Set MTRR\n");
 		}
 #endif
 	}
 #ifdef DEBUG
 	else {
 		device_printf(dev, "Couldn't Set MTRR\n");
 		return 0;
 	}
 #endif
 	return 0;
 }
 		
 static int
 tdfx_open(dev_t dev, int flags, int fmt, struct thread *td)
 {
 	/* 
 	 *	The open cdev method handles open(2) calls to /dev/3dfx[n] 
 	 * We can pretty much allow any opening of the device.
 	 */
 	struct tdfx_softc *tdfx_info = devclass_get_softc(tdfx_devclass, 
 			UNIT(minor(dev)));
 	if(tdfx_info->busy != 0) return EBUSY;
 #ifdef	DEBUG
 	printf("3dfx: Opened by #%d\n", td->td_proc->p_pid);
 #endif
 	/* Set the driver as busy */
 	tdfx_info->busy++;
 	return 0;
 }
 
 static int 
 tdfx_close(dev_t dev, int fflag, int devtype, struct thread *td) 
 {
 	/* 
 	 *	The close cdev method handles close(2) calls to /dev/3dfx[n] 
 	 * We'll always want to close the device when it's called.
 	 */
 	struct tdfx_softc *tdfx_info = devclass_get_softc(tdfx_devclass, 
 		UNIT(minor(dev)));
 	if(tdfx_info->busy == 0) return EBADF;
 	tdfx_info->busy = 0;
 #ifdef	DEBUG
 	printf("Closed by #%d\n", td->td_proc->p_pid);
 #endif
 	return 0;
 }
 
 static int
 tdfx_mmap(dev_t dev, vm_offset_t offset, int nprot)
 {
 	/* 
 	 * mmap(2) is called by a user process to request that an area of memory
 	 * associated with this device be mapped for the process to work with. Nprot
 	 * holds the protections requested, PROT_READ, PROT_WRITE, or both.
 	 */
 
 	/**** OLD GET CONFIG ****/
 	/* struct tdfx_softc* tdfx_info; */
 	
 	/* Get the configuration for our card XXX*/
 	/*tdfx_info = (struct tdfx_softc*)devclass_get_softc(tdfx_devclass,
 			UNIT(minor(dev)));*/
 	/************************/
 
 	struct tdfx_softc* tdfx_info[2];
 	
 	tdfx_info[0] = (struct tdfx_softc*)devclass_get_softc(tdfx_devclass, 0);
 
 	/* If, for some reason, its not configured, we bail out */
 	if(tdfx_info[0] == NULL) {
 #ifdef	DEBUG
 	   printf("tdfx: tdfx_info (softc) is NULL\n");
 #endif
 	   return -1;
 	}
 
 	/* We must stay within the bound of our address space */
 	if((offset & 0xff000000) == tdfx_info[0]->addr0) {
 		offset &= 0xffffff;
 		return atop(rman_get_start(tdfx_info[0]->memrange) + offset);
 	}
 	
 	if(tdfx_count > 1) {
 		tdfx_info[1] = (struct tdfx_softc*)devclass_get_softc(tdfx_devclass, 1);
 		if((offset & 0xff000000) == tdfx_info[1]->addr0) {
 			offset &= 0xffffff;
 			return atop(rman_get_start(tdfx_info[1]->memrange) + offset);
 		}
 	}
 
 	/* See if the Banshee/V3 LFB is being requested */
 	/*if(tdfx_info->memrange2 != NULL && (offset & 0xff000000) ==
 			tdfx_info->addr1) {
 	  	offset &= 0xffffff;
 		return atop(rman_get_start(tdfx_info[1]->memrange2) + offset);
 	}*/ /* VoodooNG code */
 
 	/* The ret call */
 	/* atop -> address to page
 	 * rman_get_start, get the (struct resource*)->r_start member,
 	 * the mapping base address.
 	 */
 	return -1;
 }
 
 static int
 tdfx_query_boards(void) {
 	/* 
     *	This returns the number of installed tdfx cards, we have been keeping
 	 * count, look at tdfx_attach 
 	 */
 	return tdfx_count;
 }
 
 static int
 tdfx_query_fetch(u_int cmd, struct tdfx_pio_data *piod)
 {
 	/* XXX Comment this later, after careful inspection and spring cleaning :) */
 	/* Various return values 8bit-32bit */
 	u_int8_t  ret_byte;
 	u_int16_t ret_word;
 	u_int32_t ret_dword;
 	struct tdfx_softc* tdfx_info = NULL;	
 
 	/* This one depend on the tdfx_* structs being properly initialized */
 
 	/*piod->device &= 0xf;*/
 	if((piod == NULL) ||(tdfx_count <= piod->device) ||
 			(piod->device < 0)) {
 #ifdef DEBUG
 		printf("tdfx: Bad device or internal struct in tdfx_query_fetch\n");
 #endif
 		return -EINVAL;
 	}
 
 	tdfx_info = (struct tdfx_softc*)devclass_get_softc(tdfx_devclass,
 			piod->device);
 
 	if(tdfx_info == NULL) return -ENXIO;
 
 	/* We must restrict the size reads from the port, since to high or low of a
 	 * size witll result in wrong data being passed, and that's bad */
 	/* A few of these were pulled during the attach phase */
 	switch(piod->port) {
 		case PCI_VENDOR_ID_FREEBSD:
 			if(piod->size != 2) return -EINVAL;
 			copyout(&tdfx_info->vendor, piod->value, piod->size);
 			return 0;
 		case PCI_DEVICE_ID_FREEBSD:
 			if(piod->size != 2) return -EINVAL;
 			copyout(&tdfx_info->type, piod->value, piod->size);
 			return 0;
 		case PCI_BASE_ADDRESS_0_FREEBSD:
 			if(piod->size != 4) return -EINVAL;
 			copyout(&tdfx_info->addr0, piod->value, piod->size);
 			return 0;
 		case PCI_BASE_ADDRESS_1_FREEBSD:
 			if(piod->size != 4) return -EINVAL;
 			copyout(&tdfx_info->addr1, piod->value, piod->size);
 			return 0;
 		case PCI_PRIBUS_FREEBSD:
 			if(piod->size != 1) return -EINVAL;
 			break;
 		case PCI_IOBASE_0_FREEBSD:
 			if(piod->size != 2) return -EINVAL;
 			break;
 		case PCI_IOLIMIT_0_FREEBSD:
 			if(piod->size != 2) return -EINVAL;
 			break;
 		case SST1_PCI_SPECIAL1_FREEBSD:
 			if(piod->size != 4) return -EINVAL;
 			break;
 		case PCI_REVISION_ID_FREEBSD:
 			if(piod->size != 1) return -EINVAL;
 			break;
 		case SST1_PCI_SPECIAL4_FREEBSD:
 			if(piod->size != 4) return -EINVAL;
 			break;
 		default:
 			return -EINVAL;
 	}
 
 	
 	/* Read the value and return */
 	switch(piod->size) {
 		case 1:
 			ret_byte = pci_read_config(tdfx_info[piod->device].dev, 
 					piod->port, 1);
 			copyout(&ret_byte, piod->value, 1);
 			break;
 		case 2:
 			ret_word = pci_read_config(tdfx_info[piod->device].dev, 
 					piod->port, 2);
 			copyout(&ret_word, piod->value, 2);
 			break;
 		case 4:
 			ret_dword = pci_read_config(tdfx_info[piod->device].dev, 
 					piod->port, 4);
 			copyout(&ret_dword, piod->value, 4);
 			break;
 		default:
 			return -EINVAL;
 	}
 	return 0;
 }
 
 static int
 tdfx_query_update(u_int cmd, struct tdfx_pio_data *piod)
 {
 	/* XXX Comment this later, after careful inspection and spring cleaning :) */
 	/* Return vals */
 	u_int8_t  ret_byte;
 	u_int16_t ret_word;
 	u_int32_t ret_dword;
 
 	/* Port vals, mask */
 	u_int32_t retval, preval, mask;
 	struct tdfx_softc* tdfx_info = NULL;
 			
 
 	if((piod == NULL) || (piod->device >= (tdfx_count &
 					0xf))) {
 #ifdef DEBUG
 		printf("tdfx: Bad struct or device in tdfx_query_update\n");
 #endif
 		return -EINVAL;
 	}
 
 	tdfx_info = (struct tdfx_softc*)devclass_get_softc(tdfx_devclass, 
 			piod->device);
 	if(tdfx_info == NULL) return -ENXIO;
 	/* Code below this line in the fuction was taken from the 
 	 * Linux driver and converted for freebsd. */
 
 	/* Check the size for all the ports, to make sure stuff doesn't get messed up
 	 * by poorly written clients */
 
 	switch(piod->port) {
 		case PCI_COMMAND_FREEBSD:
 			if(piod->size != 2) return -EINVAL;
 			break;
 		case SST1_PCI_SPECIAL1_FREEBSD:
 			if(piod->size != 4) return -EINVAL;
 			break;
 		case SST1_PCI_SPECIAL2_FREEBSD:
 			if(piod->size != 4) return -EINVAL;
 			break;
 		case SST1_PCI_SPECIAL3_FREEBSD:
 			if(piod->size != 4) return -EINVAL;
 			break;
 		case SST1_PCI_SPECIAL4_FREEBSD:
 			if(piod->size != 4) return -EINVAL;
 			break;
 		default:
 			return -EINVAL;
 	}
 	/* Read the current value */
 	retval = pci_read_config(tdfx_info->dev, piod->port & ~3, 4);
 			
 	/* These set up a mask to use, since apparently they wanted to write 4 bytes
 	 * at once to the ports */
 	switch (piod->size) {
 		case 1:
 			copyin(piod->value, &ret_byte, 1);
 			preval = ret_byte << (8 * (piod->port & 0x3));
 			mask = 0xff << (8 * (piod->port & 0x3));
 			break;
 		case 2:
 			copyin(piod->value, &ret_word, 2);
 			preval = ret_word << (8 * (piod->port & 0x3));
 			mask = 0xffff << (8 * (piod->port & 0x3));
 			break;
 		case 4:
 			copyin(piod->value, &ret_dword, 4);
 			preval = ret_dword;
 			mask = ~0;
 			break;
 		default:
 			return -EINVAL;
 	}
 	/* Finally, combine the values and write it to the port */
 	retval = (retval & ~mask) | preval;
 	pci_write_config(tdfx_info->dev, piod->port & ~3, retval, 4);
    
 	return 0;
 }
 
 /* For both of these, I added a variable named workport of type u_int so
  * that I could eliminate the warning about my data type size. The
  * applications expect the port to be of type short, so I needed to change
  * this within the function */
 static int
 tdfx_do_pio_rd(struct tdfx_pio_data *piod)
 {
 	/* Return val */
 	u_int8_t  ret_byte;
 	u_int 	 workport;
 	struct tdfx_softc *tdfx_info = 
 		(struct tdfx_softc*)devclass_get_softc(tdfx_devclass, piod->device);
 		
 	/* Restricts the access of ports other than those we use */
 	if(((piod->port != VGA_INPUT_STATUS_1C) || (piod->port != SC_INDEX) ||
 		(piod->port != SC_DATA) || (piod->port != VGA_MISC_OUTPUT_READ)) &&
 		(piod->port < tdfx_info->pio0) && (piod->port > tdfx_info->pio0max))
 		return -EPERM;
 	
 	/* All VGA STATUS REGS are byte registers, size should never be > 1 */
 	if(piod->size != 1) {
 		return -EINVAL;
 	}
 
 	/* Write the data to the intended port */
 	workport = piod->port;
 	ret_byte = inb(workport);
 	copyout(&ret_byte, piod->value, sizeof(u_int8_t));
 	return 0;
 }
 
 static int
 tdfx_do_pio_wt(struct tdfx_pio_data *piod) 
 {
 	/* return val */
 	u_int8_t  ret_byte;
 	u_int		 workport;
 	struct tdfx_softc *tdfx_info = (struct
 			tdfx_softc*)devclass_get_softc(tdfx_devclass, piod->device);
 	/* Replace old switch w/ massive if(...) */
 	/* Restricts the access of ports other than those we use */
 	if(((piod->port != SC_INDEX) && (piod->port != SC_DATA) && 
 		(piod->port != VGA_MISC_OUTPUT_READ)) /* Can't write VGA_ST_1C */ &&
 		(piod->port < tdfx_info->pio0) && (piod->port > tdfx_info->pio0max))
 		return -EPERM;
 	
 	/* All VGA STATUS REGS are byte registers, size should never be > 1 */
 	if(piod->size != 1) {
 		return -EINVAL;
 	}
 
 	/* Write the data to the intended port */
 	copyin(piod->value, &ret_byte, sizeof(u_int8_t));
 	workport = piod->port;
 	outb(workport, ret_byte);
 	return 0;
 }
 
 static int
 tdfx_do_query(u_int cmd, struct tdfx_pio_data *piod)
 {
 	/* There are three sub-commands to the query 0x33 */
 	switch(_IOC_NR(cmd)) {
 		case 2:
 			return tdfx_query_boards();
 			break;
 		case 3:
 			return tdfx_query_fetch(cmd, piod);
 			break;
 		case 4:
 			return tdfx_query_update(cmd, piod);
 			break;
 		default:
 			/* In case we are thrown a bogus sub-command! */
 #ifdef DEBUG
 			printf("Bad Sub-cmd: 0x%x\n", _IOC_NR(cmd));
 #endif
 			return -EINVAL;
 	};
 }
 
 static int
 tdfx_do_pio(u_int cmd, struct tdfx_pio_data *piod) 
 {
 	/* Two types of PIO, INPUT and OUTPUT, as the name suggests */
 	switch(_IOC_DIR(cmd)) {
 		case IOCV_OUT: 
 			return tdfx_do_pio_rd(piod);
 			break;
 		case IOCV_IN:
 			return tdfx_do_pio_wt(piod);
 			break;
 		default:
 			return -EINVAL;
 	};
 }
 
 /* Calls to ioctl(2) eventually end up here. Unhandled ioctls return an ENXIO,
  * normally, you would read in the data pointed to by data, then write your
  * output to it. The ioctl *should* normally return zero if everything is
  * alright, but 3dfx didn't make it that way...
  *
  * For all of the ioctl code, in the event of a real error,
  * we return -Exxxx rather than simply Exxxx. The reason for this
  * is that the ioctls actually RET information back to the program
  * sometimes, rather than filling it in the passed structure. We
  * want to distinguish errors from useful data, and maintain compatibility.
  *
  * There is this portion of the proc struct called p_retval[], we can store a
  * return value in td->td_retval[0] and place the return value if it is positive
  * in there, then we can return 0 (good). If the return value is negative, we
  * can return -retval and the error should be properly handled.
  */
 static int
 tdfx_ioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
 {
 	int retval = 0;
 	struct tdfx_pio_data *piod = (struct tdfx_pio_data*)data;
 #ifdef	DEBUG
 	printf("IOCTL'd by #%d, cmd: 0x%x, data: 0x%x\n", td->td_proc->p_pid, (u_int32_t)cmd,
 			(unsigned int)piod);
 #endif
 	switch(_IOC_TYPE(cmd)) {
 		/* Return the real error if negative, or simply stick the valid return
 		 * in td->td_retval */
 	case 0x33:
 			/* The '3'(0x33) type IOCTL is for querying the installed cards */
 			if((retval = tdfx_do_query(cmd, piod)) > 0) td->td_retval[0] = retval;
 			else return -retval;
 			break;
 		case 0:
 			/* The 0 type IOCTL is for programmed I/O methods */
 			if((tdfx_do_pio(cmd, piod)) > 0) td->td_retval[0] = retval;
 			else return -retval;
 			break;
 		default:
 			/* Technically, we won't reach this from linux emu, but when glide
 			 * finally gets ported, watch out! */
 #ifdef DEBUG
 			printf("Bad IOCTL from #%d\n", td->td_proc->p_pid);
 #endif
 			return ENXIO;
 	}
 
 	return 0;
 }
 
 #ifdef TDFX_LINUX
 /*
  * Linux emulation IOCTL for /dev/tdfx
  */
 static int
 linux_ioctl_tdfx(struct thread *td, struct linux_ioctl_args* args)
 {
    int error = 0;
    u_long cmd = args->cmd & 0xffff;
 
    /* The structure passed to ioctl has two shorts, one int
       and one void*. */
    char d_pio[2*sizeof(short) + sizeof(int) + sizeof(void*)];
 
-   struct file *fp = td->td_proc->p_fd->fd_ofiles[args->fd];
+   struct file *fp;
 
+   fp = ffind_hold(td, args->fd);
+   if (fp == NULL)
+	   return (EBADF);
    /* We simply copy the data and send it right to ioctl */
    copyin((caddr_t)args->arg, &d_pio, sizeof(d_pio));
    error = fo_ioctl(fp, cmd, (caddr_t)&d_pio, td);
+   fdrop(fp, td);
    return error;
 }
 #endif /* TDFX_LINUX */
 
 
 /* This is the device driver struct. This is sent to the driver subsystem to
  * register the method structure and the info strcut space for this particular
  * instance of the driver.
  */
 static driver_t tdfx_driver = {
 	"tdfx", 
 	tdfx_methods,
 	sizeof(struct tdfx_softc),
 };
 
 /* Tell Mr. Kernel about us! */
 DRIVER_MODULE(tdfx, pci, tdfx_driver, tdfx_devclass, 0, 0);
Index: head/sys/fs/fdescfs/fdesc_vfsops.c
===================================================================
--- head/sys/fs/fdescfs/fdesc_vfsops.c	(revision 89305)
+++ head/sys/fs/fdescfs/fdesc_vfsops.c	(revision 89306)
@@ -1,225 +1,227 @@
 /*
  * Copyright (c) 1992, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fdesc_vfsops.c	8.4 (Berkeley) 1/21/94
  *
  * $FreeBSD$
  */
 
 /*
  * /dev/fd Filesystem
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
 
 #include <fs/fdescfs/fdesc.h>
 
 static MALLOC_DEFINE(M_FDESCMNT, "FDESC mount", "FDESC mount structure");
 
 static int	fdesc_mount __P((struct mount *mp, char *path, caddr_t data,
 				 struct nameidata *ndp, struct thread *td));
 static int	fdesc_unmount __P((struct mount *mp, int mntflags,
 				   struct thread *td));
 static int	fdesc_statfs __P((struct mount *mp, struct statfs *sbp,
 				  struct thread *td));
   
 /*
  * Mount the per-process file descriptors (/dev/fd)
  */
 static int
 fdesc_mount(mp, path, data, ndp, td)
 	struct mount *mp;
 	char *path;
 	caddr_t data;
 	struct nameidata *ndp;
 	struct thread *td;
 {
 	int error = 0;
 	struct fdescmount *fmp;
 	struct vnode *rvp;
 
 	/*
 	 * Update is a no-op
 	 */
 	if (mp->mnt_flag & MNT_UPDATE)
 		return (EOPNOTSUPP);
 
 	error = fdesc_allocvp(Froot, FD_ROOT, mp, &rvp, td);
 	if (error)
 		return (error);
 
 	MALLOC(fmp, struct fdescmount *, sizeof(struct fdescmount),
 				M_FDESCMNT, M_WAITOK);	/* XXX */
 	rvp->v_type = VDIR;
 	rvp->v_flag |= VROOT;
 	fmp->f_root = rvp;
 	/* XXX -- don't mark as local to work around fts() problems */
 	/*mp->mnt_flag |= MNT_LOCAL;*/
 	mp->mnt_data = (qaddr_t) fmp;
 	vfs_getnewfsid(mp);
 
 	bzero(mp->mnt_stat.f_mntfromname, MNAMELEN);
 	bcopy("fdesc", mp->mnt_stat.f_mntfromname, sizeof("fdesc"));
 	(void)fdesc_statfs(mp, &mp->mnt_stat, td);
 	return (0);
 }
 
 static int
 fdesc_unmount(mp, mntflags, td)
 	struct mount *mp;
 	int mntflags;
 	struct thread *td;
 {
 	int error;
 	int flags = 0;
 
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 
 	/*
 	 * Clear out buffer cache.  I don't think we
 	 * ever get anything cached at this level at the
 	 * moment, but who knows...
 	 *
 	 * There is 1 extra root vnode reference corresponding
 	 * to f_root.
 	 */
 	if ((error = vflush(mp, 1, flags)) != 0)
 		return (error);
 
 	/*
 	 * Finally, throw away the fdescmount structure
 	 */
 	free(mp->mnt_data, M_FDESCMNT);	/* XXX */
 	mp->mnt_data = 0;
 
 	return (0);
 }
 
 int
 fdesc_root(mp, vpp)
 	struct mount *mp;
 	struct vnode **vpp;
 {
 	struct thread *td = curthread;	/* XXX */
 	struct vnode *vp;
 
 	/*
 	 * Return locked reference to root.
 	 */
 	vp = VFSTOFDESC(mp)->f_root;
 	VREF(vp);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	*vpp = vp;
 	return (0);
 }
 
 static int
 fdesc_statfs(mp, sbp, td)
 	struct mount *mp;
 	struct statfs *sbp;
 	struct thread *td;
 {
 	struct filedesc *fdp;
 	int lim;
 	int i;
 	int last;
 	int freefd;
 
 	/*
 	 * Compute number of free file descriptors.
 	 * [ Strange results will ensue if the open file
 	 * limit is ever reduced below the current number
 	 * of open files... ]
 	 */
 	lim = td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur;
 	fdp = td->td_proc->p_fd;
+	FILEDESC_LOCK(fdp);
 	last = min(fdp->fd_nfiles, lim);
 	freefd = 0;
 	for (i = fdp->fd_freefile; i < last; i++)
 		if (fdp->fd_ofiles[i] == NULL)
 			freefd++;
 
 	/*
 	 * Adjust for the fact that the fdesc array may not
 	 * have been fully allocated yet.
 	 */
 	if (fdp->fd_nfiles < lim)
 		freefd += (lim - fdp->fd_nfiles);
+	FILEDESC_UNLOCK(fdp);
 
 	sbp->f_flags = 0;
 	sbp->f_bsize = DEV_BSIZE;
 	sbp->f_iosize = DEV_BSIZE;
 	sbp->f_blocks = 2;		/* 1K to keep df happy */
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = lim + 1;		/* Allow for "." */
 	sbp->f_ffree = freefd;		/* See comments above */
 	if (sbp != &mp->mnt_stat) {
 		sbp->f_type = mp->mnt_vfc->vfc_typenum;
 		bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid));
 		bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN);
 		bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN);
 	}
 	return (0);
 }
 
 static struct vfsops fdesc_vfsops = {
 	fdesc_mount,
 	vfs_stdstart,
 	fdesc_unmount,
 	fdesc_root,
 	vfs_stdquotactl,
 	fdesc_statfs,
 	vfs_stdsync,
 	vfs_stdvget,
 	vfs_stdfhtovp,
 	vfs_stdcheckexp,
 	vfs_stdvptofh,
 	fdesc_init,
 	vfs_stduninit,
 	vfs_stdextattrctl,
 };
 
 VFS_SET(fdesc_vfsops, fdescfs, VFCF_SYNTHETIC);
Index: head/sys/fs/fdescfs/fdesc_vnops.c
===================================================================
--- head/sys/fs/fdescfs/fdesc_vnops.c	(revision 89305)
+++ head/sys/fs/fdescfs/fdesc_vnops.c	(revision 89306)
@@ -1,563 +1,575 @@
 /*
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fdesc_vnops.c	8.9 (Berkeley) 1/21/94
  *
  * $FreeBSD$
  */
 
 /*
  * /dev/fd Filesystem
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>	/* boottime */
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h>	/* Must come after sys/malloc.h */
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/vnode.h>
 
 #include <fs/fdescfs/fdesc.h>
 
 #define FDL_WANT	0x01
 #define FDL_LOCKED	0x02
 static int fdcache_lock;
 
 static vop_t **fdesc_vnodeop_p;
 
 #define	NFDCACHE 4
 #define FD_NHASH(ix) \
 	(&fdhashtbl[(ix) & fdhash])
 static LIST_HEAD(fdhashhead, fdescnode) *fdhashtbl;
 static u_long fdhash;
 
 static int	fdesc_getattr __P((struct vop_getattr_args *ap));
 static int	fdesc_inactive __P((struct vop_inactive_args *ap));
 static int	fdesc_lookup __P((struct vop_lookup_args *ap));
 static int	fdesc_open __P((struct vop_open_args *ap));
 static int	fdesc_print __P((struct vop_print_args *ap));
 static int	fdesc_readdir __P((struct vop_readdir_args *ap));
 static int	fdesc_reclaim __P((struct vop_reclaim_args *ap));
 static int	fdesc_poll __P((struct vop_poll_args *ap));
 static int	fdesc_setattr __P((struct vop_setattr_args *ap));
 
 /*
  * Initialise cache headers
  */
 int
 fdesc_init(vfsp)
 	struct vfsconf *vfsp;
 {
 
 	fdhashtbl = hashinit(NFDCACHE, M_CACHE, &fdhash);
 	return (0);
 }
 
 int
 fdesc_allocvp(ftype, ix, mp, vpp, td)
 	fdntype ftype;
 	int ix;
 	struct mount *mp;
 	struct vnode **vpp;
 	struct thread *td;
 {
 	struct fdhashhead *fc;
 	struct fdescnode *fd;
 	int error = 0;
 
 	fc = FD_NHASH(ix);
 loop:
 	LIST_FOREACH(fd, fc, fd_hash) {
 		if (fd->fd_ix == ix && fd->fd_vnode->v_mount == mp) {
 			if (vget(fd->fd_vnode, 0, td))
 				goto loop;
 			*vpp = fd->fd_vnode;
 			return (error);
 		}
 	}
 
 	/*
 	 * otherwise lock the array while we call getnewvnode
 	 * since that can block.
 	 */
 	if (fdcache_lock & FDL_LOCKED) {
 		fdcache_lock |= FDL_WANT;
 		(void) tsleep((caddr_t) &fdcache_lock, PINOD, "fdalvp", 0);
 		goto loop;
 	}
 	fdcache_lock |= FDL_LOCKED;
 
 	/*
 	 * Do the MALLOC before the getnewvnode since doing so afterward
 	 * might cause a bogus v_data pointer to get dereferenced
 	 * elsewhere if MALLOC should block.
 	 */
 	MALLOC(fd, struct fdescnode *, sizeof(struct fdescnode), M_TEMP, M_WAITOK);
 
 	error = getnewvnode(VT_FDESC, mp, fdesc_vnodeop_p, vpp);
 	if (error) {
 		FREE(fd, M_TEMP);
 		goto out;
 	}
 	(*vpp)->v_data = fd;
 	fd->fd_vnode = *vpp;
 	fd->fd_type = ftype;
 	fd->fd_fd = -1;
 	fd->fd_ix = ix;
 	LIST_INSERT_HEAD(fc, fd, fd_hash);
 
 out:
 	fdcache_lock &= ~FDL_LOCKED;
 
 	if (fdcache_lock & FDL_WANT) {
 		fdcache_lock &= ~FDL_WANT;
 		wakeup((caddr_t) &fdcache_lock);
 	}
 
 	return (error);
 }
 
 /*
  * vp is the current namei directory
  * ndp is the name to locate in that directory...
  */
 static int
 fdesc_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode * a_dvp;
 		struct vnode ** a_vpp;
 		struct componentname * a_cnp;
 	} */ *ap;
 {
 	struct vnode **vpp = ap->a_vpp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	char *pname = cnp->cn_nameptr;
 	struct thread *td = cnp->cn_thread;
+	struct file *fp;
 	int nlen = cnp->cn_namelen;
-	int nfiles = td->td_proc->p_fd->fd_nfiles;
 	u_int fd;
 	int error;
 	struct vnode *fvp;
 
 	if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) {
 		error = EROFS;
 		goto bad;
 	}
 
 	VOP_UNLOCK(dvp, 0, td);
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		*vpp = dvp;
 		VREF(dvp);	
 		vn_lock(dvp, LK_SHARED | LK_RETRY, td);
 		return (0);
 	}
 
 	if (VTOFDESC(dvp)->fd_type != Froot) {
 		error = ENOTDIR;
 		goto bad;
 	}
 
 	fd = 0;
 	/* the only time a leading 0 is acceptable is if it's "0" */
 	if (*pname == '0' && nlen != 1) {
 		error = ENOENT;
 		goto bad;
 	}
 	while (nlen--) {
 		if (*pname < '0' || *pname > '9') {
 			error = ENOENT;
 			goto bad;
 		}
 		fd = 10 * fd + *pname++ - '0';
 	}
 
-	if (fd >= nfiles || td->td_proc->p_fd->fd_ofiles[fd] == NULL) {
+	fp = ffind_hold(td, fd);
+	if (fp == NULL) {
 		error = EBADF;
 		goto bad;
 	}
 
 	error = fdesc_allocvp(Fdesc, FD_DESC+fd, dvp->v_mount, &fvp, td);
+	fdrop(fp, td);
 	if (error)
 		goto bad;
 	VTOFDESC(fvp)->fd_fd = fd;
 	vn_lock(fvp, LK_SHARED | LK_RETRY, td);
 	*vpp = fvp;
 	return (0);
 
 bad:
 	vn_lock(dvp, LK_SHARED | LK_RETRY, td);
 	*vpp = NULL;
 	return (error);
 }
 
 static int
 fdesc_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (VTOFDESC(vp)->fd_type == Froot)
 		return (0);
 
 	/*
 	 * XXX Kludge: set td->td_proc->p_dupfd to contain the value of the the file
 	 * descriptor being sought for duplication. The error return ensures
 	 * that the vnode for this device will be released by vn_open. Open
 	 * will detect this special error and take the actions in dupfdopen.
 	 * Other callers of vn_open or VOP_OPEN will simply report the
 	 * error.
 	 */
 	ap->a_td->td_dupfd = VTOFDESC(vp)->fd_fd;	/* XXX */
 	return (ENODEV);
 }
 
 static int
 fdesc_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
-	struct filedesc *fdp = ap->a_td->td_proc->p_fd;
 	struct file *fp;
 	struct stat stb;
 	u_int fd;
 	int error = 0;
 
 	switch (VTOFDESC(vp)->fd_type) {
 	case Froot:
 		VATTR_NULL(vap);
 
 		vap->va_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
 		vap->va_type = VDIR;
 		vap->va_nlink = 2;
 		vap->va_size = DEV_BSIZE;
 		vap->va_fileid = VTOFDESC(vp)->fd_ix;
 		vap->va_uid = 0;
 		vap->va_gid = 0;
 		vap->va_blocksize = DEV_BSIZE;
 		vap->va_atime.tv_sec = boottime.tv_sec;
 		vap->va_atime.tv_nsec = 0;
 		vap->va_mtime = vap->va_atime;
 		vap->va_ctime = vap->va_mtime;
 		vap->va_gen = 0;
 		vap->va_flags = 0;
 		vap->va_rdev = 0;
 		vap->va_bytes = 0;
 		break;
 
 	case Fdesc:
 		fd = VTOFDESC(vp)->fd_fd;
 
-		if (fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL)
+		fp = ffind_hold(ap->a_td, fd);
+		if (fp == NULL)
 			return (EBADF);
 
 		bzero(&stb, sizeof(stb));
 		error = fo_stat(fp, &stb, ap->a_td);
+		fdrop(fp, ap->a_td);
 		if (error == 0) {
 			VATTR_NULL(vap);
 			vap->va_type = IFTOVT(stb.st_mode);
 			vap->va_mode = stb.st_mode;
 #define FDRX (VREAD|VEXEC)
 			if (vap->va_type == VDIR)
 				vap->va_mode &= ~((FDRX)|(FDRX>>3)|(FDRX>>6));
 #undef FDRX
 			vap->va_nlink = 1;
 			vap->va_flags = 0;
 			vap->va_bytes = stb.st_blocks * stb.st_blksize;
 			vap->va_fileid = VTOFDESC(vp)->fd_ix;
 			vap->va_size = stb.st_size;
 			vap->va_blocksize = stb.st_blksize;
 			vap->va_rdev = stb.st_rdev;
 
 			/*
 			 * If no time data is provided, use the current time.
 			 */
 			if (stb.st_atimespec.tv_sec == 0 &&
 			    stb.st_atimespec.tv_nsec == 0)
 				nanotime(&stb.st_atimespec);
 
 			if (stb.st_ctimespec.tv_sec == 0 &&
 			    stb.st_ctimespec.tv_nsec == 0)
 				nanotime(&stb.st_ctimespec);
 
 			if (stb.st_mtimespec.tv_sec == 0 &&
 			    stb.st_mtimespec.tv_nsec == 0)
 				nanotime(&stb.st_mtimespec);
 
 			vap->va_atime = stb.st_atimespec;
 			vap->va_mtime = stb.st_mtimespec;
 			vap->va_ctime = stb.st_ctimespec;
 			vap->va_uid = stb.st_uid;
 			vap->va_gid = stb.st_gid;
 		}
 		break;
 
 	default:
 		panic("fdesc_getattr");
 		break;
 	}
 
 	if (error == 0)
 		vp->v_type = vap->va_type;
 	return (error);
 }
 
 static int
 fdesc_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
 	unsigned fd;
 	int error;
 
 	/*
 	 * Can't mess with the root vnode
 	 */
 	if (VTOFDESC(ap->a_vp)->fd_type == Froot)
 		return (EACCES);
 
 	fd = VTOFDESC(ap->a_vp)->fd_fd;
 
 	/*
 	 * Allow setattr where there is an underlying vnode.
 	 */
 	error = getvnode(ap->a_td->td_proc->p_fd, fd, &fp);
 	if (error) {
 		/*
 		 * getvnode() returns EINVAL if the file descriptor is not
 		 * backed by a vnode.  Silently drop all changes except
 		 * chflags(2) in this case.
 		 */
 		if (error == EINVAL) {
 			if (vap->va_flags != VNOVAL)
 				error = EOPNOTSUPP;
 			else
 				error = 0;
 		}
 		return (error);
 	}
 	vp = (struct vnode *)fp->f_data;
-	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		fdrop(fp, ap->a_td);
 		return (error);
+	}
 	error = VOP_SETATTR(vp, ap->a_vap, ap->a_cred, ap->a_td);
 	vn_finished_write(mp);
+	fdrop(fp, ap->a_td);
 	return (error);
 }
 
 #define UIO_MX 16
 
 static int
 fdesc_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		u_long *a_cookies;
 		int a_ncookies;
 	} */ *ap;
 {
 	struct uio *uio = ap->a_uio;
 	struct filedesc *fdp;
 	struct dirent d;
 	struct dirent *dp = &d;
 	int error, i, off, fcnt;
 
 	/*
 	 * We don't allow exporting fdesc mounts, and currently local
 	 * requests do not need cookies.
 	 */
 	if (ap->a_ncookies)
 		panic("fdesc_readdir: not hungry");
 
 	if (VTOFDESC(ap->a_vp)->fd_type != Froot)
 		panic("fdesc_readdir: not dir");
 
 	off = (int)uio->uio_offset;
 	if (off != uio->uio_offset || off < 0 || (u_int)off % UIO_MX != 0 ||
 	    uio->uio_resid < UIO_MX)
 		return (EINVAL);
 	i = (u_int)off / UIO_MX;
 	fdp = uio->uio_td->td_proc->p_fd;
 	error = 0;
 
 	fcnt = i - 2;		/* The first two nodes are `.' and `..' */
 
+	FILEDESC_LOCK(fdp);
 	while (i < fdp->fd_nfiles + 2 && uio->uio_resid >= UIO_MX) {
 		switch (i) {
 		case 0:	/* `.' */
 		case 1: /* `..' */
 			bzero((caddr_t)dp, UIO_MX);
 
 			dp->d_fileno = i + FD_ROOT;
 			dp->d_namlen = i + 1;
 			dp->d_reclen = UIO_MX;
 			bcopy("..", dp->d_name, dp->d_namlen);
 			dp->d_name[i + 1] = '\0';
 			dp->d_type = DT_DIR;
 			break;
 		default:
-			if (fdp->fd_ofiles[fcnt] == NULL)
+			if (fdp->fd_ofiles[fcnt] == NULL) {
+				FILEDESC_UNLOCK(fdp);
 				goto done;
+			}
 
 			bzero((caddr_t) dp, UIO_MX);
 			dp->d_namlen = sprintf(dp->d_name, "%d", fcnt);
 			dp->d_reclen = UIO_MX;
 			dp->d_type = DT_UNKNOWN;
 			dp->d_fileno = i + FD_DESC;
 			break;
 		}
 		/*
 		 * And ship to userland
 		 */
+		FILEDESC_UNLOCK(fdp);
 		error = uiomove((caddr_t) dp, UIO_MX, uio);
 		if (error)
-			break;
+			goto done;
+		FILEDESC_LOCK(fdp);
 		i++;
 		fcnt++;
 	}
+	FILEDESC_UNLOCK(fdp);
 
 done:
 	uio->uio_offset = i * UIO_MX;
 	return (error);
 }
 
 static int
 fdesc_poll(ap)
 	struct vop_poll_args /* {
 		struct vnode *a_vp;
 		int  a_events;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	return seltrue(0, ap->a_events, ap->a_td);
 }
 
 static int
 fdesc_inactive(ap)
 	struct vop_inactive_args /* {
 		struct vnode *a_vp;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
 	/*
 	 * Clear out the v_type field to avoid
 	 * nasty things happening in vgone().
 	 */
 	VOP_UNLOCK(vp, 0, ap->a_td);
 	vp->v_type = VNON;
 	return (0);
 }
 
 static int
 fdesc_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct fdescnode *fd = VTOFDESC(vp);
 
 	LIST_REMOVE(fd, fd_hash);
 	FREE(vp->v_data, M_TEMP);
 	vp->v_data = 0;
 
 	return (0);
 }
 
 /*
  * Print out the contents of a /dev/fd vnode.
  */
 /* ARGSUSED */
 static int
 fdesc_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 
 	printf("tag VT_NON, fdesc vnode\n");
 	return (0);
 }
 
 static struct vnodeopv_entry_desc fdesc_vnodeop_entries[] = {
 	{ &vop_default_desc,		(vop_t *) vop_defaultop },
 	{ &vop_access_desc,		(vop_t *) vop_null },
 	{ &vop_getattr_desc,		(vop_t *) fdesc_getattr },
 	{ &vop_inactive_desc,		(vop_t *) fdesc_inactive },
 	{ &vop_lookup_desc,		(vop_t *) fdesc_lookup },
 	{ &vop_open_desc,		(vop_t *) fdesc_open },
 	{ &vop_pathconf_desc,		(vop_t *) vop_stdpathconf },
 	{ &vop_poll_desc,		(vop_t *) fdesc_poll },
 	{ &vop_print_desc,		(vop_t *) fdesc_print },
 	{ &vop_readdir_desc,		(vop_t *) fdesc_readdir },
 	{ &vop_reclaim_desc,		(vop_t *) fdesc_reclaim },
 	{ &vop_setattr_desc,		(vop_t *) fdesc_setattr },
 	{ NULL, NULL }
 };
 static struct vnodeopv_desc fdesc_vnodeop_opv_desc =
 	{ &fdesc_vnodeop_p, fdesc_vnodeop_entries };
 
 VNODEOP_SET(fdesc_vnodeop_opv_desc);
Index: head/sys/fs/fifofs/fifo_vnops.c
===================================================================
--- head/sys/fs/fifofs/fifo_vnops.c	(revision 89305)
+++ head/sys/fs/fifofs/fifo_vnops.c	(revision 89306)
@@ -1,601 +1,610 @@
 /*
  * Copyright (c) 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fifo_vnops.c	8.10 (Berkeley) 5/27/95
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/unistd.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/vnode.h>
 #include <sys/proc.h> /* XXXKSE */
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/event.h>
 #include <sys/poll.h>
 #include <sys/un.h>
 #include <fs/fifofs/fifo.h>
 
 /*
  * This structure is associated with the FIFO vnode and stores
  * the state associated with the FIFO.
  */
 struct fifoinfo {
 	struct socket	*fi_readsock;
 	struct socket	*fi_writesock;
 	long		fi_readers;
 	long		fi_writers;
 };
 
 static int	fifo_badop __P((void));
 static int	fifo_print __P((struct vop_print_args *));
 static int	fifo_lookup __P((struct vop_lookup_args *));
 static int	fifo_open __P((struct vop_open_args *));
 static int	fifo_close __P((struct vop_close_args *));
 static int	fifo_read __P((struct vop_read_args *));
 static int	fifo_write __P((struct vop_write_args *));
 static int	fifo_ioctl __P((struct vop_ioctl_args *));
 static int	fifo_poll __P((struct vop_poll_args *));
 static int	fifo_kqfilter __P((struct vop_kqfilter_args *));
 static int	fifo_pathconf __P((struct vop_pathconf_args *));
 static int	fifo_advlock __P((struct vop_advlock_args *));
 
 static void	filt_fifordetach(struct knote *kn);
 static int	filt_fiforead(struct knote *kn, long hint);
 static void	filt_fifowdetach(struct knote *kn);
 static int	filt_fifowrite(struct knote *kn, long hint);
 
 static struct filterops fiforead_filtops =
 	{ 1, NULL, filt_fifordetach, filt_fiforead };
 static struct filterops fifowrite_filtops =
 	{ 1, NULL, filt_fifowdetach, filt_fifowrite };
 
 vop_t **fifo_vnodeop_p;
 static struct vnodeopv_entry_desc fifo_vnodeop_entries[] = {
 	{ &vop_default_desc,		(vop_t *) vop_defaultop },
 	{ &vop_access_desc,		(vop_t *) vop_ebadf },
 	{ &vop_advlock_desc,		(vop_t *) fifo_advlock },
 	{ &vop_close_desc,		(vop_t *) fifo_close },
 	{ &vop_create_desc,		(vop_t *) fifo_badop },
 	{ &vop_getattr_desc,		(vop_t *) vop_ebadf },
 	{ &vop_getwritemount_desc, 	(vop_t *) vop_stdgetwritemount },
 	{ &vop_ioctl_desc,		(vop_t *) fifo_ioctl },
 	{ &vop_kqfilter_desc,		(vop_t *) fifo_kqfilter },
 	{ &vop_lease_desc,		(vop_t *) vop_null },
 	{ &vop_link_desc,		(vop_t *) fifo_badop },
 	{ &vop_lookup_desc,		(vop_t *) fifo_lookup },
 	{ &vop_mkdir_desc,		(vop_t *) fifo_badop },
 	{ &vop_mknod_desc,		(vop_t *) fifo_badop },
 	{ &vop_open_desc,		(vop_t *) fifo_open },
 	{ &vop_pathconf_desc,		(vop_t *) fifo_pathconf },
 	{ &vop_poll_desc,		(vop_t *) fifo_poll },
 	{ &vop_print_desc,		(vop_t *) fifo_print },
 	{ &vop_read_desc,		(vop_t *) fifo_read },
 	{ &vop_readdir_desc,		(vop_t *) fifo_badop },
 	{ &vop_readlink_desc,		(vop_t *) fifo_badop },
 	{ &vop_reallocblks_desc,	(vop_t *) fifo_badop },
 	{ &vop_reclaim_desc,		(vop_t *) vop_null },
 	{ &vop_remove_desc,		(vop_t *) fifo_badop },
 	{ &vop_rename_desc,		(vop_t *) fifo_badop },
 	{ &vop_rmdir_desc,		(vop_t *) fifo_badop },
 	{ &vop_setattr_desc,		(vop_t *) vop_ebadf },
 	{ &vop_symlink_desc,		(vop_t *) fifo_badop },
 	{ &vop_write_desc,		(vop_t *) fifo_write },
 	{ NULL, NULL }
 };
 static struct vnodeopv_desc fifo_vnodeop_opv_desc =
 	{ &fifo_vnodeop_p, fifo_vnodeop_entries };
 
 VNODEOP_SET(fifo_vnodeop_opv_desc);
 
 int
 fifo_vnoperate(ap)
 	struct vop_generic_args /* {
 		struct vnodeop_desc *a_desc;
 		<other random data follows, presumably>
 	} */ *ap;
 {
 	return (VOCALL(fifo_vnodeop_p, ap->a_desc->vdesc_offset, ap));
 }
 
 /*
  * Trivial lookup routine that always fails.
  */
 /* ARGSUSED */
 static int
 fifo_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode * a_dvp;
 		struct vnode ** a_vpp;
 		struct componentname * a_cnp;
 	} */ *ap;
 {
 
 	*ap->a_vpp = NULL;
 	return (ENOTDIR);
 }
 
 /*
  * Open called to set up a new instance of a fifo or
  * to find an active instance of a fifo.
  */
 /* ARGSUSED */
 static int
 fifo_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct fifoinfo *fip;
 	struct thread *td = ap->a_td;
 	struct socket *rso, *wso;
 	int error;
 
 	if ((fip = vp->v_fifoinfo) == NULL) {
 		MALLOC(fip, struct fifoinfo *, sizeof(*fip), M_VNODE, M_WAITOK);
 		vp->v_fifoinfo = fip;
 		error = socreate(AF_LOCAL, &rso, SOCK_STREAM, 0,
 		    ap->a_td->td_proc->p_ucred, ap->a_td);
 		if (error) {
 			free(fip, M_VNODE);
 			vp->v_fifoinfo = NULL;
 			return (error);
 		}
 		fip->fi_readsock = rso;
 		error = socreate(AF_LOCAL, &wso, SOCK_STREAM, 0,
 		    ap->a_td->td_proc->p_ucred, ap->a_td);
 		if (error) {
 			(void)soclose(rso);
 			free(fip, M_VNODE);
 			vp->v_fifoinfo = NULL;
 			return (error);
 		}
 		fip->fi_writesock = wso;
 		error = unp_connect2(wso, rso);
 		if (error) {
 			(void)soclose(wso);
 			(void)soclose(rso);
 			free(fip, M_VNODE);
 			vp->v_fifoinfo = NULL;
 			return (error);
 		}
 		fip->fi_readers = fip->fi_writers = 0;
 		wso->so_snd.sb_lowat = PIPE_BUF;
 	}
 	if (ap->a_mode & FREAD) {
 		fip->fi_readers++;
 		if (fip->fi_readers == 1) {
 			fip->fi_writesock->so_state &= ~SS_CANTSENDMORE;
 			if (fip->fi_writers > 0) {
 				wakeup((caddr_t)&fip->fi_writers);
 				sowwakeup(fip->fi_writesock);
 			}
 		}
 	}
 	if (ap->a_mode & FWRITE) {
 		fip->fi_writers++;
 		if (fip->fi_writers == 1) {
 			fip->fi_readsock->so_state &= ~SS_CANTRCVMORE;
 			if (fip->fi_readers > 0) {
 				wakeup((caddr_t)&fip->fi_readers);
 				sorwakeup(fip->fi_writesock);
 			}
 		}
 	}
 	if ((ap->a_mode & FREAD) && (ap->a_mode & O_NONBLOCK) == 0) {
 		while (fip->fi_writers == 0) {
 			VOP_UNLOCK(vp, 0, td);
 			error = tsleep((caddr_t)&fip->fi_readers,
 			    PCATCH | PSOCK, "fifoor", 0);
 			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 			if (error)
 				goto bad;
 		}
 	}
 	if (ap->a_mode & FWRITE) {
 		if (ap->a_mode & O_NONBLOCK) {
 			if (fip->fi_readers == 0) {
 				error = ENXIO;
 				goto bad;
 			}
 		} else {
 			while (fip->fi_readers == 0) {
 				VOP_UNLOCK(vp, 0, td);
 				error = tsleep((caddr_t)&fip->fi_writers,
 				    PCATCH | PSOCK, "fifoow", 0);
 				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 				if (error)
 					goto bad;
 			}
 		}
 	}
 	return (0);
 bad:
 	VOP_CLOSE(vp, ap->a_mode, ap->a_cred, td);
 	return (error);
 }
 
 /*
  * Vnode op for read
  */
 /* ARGSUSED */
 static int
 fifo_read(ap)
 	struct vop_read_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct uio *uio = ap->a_uio;
 	struct socket *rso = ap->a_vp->v_fifoinfo->fi_readsock;
 	struct thread *td = uio->uio_td;
 	int error, startresid;
 
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ)
 		panic("fifo_read mode");
 #endif
 	if (uio->uio_resid == 0)
 		return (0);
 	if (ap->a_ioflag & IO_NDELAY)
 		rso->so_state |= SS_NBIO;
 	startresid = uio->uio_resid;
 	VOP_UNLOCK(ap->a_vp, 0, td);
 	error = soreceive(rso, (struct sockaddr **)0, uio, (struct mbuf **)0,
 	    (struct mbuf **)0, (int *)0);
 	/*
 	 * Clear EOF indication after first such return.
 	 */
 	if (uio->uio_resid == startresid)
 		rso->so_state &= ~SS_CANTRCVMORE;
 	vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (ap->a_ioflag & IO_NDELAY)
 		rso->so_state &= ~SS_NBIO;
 	return (error);
 }
 
 /*
  * Vnode op for write
  */
 /* ARGSUSED */
 static int
 fifo_write(ap)
 	struct vop_write_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		int  a_ioflag;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct socket *wso = ap->a_vp->v_fifoinfo->fi_writesock;
 	struct thread *td = ap->a_uio->uio_td;
 	int error;
 
 #ifdef DIAGNOSTIC
 	if (ap->a_uio->uio_rw != UIO_WRITE)
 		panic("fifo_write mode");
 #endif
 	if (ap->a_ioflag & IO_NDELAY)
 		wso->so_state |= SS_NBIO;
 	VOP_UNLOCK(ap->a_vp, 0, td);
 	error = sosend(wso, (struct sockaddr *)0, ap->a_uio, 0,
 		       (struct mbuf *)0, 0, td);
 	vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (ap->a_ioflag & IO_NDELAY)
 		wso->so_state &= ~SS_NBIO;
 	return (error);
 }
 
 /*
  * Device ioctl operation.
  */
 /* ARGSUSED */
 static int
 fifo_ioctl(ap)
 	struct vop_ioctl_args /* {
 		struct vnode *a_vp;
 		int  a_command;
 		caddr_t  a_data;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct file filetmp;
-	int error;
+	int error = 0;
 
 	if (ap->a_command == FIONBIO)
 		return (0);
+	mtx_init(&filetmp.f_mtx, "struct file", MTX_DEF);
+	filetmp.f_count = 1;
 	if (ap->a_fflag & FREAD) {
+		/* filetmp is local, hence not need be locked. */
 		filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_readsock;
 		error = soo_ioctl(&filetmp, ap->a_command, ap->a_data, ap->a_td);
 		if (error)
-			return (error);
+			goto err;
 	}
 	if (ap->a_fflag & FWRITE) {
+		/* filetmp is local, hence not need be locked. */
 		filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_writesock;
 		error = soo_ioctl(&filetmp, ap->a_command, ap->a_data, ap->a_td);
 		if (error)
-			return (error);
+			goto err;
 	}
-	return (0);
+err:
+	mtx_destroy(&filetmp.f_mtx);
+	return (error);
 }
 
 /* ARGSUSED */
 static int
 fifo_kqfilter(ap)
 	struct vop_kqfilter_args /* {
 		struct vnode *a_vp;
 		struct knote *a_kn;
 	} */ *ap;
 {
 	struct fifoinfo *fi = ap->a_vp->v_fifoinfo;
 	struct socket *so;
 	struct sockbuf *sb;
 
 	switch (ap->a_kn->kn_filter) {
 	case EVFILT_READ:
 		ap->a_kn->kn_fop = &fiforead_filtops;
 		so = fi->fi_readsock;
 		sb = &so->so_rcv;
 		break;
 	case EVFILT_WRITE:
 		ap->a_kn->kn_fop = &fifowrite_filtops;
 		so = fi->fi_writesock;
 		sb = &so->so_snd;
 		break;
 	default:
 		return (1);
 	}
 
 	ap->a_kn->kn_hook = (caddr_t)so;
 
 	SLIST_INSERT_HEAD(&sb->sb_sel.si_note, ap->a_kn, kn_selnext);
 	sb->sb_flags |= SB_KNOTE;
 
 	return (0);
 }
 
 static void
 filt_fifordetach(struct knote *kn)
 {
 	struct socket *so = (struct socket *)kn->kn_hook;
 
 	SLIST_REMOVE(&so->so_rcv.sb_sel.si_note, kn, knote, kn_selnext);
 	if (SLIST_EMPTY(&so->so_rcv.sb_sel.si_note))
 		so->so_rcv.sb_flags &= ~SB_KNOTE;
 }
 
 static int
 filt_fiforead(struct knote *kn, long hint)
 {
 	struct socket *so = (struct socket *)kn->kn_hook;
 
 	kn->kn_data = so->so_rcv.sb_cc;
 	if (so->so_state & SS_CANTRCVMORE) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	kn->kn_flags &= ~EV_EOF;
 	return (kn->kn_data > 0);
 }
 
 static void
 filt_fifowdetach(struct knote *kn)
 {
 	struct socket *so = (struct socket *)kn->kn_hook;
 
 	SLIST_REMOVE(&so->so_snd.sb_sel.si_note, kn, knote, kn_selnext);
 	if (SLIST_EMPTY(&so->so_snd.sb_sel.si_note))
 		so->so_snd.sb_flags &= ~SB_KNOTE;
 }
 
 static int
 filt_fifowrite(struct knote *kn, long hint)
 {
 	struct socket *so = (struct socket *)kn->kn_hook;
 
 	kn->kn_data = sbspace(&so->so_snd);
 	if (so->so_state & SS_CANTSENDMORE) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
 	kn->kn_flags &= ~EV_EOF;
 	return (kn->kn_data >= so->so_snd.sb_lowat);
 }
 
 /* ARGSUSED */
 static int
 fifo_poll(ap)
 	struct vop_poll_args /* {
 		struct vnode *a_vp;
 		int  a_events;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct file filetmp;
 	int revents = 0;
 
+	mtx_init(&filetmp.f_mtx, "struct file", MTX_DEF);
+	filetmp.f_count = 1;
 	if (ap->a_events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
 		filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_readsock;
 		if (filetmp.f_data)
 			revents |= soo_poll(&filetmp, ap->a_events, ap->a_cred,
 			    ap->a_td);
 	}
 	if (ap->a_events & (POLLOUT | POLLWRNORM | POLLWRBAND)) {
 		filetmp.f_data = (caddr_t)ap->a_vp->v_fifoinfo->fi_writesock;
 		if (filetmp.f_data)
 			revents |= soo_poll(&filetmp, ap->a_events, ap->a_cred,
 			    ap->a_td);
 	}
+	mtx_destroy(&filetmp.f_mtx);
 	return (revents);
 }
 
 /*
  * Device close routine
  */
 /* ARGSUSED */
 static int
 fifo_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	register struct vnode *vp = ap->a_vp;
 	register struct fifoinfo *fip = vp->v_fifoinfo;
 	int error1, error2;
 
 	if (ap->a_fflag & FREAD) {
 		fip->fi_readers--;
 		if (fip->fi_readers == 0)
 			socantsendmore(fip->fi_writesock);
 	}
 	if (ap->a_fflag & FWRITE) {
 		fip->fi_writers--;
 		if (fip->fi_writers == 0)
 			socantrcvmore(fip->fi_readsock);
 	}
 	if (vp->v_usecount > 1)
 		return (0);
 	error1 = soclose(fip->fi_readsock);
 	error2 = soclose(fip->fi_writesock);
 	FREE(fip, M_VNODE);
 	vp->v_fifoinfo = NULL;
 	if (error1)
 		return (error1);
 	return (error2);
 }
 
 
 /*
  * Print out internal contents of a fifo vnode.
  */
 int
 fifo_printinfo(vp)
 	struct vnode *vp;
 {
 	register struct fifoinfo *fip = vp->v_fifoinfo;
 
 	printf(", fifo with %ld readers and %ld writers",
 		fip->fi_readers, fip->fi_writers);
 	return (0);
 }
 
 /*
  * Print out the contents of a fifo vnode.
  */
 static int
 fifo_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 
 	printf("tag VT_NON");
 	fifo_printinfo(ap->a_vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * Return POSIX pathconf information applicable to fifo's.
  */
 int
 fifo_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		int *a_retval;
 	} */ *ap;
 {
 
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = LINK_MAX;
 		return (0);
 	case _PC_PIPE_BUF:
 		*ap->a_retval = PIPE_BUF;
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (EINVAL);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Fifo advisory byte-level locks.
  */
 /* ARGSUSED */
 static int
 fifo_advlock(ap)
 	struct vop_advlock_args /* {
 		struct vnode *a_vp;
 		caddr_t  a_id;
 		int  a_op;
 		struct flock *a_fl;
 		int  a_flags;
 	} */ *ap;
 {
 
 	return (ap->a_flags & F_FLOCK ? EOPNOTSUPP : EINVAL);
 }
 
 /*
  * Fifo bad operation
  */
 static int
 fifo_badop()
 {
 
 	panic("fifo_badop called");
 	/* NOTREACHED */
 }
Index: head/sys/fs/portalfs/portal_vfsops.c
===================================================================
--- head/sys/fs/portalfs/portal_vfsops.c	(revision 89305)
+++ head/sys/fs/portalfs/portal_vfsops.c	(revision 89306)
@@ -1,260 +1,262 @@
 /*
  * Copyright (c) 1992, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)portal_vfsops.c	8.11 (Berkeley) 5/14/95
  *
  * $FreeBSD$
  */
 
 /*
  * Portal Filesystem
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h>		/* Must come after sys/malloc.h */
 #include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/vnode.h>
 
 #include <fs/portalfs/portal.h>
 
 static MALLOC_DEFINE(M_PORTALFSMNT, "PORTAL mount", "PORTAL mount structure");
 
 static int	portal_mount __P((struct mount *mp, char *path, caddr_t data,
 				  struct nameidata *ndp, struct thread *td));
 static int	portal_unmount __P((struct mount *mp, int mntflags,
 				    struct thread *td));
 static int	portal_root __P((struct mount *mp, struct vnode **vpp));
 static int	portal_statfs __P((struct mount *mp, struct statfs *sbp,
 				   struct thread *td));
 
 /*
  * Mount the per-process file descriptors (/dev/fd)
  */
 static int
 portal_mount(mp, path, data, ndp, td)
 	struct mount *mp;
 	char *path;
 	caddr_t data;
 	struct nameidata *ndp;
 	struct thread *td;
 {
 	struct file *fp;
 	struct portal_args args;
 	struct portalmount *fmp;
 	struct socket *so;
 	struct vnode *rvp;
 	struct portalnode *pn;
 	u_int size;
 	int error;
 
 	/*
 	 * Update is a no-op
 	 */
 	if (mp->mnt_flag & MNT_UPDATE)
 		return (EOPNOTSUPP);
 
 	error = copyin(data, (caddr_t) &args, sizeof(struct portal_args));
 	if (error)
 		return (error);
 
 	if ((error = fget(td, args.pa_socket, &fp)) != 0)
 		return (error);
         if (fp->f_type != DTYPE_SOCKET) {
 		fdrop(fp, td);
                 return(ENOTSOCK);
 	}
 	so = (struct socket *) fp->f_data;	/* XXX race against userland */
 	if (so->so_proto->pr_domain->dom_family != AF_UNIX) {
 		fdrop(fp, td);
 		return (ESOCKTNOSUPPORT);
 	}
 
 	MALLOC(pn, struct portalnode *, sizeof(struct portalnode),
 		M_TEMP, M_WAITOK);
 
 	MALLOC(fmp, struct portalmount *, sizeof(struct portalmount),
 		M_PORTALFSMNT, M_WAITOK);	/* XXX */
 
 	error = getnewvnode(VT_PORTAL, mp, portal_vnodeop_p, &rvp); /* XXX */
 	if (error) {
 		FREE(fmp, M_PORTALFSMNT);
 		FREE(pn, M_TEMP);
 		fdrop(fp, td);
 		return (error);
 	}
 
 	rvp->v_data = pn;
 	rvp->v_type = VDIR;
 	rvp->v_flag |= VROOT;
 	VTOPORTAL(rvp)->pt_arg = 0;
 	VTOPORTAL(rvp)->pt_size = 0;
 	VTOPORTAL(rvp)->pt_fileid = PORTAL_ROOTFILEID;
 	fmp->pm_root = rvp;
-	fmp->pm_server = fp; fp->f_count++;
+	fhold(fp);
+	fmp->pm_server = fp;
 
 	mp->mnt_flag |= MNT_LOCAL;
 	mp->mnt_data = (qaddr_t) fmp;
 	vfs_getnewfsid(mp);
 
 	(void)copyinstr(args.pa_config,
 	    mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
 	bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
 
 #ifdef notdef
 	bzero(mp->mnt_stat.f_mntfromname, MNAMELEN);
 	bcopy("portal", mp->mnt_stat.f_mntfromname, sizeof("portal"));
 #endif
 
 	(void)portal_statfs(mp, &mp->mnt_stat, td);
 	fdrop(fp, td);
 	return (0);
 }
 
 static int
 portal_unmount(mp, mntflags, td)
 	struct mount *mp;
 	int mntflags;
 	struct thread *td;
 {
 	int error, flags = 0;
+	struct socket *so;
 
 
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 
 	/*
 	 * Clear out buffer cache.  I don't think we
 	 * ever get anything cached at this level at the
 	 * moment, but who knows...
 	 */
 #ifdef notyet
 	mntflushbuf(mp, 0);
 	if (mntinvalbuf(mp, 1))
 		return (EBUSY);
 #endif
 	/* There is 1 extra root vnode reference (pm_root). */
 	error = vflush(mp, 1, flags);
 	if (error)
 		return (error);
 
 	/*
 	 * Shutdown the socket.  This will cause the select in the
 	 * daemon to wake up, and then the accept will get ECONNABORTED
 	 * which it interprets as a request to go and bury itself.
 	 */
 	soshutdown((struct socket *) VFSTOPORTAL(mp)->pm_server->f_data, 2);
 	/*
 	 * Discard reference to underlying file.  Must call closef because
 	 * this may be the last reference.
 	 */
 	closef(VFSTOPORTAL(mp)->pm_server, (struct thread *) 0);
 	/*
 	 * Finally, throw away the portalmount structure
 	 */
 	free(mp->mnt_data, M_PORTALFSMNT);	/* XXX */
 	mp->mnt_data = 0;
 	return (0);
 }
 
 static int
 portal_root(mp, vpp)
 	struct mount *mp;
 	struct vnode **vpp;
 {
 	struct thread *td = curthread;	/* XXX */
 	struct vnode *vp;
 
 	/*
 	 * Return locked reference to root.
 	 */
 	vp = VFSTOPORTAL(mp)->pm_root;
 	VREF(vp);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	*vpp = vp;
 	return (0);
 }
 
 static int
 portal_statfs(mp, sbp, td)
 	struct mount *mp;
 	struct statfs *sbp;
 	struct thread *td;
 {
 
 	sbp->f_flags = 0;
 	sbp->f_bsize = DEV_BSIZE;
 	sbp->f_iosize = DEV_BSIZE;
 	sbp->f_blocks = 2;		/* 1K to keep df happy */
 	sbp->f_bfree = 0;
 	sbp->f_bavail = 0;
 	sbp->f_files = 1;		/* Allow for "." */
 	sbp->f_ffree = 0;		/* See comments above */
 	if (sbp != &mp->mnt_stat) {
 		sbp->f_type = mp->mnt_vfc->vfc_typenum;
 		bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid));
 		bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN);
 		bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN);
 	}
 	return (0);
 }
 
 static struct vfsops portal_vfsops = {
 	portal_mount,
 	vfs_stdstart,
 	portal_unmount,
 	portal_root,
 	vfs_stdquotactl,
 	portal_statfs,
 	vfs_stdsync,
 	vfs_stdvget,
 	vfs_stdfhtovp,
 	vfs_stdcheckexp,
 	vfs_stdvptofh,
 	vfs_stdinit,
 	vfs_stduninit,
 	vfs_stdextattrctl,
 };
 
 VFS_SET(portal_vfsops, portalfs, VFCF_SYNTHETIC);
Index: head/sys/fs/portalfs/portal_vnops.c
===================================================================
--- head/sys/fs/portalfs/portal_vnops.c	(revision 89305)
+++ head/sys/fs/portalfs/portal_vnops.c	(revision 89306)
@@ -1,579 +1,585 @@
 /*
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)portal_vnops.c	8.14 (Berkeley) 5/21/95
  *
  * $FreeBSD$
  */
 
 /*
  * Portal Filesystem
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/time.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/vnode.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/mount.h>
 #include <sys/malloc.h>
 #include <sys/namei.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <fs/portalfs/portal.h>
 
 static int portal_fileid = PORTAL_ROOTFILEID+1;
 
 static void	portal_closefd __P((struct thread *td, int fd));
 static int	portal_connect __P((struct socket *so, struct socket *so2));
 static int	portal_getattr __P((struct vop_getattr_args *ap));
 static int	portal_lookup __P((struct vop_lookup_args *ap));
 static int	portal_open __P((struct vop_open_args *ap));
 static int	portal_print __P((struct vop_print_args *ap));
 static int	portal_readdir __P((struct vop_readdir_args *ap));
 static int	portal_reclaim __P((struct vop_reclaim_args *ap));
 static int	portal_setattr __P((struct vop_setattr_args *ap));
 
 static void
 portal_closefd(td, fd)
 	struct thread *td;
 	int fd;
 {
 	int error;
 	struct close_args ua;
 
 	ua.fd = fd;
 	error = close(td, &ua);
 	/*
 	 * We should never get an error, and there isn't anything
 	 * we could do if we got one, so just print a message.
 	 */
 	if (error)
 		printf("portal_closefd: error = %d\n", error);
 }
 
 /*
  * vp is the current namei directory
  * cnp is the name to locate in that directory...
  */
 static int
 portal_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode * a_dvp;
 		struct vnode ** a_vpp;
 		struct componentname * a_cnp;
 	} */ *ap;
 {
 	struct componentname *cnp = ap->a_cnp;
 	struct vnode **vpp = ap->a_vpp;
 	struct vnode *dvp = ap->a_dvp;
 	char *pname = cnp->cn_nameptr;
 	struct portalnode *pt;
 	int error;
 	struct vnode *fvp = 0;
 	char *path;
 	int size;
 
 	*vpp = NULLVP;
 
 	if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
 		return (EROFS);
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		*vpp = dvp;
 		VREF(dvp);
 		/*VOP_LOCK(dvp);*/
 		return (0);
 	}
 
 	/*
 	 * Do the MALLOC before the getnewvnode since doing so afterward
 	 * might cause a bogus v_data pointer to get dereferenced
 	 * elsewhere if MALLOC should block.
 	 */
 	MALLOC(pt, struct portalnode *, sizeof(struct portalnode),
 		M_TEMP, M_WAITOK);
 
 	error = getnewvnode(VT_PORTAL, dvp->v_mount, portal_vnodeop_p, &fvp);
 	if (error) {
 		FREE(pt, M_TEMP);
 		goto bad;
 	}
 	fvp->v_type = VREG;
 	fvp->v_data = pt;
 	/*
 	 * Save all of the remaining pathname and
 	 * advance the namei next pointer to the end
 	 * of the string.
 	 */
 	for (size = 0, path = pname; *path; path++)
 		size++;
 	cnp->cn_consume = size - cnp->cn_namelen;
 
 	pt->pt_arg = malloc(size+1, M_TEMP, M_WAITOK);
 	pt->pt_size = size+1;
 	bcopy(pname, pt->pt_arg, pt->pt_size);
 	pt->pt_fileid = portal_fileid++;
 
 	*vpp = fvp;
 	/*VOP_LOCK(fvp);*/
 	return (0);
 
 bad:;
 	if (fvp)
 		vrele(fvp);
 	return (error);
 }
 
 static int
 portal_connect(so, so2)
 	struct socket *so;
 	struct socket *so2;
 {
 	/* from unp_connect, bypassing the namei stuff... */
 	struct socket *so3;
 	struct unpcb *unp2;
 	struct unpcb *unp3;
 
 	if (so2 == 0)
 		return (ECONNREFUSED);
 
 	if (so->so_type != so2->so_type)
 		return (EPROTOTYPE);
 
 	if ((so2->so_options & SO_ACCEPTCONN) == 0)
 		return (ECONNREFUSED);
 
 	if ((so3 = sonewconn(so2, 0)) == 0)
 		return (ECONNREFUSED);
 
 	unp2 = sotounpcb(so2);
 	unp3 = sotounpcb(so3);
 	if (unp2->unp_addr)
 		unp3->unp_addr = (struct sockaddr_un *)
 			dup_sockaddr((struct sockaddr *)unp2->unp_addr, 0);
 	so2 = so3;
 
 	return (unp_connect2(so, so2));
 }
 
 static int
 portal_open(ap)
 	struct vop_open_args /* {
 		struct vnode *a_vp;
 		int  a_mode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct socket *so = 0;
 	struct portalnode *pt;
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	int s;
 	struct uio auio;
 	struct iovec aiov[2];
 	int res;
 	struct mbuf *cm = 0;
 	struct cmsghdr *cmsg;
 	int newfds;
 	int *ip;
 	int fd;
 	int error;
 	int len;
 	struct portalmount *fmp;
 	struct file *fp;
 	struct portal_cred pcred;
 
 	/*
 	 * Nothing to do when opening the root node.
 	 */
 	if (vp->v_flag & VROOT)
 		return (0);
 
 	/*
 	 * Can't be opened unless the caller is set up
 	 * to deal with the side effects.  Check for this
 	 * by testing whether the p_dupfd has been set.
 	 */
 	if (td->td_dupfd >= 0)
 		return (ENODEV);
 
 	pt = VTOPORTAL(vp);
 	fmp = VFSTOPORTAL(vp->v_mount);
 
 	/*
 	 * Create a new socket.
 	 */
 	error = socreate(AF_UNIX, &so, SOCK_STREAM, 0,
 	    ap->a_td->td_proc->p_ucred, ap->a_td);
 	if (error)
 		goto bad;
 
 	/*
 	 * Reserve some buffer space
 	 */
 	res = pt->pt_size + sizeof(pcred) + 512;	/* XXX */
 	error = soreserve(so, res, res);
 	if (error)
 		goto bad;
 
 	/*
 	 * Kick off connection
 	 */
 	error = portal_connect(so, (struct socket *)fmp->pm_server->f_data);
 	if (error)
 		goto bad;
 
 	/*
 	 * Wait for connection to complete
 	 */
 	/*
 	 * XXX: Since the mount point is holding a reference on the
 	 * underlying server socket, it is not easy to find out whether
 	 * the server process is still running.  To handle this problem
 	 * we loop waiting for the new socket to be connected (something
 	 * which will only happen if the server is still running) or for
 	 * the reference count on the server socket to drop to 1, which
 	 * will happen if the server dies.  Sleep for 5 second intervals
 	 * and keep polling the reference count.   XXX.
 	 */
 	s = splnet();
 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 		if (fmp->pm_server->f_count == 1) {
 			error = ECONNREFUSED;
 			splx(s);
 			goto bad;
 		}
 		(void) tsleep((caddr_t) &so->so_timeo, PSOCK, "portalcon", 5 * hz);
 	}
 	splx(s);
 
 	if (so->so_error) {
 		error = so->so_error;
 		goto bad;
 	}
 
 	/*
 	 * Set miscellaneous flags
 	 */
 	so->so_rcv.sb_timeo = 0;
 	so->so_snd.sb_timeo = 0;
 	so->so_rcv.sb_flags |= SB_NOINTR;
 	so->so_snd.sb_flags |= SB_NOINTR;
 
 
 	pcred.pcr_flag = ap->a_mode;
 	pcred.pcr_uid = ap->a_cred->cr_uid;
 	pcred.pcr_ngroups = ap->a_cred->cr_ngroups;
 	bcopy(ap->a_cred->cr_groups, pcred.pcr_groups, NGROUPS * sizeof(gid_t));
 	aiov[0].iov_base = (caddr_t) &pcred;
 	aiov[0].iov_len = sizeof(pcred);
 	aiov[1].iov_base = pt->pt_arg;
 	aiov[1].iov_len = pt->pt_size;
 	auio.uio_iov = aiov;
 	auio.uio_iovcnt = 2;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = aiov[0].iov_len + aiov[1].iov_len;
 
 	error = sosend(so, (struct sockaddr *) 0, &auio,
 			(struct mbuf *) 0, (struct mbuf *) 0, 0 , td);
 	if (error)
 		goto bad;
 
 	len = auio.uio_resid = sizeof(int);
 	do {
 		struct mbuf *m = 0;
 		int flags = MSG_WAITALL;
 		error = soreceive(so, (struct sockaddr **) 0, &auio,
 					&m, &cm, &flags);
 		if (error)
 			goto bad;
 
 		/*
 		 * Grab an error code from the mbuf.
 		 */
 		if (m) {
 			m = m_pullup(m, sizeof(int));	/* Needed? */
 			if (m) {
 				error = *(mtod(m, int *));
 				m_freem(m);
 			} else {
 				error = EINVAL;
 			}
 		} else {
 			if (cm == 0) {
 				error = ECONNRESET;	 /* XXX */
 #ifdef notdef
 				break;
 #endif
 			}
 		}
 	} while (cm == 0 && auio.uio_resid == len && !error);
 
 	if (cm == 0)
 		goto bad;
 
 	if (auio.uio_resid) {
 		error = 0;
 #ifdef notdef
 		error = EMSGSIZE;
 		goto bad;
 #endif
 	}
 
 	/*
 	 * XXX: Break apart the control message, and retrieve the
 	 * received file descriptor.  Note that more than one descriptor
 	 * may have been received, or that the rights chain may have more
 	 * than a single mbuf in it.  What to do?
 	 */
 	cmsg = mtod(cm, struct cmsghdr *);
 	newfds = (cmsg->cmsg_len - sizeof(*cmsg)) / sizeof (int);
 	if (newfds == 0) {
 		error = ECONNREFUSED;
 		goto bad;
 	}
 	/*
 	 * At this point the rights message consists of a control message
 	 * header, followed by a data region containing a vector of
 	 * integer file descriptors.  The fds were allocated by the action
 	 * of receiving the control message.
 	 */
 	ip = (int *) (cmsg + 1);
 	fd = *ip++;
 	if (newfds > 1) {
 		/*
 		 * Close extra fds.
 		 */
 		int i;
 		printf("portal_open: %d extra fds\n", newfds - 1);
 		for (i = 1; i < newfds; i++) {
 			portal_closefd(td, *ip);
 			ip++;
 		}
 	}
 
 	/*
 	 * Check that the mode the file is being opened for is a subset
 	 * of the mode of the existing descriptor.
 	 */
- 	fp = td->td_proc->p_fd->fd_ofiles[fd];
+	fp = ffind_hold(td, fd);
+	if (fp == NULL) {
+		error = EBADF;
+		goto bad;
+	}
 	if (((ap->a_mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
+		fdrop(fp, td);
 		portal_closefd(td, fd);
 		error = EACCES;
 		goto bad;
 	}
+	fdrop(fp, td);
 
 	/*
 	 * Save the dup fd in the proc structure then return the
 	 * special error code (ENXIO) which causes magic things to
 	 * happen in vn_open.  The whole concept is, well, hmmm.
 	 */
 	td->td_dupfd = fd;
 	error = ENXIO;
 
 bad:;
 	/*
 	 * And discard the control message.
 	 */
 	if (cm) {
 		m_freem(cm);
 	}
 
 	if (so) {
 		soshutdown(so, 2);
 		soclose(so);
 	}
 	return (error);
 }
 
 static int
 portal_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 
 	bzero(vap, sizeof(*vap));
 	vattr_null(vap);
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_size = DEV_BSIZE;
 	vap->va_blocksize = DEV_BSIZE;
 	nanotime(&vap->va_atime);
 	vap->va_mtime = vap->va_atime;
 	vap->va_ctime = vap->va_mtime;
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_rdev = 0;
 	/* vap->va_qbytes = 0; */
 	vap->va_bytes = 0;
 	/* vap->va_qsize = 0; */
 	if (vp->v_flag & VROOT) {
 		vap->va_type = VDIR;
 		vap->va_mode = S_IRUSR|S_IWUSR|S_IXUSR|
 				S_IRGRP|S_IWGRP|S_IXGRP|
 				S_IROTH|S_IWOTH|S_IXOTH;
 		vap->va_nlink = 2;
 		vap->va_fileid = 2;
 	} else {
 		vap->va_type = VREG;
 		vap->va_mode = S_IRUSR|S_IWUSR|
 				S_IRGRP|S_IWGRP|
 				S_IROTH|S_IWOTH;
 		vap->va_nlink = 1;
 		vap->va_fileid = VTOPORTAL(vp)->pt_fileid;
 	}
 	return (0);
 }
 
 static int
 portal_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 
 	/*
 	 * Can't mess with the root vnode
 	 */
 	if (ap->a_vp->v_flag & VROOT)
 		return (EACCES);
 
 	if (ap->a_vap->va_flags != VNOVAL)
 		return (EOPNOTSUPP);
 
 	return (0);
 }
 
 /*
  * Fake readdir, just return empty directory.
  * It is hard to deal with '.' and '..' so don't bother.
  */
 static int
 portal_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		u_long *a_cookies;
 		int a_ncookies;
 	} */ *ap;
 {
 
 	/*
 	 * We don't allow exporting portal mounts, and currently local
 	 * requests do not need cookies.
 	 */
 	if (ap->a_ncookies)
 		panic("portal_readdir: not hungry");
 
 	return (0);
 }
 
 static int
 portal_reclaim(ap)
 	struct vop_reclaim_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct portalnode *pt = VTOPORTAL(ap->a_vp);
 
 	if (pt->pt_arg) {
 		free((caddr_t) pt->pt_arg, M_TEMP);
 		pt->pt_arg = 0;
 	}
 	FREE(ap->a_vp->v_data, M_TEMP);
 	ap->a_vp->v_data = 0;
 
 	return (0);
 }
 
 
 /*
  * Print out the contents of a Portal vnode.
  */
 /* ARGSUSED */
 static int
 portal_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 
 	printf("tag VT_PORTAL, portal vnode\n");
 	return (0);
 }
 
 vop_t **portal_vnodeop_p;
 static struct vnodeopv_entry_desc portal_vnodeop_entries[] = {
 	{ &vop_default_desc,		(vop_t *) vop_defaultop },
 	{ &vop_access_desc,		(vop_t *) vop_null },
 	{ &vop_getattr_desc,		(vop_t *) portal_getattr },
 	{ &vop_lookup_desc,		(vop_t *) portal_lookup },
 	{ &vop_open_desc,		(vop_t *) portal_open },
 	{ &vop_pathconf_desc,		(vop_t *) vop_stdpathconf },
 	{ &vop_print_desc,		(vop_t *) portal_print },
 	{ &vop_readdir_desc,		(vop_t *) portal_readdir },
 	{ &vop_reclaim_desc,		(vop_t *) portal_reclaim },
 	{ &vop_setattr_desc,		(vop_t *) portal_setattr },
 	{ NULL, NULL }
 };
 static struct vnodeopv_desc portal_vnodeop_opv_desc =
 	{ &portal_vnodeop_p, portal_vnodeop_entries };
 
 VNODEOP_SET(portal_vnodeop_opv_desc);
Index: head/sys/fs/unionfs/union_subr.c
===================================================================
--- head/sys/fs/unionfs/union_subr.c	(revision 89305)
+++ head/sys/fs/unionfs/union_subr.c	(revision 89306)
@@ -1,1360 +1,1365 @@
 /*
  * Copyright (c) 1994 Jan-Simon Pendry
  * Copyright (c) 1994
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/stat.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>	/* for vnode_pager_setsize */
 #include <vm/vm_zone.h>
 #include <vm/vm_object.h>	/* for vm cache coherency */
 
 #include <fs/unionfs/union.h>
 
 #include <sys/proc.h>
 
 extern int	union_init __P((void));
 
 /* must be power of two, otherwise change UNION_HASH() */
 #define NHASH 32
 
 /* unsigned int ... */
 #define UNION_HASH(u, l) \
 	(((((uintptr_t) (u)) + ((uintptr_t) l)) >> 8) & (NHASH-1))
 
 static LIST_HEAD(unhead, union_node) unhead[NHASH];
 static int unvplock[NHASH];
 
 static void	union_dircache_r __P((struct vnode *vp, struct vnode ***vppp,
 				      int *cntp));
 static int	union_list_lock __P((int ix));
 static void	union_list_unlock __P((int ix));
 static int	union_relookup __P((struct union_mount *um, struct vnode *dvp,
 				    struct vnode **vpp,
 				    struct componentname *cnp,
 				    struct componentname *cn, char *path,
 				    int pathlen));
 static void	union_updatevp __P((struct union_node *un,
 				    struct vnode *uppervp,
 				    struct vnode *lowervp));
 static void union_newlower __P((struct union_node *, struct vnode *));
 static void union_newupper __P((struct union_node *, struct vnode *));
 static int union_copyfile __P((struct vnode *, struct vnode *,
 					struct ucred *, struct thread *));
 static int union_vn_create __P((struct vnode **, struct union_node *,
 				struct thread *));
 static int union_vn_close __P((struct vnode *, int, struct ucred *,
 				struct thread *));
 
 int
 union_init()
 {
 	int i;
 
 	for (i = 0; i < NHASH; i++)
 		LIST_INIT(&unhead[i]);
 	bzero((caddr_t)unvplock, sizeof(unvplock));
 	return (0);
 }
 
 static int
 union_list_lock(ix)
 	int ix;
 {
 	if (unvplock[ix] & UNVP_LOCKED) {
 		unvplock[ix] |= UNVP_WANT;
 		(void) tsleep((caddr_t) &unvplock[ix], PINOD, "unllck", 0);
 		return (1);
 	}
 	unvplock[ix] |= UNVP_LOCKED;
 	return (0);
 }
 
 static void
 union_list_unlock(ix)
 	int ix;
 {
 	unvplock[ix] &= ~UNVP_LOCKED;
 
 	if (unvplock[ix] & UNVP_WANT) {
 		unvplock[ix] &= ~UNVP_WANT;
 		wakeup((caddr_t) &unvplock[ix]);
 	}
 }
 
 /*
  *	union_updatevp:
  *
  *	The uppervp, if not NULL, must be referenced and not locked by us
  *	The lowervp, if not NULL, must be referenced.
  *
  *	if uppervp and lowervp match pointers already installed, nothing
  *	happens. The passed vp's (when matching) are not adjusted.  This
  *	routine may only be called by union_newupper() and union_newlower().
  */
 
 static void
 union_updatevp(un, uppervp, lowervp)
 	struct union_node *un;
 	struct vnode *uppervp;
 	struct vnode *lowervp;
 {
 	int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
 	int nhash = UNION_HASH(uppervp, lowervp);
 	int docache = (lowervp != NULLVP || uppervp != NULLVP);
 	int lhash, uhash;
 
 	/*
 	 * Ensure locking is ordered from lower to higher
 	 * to avoid deadlocks.
 	 */
 	if (nhash < ohash) {
 		lhash = nhash;
 		uhash = ohash;
 	} else {
 		lhash = ohash;
 		uhash = nhash;
 	}
 
 	if (lhash != uhash) {
 		while (union_list_lock(lhash))
 			continue;
 	}
 
 	while (union_list_lock(uhash))
 		continue;
 
 	if (ohash != nhash || !docache) {
 		if (un->un_flags & UN_CACHED) {
 			un->un_flags &= ~UN_CACHED;
 			LIST_REMOVE(un, un_cache);
 		}
 	}
 
 	if (ohash != nhash)
 		union_list_unlock(ohash);
 
 	if (un->un_lowervp != lowervp) {
 		if (un->un_lowervp) {
 			vrele(un->un_lowervp);
 			if (un->un_path) {
 				free(un->un_path, M_TEMP);
 				un->un_path = 0;
 			}
 		}
 		un->un_lowervp = lowervp;
 		un->un_lowersz = VNOVAL;
 	}
 
 	if (un->un_uppervp != uppervp) {
 		if (un->un_uppervp)
 			vrele(un->un_uppervp);
 		un->un_uppervp = uppervp;
 		un->un_uppersz = VNOVAL;
 	}
 
 	if (docache && (ohash != nhash)) {
 		LIST_INSERT_HEAD(&unhead[nhash], un, un_cache);
 		un->un_flags |= UN_CACHED;
 	}
 
 	union_list_unlock(nhash);
 }
 
 /*
  * Set a new lowervp.  The passed lowervp must be referenced and will be
  * stored in the vp in a referenced state. 
  */
 
 static void
 union_newlower(un, lowervp)
 	struct union_node *un;
 	struct vnode *lowervp;
 {
 	union_updatevp(un, un->un_uppervp, lowervp);
 }
 
 /*
  * Set a new uppervp.  The passed uppervp must be locked and will be 
  * stored in the vp in a locked state.  The caller should not unlock
  * uppervp.
  */
 
 static void
 union_newupper(un, uppervp)
 	struct union_node *un;
 	struct vnode *uppervp;
 {
 	union_updatevp(un, uppervp, un->un_lowervp);
 }
 
 /*
  * Keep track of size changes in the underlying vnodes.
  * If the size changes, then callback to the vm layer
  * giving priority to the upper layer size.
  */
 void
 union_newsize(vp, uppersz, lowersz)
 	struct vnode *vp;
 	off_t uppersz, lowersz;
 {
 	struct union_node *un;
 	off_t sz;
 
 	/* only interested in regular files */
 	if (vp->v_type != VREG)
 		return;
 
 	un = VTOUNION(vp);
 	sz = VNOVAL;
 
 	if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) {
 		un->un_uppersz = uppersz;
 		if (sz == VNOVAL)
 			sz = un->un_uppersz;
 	}
 
 	if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) {
 		un->un_lowersz = lowersz;
 		if (sz == VNOVAL)
 			sz = un->un_lowersz;
 	}
 
 	if (sz != VNOVAL) {
 		UDEBUG(("union: %s size now %ld\n",
 			(uppersz != VNOVAL ? "upper" : "lower"), (long)sz));
 		/*
 		 * There is no need to change size of non-existent object
 		 */
 		/* vnode_pager_setsize(vp, sz); */
 	}
 }
 
 /*
  *	union_allocvp:	allocate a union_node and associate it with a
  *			parent union_node and one or two vnodes.
  *
  *	vpp	Holds the returned vnode locked and referenced if no 
  *		error occurs.
  *
  *	mp	Holds the mount point.  mp may or may not be busied. 
  *		allocvp makes no changes to mp.
  *
  *	dvp	Holds the parent union_node to the one we wish to create.
  *		XXX may only be used to traverse an uncopied lowervp-based
  *		tree?  XXX
  *
  *		dvp may or may not be locked.  allocvp makes no changes
  *		to dvp.
  *
  *	upperdvp Holds the parent vnode to uppervp, generally used along
  *		with path component information to create a shadow of
  *		lowervp when uppervp does not exist.
  *
  *		upperdvp is referenced but unlocked on entry, and will be
  *		dereferenced on return.
  *
  *	uppervp	Holds the new uppervp vnode to be stored in the 
  *		union_node we are allocating.  uppervp is referenced but
  *		not locked, and will be dereferenced on return.
  *
  *	lowervp	Holds the new lowervp vnode to be stored in the
  *		union_node we are allocating.  lowervp is referenced but
  *		not locked, and will be dereferenced on return.
  * 
  *	cnp	Holds path component information to be coupled with
  *		lowervp and upperdvp to allow unionfs to create an uppervp
  *		later on.  Only used if lowervp is valid.  The conents
  *		of cnp is only valid for the duration of the call.
  *
  *	docache	Determine whether this node should be entered in the
  *		cache or whether it should be destroyed as soon as possible.
  *
  * all union_nodes are maintained on a singly-linked
  * list.  new nodes are only allocated when they cannot
  * be found on this list.  entries on the list are
  * removed when the vfs reclaim entry is called.
  *
  * a single lock is kept for the entire list.  this is
  * needed because the getnewvnode() function can block
  * waiting for a vnode to become free, in which case there
  * may be more than one process trying to get the same
  * vnode.  this lock is only taken if we are going to
  * call getnewvnode, since the kernel itself is single-threaded.
  *
  * if an entry is found on the list, then call vget() to
  * take a reference.  this is done because there may be
  * zero references to it and so it needs to removed from
  * the vnode free list.
  */
 
 int
 union_allocvp(vpp, mp, dvp, upperdvp, cnp, uppervp, lowervp, docache)
 	struct vnode **vpp;
 	struct mount *mp;
 	struct vnode *dvp;		/* parent union vnode */
 	struct vnode *upperdvp;		/* parent vnode of uppervp */
 	struct componentname *cnp;	/* may be null */
 	struct vnode *uppervp;		/* may be null */
 	struct vnode *lowervp;		/* may be null */
 	int docache;
 {
 	int error;
 	struct union_node *un = 0;
 	struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
 	struct thread *td = (cnp) ? cnp->cn_thread : curthread;
 	int hash = 0;
 	int vflag;
 	int try;
 
 	if (uppervp == NULLVP && lowervp == NULLVP)
 		panic("union: unidentifiable allocation");
 
 	if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) {
 		vrele(lowervp);
 		lowervp = NULLVP;
 	}
 
 	/* detect the root vnode (and aliases) */
 	vflag = 0;
 	if ((uppervp == um->um_uppervp) &&
 	    ((lowervp == NULLVP) || lowervp == um->um_lowervp)) {
 		if (lowervp == NULLVP) {
 			lowervp = um->um_lowervp;
 			if (lowervp != NULLVP)
 				VREF(lowervp);
 		}
 		vflag = VROOT;
 	}
 
 loop:
 	if (!docache) {
 		un = 0;
 	} else for (try = 0; try < 3; try++) {
 		switch (try) {
 		case 0:
 			if (lowervp == NULLVP)
 				continue;
 			hash = UNION_HASH(uppervp, lowervp);
 			break;
 
 		case 1:
 			if (uppervp == NULLVP)
 				continue;
 			hash = UNION_HASH(uppervp, NULLVP);
 			break;
 
 		case 2:
 			if (lowervp == NULLVP)
 				continue;
 			hash = UNION_HASH(NULLVP, lowervp);
 			break;
 		}
 
 		while (union_list_lock(hash))
 			continue;
 
 		LIST_FOREACH(un, &unhead[hash], un_cache) {
 			if ((un->un_lowervp == lowervp ||
 			     un->un_lowervp == NULLVP) &&
 			    (un->un_uppervp == uppervp ||
 			     un->un_uppervp == NULLVP) &&
 			    (UNIONTOV(un)->v_mount == mp)) {
 				if (vget(UNIONTOV(un), 0,
 				    cnp ? cnp->cn_thread : NULL)) {
 					union_list_unlock(hash);
 					goto loop;
 				}
 				break;
 			}
 		}
 
 		union_list_unlock(hash);
 
 		if (un)
 			break;
 	}
 
 	if (un) {
 		/*
 		 * Obtain a lock on the union_node.  Everything is unlocked
 		 * except for dvp, so check that case.  If they match, our
 		 * new un is already locked.  Otherwise we have to lock our
 		 * new un.
 		 *
 		 * A potential deadlock situation occurs when we are holding
 		 * one lock while trying to get another.  We must follow 
 		 * strict ordering rules to avoid it.  We try to locate dvp
 		 * by scanning up from un_vnode, since the most likely 
 		 * scenario is un being under dvp.
 		 */
 
 		if (dvp && un->un_vnode != dvp) {
 			struct vnode *scan = un->un_vnode;
 
 			do {
 				scan = VTOUNION(scan)->un_pvp;
 			} while (scan && scan->v_tag == VT_UNION && scan != dvp);
 			if (scan != dvp) {
 				/*
 				 * our new un is above dvp (we never saw dvp
 				 * while moving up the tree).
 				 */
 				VREF(dvp);
 				VOP_UNLOCK(dvp, 0, td);
 				error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td);
 				vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
 				vrele(dvp);
 			} else {
 				/*
 				 * our new un is under dvp
 				 */
 				error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td);
 			}
 		} else if (dvp == NULLVP) {
 			/*
 			 * dvp is NULL, we need to lock un.
 			 */
 			error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td);
 		} else {
 			/*
 			 * dvp == un->un_vnode, we are already locked.
 			 */
 			error = 0;
 		}
 
 		if (error)
 			goto loop;
 
 		/*
 		 * At this point, the union_node is locked and referenced.
 		 *
 		 * uppervp is locked and referenced or NULL, lowervp is
 		 * referenced or NULL.
 		 */
 		UDEBUG(("Modify existing un %p vn %p upper %p(refs %d) -> %p(refs %d)\n",
 			un, un->un_vnode, un->un_uppervp, 
 			(un->un_uppervp ? un->un_uppervp->v_usecount : -99),
 			uppervp,
 			(uppervp ? uppervp->v_usecount : -99)
 		));
 
 		if (uppervp != un->un_uppervp) {
 			KASSERT(uppervp == NULL || uppervp->v_usecount > 0, ("union_allocvp: too few refs %d (at least 1 required) on uppervp", uppervp->v_usecount));
 			union_newupper(un, uppervp);
 		} else if (uppervp) {
 			KASSERT(uppervp->v_usecount > 1, ("union_allocvp: too few refs %d (at least 2 required) on uppervp", uppervp->v_usecount));
 			vrele(uppervp);
 		}
 
 		/*
 		 * Save information about the lower layer.
 		 * This needs to keep track of pathname
 		 * and directory information which union_vn_create
 		 * might need.
 		 */
 		if (lowervp != un->un_lowervp) {
 			union_newlower(un, lowervp);
 			if (cnp && (lowervp != NULLVP)) {
 				un->un_path = malloc(cnp->cn_namelen+1,
 						M_TEMP, M_WAITOK);
 				bcopy(cnp->cn_nameptr, un->un_path,
 						cnp->cn_namelen);
 				un->un_path[cnp->cn_namelen] = '\0';
 			}
 		} else if (lowervp) {
 			vrele(lowervp);
 		}
 
 		/*
 		 * and upperdvp
 		 */
 		if (upperdvp != un->un_dirvp) {
 			if (un->un_dirvp)
 				vrele(un->un_dirvp);
 			un->un_dirvp = upperdvp;
 		} else if (upperdvp) {
 			vrele(upperdvp);
 		}
 
 		*vpp = UNIONTOV(un);
 		return (0);
 	}
 
 	if (docache) {
 		/*
 		 * otherwise lock the vp list while we call getnewvnode
 		 * since that can block.
 		 */ 
 		hash = UNION_HASH(uppervp, lowervp);
 
 		if (union_list_lock(hash))
 			goto loop;
 	}
 
 	/*
 	 * Create new node rather then replace old node
 	 */
 
 	error = getnewvnode(VT_UNION, mp, union_vnodeop_p, vpp);
 	if (error) {
 		/*
 		 * If an error occurs clear out vnodes.
 		 */
 		if (lowervp)
 			vrele(lowervp);
 		if (uppervp) 
 			vrele(uppervp);
 		if (upperdvp)
 			vrele(upperdvp);
 		*vpp = NULL;
 		goto out;
 	}
 
 	MALLOC((*vpp)->v_data, void *, sizeof(struct union_node),
 		M_TEMP, M_WAITOK);
 
 	(*vpp)->v_flag |= vflag;
 	if (uppervp)
 		(*vpp)->v_type = uppervp->v_type;
 	else
 		(*vpp)->v_type = lowervp->v_type;
 
 	un = VTOUNION(*vpp);
 	bzero(un, sizeof(*un));
 
 	lockinit(&un->un_lock, PVFS, "unlock", VLKTIMEOUT, 0);
 	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	un->un_vnode = *vpp;
 	un->un_uppervp = uppervp;
 	un->un_uppersz = VNOVAL;
 	un->un_lowervp = lowervp;
 	un->un_lowersz = VNOVAL;
 	un->un_dirvp = upperdvp;
 	un->un_pvp = dvp;		/* only parent dir in new allocation */
 	if (dvp != NULLVP)
 		VREF(dvp);
 	un->un_dircache = 0;
 	un->un_openl = 0;
 
 	if (cnp && (lowervp != NULLVP)) {
 		un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK);
 		bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen);
 		un->un_path[cnp->cn_namelen] = '\0';
 	} else {
 		un->un_path = 0;
 		un->un_dirvp = NULL;
 	}
 
 	if (docache) {
 		LIST_INSERT_HEAD(&unhead[hash], un, un_cache);
 		un->un_flags |= UN_CACHED;
 	}
 
 out:
 	if (docache)
 		union_list_unlock(hash);
 
 	return (error);
 }
 
 int
 union_freevp(vp)
 	struct vnode *vp;
 {
 	struct union_node *un = VTOUNION(vp);
 
 	if (un->un_flags & UN_CACHED) {
 		un->un_flags &= ~UN_CACHED;
 		LIST_REMOVE(un, un_cache);
 	}
 
 	if (un->un_pvp != NULLVP) {
 		vrele(un->un_pvp);
 		un->un_pvp = NULL;
 	}
 	if (un->un_uppervp != NULLVP) {
 		vrele(un->un_uppervp);
 		un->un_uppervp = NULL;
 	}
 	if (un->un_lowervp != NULLVP) {
 		vrele(un->un_lowervp);
 		un->un_lowervp = NULL;
 	}
 	if (un->un_dirvp != NULLVP) {
 		vrele(un->un_dirvp);
 		un->un_dirvp = NULL;
 	}
 	if (un->un_path) {
 		free(un->un_path, M_TEMP);
 		un->un_path = NULL;
 	}
 	lockdestroy(&un->un_lock);
 
 	FREE(vp->v_data, M_TEMP);
 	vp->v_data = 0;
 
 	return (0);
 }
 
 /*
  * copyfile.  copy the vnode (fvp) to the vnode (tvp)
  * using a sequence of reads and writes.  both (fvp)
  * and (tvp) are locked on entry and exit.
  *
  * fvp and tvp are both exclusive locked on call, but their refcount's
  * haven't been bumped at all.
  */
 static int
 union_copyfile(fvp, tvp, cred, td)
 	struct vnode *fvp;
 	struct vnode *tvp;
 	struct ucred *cred;
 	struct thread *td;
 {
 	char *buf;
 	struct uio uio;
 	struct iovec iov;
 	int error = 0;
 
 	/*
 	 * strategy:
 	 * allocate a buffer of size MAXBSIZE.
 	 * loop doing reads and writes, keeping track
 	 * of the current uio offset.
 	 * give up at the first sign of trouble.
 	 */
 
 	bzero(&uio, sizeof(uio));
 
 	uio.uio_td = td;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_offset = 0;
 
 	VOP_LEASE(fvp, td, cred, LEASE_READ);
 	VOP_LEASE(tvp, td, cred, LEASE_WRITE);
 
 	buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
 
 	/* ugly loop follows... */
 	do {
 		off_t offset = uio.uio_offset;
 		int count;
 		int bufoffset;
 
 		/*
 		 * Setup for big read
 		 */
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		iov.iov_base = buf;
 		iov.iov_len = MAXBSIZE;
 		uio.uio_resid = iov.iov_len;
 		uio.uio_rw = UIO_READ;
 
 		if ((error = VOP_READ(fvp, &uio, 0, cred)) != 0)
 			break;
 
 		/*
 		 * Get bytes read, handle read eof case and setup for
 		 * write loop
 		 */
 		if ((count = MAXBSIZE - uio.uio_resid) == 0)
 			break;
 		bufoffset = 0;
 
 		/*
 		 * Write until an error occurs or our buffer has been
 		 * exhausted, then update the offset for the next read.
 		 */
 		while (bufoffset < count) {
 			uio.uio_iov = &iov;
 			uio.uio_iovcnt = 1;
 			iov.iov_base = buf + bufoffset;
 			iov.iov_len = count - bufoffset;
 			uio.uio_offset = offset + bufoffset;
 			uio.uio_rw = UIO_WRITE;
 			uio.uio_resid = iov.iov_len;
 
 			if ((error = VOP_WRITE(tvp, &uio, 0, cred)) != 0)
 				break;
 			bufoffset += (count - bufoffset) - uio.uio_resid;
 		}
 		uio.uio_offset = offset + bufoffset;
 	} while (error == 0);
 
 	free(buf, M_TEMP);
 	return (error);
 }
 
 /*
  *
  * un's vnode is assumed to be locked on entry and remains locked on exit.
  */
 
 int
 union_copyup(un, docopy, cred, td)
 	struct union_node *un;
 	int docopy;
 	struct ucred *cred;
 	struct thread *td;
 {
 	int error;
 	struct mount *mp;
 	struct vnode *lvp, *uvp;
 
 	/*
 	 * If the user does not have read permission, the vnode should not
 	 * be copied to upper layer.
 	 */
 	vn_lock(un->un_lowervp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = VOP_ACCESS(un->un_lowervp, VREAD, cred, td);
 	VOP_UNLOCK(un->un_lowervp, 0, td);
 	if (error)
 		return (error);
 
 	if ((error = vn_start_write(un->un_dirvp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	if ((error = union_vn_create(&uvp, un, td)) != 0) {
 		vn_finished_write(mp);
 		return (error);
 	}
 
 	lvp = un->un_lowervp;
 
 	KASSERT(uvp->v_usecount > 0, ("copy: uvp refcount 0: %d", uvp->v_usecount));
 	if (docopy) {
 		/*
 		 * XX - should not ignore errors
 		 * from VOP_CLOSE
 		 */
 		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY, td);
 		error = VOP_OPEN(lvp, FREAD, cred, td);
 		if (error == 0 && vn_canvmio(lvp) == TRUE)
 			error = vfs_object_create(lvp, td, cred);
 		if (error == 0) {
 			error = union_copyfile(lvp, uvp, cred, td);
 			VOP_UNLOCK(lvp, 0, td);
 			(void) VOP_CLOSE(lvp, FREAD, cred, td);
 		}
 		if (error == 0)
 			UDEBUG(("union: copied up %s\n", un->un_path));
 
 	}
 	VOP_UNLOCK(uvp, 0, td);
 	vn_finished_write(mp);
 	union_newupper(un, uvp);
 	KASSERT(uvp->v_usecount > 0, ("copy: uvp refcount 0: %d", uvp->v_usecount));
 	union_vn_close(uvp, FWRITE, cred, td);
 	KASSERT(uvp->v_usecount > 0, ("copy: uvp refcount 0: %d", uvp->v_usecount));
 	/*
 	 * Subsequent IOs will go to the top layer, so
 	 * call close on the lower vnode and open on the
 	 * upper vnode to ensure that the filesystem keeps
 	 * its references counts right.  This doesn't do
 	 * the right thing with (cred) and (FREAD) though.
 	 * Ignoring error returns is not right, either.
 	 */
 	if (error == 0) {
 		int i;
 
 		for (i = 0; i < un->un_openl; i++) {
 			(void) VOP_CLOSE(lvp, FREAD, cred, td);
 			(void) VOP_OPEN(uvp, FREAD, cred, td);
 		}
 		if (un->un_openl) {
 			if (vn_canvmio(uvp) == TRUE)
 				error = vfs_object_create(uvp, td, cred);
 		}
 		un->un_openl = 0;
 	}
 
 	return (error);
 
 }
 
 /*
  *	union_relookup:
  *
  *	dvp should be locked on entry and will be locked on return.  No
  *	net change in the ref count will occur.
  *
  *	If an error is returned, *vpp will be invalid, otherwise it
  *	will hold a locked, referenced vnode.  If *vpp == dvp then
  *	remember that only one exclusive lock is held.
  */
 
 static int
 union_relookup(um, dvp, vpp, cnp, cn, path, pathlen)
 	struct union_mount *um;
 	struct vnode *dvp;
 	struct vnode **vpp;
 	struct componentname *cnp;
 	struct componentname *cn;
 	char *path;
 	int pathlen;
 {
 	int error;
 
 	/*
 	 * A new componentname structure must be faked up because
 	 * there is no way to know where the upper level cnp came
 	 * from or what it is being used for.  This must duplicate
 	 * some of the work done by NDINIT, some of the work done
 	 * by namei, some of the work done by lookup and some of
 	 * the work done by VOP_LOOKUP when given a CREATE flag.
 	 * Conclusion: Horrible.
 	 */
 	cn->cn_namelen = pathlen;
 	cn->cn_pnbuf = zalloc(namei_zone);
 	bcopy(path, cn->cn_pnbuf, cn->cn_namelen);
 	cn->cn_pnbuf[cn->cn_namelen] = '\0';
 
 	cn->cn_nameiop = CREATE;
 	cn->cn_flags = (LOCKPARENT|LOCKLEAF|HASBUF|SAVENAME|ISLASTCN);
 	cn->cn_thread = cnp->cn_thread;
 	if (um->um_op == UNMNT_ABOVE)
 		cn->cn_cred = cnp->cn_cred;
 	else
 		cn->cn_cred = um->um_cred;
 	cn->cn_nameptr = cn->cn_pnbuf;
 	cn->cn_consume = cnp->cn_consume;
 
 	VREF(dvp);
 	VOP_UNLOCK(dvp, 0, cnp->cn_thread);
 
 	/*
 	 * Pass dvp unlocked and referenced on call to relookup().
 	 *
 	 * If an error occurs, dvp will be returned unlocked and dereferenced.
 	 */
 
 	if ((error = relookup(dvp, vpp, cn)) != 0) {
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, cnp->cn_thread);
 		return(error);
 	}
 
 	/*
 	 * If no error occurs, dvp will be returned locked with the reference
 	 * left as before, and vpp will be returned referenced and locked.
 	 *
 	 * We want to return with dvp as it was passed to us, so we get
 	 * rid of our reference.
 	 */
 	vrele(dvp);
 	return (0);
 }
 
 /*
  * Create a shadow directory in the upper layer.
  * The new vnode is returned locked.
  *
  * (um) points to the union mount structure for access to the
  * the mounting process's credentials.
  * (dvp) is the directory in which to create the shadow directory,
  * it is locked (but not ref'd) on entry and return.
  * (cnp) is the componentname to be created.
  * (vpp) is the returned newly created shadow directory, which
  * is returned locked and ref'd
  */
 int
 union_mkshadow(um, dvp, cnp, vpp)
 	struct union_mount *um;
 	struct vnode *dvp;
 	struct componentname *cnp;
 	struct vnode **vpp;
 {
 	int error;
 	struct vattr va;
 	struct thread *td = cnp->cn_thread;
 	struct componentname cn;
 	struct mount *mp;
 
 	if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	if ((error = union_relookup(um, dvp, vpp, cnp, &cn,
 			cnp->cn_nameptr, cnp->cn_namelen)) != 0) {
 		vn_finished_write(mp);
 		return (error);
 	}
 
 	if (*vpp) {
 		if (cn.cn_flags & HASBUF) {
 			zfree(namei_zone, cn.cn_pnbuf);
 			cn.cn_flags &= ~HASBUF;
 		}
 		if (dvp == *vpp)
 			vrele(*vpp);
 		else
 			vput(*vpp);
 		vn_finished_write(mp);
 		*vpp = NULLVP;
 		return (EEXIST);
 	}
 
 	/*
 	 * policy: when creating the shadow directory in the
 	 * upper layer, create it owned by the user who did
 	 * the mount, group from parent directory, and mode
 	 * 777 modified by umask (ie mostly identical to the
 	 * mkdir syscall).  (jsp, kb)
 	 */
 
 	VATTR_NULL(&va);
 	va.va_type = VDIR;
 	va.va_mode = um->um_cmode;
 
 	/* VOP_LEASE: dvp is locked */
 	VOP_LEASE(dvp, td, cn.cn_cred, LEASE_WRITE);
 
 	error = VOP_MKDIR(dvp, vpp, &cn, &va);
 	if (cn.cn_flags & HASBUF) {
 		zfree(namei_zone, cn.cn_pnbuf);
 		cn.cn_flags &= ~HASBUF;
 	}
 	/*vput(dvp);*/
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Create a whiteout entry in the upper layer.
  *
  * (um) points to the union mount structure for access to the
  * the mounting process's credentials.
  * (dvp) is the directory in which to create the whiteout.
  * it is locked on entry and return.
  * (cnp) is the componentname to be created.
  */
 int
 union_mkwhiteout(um, dvp, cnp, path)
 	struct union_mount *um;
 	struct vnode *dvp;
 	struct componentname *cnp;
 	char *path;
 {
 	int error;
 	struct thread *td = cnp->cn_thread;
 	struct vnode *wvp;
 	struct componentname cn;
 	struct mount *mp;
 
 	if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path));
 	if (error) {
 		vn_finished_write(mp);
 		return (error);
 	}
 
 	if (wvp) {
 		if (cn.cn_flags & HASBUF) {
 			zfree(namei_zone, cn.cn_pnbuf);
 			cn.cn_flags &= ~HASBUF;
 		}
 		if (wvp == dvp)
 			vrele(wvp);
 		else
 			vput(wvp);
 		vn_finished_write(mp);
 		return (EEXIST);
 	}
 
 	/* VOP_LEASE: dvp is locked */
 	VOP_LEASE(dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 
 	error = VOP_WHITEOUT(dvp, &cn, CREATE);
 	if (cn.cn_flags & HASBUF) {
 		zfree(namei_zone, cn.cn_pnbuf);
 		cn.cn_flags &= ~HASBUF;
 	}
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * union_vn_create: creates and opens a new shadow file
  * on the upper union layer.  this function is similar
  * in spirit to calling vn_open but it avoids calling namei().
  * the problem with calling namei is that a) it locks too many
  * things, and b) it doesn't start at the "right" directory,
  * whereas relookup is told where to start.
  *
  * On entry, the vnode associated with un is locked.  It remains locked
  * on return.
  *
  * If no error occurs, *vpp contains a locked referenced vnode for your
  * use.  If an error occurs *vpp iis undefined.
  */
 static int
 union_vn_create(vpp, un, td)
 	struct vnode **vpp;
 	struct union_node *un;
 	struct thread *td;
 {
 	struct vnode *vp;
 	struct ucred *cred = td->td_proc->p_ucred;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL);
 	int error;
-	int cmode = UN_FILEMODE & ~td->td_proc->p_fd->fd_cmask;
+	int cmode;
 	struct componentname cn;
 
 	*vpp = NULLVP;
+	FILEDESC_LOCK(td->td_proc->p_fd);
+	cmode = UN_FILEMODE & ~td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
 
 	/*
 	 * Build a new componentname structure (for the same
 	 * reasons outlines in union_mkshadow).
 	 * The difference here is that the file is owned by
 	 * the current user, rather than by the person who
 	 * did the mount, since the current user needs to be
 	 * able to write the file (that's why it is being
 	 * copied in the first place).
 	 */
 	cn.cn_namelen = strlen(un->un_path);
 	cn.cn_pnbuf = zalloc(namei_zone);
 	bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1);
 	cn.cn_nameiop = CREATE;
 	cn.cn_flags = (LOCKPARENT|LOCKLEAF|HASBUF|SAVENAME|ISLASTCN);
 	cn.cn_thread = td;
 	cn.cn_cred = td->td_proc->p_ucred;
 	cn.cn_nameptr = cn.cn_pnbuf;
 	cn.cn_consume = 0;
 
 	/*
 	 * Pass dvp unlocked and referenced on call to relookup().
 	 *
 	 * If an error occurs, dvp will be returned unlocked and dereferenced.
 	 */
 	VREF(un->un_dirvp);
 	error = relookup(un->un_dirvp, &vp, &cn);
 	if (error)
 		return (error);
 
 	/*
 	 * If no error occurs, dvp will be returned locked with the reference
 	 * left as before, and vpp will be returned referenced and locked.
 	 */
 	if (vp) {
 		vput(un->un_dirvp);
 		if (cn.cn_flags & HASBUF) {
 			zfree(namei_zone, cn.cn_pnbuf);
 			cn.cn_flags &= ~HASBUF;
 		}
 		if (vp == un->un_dirvp)
 			vrele(vp);
 		else
 			vput(vp);
 		return (EEXIST);
 	}
 
 	/*
 	 * Good - there was no race to create the file
 	 * so go ahead and create it.  The permissions
 	 * on the file will be 0666 modified by the
 	 * current user's umask.  Access to the file, while
 	 * it is unioned, will require access to the top *and*
 	 * bottom files.  Access when not unioned will simply
 	 * require access to the top-level file.
 	 * TODO: confirm choice of access permissions.
 	 */
 	VATTR_NULL(vap);
 	vap->va_type = VREG;
 	vap->va_mode = cmode;
 	VOP_LEASE(un->un_dirvp, td, cred, LEASE_WRITE);
 	error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap);
 	if (cn.cn_flags & HASBUF) {
 		zfree(namei_zone, cn.cn_pnbuf);
 		cn.cn_flags &= ~HASBUF;
 	}
 	vput(un->un_dirvp);
 	if (error)
 		return (error);
 
 	error = VOP_OPEN(vp, fmode, cred, td);
 	if (error == 0 && vn_canvmio(vp) == TRUE)
 		error = vfs_object_create(vp, td, cred);
 	if (error) {
 		vput(vp);
 		return (error);
 	}
 	vp->v_writecount++;
 	*vpp = vp;
 	return (0);
 }
 
 static int
 union_vn_close(vp, fmode, cred, td)
 	struct vnode *vp;
 	int fmode;
 	struct ucred *cred;
 	struct thread *td;
 {
 
 	if (fmode & FWRITE)
 		--vp->v_writecount;
 	return (VOP_CLOSE(vp, fmode, cred, td));
 }
 
 #if 0
 
 /*
  *	union_removed_upper:
  *
  *	called with union_node unlocked. XXX
  */
 
 void
 union_removed_upper(un)
 	struct union_node *un;
 {
 	struct thread *td = curthread;	/* XXX */
 	struct vnode **vpp;
 
 	/*
 	 * Do not set the uppervp to NULLVP.  If lowervp is NULLVP,
 	 * union node will have neither uppervp nor lowervp.  We remove
 	 * the union node from cache, so that it will not be referrenced.
 	 */
 	union_newupper(un, NULLVP);
 	if (un->un_dircache != 0) {
 		for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
 			vrele(*vpp);
 		free(un->un_dircache, M_TEMP);
 		un->un_dircache = 0;
 	}
 
 	if (un->un_flags & UN_CACHED) {
 		un->un_flags &= ~UN_CACHED;
 		LIST_REMOVE(un, un_cache);
 	}
 }
 
 #endif
 
 /*
  * determine whether a whiteout is needed
  * during a remove/rmdir operation.
  */
 int
 union_dowhiteout(un, cred, td)
 	struct union_node *un;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct vattr va;
 
 	if (un->un_lowervp != NULLVP)
 		return (1);
 
 	if (VOP_GETATTR(un->un_uppervp, &va, cred, td) == 0 &&
 	    (va.va_flags & OPAQUE))
 		return (1);
 
 	return (0);
 }
 
 static void
 union_dircache_r(vp, vppp, cntp)
 	struct vnode *vp;
 	struct vnode ***vppp;
 	int *cntp;
 {
 	struct union_node *un;
 
 	if (vp->v_op != union_vnodeop_p) {
 		if (vppp) {
 			VREF(vp);
 			*(*vppp)++ = vp;
 			if (--(*cntp) == 0)
 				panic("union: dircache table too small");
 		} else {
 			(*cntp)++;
 		}
 
 		return;
 	}
 
 	un = VTOUNION(vp);
 	if (un->un_uppervp != NULLVP)
 		union_dircache_r(un->un_uppervp, vppp, cntp);
 	if (un->un_lowervp != NULLVP)
 		union_dircache_r(un->un_lowervp, vppp, cntp);
 }
 
 struct vnode *
 union_dircache(vp, td)
 	struct vnode *vp;
 	struct thread *td;
 {
 	int cnt;
 	struct vnode *nvp;
 	struct vnode **vpp;
 	struct vnode **dircache;
 	struct union_node *un;
 	int error;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	dircache = VTOUNION(vp)->un_dircache;
 
 	nvp = NULLVP;
 
 	if (dircache == NULL) {
 		cnt = 0;
 		union_dircache_r(vp, 0, &cnt);
 		cnt++;
 		dircache = malloc(cnt * sizeof(struct vnode *),
 				M_TEMP, M_WAITOK);
 		vpp = dircache;
 		union_dircache_r(vp, &vpp, &cnt);
 		*vpp = NULLVP;
 		vpp = dircache + 1;
 	} else {
 		vpp = dircache;
 		do {
 			if (*vpp++ == VTOUNION(vp)->un_uppervp)
 				break;
 		} while (*vpp != NULLVP);
 	}
 
 	if (*vpp == NULLVP)
 		goto out;
 
 	/*vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);*/
 	UDEBUG(("ALLOCVP-3 %p ref %d\n", *vpp, (*vpp ? (*vpp)->v_usecount : -99)));
 	VREF(*vpp);
 	error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, NULL, *vpp, NULLVP, 0);
 	UDEBUG(("ALLOCVP-3B %p ref %d\n", nvp, (*vpp ? (*vpp)->v_usecount : -99)));
 	if (error)
 		goto out;
 
 	VTOUNION(vp)->un_dircache = 0;
 	un = VTOUNION(nvp);
 	un->un_dircache = dircache;
 
 out:
 	VOP_UNLOCK(vp, 0, td);
 	return (nvp);
 }
 
 /*
  * Module glue to remove #ifdef UNION from vfs_syscalls.c
  */
 static int
 union_dircheck(struct thread *td, struct vnode **vp, struct file *fp)
 {
 	int error = 0;
 
 	if ((*vp)->v_op == union_vnodeop_p) {
 		struct vnode *lvp;
 
 		lvp = union_dircache(*vp, td);
 		if (lvp != NULLVP) {
 			struct vattr va;
 
 			/*
 			 * If the directory is opaque,
 			 * then don't show lower entries
 			 */
 			error = VOP_GETATTR(*vp, &va, fp->f_cred, td);
 			if (va.va_flags & OPAQUE) {
 				vput(lvp);
 				lvp = NULL;
 			}
 		}
 
 		if (lvp != NULLVP) {
 			error = VOP_OPEN(lvp, FREAD, fp->f_cred, td);
 			if (error == 0 && vn_canvmio(lvp) == TRUE)
 				error = vfs_object_create(lvp, td, fp->f_cred);
 			if (error) {
 				vput(lvp);
 				return (error);
 			}
 			VOP_UNLOCK(lvp, 0, td);
+			FILE_LOCK(fp);
 			fp->f_data = (caddr_t) lvp;
 			fp->f_offset = 0;
+			FILE_UNLOCK(fp);
 			error = vn_close(*vp, FREAD, fp->f_cred, td);
 			if (error)
 				return (error);
 			*vp = lvp;
 			return -1;	/* goto unionread */
 		}
 	}
 	return error;
 }
 
 static int
 union_modevent(module_t mod, int type, void *data)
 {
 	switch (type) {
 	case MOD_LOAD:
 		union_dircheckp = union_dircheck;
 		break;
 	case MOD_UNLOAD:
 		union_dircheckp = NULL;
 		break;
 	default:
 		break;
 	}
 	return 0;
 }
 
 static moduledata_t union_mod = {
 	"union_dircheck",
 	union_modevent,
 	NULL
 };
 
 DECLARE_MODULE(union_dircheck, union_mod, SI_SUB_VFS, SI_ORDER_ANY);
Index: head/sys/fs/unionfs/union_vfsops.c
===================================================================
--- head/sys/fs/unionfs/union_vfsops.c	(revision 89305)
+++ head/sys/fs/unionfs/union_vfsops.c	(revision 89306)
@@ -1,488 +1,490 @@
 /*
  * Copyright (c) 1994, 1995 The Regents of the University of California.
  * Copyright (c) 1994, 1995 Jan-Simon Pendry.
  * All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)union_vfsops.c	8.20 (Berkeley) 5/20/95
  * $FreeBSD$
  */
 
 /*
  * Union Layer
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/filedesc.h>
 #include <fs/unionfs/union.h>
 
 static MALLOC_DEFINE(M_UNIONFSMNT, "UNION mount", "UNION mount structure");
 
 extern int	union_init __P((struct vfsconf *));
 static int	union_mount __P((struct mount *mp, char *path, caddr_t data,
 				 struct nameidata *ndp, struct thread *td));
 static int	union_root __P((struct mount *mp, struct vnode **vpp));
 static int	union_statfs __P((struct mount *mp, struct statfs *sbp,
 				  struct thread *td));
 static int	union_unmount __P((struct mount *mp, int mntflags,
 				   struct thread *td));
 
 /*
  * Mount union filesystem
  */
 static int
 union_mount(mp, path, data, ndp, td)
 	struct mount *mp;
 	char *path;
 	caddr_t data;
 	struct nameidata *ndp;
 	struct thread *td;
 {
 	int error = 0;
 	struct union_args args;
 	struct vnode *lowerrootvp = NULLVP;
 	struct vnode *upperrootvp = NULLVP;
 	struct union_mount *um = 0;
 	struct ucred *cred = 0;
 	char *cp = 0;
 	int len;
 	u_int size;
 
 	UDEBUG(("union_mount(mp = %p)\n", (void *)mp));
 
 	/*
 	 * Disable clustered write, otherwise system becomes unstable.
 	 */
 	mp->mnt_flag |= MNT_NOCLUSTERW;
 
 	/*
 	 * Update is a no-op
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		/*
 		 * Need to provide.
 		 * 1. a way to convert between rdonly and rdwr mounts.
 		 * 2. support for nfs exports.
 		 */
 		error = EOPNOTSUPP;
 		goto bad;
 	}
 
 	/*
 	 * Get argument
 	 */
 	error = copyin(data, (caddr_t)&args, sizeof(struct union_args));
 	if (error)
 		goto bad;
 
 	/*
 	 * Obtain lower vnode.  Vnode is stored in mp->mnt_vnodecovered.
 	 * We need to reference it but not lock it.
 	 */
 
 	lowerrootvp = mp->mnt_vnodecovered;
 	VREF(lowerrootvp);
 
 #if 0
 	/*
 	 * Unlock lower node to avoid deadlock.
 	 */
 	if (lowerrootvp->v_op == union_vnodeop_p)
 		VOP_UNLOCK(lowerrootvp, 0, td);
 #endif
 
 	/*
 	 * Obtain upper vnode by calling namei() on the path.  The
 	 * upperrootvp will be turned referenced but not locked.
 	 */
 	NDINIT(ndp, LOOKUP, FOLLOW|WANTPARENT,
 	       UIO_USERSPACE, args.target, td);
 
 	error = namei(ndp);
 
 #if 0
 	if (lowerrootvp->v_op == union_vnodeop_p)
 		vn_lock(lowerrootvp, LK_EXCLUSIVE | LK_RETRY, td);
 #endif
 	if (error)
 		goto bad;
 
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	upperrootvp = ndp->ni_vp;
 	vrele(ndp->ni_dvp);
 	ndp->ni_dvp = NULL;
 
 	UDEBUG(("mount_root UPPERVP %p locked = %d\n", upperrootvp,
 	    VOP_ISLOCKED(upperrootvp, NULL)));
 
 	/*
 	 * Check multi union mount to avoid `lock myself again' panic.
 	 * Also require that it be a directory.
 	 */
 	if (upperrootvp == VTOUNION(lowerrootvp)->un_uppervp) {
 #ifdef DIAGNOSTIC
 		printf("union_mount: multi union mount?\n");
 #endif
 		error = EDEADLK;
 		goto bad;
 	}
 
 	if (upperrootvp->v_type != VDIR) {
 		error = EINVAL;
 		goto bad;
 	}
 
 	/*
 	 * Allocate our union_mount structure and populate the fields.
 	 * The vnode references are stored in the union_mount as held,
 	 * unlocked references.  Depending on the _BELOW flag, the
 	 * filesystems are viewed in a different order.  In effect this
 	 * is the same as providing a mount-under option to the mount
 	 * syscall.
 	 */
 
 	um = (struct union_mount *) malloc(sizeof(struct union_mount),
 				M_UNIONFSMNT, M_WAITOK | M_ZERO);
 
 	um->um_op = args.mntflags & UNMNT_OPMASK;
 
 	switch (um->um_op) {
 	case UNMNT_ABOVE:
 		um->um_lowervp = lowerrootvp;
 		um->um_uppervp = upperrootvp;
 		upperrootvp = NULL;
 		lowerrootvp = NULL;
 		break;
 
 	case UNMNT_BELOW:
 		um->um_lowervp = upperrootvp;
 		um->um_uppervp = lowerrootvp;
 		upperrootvp = NULL;
 		lowerrootvp = NULL;
 		break;
 
 	case UNMNT_REPLACE:
 		vrele(lowerrootvp);
 		lowerrootvp = NULL;
 		um->um_uppervp = upperrootvp;
 		um->um_lowervp = lowerrootvp;
 		upperrootvp = NULL;
 		break;
 
 	default:
 		error = EINVAL;
 		goto bad;
 	}
 
 	/*
 	 * Unless the mount is readonly, ensure that the top layer
 	 * supports whiteout operations
 	 */
 	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 		error = VOP_WHITEOUT(um->um_uppervp, NULL, LOOKUP);
 		if (error)
 			goto bad;
 	}
 
 	um->um_cred = crhold(td->td_proc->p_ucred);
+	FILEDESC_LOCK(td->td_proc->p_fd);
 	um->um_cmode = UN_DIRMODE &~ td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
 
 	/*
 	 * Depending on what you think the MNT_LOCAL flag might mean,
 	 * you may want the && to be || on the conditional below.
 	 * At the moment it has been defined that the filesystem is
 	 * only local if it is all local, ie the MNT_LOCAL flag implies
 	 * that the entire namespace is local.  If you think the MNT_LOCAL
 	 * flag implies that some of the files might be stored locally
 	 * then you will want to change the conditional.
 	 */
 	if (um->um_op == UNMNT_ABOVE) {
 		if (((um->um_lowervp == NULLVP) ||
 		     (um->um_lowervp->v_mount->mnt_flag & MNT_LOCAL)) &&
 		    (um->um_uppervp->v_mount->mnt_flag & MNT_LOCAL))
 			mp->mnt_flag |= MNT_LOCAL;
 	}
 
 	/*
 	 * Copy in the upper layer's RDONLY flag.  This is for the benefit
 	 * of lookup() which explicitly checks the flag, rather than asking
 	 * the filesystem for its own opinion.  This means, that an update
 	 * mount of the underlying filesystem to go from rdonly to rdwr
 	 * will leave the unioned view as read-only.
 	 */
 	mp->mnt_flag |= (um->um_uppervp->v_mount->mnt_flag & MNT_RDONLY);
 
 	mp->mnt_data = (qaddr_t) um;
 	vfs_getnewfsid(mp);
 
 	switch (um->um_op) {
 	case UNMNT_ABOVE:
 		cp = "<above>:";
 		break;
 	case UNMNT_BELOW:
 		cp = "<below>:";
 		break;
 	case UNMNT_REPLACE:
 		cp = "";
 		break;
 	}
 	len = strlen(cp);
 	bcopy(cp, mp->mnt_stat.f_mntfromname, len);
 
 	cp = mp->mnt_stat.f_mntfromname + len;
 	len = MNAMELEN - len;
 
 	(void) copyinstr(args.target, cp, len - 1, &size);
 	bzero(cp + size, len - size);
 
 	(void)union_statfs(mp, &mp->mnt_stat, td);
 
 	UDEBUG(("union_mount: from %s, on %s\n",
 		mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname));
 	return (0);
 
 bad:
 	if (um) {
 		if (um->um_uppervp)
 			vrele(um->um_uppervp);
 		if (um->um_lowervp)
 			vrele(um->um_lowervp);
 		/* XXX other fields */
 		free(um, M_UNIONFSMNT);
 	}
 	if (cred)
 		crfree(cred);
 	if (upperrootvp)
 		vrele(upperrootvp);
 	if (lowerrootvp)
 		vrele(lowerrootvp);
 	return (error);
 }
 
 /*
  * Free reference to union layer
  */
 static int
 union_unmount(mp, mntflags, td)
 	struct mount *mp;
 	int mntflags;
 	struct thread *td;
 {
 	struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
 	int error;
 	int freeing;
 	int flags = 0;
 
 	UDEBUG(("union_unmount(mp = %p)\n", (void *)mp));
 
 	if (mntflags & MNT_FORCE)
 		flags |= FORCECLOSE;
 
 	/*
 	 * Keep flushing vnodes from the mount list.
 	 * This is needed because of the un_pvp held
 	 * reference to the parent vnode.
 	 * If more vnodes have been freed on a given pass,
 	 * the try again.  The loop will iterate at most
 	 * (d) times, where (d) is the maximum tree depth
 	 * in the filesystem.
 	 */
 	for (freeing = 0; (error = vflush(mp, 0, flags)) != 0;) {
 		struct vnode *vp;
 		int n;
 
 		/* count #vnodes held on mount list */
 		mtx_lock(&mntvnode_mtx);
 		n = 0;
 		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
 			n++;
 		mtx_unlock(&mntvnode_mtx);
 
 		/* if this is unchanged then stop */
 		if (n == freeing)
 			break;
 
 		/* otherwise try once more time */
 		freeing = n;
 	}
 
 	/* If the most recent vflush failed, the filesystem is still busy. */
 	if (error)
 		return (error);
 
 	/*
 	 * Discard references to upper and lower target vnodes.
 	 */
 	if (um->um_lowervp)
 		vrele(um->um_lowervp);
 	vrele(um->um_uppervp);
 	crfree(um->um_cred);
 	/*
 	 * Finally, throw away the union_mount structure
 	 */
 	free(mp->mnt_data, M_UNIONFSMNT);	/* XXX */
 	mp->mnt_data = 0;
 	return (0);
 }
 
 static int
 union_root(mp, vpp)
 	struct mount *mp;
 	struct vnode **vpp;
 {
 	struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
 	int error;
 
 	/*
 	 * Supply an unlocked reference to um_uppervp and to um_lowervp.  It
 	 * is possible for um_uppervp to be locked without the associated
 	 * root union_node being locked.  We let union_allocvp() deal with
 	 * it.
 	 */
 	UDEBUG(("union_root UPPERVP %p locked = %d\n", um->um_uppervp,
 	    VOP_ISLOCKED(um->um_uppervp, NULL)));
 
 	VREF(um->um_uppervp);
 	if (um->um_lowervp)
 		VREF(um->um_lowervp);
 
 	error = union_allocvp(vpp, mp, NULLVP, NULLVP, NULL, 
 		    um->um_uppervp, um->um_lowervp, 1);
 	UDEBUG(("error %d\n", error));
 	UDEBUG(("union_root2 UPPERVP %p locked = %d\n", um->um_uppervp,
 	    VOP_ISLOCKED(um->um_uppervp, NULL)));
 
 	return (error);
 }
 
 static int
 union_statfs(mp, sbp, td)
 	struct mount *mp;
 	struct statfs *sbp;
 	struct thread *td;
 {
 	int error;
 	struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
 	struct statfs mstat;
 	int lbsize;
 
 	UDEBUG(("union_statfs(mp = %p, lvp = %p, uvp = %p)\n",
 	    (void *)mp, (void *)um->um_lowervp, (void *)um->um_uppervp));
 
 	bzero(&mstat, sizeof(mstat));
 
 	if (um->um_lowervp) {
 		error = VFS_STATFS(um->um_lowervp->v_mount, &mstat, td);
 		if (error)
 			return (error);
 	}
 
 	/* now copy across the "interesting" information and fake the rest */
 #if 0
 	sbp->f_type = mstat.f_type;
 	sbp->f_flags = mstat.f_flags;
 	sbp->f_bsize = mstat.f_bsize;
 	sbp->f_iosize = mstat.f_iosize;
 #endif
 	lbsize = mstat.f_bsize;
 	sbp->f_blocks = mstat.f_blocks;
 	sbp->f_bfree = mstat.f_bfree;
 	sbp->f_bavail = mstat.f_bavail;
 	sbp->f_files = mstat.f_files;
 	sbp->f_ffree = mstat.f_ffree;
 
 	error = VFS_STATFS(um->um_uppervp->v_mount, &mstat, td);
 	if (error)
 		return (error);
 
 	sbp->f_flags = mstat.f_flags;
 	sbp->f_bsize = mstat.f_bsize;
 	sbp->f_iosize = mstat.f_iosize;
 
 	/*
 	 * if the lower and upper blocksizes differ, then frig the
 	 * block counts so that the sizes reported by df make some
 	 * kind of sense.  none of this makes sense though.
 	 */
 
 	if (mstat.f_bsize != lbsize)
 		sbp->f_blocks = ((off_t) sbp->f_blocks * lbsize) / mstat.f_bsize;
 
 	/*
 	 * The "total" fields count total resources in all layers,
 	 * the "free" fields count only those resources which are
 	 * free in the upper layer (since only the upper layer
 	 * is writeable).
 	 */
 	sbp->f_blocks += mstat.f_blocks;
 	sbp->f_bfree = mstat.f_bfree;
 	sbp->f_bavail = mstat.f_bavail;
 	sbp->f_files += mstat.f_files;
 	sbp->f_ffree = mstat.f_ffree;
 
 	if (sbp != &mp->mnt_stat) {
 		sbp->f_type = mp->mnt_vfc->vfc_typenum;
 		bcopy(&mp->mnt_stat.f_fsid, &sbp->f_fsid, sizeof(sbp->f_fsid));
 		bcopy(mp->mnt_stat.f_mntonname, sbp->f_mntonname, MNAMELEN);
 		bcopy(mp->mnt_stat.f_mntfromname, sbp->f_mntfromname, MNAMELEN);
 	}
 	return (0);
 }
 
 static struct vfsops union_vfsops = {
 	union_mount,
 	vfs_stdstart,	/* underlying start already done */
 	union_unmount,
 	union_root,
 	vfs_stdquotactl,
 	union_statfs,
 	vfs_stdsync,    /* XXX assumes no cached data on union level */
 	vfs_stdvget,
 	vfs_stdfhtovp,
 	vfs_stdcheckexp,
 	vfs_stdvptofh,
 	union_init,
 	vfs_stduninit,
 	vfs_stdextattrctl,
 };
 
 VFS_SET(union_vfsops, unionfs, VFCF_LOOPBACK);
Index: head/sys/i386/ibcs2/ibcs2_fcntl.c
===================================================================
--- head/sys/i386/ibcs2/ibcs2_fcntl.c	(revision 89305)
+++ head/sys/i386/ibcs2/ibcs2_fcntl.c	(revision 89306)
@@ -1,331 +1,335 @@
 /*
  * Copyright (c) 1995 Scott Bartram
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_spx_hack.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/ttycom.h>
 
 #include <i386/ibcs2/ibcs2_fcntl.h>
 #include <i386/ibcs2/ibcs2_signal.h>
 #include <i386/ibcs2/ibcs2_proto.h>
 #include <i386/ibcs2/ibcs2_util.h>
 
 static void cvt_iflock2flock __P((struct ibcs2_flock *, struct flock *));
 static void cvt_flock2iflock __P((struct flock *, struct ibcs2_flock *));
 static int  cvt_o_flags      __P((int));
 static int  oflags2ioflags   __P((int));
 static int  ioflags2oflags   __P((int));
 
 static int
 cvt_o_flags(flags)
 	int flags;
 {
 	int r = 0;
 
         /* convert mode into NetBSD mode */
 	if (flags & IBCS2_O_WRONLY) r |= O_WRONLY;
 	if (flags & IBCS2_O_RDWR)   r |= O_RDWR;
 	if (flags & (IBCS2_O_NDELAY | IBCS2_O_NONBLOCK)) r |= O_NONBLOCK;
 	if (flags & IBCS2_O_APPEND) r |= O_APPEND;
 	if (flags & IBCS2_O_SYNC)   r |= O_FSYNC;
 	if (flags & IBCS2_O_CREAT)  r |= O_CREAT;
 	if (flags & IBCS2_O_TRUNC)  r |= O_TRUNC /* | O_CREAT ??? */;
 	if (flags & IBCS2_O_EXCL)   r |= O_EXCL;
 	if (flags & IBCS2_O_RDONLY) r |= O_RDONLY;
 	if (flags & IBCS2_O_PRIV)   r |= O_EXLOCK;
 	if (flags & IBCS2_O_NOCTTY) r |= O_NOCTTY;
 	return r;
 }
 
 static void
 cvt_flock2iflock(flp, iflp)
 	struct flock *flp;
 	struct ibcs2_flock *iflp;
 {
 	switch (flp->l_type) {
 	case F_RDLCK:
 		iflp->l_type = IBCS2_F_RDLCK;
 		break;
 	case F_WRLCK:
 		iflp->l_type = IBCS2_F_WRLCK;
 		break;
 	case F_UNLCK:
 		iflp->l_type = IBCS2_F_UNLCK;
 		break;
 	}
 	iflp->l_whence = (short)flp->l_whence;
 	iflp->l_start = (ibcs2_off_t)flp->l_start;
 	iflp->l_len = (ibcs2_off_t)flp->l_len;
 	iflp->l_sysid = 0;
 	iflp->l_pid = (ibcs2_pid_t)flp->l_pid;
 }
 
 #ifdef DEBUG_IBCS2
 static void
 print_flock(struct flock *flp)
 {
   printf("flock: start=%x len=%x pid=%d type=%d whence=%d\n",
 	 (int)flp->l_start, (int)flp->l_len, (int)flp->l_pid,
 	 flp->l_type, flp->l_whence);
 }
 #endif
 
 static void
 cvt_iflock2flock(iflp, flp)
 	struct ibcs2_flock *iflp;
 	struct flock *flp;
 {
 	flp->l_start = (off_t)iflp->l_start;
 	flp->l_len = (off_t)iflp->l_len;
 	flp->l_pid = (pid_t)iflp->l_pid;
 	switch (iflp->l_type) {
 	case IBCS2_F_RDLCK:
 		flp->l_type = F_RDLCK;
 		break;
 	case IBCS2_F_WRLCK:
 		flp->l_type = F_WRLCK;
 		break;
 	case IBCS2_F_UNLCK:
 		flp->l_type = F_UNLCK;
 		break;
 	}
 	flp->l_whence = iflp->l_whence;
 }
 
 /* convert iBCS2 mode into NetBSD mode */
 static int
 ioflags2oflags(flags)
 	int flags;
 {
 	int r = 0;
 	
 	if (flags & IBCS2_O_RDONLY) r |= O_RDONLY;
 	if (flags & IBCS2_O_WRONLY) r |= O_WRONLY;
 	if (flags & IBCS2_O_RDWR) r |= O_RDWR;
 	if (flags & IBCS2_O_NDELAY) r |= O_NONBLOCK;
 	if (flags & IBCS2_O_APPEND) r |= O_APPEND;
 	if (flags & IBCS2_O_SYNC) r |= O_FSYNC;
 	if (flags & IBCS2_O_NONBLOCK) r |= O_NONBLOCK;
 	if (flags & IBCS2_O_CREAT) r |= O_CREAT;
 	if (flags & IBCS2_O_TRUNC) r |= O_TRUNC;
 	if (flags & IBCS2_O_EXCL) r |= O_EXCL;
 	if (flags & IBCS2_O_NOCTTY) r |= O_NOCTTY;
 	return r;
 }
 
 /* convert NetBSD mode into iBCS2 mode */
 static int
 oflags2ioflags(flags)
 	int flags;
 {
 	int r = 0;
 	
 	if (flags & O_RDONLY) r |= IBCS2_O_RDONLY;
 	if (flags & O_WRONLY) r |= IBCS2_O_WRONLY;
 	if (flags & O_RDWR) r |= IBCS2_O_RDWR;
 	if (flags & O_NDELAY) r |= IBCS2_O_NONBLOCK;
 	if (flags & O_APPEND) r |= IBCS2_O_APPEND;
 	if (flags & O_FSYNC) r |= IBCS2_O_SYNC;
 	if (flags & O_NONBLOCK) r |= IBCS2_O_NONBLOCK;
 	if (flags & O_CREAT) r |= IBCS2_O_CREAT;
 	if (flags & O_TRUNC) r |= IBCS2_O_TRUNC;
 	if (flags & O_EXCL) r |= IBCS2_O_EXCL;
 	if (flags & O_NOCTTY) r |= IBCS2_O_NOCTTY;
 	return r;
 }
 
 int
 ibcs2_open(td, uap)
 	struct thread *td;
 	struct ibcs2_open_args *uap;
 {
 	struct proc *p = td->td_proc;
 	int noctty = SCARG(uap, flags) & IBCS2_O_NOCTTY;
 	int ret;
 	caddr_t sg = stackgap_init();
 
 	SCARG(uap, flags) = cvt_o_flags(SCARG(uap, flags));
 	if (SCARG(uap, flags) & O_CREAT)
 		CHECKALTCREAT(td, &sg, SCARG(uap, path));
 	else
 		CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	ret = open(td, (struct open_args *)uap);
 
 #ifdef SPX_HACK
 	if (ret == ENXIO) {
 		if (!strcmp(SCARG(uap, path), "/compat/ibcs2/dev/spx"))
 			ret = spx_open(td, uap);
 	} else
 #endif /* SPX_HACK */
 	PROC_LOCK(p);
 	if (!ret && !noctty && SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) {
-		struct filedesc *fdp = p->p_fd;
-		struct file *fp = fdp->fd_ofiles[td->td_retval[0]];
+		struct file *fp;
 
+		fp = ffind_hold(td, td->td_retval[0]);
 		PROC_UNLOCK(p);
+		if (fp == NULL)
+			return (EBADF);
+
 		/* ignore any error, just give it a try */
 		if (fp->f_type == DTYPE_VNODE)
 			fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td);
+		fdrop(fp, td);
 	} else
 		PROC_UNLOCK(p);
 	return ret;
 }
 
 int
 ibcs2_creat(td, uap)
         struct thread *td;  
 	struct ibcs2_creat_args *uap;
 {       
 	struct open_args cup;   
 	caddr_t sg = stackgap_init();
 
 	CHECKALTCREAT(td, &sg, SCARG(uap, path));
 	SCARG(&cup, path) = SCARG(uap, path);
 	SCARG(&cup, mode) = SCARG(uap, mode);
 	SCARG(&cup, flags) = O_WRONLY | O_CREAT | O_TRUNC;
 	return open(td, &cup);
 }       
 
 int
 ibcs2_access(td, uap)
         struct thread *td;
         struct ibcs2_access_args *uap;
 {
         struct access_args cup;
         caddr_t sg = stackgap_init();
 
         CHECKALTEXIST(td, &sg, SCARG(uap, path));
         SCARG(&cup, path) = SCARG(uap, path);
         SCARG(&cup, flags) = SCARG(uap, flags);
         return access(td, &cup);
 }
 
 int
 ibcs2_fcntl(td, uap)
 	struct thread *td;
 	struct ibcs2_fcntl_args *uap;
 {
 	int error;
 	struct fcntl_args fa;
 	struct flock *flp;
 	struct ibcs2_flock ifl;
 	
 	switch(SCARG(uap, cmd)) {
 	case IBCS2_F_DUPFD:
 		SCARG(&fa, fd) = SCARG(uap, fd);
 		SCARG(&fa, cmd) = F_DUPFD;
 		SCARG(&fa, arg) = (/* XXX */ int)SCARG(uap, arg);
 		return fcntl(td, &fa);
 	case IBCS2_F_GETFD:
 		SCARG(&fa, fd) = SCARG(uap, fd);
 		SCARG(&fa, cmd) = F_GETFD;
 		SCARG(&fa, arg) = (/* XXX */ int)SCARG(uap, arg);
 		return fcntl(td, &fa);
 	case IBCS2_F_SETFD:
 		SCARG(&fa, fd) = SCARG(uap, fd);
 		SCARG(&fa, cmd) = F_SETFD;
 		SCARG(&fa, arg) = (/* XXX */ int)SCARG(uap, arg);
 		return fcntl(td, &fa);
 	case IBCS2_F_GETFL:
 		SCARG(&fa, fd) = SCARG(uap, fd);
 		SCARG(&fa, cmd) = F_GETFL;
 		SCARG(&fa, arg) = (/* XXX */ int)SCARG(uap, arg);
 		error = fcntl(td, &fa);
 		if (error)
 			return error;
 		td->td_retval[0] = oflags2ioflags(td->td_retval[0]);
 		return error;
 	case IBCS2_F_SETFL:
 		SCARG(&fa, fd) = SCARG(uap, fd);
 		SCARG(&fa, cmd) = F_SETFL;
 		SCARG(&fa, arg) = (/* XXX */ int)
 				  ioflags2oflags((int)SCARG(uap, arg));
 		return fcntl(td, &fa);
 
 	case IBCS2_F_GETLK:
 	    {
 		caddr_t sg = stackgap_init();
 		flp = stackgap_alloc(&sg, sizeof(*flp));
 		error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&ifl,
 			       ibcs2_flock_len);
 		if (error)
 			return error;
 		cvt_iflock2flock(&ifl, flp);
 		SCARG(&fa, fd) = SCARG(uap, fd);
 		SCARG(&fa, cmd) = F_GETLK;
 		SCARG(&fa, arg) = (/* XXX */ int)flp;
 		error = fcntl(td, &fa);
 		if (error)
 			return error;
 		cvt_flock2iflock(flp, &ifl);
 		return copyout((caddr_t)&ifl, (caddr_t)SCARG(uap, arg),
 			       ibcs2_flock_len);
 	    }
 
 	case IBCS2_F_SETLK:
 	    {
 		caddr_t sg = stackgap_init();
 		flp = stackgap_alloc(&sg, sizeof(*flp));
 		error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&ifl,
 			       ibcs2_flock_len);
 		if (error)
 			return error;
 		cvt_iflock2flock(&ifl, flp);
 		SCARG(&fa, fd) = SCARG(uap, fd);
 		SCARG(&fa, cmd) = F_SETLK;
 		SCARG(&fa, arg) = (/* XXX */ int)flp;
 
 		return fcntl(td, &fa);
 	    }
 
 	case IBCS2_F_SETLKW:
 	    {
 		caddr_t sg = stackgap_init();
 		flp = stackgap_alloc(&sg, sizeof(*flp));
 		error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&ifl,
 			       ibcs2_flock_len);
 		if (error)
 			return error;
 		cvt_iflock2flock(&ifl, flp);
 		SCARG(&fa, fd) = SCARG(uap, fd);
 		SCARG(&fa, cmd) = F_SETLKW;
 		SCARG(&fa, arg) = (/* XXX */ int)flp;
 		return fcntl(td, &fa);
 	    }
 	}
 	return ENOSYS;
 }
Index: head/sys/i386/ibcs2/ibcs2_ioctl.c
===================================================================
--- head/sys/i386/ibcs2/ibcs2_ioctl.c	(revision 89305)
+++ head/sys/i386/ibcs2/ibcs2_ioctl.c	(revision 89306)
@@ -1,646 +1,693 @@
 /*	$NetBSD: ibcs2_ioctl.c,v 1.6 1995/03/14 15:12:28 scottb Exp $	*/
 
 /*
  * Copyright (c) 1994, 1995 Scott Bartram
  * All rights reserved.
  *
  * based on compat/sunos/sun_ioctl.c
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/consio.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/ioctl_compat.h>
 #include <sys/kbio.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/tty.h>
 
 #include <i386/ibcs2/ibcs2_signal.h>
 #include <i386/ibcs2/ibcs2_socksys.h>
 #include <i386/ibcs2/ibcs2_stropts.h>
 #include <i386/ibcs2/ibcs2_proto.h>
 #include <i386/ibcs2/ibcs2_termios.h>
 #include <i386/ibcs2/ibcs2_util.h>
 #include <i386/ibcs2/ibcs2_ioctl.h>
 
 static void stios2btios __P((struct ibcs2_termios *, struct termios *));
 static void btios2stios __P((struct termios *, struct ibcs2_termios *));
 static void stios2stio  __P((struct ibcs2_termios *, struct ibcs2_termio *));
 static void stio2stios  __P((struct ibcs2_termio *, struct ibcs2_termios *));
+static int ibcs2_do_ioctl __P((struct proc *, struct ibcs2_ioctl_args *,
+			       struct file *));
 
 
 int
 ibcs2_gtty(struct thread *td, struct ibcs2_gtty_args *args)
 {
 	struct ioctl_args ioctl_arg;
 
 	ioctl_arg.fd = args->fd;
 	ioctl_arg.com = TIOCGETC;
 	ioctl_arg.data = (caddr_t)args->buf;
 
 	return ioctl(td, &ioctl_arg);
 }
 
 int
 ibcs2_stty(struct thread *td, struct ibcs2_stty_args *args)
 {
 	struct ioctl_args ioctl_arg;
 
 	ioctl_arg.fd = args->fd;
 	ioctl_arg.com = TIOCSETC;
 	ioctl_arg.data = (caddr_t)args->buf;
 
 	return ioctl(td, &ioctl_arg);
 }
 
 
 /*
  * iBCS2 ioctl calls.
  */
 
 static struct speedtab sptab[] = {
 	{ 0, 0 },
 	{ 50, 1 },
 	{ 75, 2 },
 	{ 110, 3 },
 	{ 134, 4 },
 	{ 135, 4 },
 	{ 150, 5 },
 	{ 200, 6 },
 	{ 300, 7 },
 	{ 600, 8 },
 	{ 1200, 9 },
 	{ 1800, 10 },
 	{ 2400, 11 },
 	{ 4800, 12 },
 	{ 9600, 13 },
 	{ 19200, 14 },
 	{ 38400, 15 },
 	{ -1, -1 }
 };
 
 static u_long s2btab[] = { 
 	0,
 	50,
 	75,
 	110,
 	134,
 	150,
 	200,
 	300,
 	600,
 	1200,
 	1800,
 	2400,
 	4800,
 	9600,
 	19200,
 	38400,
 };
 
 static void
 stios2btios(st, bt)
 	struct ibcs2_termios *st;
 	struct termios *bt;
 {
 	register u_long l, r;
 
 	l = st->c_iflag;	r = 0;
 	if (l & IBCS2_IGNBRK)	r |= IGNBRK;
 	if (l & IBCS2_BRKINT)	r |= BRKINT;
 	if (l & IBCS2_IGNPAR)	r |= IGNPAR;
 	if (l & IBCS2_PARMRK)	r |= PARMRK;
 	if (l & IBCS2_INPCK)	r |= INPCK;
 	if (l & IBCS2_ISTRIP)	r |= ISTRIP;
 	if (l & IBCS2_INLCR)	r |= INLCR;
 	if (l & IBCS2_IGNCR)	r |= IGNCR;
 	if (l & IBCS2_ICRNL)	r |= ICRNL;
 	if (l & IBCS2_IXON)	r |= IXON;
 	if (l & IBCS2_IXANY)	r |= IXANY;
 	if (l & IBCS2_IXOFF)	r |= IXOFF;
 	if (l & IBCS2_IMAXBEL)	r |= IMAXBEL;
 	bt->c_iflag = r;
 
 	l = st->c_oflag;	r = 0;
 	if (l & IBCS2_OPOST)	r |= OPOST;
 	if (l & IBCS2_ONLCR)	r |= ONLCR;
 	if (l & IBCS2_TAB3)	r |= OXTABS;
 	bt->c_oflag = r;
 
 	l = st->c_cflag;	r = 0;
 	switch (l & IBCS2_CSIZE) {
 	case IBCS2_CS5:		r |= CS5; break;
 	case IBCS2_CS6:		r |= CS6; break;
 	case IBCS2_CS7:		r |= CS7; break;
 	case IBCS2_CS8:		r |= CS8; break;
 	}
 	if (l & IBCS2_CSTOPB)	r |= CSTOPB;
 	if (l & IBCS2_CREAD)	r |= CREAD;
 	if (l & IBCS2_PARENB)	r |= PARENB;
 	if (l & IBCS2_PARODD)	r |= PARODD;
 	if (l & IBCS2_HUPCL)	r |= HUPCL;
 	if (l & IBCS2_CLOCAL)	r |= CLOCAL;
 	bt->c_cflag = r;
 
 	bt->c_ispeed = bt->c_ospeed = s2btab[l & 0x0000000f];
 
 	l = st->c_lflag;	r = 0;
 	if (l & IBCS2_ISIG)	r |= ISIG;
 	if (l & IBCS2_ICANON)	r |= ICANON;
 	if (l & IBCS2_ECHO)	r |= ECHO;
 	if (l & IBCS2_ECHOE)	r |= ECHOE;
 	if (l & IBCS2_ECHOK)	r |= ECHOK;
 	if (l & IBCS2_ECHONL)	r |= ECHONL;
 	if (l & IBCS2_NOFLSH)	r |= NOFLSH;
 	if (l & IBCS2_TOSTOP)	r |= TOSTOP;
 	bt->c_lflag = r;
 
 	bt->c_cc[VINTR]	=
 	    st->c_cc[IBCS2_VINTR]  ? st->c_cc[IBCS2_VINTR]  : _POSIX_VDISABLE;
 	bt->c_cc[VQUIT] =
 	    st->c_cc[IBCS2_VQUIT]  ? st->c_cc[IBCS2_VQUIT]  : _POSIX_VDISABLE;
 	bt->c_cc[VERASE] =
 	    st->c_cc[IBCS2_VERASE] ? st->c_cc[IBCS2_VERASE] : _POSIX_VDISABLE;
 	bt->c_cc[VKILL] =
 	    st->c_cc[IBCS2_VKILL]  ? st->c_cc[IBCS2_VKILL]  : _POSIX_VDISABLE;
 	if (bt->c_lflag & ICANON) {
 		bt->c_cc[VEOF] =
 		    st->c_cc[IBCS2_VEOF] ? st->c_cc[IBCS2_VEOF] : _POSIX_VDISABLE;
 		bt->c_cc[VEOL] =
 		    st->c_cc[IBCS2_VEOL] ? st->c_cc[IBCS2_VEOL] : _POSIX_VDISABLE;
 	} else {
 		bt->c_cc[VMIN]  = st->c_cc[IBCS2_VMIN];
 		bt->c_cc[VTIME] = st->c_cc[IBCS2_VTIME];
 	}
 	bt->c_cc[VEOL2] =
 	    st->c_cc[IBCS2_VEOL2]  ? st->c_cc[IBCS2_VEOL2]  : _POSIX_VDISABLE;
 #if 0
 	bt->c_cc[VSWTCH] =
 	    st->c_cc[IBCS2_VSWTCH] ? st->c_cc[IBCS2_VSWTCH] : _POSIX_VDISABLE;
 #endif
 	bt->c_cc[VSTART] =
 	    st->c_cc[IBCS2_VSTART] ? st->c_cc[IBCS2_VSTART] : _POSIX_VDISABLE;
 	bt->c_cc[VSTOP] =
 	    st->c_cc[IBCS2_VSTOP]  ? st->c_cc[IBCS2_VSTOP]  : _POSIX_VDISABLE;
 	bt->c_cc[VSUSP] =
 	    st->c_cc[IBCS2_VSUSP]  ? st->c_cc[IBCS2_VSUSP]  : _POSIX_VDISABLE;
 	bt->c_cc[VDSUSP]   = _POSIX_VDISABLE;
 	bt->c_cc[VREPRINT] = _POSIX_VDISABLE;
 	bt->c_cc[VDISCARD] = _POSIX_VDISABLE;
 	bt->c_cc[VWERASE]  = _POSIX_VDISABLE;
 	bt->c_cc[VLNEXT]   = _POSIX_VDISABLE;
 	bt->c_cc[VSTATUS]  = _POSIX_VDISABLE;
 }
 
 static void
 btios2stios(bt, st)
 	struct termios *bt;
 	struct ibcs2_termios *st;
 {
 	register u_long l, r;
 
 	l = bt->c_iflag;	r = 0;
 	if (l & IGNBRK)		r |= IBCS2_IGNBRK;
 	if (l & BRKINT)		r |= IBCS2_BRKINT;
 	if (l & IGNPAR)		r |= IBCS2_IGNPAR;
 	if (l & PARMRK)		r |= IBCS2_PARMRK;
 	if (l & INPCK)		r |= IBCS2_INPCK;
 	if (l & ISTRIP)		r |= IBCS2_ISTRIP;
 	if (l & INLCR)		r |= IBCS2_INLCR;
 	if (l & IGNCR)		r |= IBCS2_IGNCR;
 	if (l & ICRNL)		r |= IBCS2_ICRNL;
 	if (l & IXON)		r |= IBCS2_IXON;
 	if (l & IXANY)		r |= IBCS2_IXANY;
 	if (l & IXOFF)		r |= IBCS2_IXOFF;
 	if (l & IMAXBEL)	r |= IBCS2_IMAXBEL;
 	st->c_iflag = r;
 
 	l = bt->c_oflag;	r = 0;
 	if (l & OPOST)		r |= IBCS2_OPOST;
 	if (l & ONLCR)		r |= IBCS2_ONLCR;
 	if (l & OXTABS)		r |= IBCS2_TAB3;
 	st->c_oflag = r;
 
 	l = bt->c_cflag;	r = 0;
 	switch (l & CSIZE) {
 	case CS5:		r |= IBCS2_CS5; break;
 	case CS6:		r |= IBCS2_CS6; break;
 	case CS7:		r |= IBCS2_CS7; break;
 	case CS8:		r |= IBCS2_CS8; break;
 	}
 	if (l & CSTOPB)		r |= IBCS2_CSTOPB;
 	if (l & CREAD)		r |= IBCS2_CREAD;
 	if (l & PARENB)		r |= IBCS2_PARENB;
 	if (l & PARODD)		r |= IBCS2_PARODD;
 	if (l & HUPCL)		r |= IBCS2_HUPCL;
 	if (l & CLOCAL)		r |= IBCS2_CLOCAL;
 	st->c_cflag = r;
 
 	l = bt->c_lflag;	r = 0;
 	if (l & ISIG)		r |= IBCS2_ISIG;
 	if (l & ICANON)		r |= IBCS2_ICANON;
 	if (l & ECHO)		r |= IBCS2_ECHO;
 	if (l & ECHOE)		r |= IBCS2_ECHOE;
 	if (l & ECHOK)		r |= IBCS2_ECHOK;
 	if (l & ECHONL)		r |= IBCS2_ECHONL;
 	if (l & NOFLSH)		r |= IBCS2_NOFLSH;
 	if (l & TOSTOP)		r |= IBCS2_TOSTOP;
 	st->c_lflag = r;
 
 	l = ttspeedtab(bt->c_ospeed, sptab);
 	if ((int)l >= 0)
 		st->c_cflag |= l;
 
 	st->c_cc[IBCS2_VINTR] =
 	    bt->c_cc[VINTR]  != _POSIX_VDISABLE ? bt->c_cc[VINTR]  : 0;
 	st->c_cc[IBCS2_VQUIT] =
 	    bt->c_cc[VQUIT]  != _POSIX_VDISABLE ? bt->c_cc[VQUIT]  : 0;
 	st->c_cc[IBCS2_VERASE] =
 	    bt->c_cc[VERASE] != _POSIX_VDISABLE ? bt->c_cc[VERASE] : 0;
 	st->c_cc[IBCS2_VKILL] =
 	    bt->c_cc[VKILL]  != _POSIX_VDISABLE ? bt->c_cc[VKILL]  : 0;
 	if (bt->c_lflag & ICANON) {
 		st->c_cc[IBCS2_VEOF] =
 		    bt->c_cc[VEOF] != _POSIX_VDISABLE ? bt->c_cc[VEOF] : 0;
 		st->c_cc[IBCS2_VEOL] =
 		    bt->c_cc[VEOL] != _POSIX_VDISABLE ? bt->c_cc[VEOL] : 0;
 	} else {
 		st->c_cc[IBCS2_VMIN]  = bt->c_cc[VMIN];
 		st->c_cc[IBCS2_VTIME] = bt->c_cc[VTIME];
 	}
 	st->c_cc[IBCS2_VEOL2] =
 	    bt->c_cc[VEOL2]  != _POSIX_VDISABLE ? bt->c_cc[VEOL2]  : 0;
 	st->c_cc[IBCS2_VSWTCH] =
 	    0;
 	st->c_cc[IBCS2_VSUSP] =
 	    bt->c_cc[VSUSP]  != _POSIX_VDISABLE ? bt->c_cc[VSUSP]  : 0;
 	st->c_cc[IBCS2_VSTART] =
 	    bt->c_cc[VSTART] != _POSIX_VDISABLE ? bt->c_cc[VSTART] : 0;
 	st->c_cc[IBCS2_VSTOP] =
 	    bt->c_cc[VSTOP]  != _POSIX_VDISABLE ? bt->c_cc[VSTOP]  : 0;
 
 	st->c_line = 0;
 }
 
 static void
 stios2stio(ts, t)
 	struct ibcs2_termios *ts;
 	struct ibcs2_termio *t;
 {
 	t->c_iflag = ts->c_iflag;
 	t->c_oflag = ts->c_oflag;
 	t->c_cflag = ts->c_cflag;
 	t->c_lflag = ts->c_lflag;
 	t->c_line  = ts->c_line;
 	bcopy(ts->c_cc, t->c_cc, IBCS2_NCC);
 }
 
 static void
 stio2stios(t, ts)
 	struct ibcs2_termio *t;
 	struct ibcs2_termios *ts;
 {
 	ts->c_iflag = t->c_iflag;
 	ts->c_oflag = t->c_oflag;
 	ts->c_cflag = t->c_cflag;
 	ts->c_lflag = t->c_lflag;
 	ts->c_line  = t->c_line;
 	bcopy(t->c_cc, ts->c_cc, IBCS2_NCC);
 }
 
 int
 ibcs2_ioctl(td, uap)
 	struct thread *td;
 	struct ibcs2_ioctl_args *uap;
 {
 	struct proc *p = td->td_proc;
-	struct filedesc *fdp = p->p_fd;
 	struct file *fp;
 	int error;
 
-	if (SCARG(uap, fd) < 0 || SCARG(uap, fd) >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) {
+	fp = ffind_hold(td, uap->fd);
+	if (fp == NULL) {
 		DPRINTF(("ibcs2_ioctl(%d): bad fd %d ", p->p_pid,
 			 SCARG(uap, fd)));
 		return EBADF;
 	}
 
 	if ((fp->f_flag & (FREAD|FWRITE)) == 0) {
+		fdrop(fp, td);
 		DPRINTF(("ibcs2_ioctl(%d): bad fp flag ", p->p_pid));
 		return EBADF;
 	}
 
 	switch (SCARG(uap, cmd)) {
 	case IBCS2_TCGETA:
 	case IBCS2_XCGETA:
 	case IBCS2_OXCGETA:
 	    {
 		struct termios bts;
 		struct ibcs2_termios sts;
 		struct ibcs2_termio st;
 	
 		if ((error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bts, td)) != 0)
-			return error;
+			break;
 	
 		btios2stios (&bts, &sts);
 		if (SCARG(uap, cmd) == IBCS2_TCGETA) {
 			stios2stio (&sts, &st);
 			error = copyout((caddr_t)&st, SCARG(uap, data),
 					sizeof (st));
 #ifdef DEBUG_IBCS2
 			if (error)
 				DPRINTF(("ibcs2_ioctl(%d): copyout failed ",
 					 p->p_pid));
 #endif
-			return error;
-		} else
-			return copyout((caddr_t)&sts, SCARG(uap, data),
+			break;
+		} else {
+			error = copyout((caddr_t)&sts, SCARG(uap, data),
 					sizeof (sts));
+			break;
+		}
 		/*NOTREACHED*/
 	    }
 
 	case IBCS2_TCSETA:
 	case IBCS2_TCSETAW:
 	case IBCS2_TCSETAF:
 	    {
 		struct termios bts;
 		struct ibcs2_termios sts;
 		struct ibcs2_termio st;
 
 		if ((error = copyin(SCARG(uap, data), (caddr_t)&st,
 				    sizeof(st))) != 0) {
 			DPRINTF(("ibcs2_ioctl(%d): TCSET copyin failed ",
 				 p->p_pid));
-			return error;
+			break;
 		}
 
 		/* get full BSD termios so we don't lose information */
 		if ((error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bts, td)) != 0) {
 			DPRINTF(("ibcs2_ioctl(%d): TCSET ctl failed fd %d ",
 				 p->p_pid, SCARG(uap, fd)));
-			return error;
+			break;
 		}
 
 		/*
 		 * convert to iBCS2 termios, copy in information from
 		 * termio, and convert back, then set new values.
 		 */
 		btios2stios(&bts, &sts);
 		stio2stios(&st, &sts);
 		stios2btios(&sts, &bts);
 
-		return fo_ioctl(fp, SCARG(uap, cmd) - IBCS2_TCSETA + TIOCSETA,
+		error = fo_ioctl(fp, SCARG(uap, cmd) - IBCS2_TCSETA + TIOCSETA,
 			      (caddr_t)&bts, td);
+		break;
 	    }
 
 	case IBCS2_XCSETA:
 	case IBCS2_XCSETAW:
 	case IBCS2_XCSETAF:
 	    {
 		struct termios bts;
 		struct ibcs2_termios sts;
 
 		if ((error = copyin(SCARG(uap, data), (caddr_t)&sts,
-				    sizeof (sts))) != 0) {
-			return error;
-		}
+				    sizeof (sts))) != 0)
+			break;
 		stios2btios (&sts, &bts);
-		return fo_ioctl(fp, SCARG(uap, cmd) - IBCS2_XCSETA + TIOCSETA,
+		error = fo_ioctl(fp, SCARG(uap, cmd) - IBCS2_XCSETA + TIOCSETA,
 			      (caddr_t)&bts, td);
+		break;
 	    }
 
 	case IBCS2_OXCSETA:
 	case IBCS2_OXCSETAW:
 	case IBCS2_OXCSETAF:
 	    {
 		struct termios bts;
 		struct ibcs2_termios sts;
 
 		if ((error = copyin(SCARG(uap, data), (caddr_t)&sts,
-				    sizeof (sts))) != 0) {
-			return error;
-		}
+				    sizeof (sts))) != 0)
+			break;
 		stios2btios (&sts, &bts);
-		return fo_ioctl(fp, SCARG(uap, cmd) - IBCS2_OXCSETA + TIOCSETA,
+		error = fo_ioctl(fp, SCARG(uap, cmd) - IBCS2_OXCSETA + TIOCSETA,
 			      (caddr_t)&bts, td);
+		break;
 	    }
 
 	case IBCS2_TCSBRK:
 		DPRINTF(("ibcs2_ioctl(%d): TCSBRK ", p->p_pid));
-		return ENOSYS;
+		error = ENOSYS;
+		break;
 
 	case IBCS2_TCXONC:
 	    {
 		switch ((int)SCARG(uap, data)) {
 		case 0:
 		case 1:
 			DPRINTF(("ibcs2_ioctl(%d): TCXONC ", p->p_pid));
-			return ENOSYS;
+			error = ENOSYS;
+			break;
 		case 2:
-			return fo_ioctl(fp, TIOCSTOP, (caddr_t)0, td);
+			error = fo_ioctl(fp, TIOCSTOP, (caddr_t)0, td);
+			break;
 		case 3:
-			return fo_ioctl(fp, TIOCSTART, (caddr_t)1, td);
+			error = fo_ioctl(fp, TIOCSTART, (caddr_t)1, td);
+			break;
 		default:
-			return EINVAL;
+			error = EINVAL;
+			break;
 		}
+		break;
 	    }
 
 	case IBCS2_TCFLSH:
 	    {
 		int arg;
 
 		switch ((int)SCARG(uap, data)) {
 		case 0:
 			arg = FREAD;
 			break;
 		case 1:
 			arg = FWRITE;
 			break;
 		case 2:
 			arg = FREAD | FWRITE;
 			break;
 		default:
+			fdrop(fp, td);
 			return EINVAL;
 		}
-		return fo_ioctl(fp, TIOCFLUSH, (caddr_t)&arg, td);
+		error = fo_ioctl(fp, TIOCFLUSH, (caddr_t)&arg, td);
+		break;
 	    }
 
 	case IBCS2_TIOCGWINSZ:
 		SCARG(uap, cmd) = TIOCGWINSZ;
-		return ioctl(td, (struct ioctl_args *)uap);
+		error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_TIOCSWINSZ:
 		SCARG(uap, cmd) = TIOCSWINSZ;
-		return ioctl(td, (struct ioctl_args *)uap);
+		error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_TIOCGPGRP:
+	    {
+		pid_t	pg_id;
+
 		PROC_LOCK(p);
-		error = copyout((caddr_t)&p->p_pgrp->pg_id, SCARG(uap, data),
-				sizeof(p->p_pgrp->pg_id));
+		pg_id = p->p_pgrp->pg_id;
 		PROC_UNLOCK(p);
-		return error;
+		error = copyout((caddr_t)&pg_id, SCARG(uap, data),
+				sizeof(pg_id));
+		break;
+	    }
 
 	case IBCS2_TIOCSPGRP:	/* XXX - is uap->data a pointer to pgid? */
 	    {
 		struct setpgid_args sa;
 
 		SCARG(&sa, pid) = 0;
 		SCARG(&sa, pgid) = (int)SCARG(uap, data);
-		if ((error = setpgid(td, &sa)) != 0)
-			return error;
-		return 0;
+		error = setpgid(td, &sa);
+		break;
 	    }
 
 	case IBCS2_TCGETSC:	/* SCO console - get scancode flags */
-		return EINTR;  /* ENOSYS; */
+		error = EINTR;  /* ENOSYS; */
+		break;
 
 	case IBCS2_TCSETSC:	/* SCO console - set scancode flags */
-		return 0;   /* ENOSYS; */
+		error = 0;   /* ENOSYS; */
+		break;
 
 	case IBCS2_JWINSIZE:	/* Unix to Jerq I/O control */
 	    {
 	        struct ibcs2_jwinsize {
 		  char bytex, bytey; 
 		  short bitx, bity;
 	        } ibcs2_jwinsize;
 
 		PROC_LOCK(p);
                 ibcs2_jwinsize.bytex = 80;
 	          /* p->p_session->s_ttyp->t_winsize.ws_col; XXX */
 	        ibcs2_jwinsize.bytey = 25;
                   /* p->p_session->s_ttyp->t_winsize.ws_row; XXX */
 	        ibcs2_jwinsize.bitx = 
 		  p->p_session->s_ttyp->t_winsize.ws_xpixel;
 	        ibcs2_jwinsize.bity =
 		  p->p_session->s_ttyp->t_winsize.ws_ypixel;
 		PROC_UNLOCK(p);
-	        return copyout((caddr_t)&ibcs2_jwinsize, SCARG(uap, data),
+	        error = copyout((caddr_t)&ibcs2_jwinsize, SCARG(uap, data),
 			       sizeof(ibcs2_jwinsize));
+		break;
 	     }
 
 	/* keyboard and display ioctl's -- type 'K' */
 	case IBCS2_KDGKBMODE:        /* get keyboard translation mode */
 	        SCARG(uap, cmd) = KDGKBMODE;
 /* printf("ioctl KDGKBMODE = %x\n", SCARG(uap, cmd));*/
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_KDSKBMODE:        /* set keyboard translation mode */
 	        SCARG(uap, cmd) = KDSKBMODE;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_KDMKTONE:        /* sound tone */
 	        SCARG(uap, cmd) = KDMKTONE;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_KDGETMODE:        /* get text/graphics mode */  
 	        SCARG(uap, cmd) = KDGETMODE;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_KDSETMODE:       /* set text/graphics mode */
 	        SCARG(uap, cmd) = KDSETMODE;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_KDSBORDER:       /* set ega color border */
 	        SCARG(uap, cmd) = KDSBORDER;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_KDGKBSTATE:
 	        SCARG(uap, cmd) = KDGKBSTATE;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_KDSETRAD:
 	        SCARG(uap, cmd) = KDSETRAD;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_KDENABIO:       /* enable direct I/O to ports */
 	        SCARG(uap, cmd) = KDENABIO;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_KDDISABIO:       /* disable direct I/O to ports */
 	        SCARG(uap, cmd) = KDDISABIO;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_KIOCSOUND:       /* start sound generation */
 	        SCARG(uap, cmd) = KIOCSOUND;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_KDGKBTYPE:       /* get keyboard type */
 	        SCARG(uap, cmd) = KDGKBTYPE;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_KDGETLED:       /* get keyboard LED status */
 	        SCARG(uap, cmd) = KDGETLED;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_KDSETLED:       /* set keyboard LED status */
 	        SCARG(uap, cmd) = KDSETLED;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	    /* Xenix keyboard and display ioctl's from sys/kd.h -- type 'k' */
 	case IBCS2_GETFKEY:      /* Get function key */
 	        SCARG(uap, cmd) = GETFKEY;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_SETFKEY:      /* Set function key */
 	        SCARG(uap, cmd) = SETFKEY;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_GIO_SCRNMAP:      /* Get screen output map table */
 	        SCARG(uap, cmd) = GIO_SCRNMAP;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_PIO_SCRNMAP:      /* Set screen output map table */
 	        SCARG(uap, cmd) = PIO_SCRNMAP;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_GIO_KEYMAP:      /* Get keyboard map table */
 	        SCARG(uap, cmd) = GIO_KEYMAP;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	case IBCS2_PIO_KEYMAP:      /* Set keyboard map table */
 	        SCARG(uap, cmd) = PIO_KEYMAP;
-	        return ioctl(td, (struct ioctl_args *)uap);
+	        error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	    /* socksys */
 	case IBCS2_SIOCSOCKSYS:
-		return ibcs2_socksys(td, (struct ibcs2_socksys_args *)uap);
+		error = ibcs2_socksys(td, (struct ibcs2_socksys_args *)uap);
+		break;
 
 	case IBCS2_FIONREAD:
 	case IBCS2_I_NREAD:     /* STREAMS */
 	        SCARG(uap, cmd) = FIONREAD;
-		return ioctl(td, (struct ioctl_args *)uap);
+		error = ioctl(td, (struct ioctl_args *)uap);
+		break;
 
 	default:
 		DPRINTF(("ibcs2_ioctl(%d): unknown cmd 0x%lx ",
 			 td->proc->p_pid, SCARG(uap, cmd)));
-		return ENOSYS;
+		error = ENOSYS;
+		break;
 	}
-	return ENOSYS;
+
+	fdrop(fp, td);
+	return error;
 }
Index: head/sys/i386/ibcs2/ibcs2_misc.c
===================================================================
--- head/sys/i386/ibcs2/ibcs2_misc.c	(revision 89305)
+++ head/sys/i386/ibcs2/ibcs2_misc.c	(revision 89306)
@@ -1,1184 +1,1197 @@
 /*
  * Copyright (c) 1995 Steven Wallace
  * Copyright (c) 1994, 1995 Scott Bartram
  * Copyright (c) 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This software was developed by the Computer Systems Engineering group
  * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
  * contributed to Berkeley.
  *
  * All advertising materials mentioning features or use of this software
  * must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Lawrence Berkeley Laboratory.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Header: sun_misc.c,v 1.16 93/04/07 02:46:27 torek Exp 
  *
  *	@(#)sun_misc.c	8.1 (Berkeley) 6/18/93
  *
  * $FreeBSD$
  */
 
 /*
  * IBCS2 compatibility module.
  *
  * IBCS2 system calls that are implemented differently in BSD are
  * handled here.
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/dirent.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h>			/* Must come after sys/malloc.h */
 #include <sys/mutex.h>
 #include <sys/reboot.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/time.h>
 #include <sys/times.h>
 #include <sys/vnode.h>
 #include <sys/wait.h>
 
 #include <machine/cpu.h>
 
 #include <i386/ibcs2/ibcs2_dirent.h>
 #include <i386/ibcs2/ibcs2_signal.h>
 #include <i386/ibcs2/ibcs2_proto.h>
 #include <i386/ibcs2/ibcs2_unistd.h>
 #include <i386/ibcs2/ibcs2_util.h>
 #include <i386/ibcs2/ibcs2_utime.h>
 #include <i386/ibcs2/ibcs2_xenix.h>
 
 int
 ibcs2_ulimit(td, uap)
 	struct thread *td;
 	struct ibcs2_ulimit_args *uap;
 {
 #ifdef notyet
 	int error;
 	struct rlimit rl;
 	struct setrlimit_args {
 		int resource;
 		struct rlimit *rlp;
 	} sra;
 #endif
 #define IBCS2_GETFSIZE		1
 #define IBCS2_SETFSIZE		2
 #define IBCS2_GETPSIZE		3
 #define IBCS2_GETDTABLESIZE	4
 	
 	switch (SCARG(uap, cmd)) {
 	case IBCS2_GETFSIZE:
 		td->td_retval[0] = td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur;
 		if (td->td_retval[0] == -1) td->td_retval[0] = 0x7fffffff;
 		return 0;
 	case IBCS2_SETFSIZE:	/* XXX - fix this */
 #ifdef notyet
 		rl.rlim_cur = SCARG(uap, newlimit);
 		sra.resource = RLIMIT_FSIZE;
 		sra.rlp = &rl;
 		error = setrlimit(td, &sra);
 		if (!error)
 			td->td_retval[0] = td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur;
 		else
 			DPRINTF(("failed "));
 		return error;
 #else
 		td->td_retval[0] = SCARG(uap, newlimit);
 		return 0;
 #endif
 	case IBCS2_GETPSIZE:
 		mtx_assert(&Giant, MA_OWNED);
 		td->td_retval[0] = td->td_proc->p_rlimit[RLIMIT_RSS].rlim_cur; /* XXX */
 		return 0;
 	case IBCS2_GETDTABLESIZE:
 		uap->cmd = IBCS2_SC_OPEN_MAX;
 		return ibcs2_sysconf(td, (struct ibcs2_sysconf_args *)uap);
 	default:
 		return ENOSYS;
 	}
 }
 
 #define IBCS2_WSTOPPED       0177
 #define IBCS2_STOPCODE(sig)  ((sig) << 8 | IBCS2_WSTOPPED)
 int
 ibcs2_wait(td, uap)
 	struct thread *td;
 	struct ibcs2_wait_args *uap;
 {
 	int error, status;
 	struct wait_args w4;
         struct trapframe *tf = td->td_frame;
 	
 	SCARG(&w4, rusage) = NULL;
         if ((tf->tf_eflags & (PSL_Z|PSL_PF|PSL_N|PSL_V))
             == (PSL_Z|PSL_PF|PSL_N|PSL_V)) {
 		/* waitpid */
 		SCARG(&w4, pid) = SCARG(uap, a1);
 		SCARG(&w4, status) = (int *)SCARG(uap, a2);
 		SCARG(&w4, options) = SCARG(uap, a3);
 	} else {
 		/* wait */
 		SCARG(&w4, pid) = WAIT_ANY;
 		SCARG(&w4, status) = (int *)SCARG(uap, a1);
 		SCARG(&w4, options) = 0;
 	}
 	if ((error = wait4(td, &w4)) != 0)
 		return error;
 	if (SCARG(&w4, status))	{	/* this is real iBCS brain-damage */
 		error = copyin((caddr_t)SCARG(&w4, status), (caddr_t)&status,
 			       sizeof(SCARG(&w4, status)));
 		if(error)
 		  return error;
 
 		/* convert status/signal result */
 		if(WIFSTOPPED(status))
 			status =
 			  IBCS2_STOPCODE(bsd_to_ibcs2_sig[_SIG_IDX(WSTOPSIG(status))]);
 		else if(WIFSIGNALED(status))
 			status = bsd_to_ibcs2_sig[_SIG_IDX(WTERMSIG(status))];
 		/* else exit status -- identical */
 
 		/* record result/status */
 		td->td_retval[1] = status;
 		return copyout((caddr_t)&status, (caddr_t)SCARG(&w4, status),
 			       sizeof(SCARG(&w4, status)));
 	}
 
 	return 0;
 }
 
 int
 ibcs2_execv(td, uap)
 	struct thread *td;
 	struct ibcs2_execv_args *uap;
 {
 	struct execve_args ea;
 	caddr_t sg = stackgap_init();
 
         CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	SCARG(&ea, fname) = SCARG(uap, path);
 	SCARG(&ea, argv) = SCARG(uap, argp);
 	SCARG(&ea, envv) = NULL;
 	return execve(td, &ea);
 }
 
 int
 ibcs2_execve(td, uap) 
         struct thread *td;
         struct ibcs2_execve_args *uap;
 {
         caddr_t sg = stackgap_init();
         CHECKALTEXIST(td, &sg, SCARG(uap, path));
         return execve(td, (struct execve_args *)uap);
 }
 
 int
 ibcs2_umount(td, uap)
 	struct thread *td;
 	struct ibcs2_umount_args *uap;
 {
 	struct unmount_args um;
 
 	SCARG(&um, path) = SCARG(uap, name);
 	SCARG(&um, flags) = 0;
 	return unmount(td, &um);
 }
 
 int
 ibcs2_mount(td, uap)
 	struct thread *td;
 	struct ibcs2_mount_args *uap;
 {
 #ifdef notyet
 	int oflags = SCARG(uap, flags), nflags, error;
 	char fsname[MFSNAMELEN];
 
 	if (oflags & (IBCS2_MS_NOSUB | IBCS2_MS_SYS5))
 		return (EINVAL);
 	if ((oflags & IBCS2_MS_NEWTYPE) == 0)
 		return (EINVAL);
 	nflags = 0;
 	if (oflags & IBCS2_MS_RDONLY)
 		nflags |= MNT_RDONLY;
 	if (oflags & IBCS2_MS_NOSUID)
 		nflags |= MNT_NOSUID;
 	if (oflags & IBCS2_MS_REMOUNT)
 		nflags |= MNT_UPDATE;
 	SCARG(uap, flags) = nflags;
 
 	if (error = copyinstr((caddr_t)SCARG(uap, type), fsname, sizeof fsname,
 			      (u_int *)0))
 		return (error);
 
 	if (strcmp(fsname, "4.2") == 0) {
 		SCARG(uap, type) = (caddr_t)STACK_ALLOC();
 		if (error = copyout("ufs", SCARG(uap, type), sizeof("ufs")))
 			return (error);
 	} else if (strcmp(fsname, "nfs") == 0) {
 		struct ibcs2_nfs_args sna;
 		struct sockaddr_in sain;
 		struct nfs_args na;
 		struct sockaddr sa;
 
 		if (error = copyin(SCARG(uap, data), &sna, sizeof sna))
 			return (error);
 		if (error = copyin(sna.addr, &sain, sizeof sain))
 			return (error);
 		bcopy(&sain, &sa, sizeof sa);
 		sa.sa_len = sizeof(sain);
 		SCARG(uap, data) = (caddr_t)STACK_ALLOC();
 		na.addr = (struct sockaddr *)((int)SCARG(uap, data) + sizeof na);
 		na.sotype = SOCK_DGRAM;
 		na.proto = IPPROTO_UDP;
 		na.fh = (nfsv2fh_t *)sna.fh;
 		na.flags = sna.flags;
 		na.wsize = sna.wsize;
 		na.rsize = sna.rsize;
 		na.timeo = sna.timeo;
 		na.retrans = sna.retrans;
 		na.hostname = sna.hostname;
 
 		if (error = copyout(&sa, na.addr, sizeof sa))
 			return (error);
 		if (error = copyout(&na, SCARG(uap, data), sizeof na))
 			return (error);
 	}
 	return (mount(td, uap));
 #else
 	return EINVAL;
 #endif
 }
 
 /*
  * Read iBCS2-style directory entries.  We suck them into kernel space so
  * that they can be massaged before being copied out to user code.  Like
  * SunOS, we squish out `empty' entries.
  *
  * This is quite ugly, but what do you expect from compatibility code?
  */
 
 int
 ibcs2_getdents(td, uap)
 	struct thread *td;
 	register struct ibcs2_getdents_args *uap;
 {
 	register struct vnode *vp;
 	register caddr_t inp, buf;	/* BSD-format */
 	register int len, reclen;	/* BSD-format */
 	register caddr_t outp;		/* iBCS2-format */
 	register int resid;		/* iBCS2-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	struct ibcs2_dirent idb;
 	off_t off;			/* true file offset */
 	int buflen, error, eofflag;
 	u_long *cookies = NULL, *cookiep;
 	int ncookies;
 #define	BSD_DIRENT(cp)		((struct dirent *)(cp))
 #define	IBCS2_RECLEN(reclen)	(reclen + sizeof(u_short))
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	if ((fp->f_flag & FREAD) == 0)
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
 		return (EBADF);
+	}
 	vp = (struct vnode *)fp->f_data;
-	if (vp->v_type != VDIR)	/* XXX  vnode readdir op should do this */
+	if (vp->v_type != VDIR) {	/* XXX  vnode readdir op should do this */
+		fdrop(fp, td);
 		return (EINVAL);
+	}
 
 	off = fp->f_offset;
 #define	DIRBLKSIZ	512		/* XXX we used to use ufs's DIRBLKSIZ */
 	buflen = max(DIRBLKSIZ, SCARG(uap, nbytes));
 	buflen = min(buflen, MAXBSIZE);
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 
 	if (cookies) {
 		free(cookies, M_TEMP);
 		cookies = NULL;
 	}
 
 	/*
 	 * First we read into the malloc'ed buffer, then
 	 * we massage it into user space, one record at a time.
 	 */
 	if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0)
 		goto out;
 	inp = buf;
 	outp = SCARG(uap, buf);
 	resid = SCARG(uap, nbytes);
 	if ((len = buflen - auio.uio_resid) <= 0)
 		goto eof;
 
 	cookiep = cookies;
 
 	if (cookies) {
 		/*
 		 * When using cookies, the vfs has the option of reading from
 		 * a different offset than that supplied (UFS truncates the
 		 * offset to a block boundary to make sure that it never reads
 		 * partway through a directory entry, even if the directory
 		 * has been compacted).
 		 */
 		while (len > 0 && ncookies > 0 && *cookiep <= off) {
 			len -= BSD_DIRENT(inp)->d_reclen;
 			inp += BSD_DIRENT(inp)->d_reclen;
 			cookiep++;
 			ncookies--;
 		}
 	}
 
 	for (; len > 0; len -= reclen) {
 		if (cookiep && ncookies == 0)
 			break;
 		reclen = BSD_DIRENT(inp)->d_reclen;
 		if (reclen & 3) {
 		        printf("ibcs2_getdents: reclen=%d\n", reclen);
 		        error = EFAULT;
 			goto out;
 		}
 		if (BSD_DIRENT(inp)->d_fileno == 0) {
 			inp += reclen;	/* it is a hole; squish it out */
 			if (cookiep) {
 				off = *cookiep++;
 				ncookies--;
 			} else
 				off += reclen;
 			continue;
 		}
 		if (reclen > len || resid < IBCS2_RECLEN(reclen)) {
 			/* entry too big for buffer, so just stop */
 			outp++;
 			break;
 		}
 		/*
 		 * Massage in place to make a iBCS2-shaped dirent (otherwise
 		 * we have to worry about touching user memory outside of
 		 * the copyout() call).
 		 */
 		idb.d_ino = (ibcs2_ino_t)BSD_DIRENT(inp)->d_fileno;
 		idb.d_off = (ibcs2_off_t)off;
 		idb.d_reclen = (u_short)IBCS2_RECLEN(reclen);
 		if ((error = copyout((caddr_t)&idb, outp, 10)) != 0 ||
 		    (error = copyout(BSD_DIRENT(inp)->d_name, outp + 10,
 				     BSD_DIRENT(inp)->d_namlen + 1)) != 0)
 			goto out;
 		/* advance past this real entry */
 		if (cookiep) {
 			off = *cookiep++;
 			ncookies--;
 		} else
 			off += reclen;
 		inp += reclen;
 		/* advance output past iBCS2-shaped entry */
 		outp += IBCS2_RECLEN(reclen);
 		resid -= IBCS2_RECLEN(reclen);
 	}
 	/* if we squished out the whole block, try again */
 	if (outp == SCARG(uap, buf))
 		goto again;
 	fp->f_offset = off;		/* update the vnode offset */
 eof:
 	td->td_retval[0] = SCARG(uap, nbytes) - resid;
 out:
+	VOP_UNLOCK(vp, 0, td);
+	fdrop(fp, td);
 	if (cookies)
 		free(cookies, M_TEMP);
-	VOP_UNLOCK(vp, 0, td);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_read(td, uap)
 	struct thread *td;
 	struct ibcs2_read_args *uap;
 {
 	register struct vnode *vp;
 	register caddr_t inp, buf;	/* BSD-format */
 	register int len, reclen;	/* BSD-format */
 	register caddr_t outp;		/* iBCS2-format */
 	register int resid;		/* iBCS2-format */
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	struct ibcs2_direct {
 		ibcs2_ino_t ino;
 		char name[14];
 	} idb;
 	off_t off;			/* true file offset */
 	int buflen, error, eofflag, size;
 	u_long *cookies = NULL, *cookiep;
 	int ncookies;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0) {
 		if (error == EINVAL)
 			return read(td, (struct read_args *)uap);
 		else
 			return error;
 	}
-	if ((fp->f_flag & FREAD) == 0)
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
 		return (EBADF);
+	}
 	vp = (struct vnode *)fp->f_data;
+	if (vp->v_type != VDIR) {
+		fdrop(fp, td);
+		return read(td, (struct read_args *)uap);
+	}
+
+	off = fp->f_offset;
 	if (vp->v_type != VDIR)
 		return read(td, (struct read_args *)uap);
 
 	DPRINTF(("ibcs2_read: read directory\n"));
 
-	off = fp->f_offset;
 	buflen = max(DIRBLKSIZ, SCARG(uap, nbytes));
 	buflen = min(buflen, MAXBSIZE);
 	buf = malloc(buflen, M_TEMP, M_WAITOK);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 again:
 	aiov.iov_base = buf;
 	aiov.iov_len = buflen;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = buflen;
 	auio.uio_offset = off;
 
 	if (cookies) {
 		free(cookies, M_TEMP);
 		cookies = NULL;
 	}
 
 	/*
 	 * First we read into the malloc'ed buffer, then
 	 * we massage it into user space, one record at a time.
 	 */
 	if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0) {
 		DPRINTF(("VOP_READDIR failed: %d\n", error));
 		goto out;
 	}
 	inp = buf;
 	outp = SCARG(uap, buf);
 	resid = SCARG(uap, nbytes);
 	if ((len = buflen - auio.uio_resid) <= 0)
 		goto eof;
 
 	cookiep = cookies;
 
 	if (cookies) {
 		/*
 		 * When using cookies, the vfs has the option of reading from
 		 * a different offset than that supplied (UFS truncates the
 		 * offset to a block boundary to make sure that it never reads
 		 * partway through a directory entry, even if the directory
 		 * has been compacted).
 		 */
 		while (len > 0 && ncookies > 0 && *cookiep <= off) {
 			len -= BSD_DIRENT(inp)->d_reclen;
 			inp += BSD_DIRENT(inp)->d_reclen;
 			cookiep++;
 			ncookies--;
 		}
 	}
 
 	for (; len > 0 && resid > 0; len -= reclen) {
 		if (cookiep && ncookies == 0)
 			break;
 		reclen = BSD_DIRENT(inp)->d_reclen;
 		if (reclen & 3) {
 		        printf("ibcs2_read: reclen=%d\n", reclen);
 		        error = EFAULT;
 			goto out;
 		}
 		if (BSD_DIRENT(inp)->d_fileno == 0) {
 			inp += reclen;	/* it is a hole; squish it out */
 			if (cookiep) {
 				off = *cookiep++;
 				ncookies--;
 			} else
 				off += reclen;
 			continue;
 		}
 		if (reclen > len || resid < sizeof(struct ibcs2_direct)) {
 			/* entry too big for buffer, so just stop */
 			outp++;
 			break;
 		}
 		/*
 		 * Massage in place to make a iBCS2-shaped dirent (otherwise
 		 * we have to worry about touching user memory outside of
 		 * the copyout() call).
 		 *
 		 * TODO: if length(filename) > 14, then break filename into
 		 * multiple entries and set inode = 0xffff except last
 		 */
 		idb.ino = (BSD_DIRENT(inp)->d_fileno > 0xfffe) ? 0xfffe :
 			BSD_DIRENT(inp)->d_fileno;
 		(void)copystr(BSD_DIRENT(inp)->d_name, idb.name, 14, &size);
 		bzero(idb.name + size, 14 - size);
 		if ((error = copyout(&idb, outp, sizeof(struct ibcs2_direct))) != 0)
 			goto out;
 		/* advance past this real entry */
 		if (cookiep) {
 			off = *cookiep++;
 			ncookies--;
 		} else
 			off += reclen;
 		inp += reclen;
 		/* advance output past iBCS2-shaped entry */
 		outp += sizeof(struct ibcs2_direct);
 		resid -= sizeof(struct ibcs2_direct);
 	}
 	/* if we squished out the whole block, try again */
 	if (outp == SCARG(uap, buf))
 		goto again;
 	fp->f_offset = off;		/* update the vnode offset */
 eof:
 	td->td_retval[0] = SCARG(uap, nbytes) - resid;
 out:
+	VOP_UNLOCK(vp, 0, td);
+	fdrop(fp, td);
 	if (cookies)
 		free(cookies, M_TEMP);
-	VOP_UNLOCK(vp, 0, td);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 int
 ibcs2_mknod(td, uap)
 	struct thread *td;
 	struct ibcs2_mknod_args *uap;
 {
         caddr_t sg = stackgap_init();
 
         CHECKALTCREAT(td, &sg, SCARG(uap, path));
 	if (S_ISFIFO(SCARG(uap, mode))) {
                 struct mkfifo_args ap;
                 SCARG(&ap, path) = SCARG(uap, path);
                 SCARG(&ap, mode) = SCARG(uap, mode);
 		return mkfifo(td, &ap);
 	} else {
                 struct mknod_args ap;
                 SCARG(&ap, path) = SCARG(uap, path);
                 SCARG(&ap, mode) = SCARG(uap, mode);
                 SCARG(&ap, dev) = SCARG(uap, dev);
                 return mknod(td, &ap);
 	}
 }
 
 int
 ibcs2_getgroups(td, uap)
 	struct thread *td;
 	struct ibcs2_getgroups_args *uap;
 {
 	int error, i;
 	ibcs2_gid_t *iset = NULL;
 	struct getgroups_args sa;
 	gid_t *gp;
 	caddr_t sg = stackgap_init();
 
 	SCARG(&sa, gidsetsize) = SCARG(uap, gidsetsize);
 	if (SCARG(uap, gidsetsize)) {
 		SCARG(&sa, gidset) = stackgap_alloc(&sg, NGROUPS_MAX *
 						    sizeof(gid_t *));
 		iset = stackgap_alloc(&sg, SCARG(uap, gidsetsize) *
 				      sizeof(ibcs2_gid_t));
 	}
 	if ((error = getgroups(td, &sa)) != 0)
 		return error;
 	if (SCARG(uap, gidsetsize) == 0)
 		return 0;
 
 	for (i = 0, gp = SCARG(&sa, gidset); i < td->td_retval[0]; i++)
 		iset[i] = (ibcs2_gid_t)*gp++;
 	if (td->td_retval[0] && (error = copyout((caddr_t)iset,
 					  (caddr_t)SCARG(uap, gidset),
 					  sizeof(ibcs2_gid_t) * td->td_retval[0])))
 		return error;
         return 0;
 }
 
 int
 ibcs2_setgroups(td, uap)
 	struct thread *td;
 	struct ibcs2_setgroups_args *uap;
 {
 	int error, i;
 	ibcs2_gid_t *iset;
 	struct setgroups_args sa;
 	gid_t *gp;
 	caddr_t sg = stackgap_init();
 
 	SCARG(&sa, gidsetsize) = SCARG(uap, gidsetsize);
 	SCARG(&sa, gidset) = stackgap_alloc(&sg, SCARG(&sa, gidsetsize) *
 					    sizeof(gid_t *));
 	iset = stackgap_alloc(&sg, SCARG(&sa, gidsetsize) *
 			      sizeof(ibcs2_gid_t *));
 	if (SCARG(&sa, gidsetsize)) {
 		if ((error = copyin((caddr_t)SCARG(uap, gidset), (caddr_t)iset, 
 				   sizeof(ibcs2_gid_t *) *
 				   SCARG(uap, gidsetsize))) != 0)
 			return error;
 	}
 	for (i = 0, gp = SCARG(&sa, gidset); i < SCARG(&sa, gidsetsize); i++)
 		*gp++ = (gid_t)iset[i];
 	return setgroups(td, &sa);
 }
 
 int
 ibcs2_setuid(td, uap)
 	struct thread *td;
 	struct ibcs2_setuid_args *uap;
 {
 	struct setuid_args sa;
 
 	SCARG(&sa, uid) = (uid_t)SCARG(uap, uid);
 	return setuid(td, &sa);
 }
 
 int
 ibcs2_setgid(td, uap)
 	struct thread *td;
 	struct ibcs2_setgid_args *uap;
 {
 	struct setgid_args sa;
 
 	SCARG(&sa, gid) = (gid_t)SCARG(uap, gid);
 	return setgid(td, &sa);
 }
 
 int
 ibcs2_time(td, uap)
 	struct thread *td;
 	struct ibcs2_time_args *uap;
 {
 	struct timeval tv;
 
 	microtime(&tv);
 	td->td_retval[0] = tv.tv_sec;
 	if (SCARG(uap, tp))
 		return copyout((caddr_t)&tv.tv_sec, (caddr_t)SCARG(uap, tp),
 			       sizeof(ibcs2_time_t));
 	else
 		return 0;
 }
 
 int
 ibcs2_pathconf(td, uap)
 	struct thread *td;
 	struct ibcs2_pathconf_args *uap;
 {
 	SCARG(uap, name)++;	/* iBCS2 _PC_* defines are offset by one */
         return pathconf(td, (struct pathconf_args *)uap);
 }
 
 int
 ibcs2_fpathconf(td, uap)
 	struct thread *td;
 	struct ibcs2_fpathconf_args *uap;
 {
 	SCARG(uap, name)++;	/* iBCS2 _PC_* defines are offset by one */
         return fpathconf(td, (struct fpathconf_args *)uap);
 }
 
 int
 ibcs2_sysconf(td, uap)
 	struct thread *td;
 	struct ibcs2_sysconf_args *uap;
 {
 	int mib[2], value, len, error;
 	struct sysctl_args sa;
 	struct __getrlimit_args ga;
 
 	switch(SCARG(uap, name)) {
 	case IBCS2_SC_ARG_MAX:
 		mib[1] = KERN_ARGMAX;
 		break;
 
 	case IBCS2_SC_CHILD_MAX:
 	    {
 		caddr_t sg = stackgap_init();
 
 		SCARG(&ga, which) = RLIMIT_NPROC;
 		SCARG(&ga, rlp) = stackgap_alloc(&sg, sizeof(struct rlimit *));
 		if ((error = getrlimit(td, &ga)) != 0)
 			return error;
 		td->td_retval[0] = SCARG(&ga, rlp)->rlim_cur;
 		return 0;
 	    }
 
 	case IBCS2_SC_CLK_TCK:
 		td->td_retval[0] = hz;
 		return 0;
 
 	case IBCS2_SC_NGROUPS_MAX:
 		mib[1] = KERN_NGROUPS;
 		break;
 
 	case IBCS2_SC_OPEN_MAX:
 	    {
 		caddr_t sg = stackgap_init();
 
 		SCARG(&ga, which) = RLIMIT_NOFILE;
 		SCARG(&ga, rlp) = stackgap_alloc(&sg, sizeof(struct rlimit *));
 		if ((error = getrlimit(td, &ga)) != 0)
 			return error;
 		td->td_retval[0] = SCARG(&ga, rlp)->rlim_cur;
 		return 0;
 	    }
 		
 	case IBCS2_SC_JOB_CONTROL:
 		mib[1] = KERN_JOB_CONTROL;
 		break;
 		
 	case IBCS2_SC_SAVED_IDS:
 		mib[1] = KERN_SAVED_IDS;
 		break;
 		
 	case IBCS2_SC_VERSION:
 		mib[1] = KERN_POSIX1;
 		break;
 		
 	case IBCS2_SC_PASS_MAX:
 		td->td_retval[0] = 128;		/* XXX - should we create PASS_MAX ? */
 		return 0;
 
 	case IBCS2_SC_XOPEN_VERSION:
 		td->td_retval[0] = 2;		/* XXX: What should that be? */
 		return 0;
 		
 	default:
 		return EINVAL;
 	}
 
 	mib[0] = CTL_KERN;
 	len = sizeof(value);
 	SCARG(&sa, name) = mib;
 	SCARG(&sa, namelen) = 2;
 	SCARG(&sa, old) = &value;
 	SCARG(&sa, oldlenp) = &len;
 	SCARG(&sa, new) = NULL;
 	SCARG(&sa, newlen) = 0;
 	if ((error = __sysctl(td, &sa)) != 0)
 		return error;
 	td->td_retval[0] = value;
 	return 0;
 }
 
 int
 ibcs2_alarm(td, uap)
 	struct thread *td;
 	struct ibcs2_alarm_args *uap;
 {
 	int error;
         struct itimerval *itp, *oitp;
 	struct setitimer_args sa;
 	caddr_t sg = stackgap_init();
 
         itp = stackgap_alloc(&sg, sizeof(*itp));
 	oitp = stackgap_alloc(&sg, sizeof(*oitp));
         timevalclear(&itp->it_interval);
         itp->it_value.tv_sec = SCARG(uap, sec);
         itp->it_value.tv_usec = 0;
 
 	SCARG(&sa, which) = ITIMER_REAL;
 	SCARG(&sa, itv) = itp;
 	SCARG(&sa, oitv) = oitp;
         error = setitimer(td, &sa);
 	if (error)
 		return error;
         if (oitp->it_value.tv_usec)
                 oitp->it_value.tv_sec++;
         td->td_retval[0] = oitp->it_value.tv_sec;
         return 0;
 }
 
 int
 ibcs2_times(td, uap)
 	struct thread *td;
 	struct ibcs2_times_args *uap;
 {
 	int error;
 	struct getrusage_args ga;
 	struct tms tms;
         struct timeval t;
 	caddr_t sg = stackgap_init();
         struct rusage *ru = stackgap_alloc(&sg, sizeof(*ru));
 #define CONVTCK(r)      (r.tv_sec * hz + r.tv_usec / (1000000 / hz))
 
 	SCARG(&ga, who) = RUSAGE_SELF;
 	SCARG(&ga, rusage) = ru;
 	error = getrusage(td, &ga);
 	if (error)
                 return error;
         tms.tms_utime = CONVTCK(ru->ru_utime);
         tms.tms_stime = CONVTCK(ru->ru_stime);
 
 	SCARG(&ga, who) = RUSAGE_CHILDREN;
         error = getrusage(td, &ga);
 	if (error)
 		return error;
         tms.tms_cutime = CONVTCK(ru->ru_utime);
         tms.tms_cstime = CONVTCK(ru->ru_stime);
 
 	microtime(&t);
         td->td_retval[0] = CONVTCK(t);
 	
 	return copyout((caddr_t)&tms, (caddr_t)SCARG(uap, tp),
 		       sizeof(struct tms));
 }
 
 int
 ibcs2_stime(td, uap)
 	struct thread *td;
 	struct ibcs2_stime_args *uap;
 {
 	int error;
 	struct settimeofday_args sa;
 	caddr_t sg = stackgap_init();
 
 	SCARG(&sa, tv) = stackgap_alloc(&sg, sizeof(*SCARG(&sa, tv)));
 	SCARG(&sa, tzp) = NULL;
 	if ((error = copyin((caddr_t)SCARG(uap, timep),
 			   &(SCARG(&sa, tv)->tv_sec), sizeof(long))) != 0)
 		return error;
 	SCARG(&sa, tv)->tv_usec = 0;
 	if ((error = settimeofday(td, &sa)) != 0)
 		return EPERM;
 	return 0;
 }
 
 int
 ibcs2_utime(td, uap)
 	struct thread *td;
 	struct ibcs2_utime_args *uap;
 {
 	int error;
 	struct utimes_args sa;
 	struct timeval *tp;
 	caddr_t sg = stackgap_init();
 
         CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	SCARG(&sa, path) = SCARG(uap, path);
 	if (SCARG(uap, buf)) {
 		struct ibcs2_utimbuf ubuf;
 
 		if ((error = copyin((caddr_t)SCARG(uap, buf), (caddr_t)&ubuf,
 				   sizeof(ubuf))) != 0)
 			return error;
 		SCARG(&sa, tptr) = stackgap_alloc(&sg,
 						  2 * sizeof(struct timeval *));
 		tp = (struct timeval *)SCARG(&sa, tptr);
 		tp->tv_sec = ubuf.actime;
 		tp->tv_usec = 0;
 		tp++;
 		tp->tv_sec = ubuf.modtime;
 		tp->tv_usec = 0;
 	} else
 		SCARG(&sa, tptr) = NULL;
 	return utimes(td, &sa);
 }
 
 int
 ibcs2_nice(td, uap)
 	struct thread *td;
 	struct ibcs2_nice_args *uap;
 {
 	int error;
 	struct setpriority_args sa;
 
 	SCARG(&sa, which) = PRIO_PROCESS;
 	SCARG(&sa, who) = 0;
 	SCARG(&sa, prio) = td->td_ksegrp->kg_nice + SCARG(uap, incr);
 	if ((error = setpriority(td, &sa)) != 0)
 		return EPERM;
 	td->td_retval[0] = td->td_ksegrp->kg_nice;
 	return 0;
 }
 
 /*
  * iBCS2 getpgrp, setpgrp, setsid, and setpgid
  */
 
 int
 ibcs2_pgrpsys(td, uap)
 	struct thread *td;
 	struct ibcs2_pgrpsys_args *uap;
 {
 	struct proc *p = td->td_proc;
 	switch (SCARG(uap, type)) {
 	case 0:			/* getpgrp */
 		PROC_LOCK(p);
 		td->td_retval[0] = p->p_pgrp->pg_id;
 		PROC_UNLOCK(p);
 		return 0;
 
 	case 1:			/* setpgrp */
 	    {
 		struct setpgid_args sa;
 
 		SCARG(&sa, pid) = 0;
 		SCARG(&sa, pgid) = 0;
 		setpgid(td, &sa);
 		PROC_LOCK(p);
 		td->td_retval[0] = p->p_pgrp->pg_id;
 		PROC_UNLOCK(p);
 		return 0;
 	    }
 
 	case 2:			/* setpgid */
 	    {
 		struct setpgid_args sa;
 
 		SCARG(&sa, pid) = SCARG(uap, pid);
 		SCARG(&sa, pgid) = SCARG(uap, pgid);
 		return setpgid(td, &sa);
 	    }
 
 	case 3:			/* setsid */
 		return setsid(td, NULL);
 
 	default:
 		return EINVAL;
 	}
 }
 
 /*
  * XXX - need to check for nested calls
  */
 
 int
 ibcs2_plock(td, uap)
 	struct thread *td;
 	struct ibcs2_plock_args *uap;
 {
 	int error;
 #define IBCS2_UNLOCK	0
 #define IBCS2_PROCLOCK	1
 #define IBCS2_TEXTLOCK	2
 #define IBCS2_DATALOCK	4
 
 	
         if ((error = suser_td(td)) != 0)
                 return EPERM;
 	switch(SCARG(uap, cmd)) {
 	case IBCS2_UNLOCK:
 	case IBCS2_PROCLOCK:
 	case IBCS2_TEXTLOCK:
 	case IBCS2_DATALOCK:
 		return 0;	/* XXX - TODO */
 	}
 	return EINVAL;
 }
 
 int
 ibcs2_uadmin(td, uap)
 	struct thread *td;
 	struct ibcs2_uadmin_args *uap;
 {
 #define SCO_A_REBOOT        1
 #define SCO_A_SHUTDOWN      2
 #define SCO_A_REMOUNT       4
 #define SCO_A_CLOCK         8
 #define SCO_A_SETCONFIG     128
 #define SCO_A_GETDEV        130
 
 #define SCO_AD_HALT         0
 #define SCO_AD_BOOT         1
 #define SCO_AD_IBOOT        2
 #define SCO_AD_PWRDOWN      3
 #define SCO_AD_PWRNAP       4
 
 #define SCO_AD_PANICBOOT    1
 
 #define SCO_AD_GETBMAJ      0
 #define SCO_AD_GETCMAJ      1
 
         if (suser_td(td))
                 return EPERM;
 
 	switch(SCARG(uap, cmd)) {
 	case SCO_A_REBOOT:
 	case SCO_A_SHUTDOWN:
 		switch(SCARG(uap, func)) {
 			struct reboot_args r;
 		case SCO_AD_HALT:
 		case SCO_AD_PWRDOWN:
 		case SCO_AD_PWRNAP:
 			r.opt = RB_HALT;
 			reboot(td, &r);
 		case SCO_AD_BOOT:
 		case SCO_AD_IBOOT:
 			r.opt = RB_AUTOBOOT;
 			reboot(td, &r);
 		}
 		return EINVAL;
 	case SCO_A_REMOUNT:
 	case SCO_A_CLOCK:
 	case SCO_A_SETCONFIG:
 		return 0;
 	case SCO_A_GETDEV:
 		return EINVAL;	/* XXX - TODO */
 	}
 	return EINVAL;
 }
 
 int
 ibcs2_sysfs(td, uap)
 	struct thread *td;
 	struct ibcs2_sysfs_args *uap;
 {
 #define IBCS2_GETFSIND        1
 #define IBCS2_GETFSTYP        2
 #define IBCS2_GETNFSTYP       3
 
 	switch(SCARG(uap, cmd)) {
 	case IBCS2_GETFSIND:
 	case IBCS2_GETFSTYP:
 	case IBCS2_GETNFSTYP:
 		break;
 	}
 	return EINVAL;		/* XXX - TODO */
 }
 
 int
 ibcs2_unlink(td, uap)
 	struct thread *td;
 	struct ibcs2_unlink_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	return unlink(td, (struct unlink_args *)uap);
 }
 
 int
 ibcs2_chdir(td, uap)
 	struct thread *td;
 	struct ibcs2_chdir_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	return chdir(td, (struct chdir_args *)uap);
 }
 
 int
 ibcs2_chmod(td, uap)
 	struct thread *td;
 	struct ibcs2_chmod_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	return chmod(td, (struct chmod_args *)uap);
 }
 
 int
 ibcs2_chown(td, uap)
 	struct thread *td;
 	struct ibcs2_chown_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	return chown(td, (struct chown_args *)uap);
 }
 
 int
 ibcs2_rmdir(td, uap)
 	struct thread *td;
 	struct ibcs2_rmdir_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	return rmdir(td, (struct rmdir_args *)uap);
 }
 
 int
 ibcs2_mkdir(td, uap)
 	struct thread *td;
 	struct ibcs2_mkdir_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTCREAT(td, &sg, SCARG(uap, path));
 	return mkdir(td, (struct mkdir_args *)uap);
 }
 
 int
 ibcs2_symlink(td, uap)
 	struct thread *td;
 	struct ibcs2_symlink_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	CHECKALTCREAT(td, &sg, SCARG(uap, link));
 	return symlink(td, (struct symlink_args *)uap);
 }
 
 int
 ibcs2_rename(td, uap)
 	struct thread *td;
 	struct ibcs2_rename_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, SCARG(uap, from));
 	CHECKALTCREAT(td, &sg, SCARG(uap, to));
 	return rename(td, (struct rename_args *)uap);
 }
 
 int
 ibcs2_readlink(td, uap)
 	struct thread *td;
 	struct ibcs2_readlink_args *uap;
 {
         caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	return readlink(td, (struct readlink_args *) uap);
 }
Index: head/sys/i386/ibcs2/ibcs2_stat.c
===================================================================
--- head/sys/i386/ibcs2/ibcs2_stat.c	(revision 89305)
+++ head/sys/i386/ibcs2/ibcs2_stat.c	(revision 89306)
@@ -1,257 +1,259 @@
 /*
  * Copyright (c) 1995 Scott Bartram
  * Copyright (c) 1995 Steven Wallace
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/namei.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 
 #include <i386/ibcs2/ibcs2_signal.h>
 #include <i386/ibcs2/ibcs2_stat.h>
 #include <i386/ibcs2/ibcs2_statfs.h>
 #include <i386/ibcs2/ibcs2_proto.h>
 #include <i386/ibcs2/ibcs2_util.h>
 #include <i386/ibcs2/ibcs2_utsname.h>
 
 
 static void bsd_stat2ibcs_stat __P((struct stat *, struct ibcs2_stat *));
 static int  cvt_statfs         __P((struct statfs *, caddr_t, int));
 
 static void
 bsd_stat2ibcs_stat(st, st4)
 	struct stat *st;
 	struct ibcs2_stat *st4;
 {
 	bzero(st4, sizeof(*st4));
 	st4->st_dev  = (ibcs2_dev_t)st->st_dev;
 	st4->st_ino  = (ibcs2_ino_t)st->st_ino;
 	st4->st_mode = (ibcs2_mode_t)st->st_mode;
 	st4->st_nlink= (ibcs2_nlink_t)st->st_nlink;
 	st4->st_uid  = (ibcs2_uid_t)st->st_uid;
 	st4->st_gid  = (ibcs2_gid_t)st->st_gid;
 	st4->st_rdev = (ibcs2_dev_t)st->st_rdev;
 	if (st->st_size < (quad_t)1 << 32)
 		st4->st_size = (ibcs2_off_t)st->st_size;
 	else
 		st4->st_size = -2;
 	st4->st_atim = (ibcs2_time_t)st->st_atime;
 	st4->st_mtim = (ibcs2_time_t)st->st_mtime;
 	st4->st_ctim = (ibcs2_time_t)st->st_ctime;
 }
 
 static int
 cvt_statfs(sp, buf, len)
 	struct statfs *sp;
 	caddr_t buf;
 	int len;
 {
 	struct ibcs2_statfs ssfs;
 
 	bzero(&ssfs, sizeof ssfs);
 	ssfs.f_fstyp = 0;
 	ssfs.f_bsize = sp->f_bsize;
 	ssfs.f_frsize = 0;
 	ssfs.f_blocks = sp->f_blocks;
 	ssfs.f_bfree = sp->f_bfree;
 	ssfs.f_files = sp->f_files;
 	ssfs.f_ffree = sp->f_ffree;
 	ssfs.f_fname[0] = 0;
 	ssfs.f_fpack[0] = 0;
 	return copyout((caddr_t)&ssfs, buf, len);
 }	
 
 int
 ibcs2_statfs(td, uap)
 	struct thread *td;
 	struct ibcs2_statfs_args *uap;
 {
 	register struct mount *mp;
 	register struct statfs *sp;
 	int error;
 	struct nameidata nd;
 	caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	mp = nd.ni_vp->v_mount;
 	sp = &mp->mnt_stat;
 	vrele(nd.ni_vp);
 	if ((error = VFS_STATFS(mp, sp, td)) != 0)
 		return (error);
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	return cvt_statfs(sp, (caddr_t)SCARG(uap, buf), SCARG(uap, len));
 }
 
 int
 ibcs2_fstatfs(td, uap)
 	struct thread *td;
 	struct ibcs2_fstatfs_args *uap;
 {
 	struct file *fp;
 	struct mount *mp;
 	register struct statfs *sp;
 	int error;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
 	mp = ((struct vnode *)fp->f_data)->v_mount;
 	sp = &mp->mnt_stat;
-	if ((error = VFS_STATFS(mp, sp, td)) != 0)
+	error = VFS_STATFS(mp, sp, td);
+	fdrop(fp, td);
+	if (error != 0)
 		return (error);
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	return cvt_statfs(sp, (caddr_t)SCARG(uap, buf), SCARG(uap, len));
 }
 
 int
 ibcs2_stat(td, uap)
 	struct thread *td;
 	struct ibcs2_stat_args *uap;
 {
 	struct stat st;
 	struct ibcs2_stat ibcs2_st;
 	struct stat_args cup;
 	int error;
 	caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	SCARG(&cup, path) = SCARG(uap, path);
 	SCARG(&cup, ub) = stackgap_alloc(&sg, sizeof(st));
 
 	if ((error = stat(td, &cup)) != 0)
 		return error;
 
 	if ((error = copyin(SCARG(&cup, ub), &st, sizeof(st))) != 0)
 		return error;
 	bsd_stat2ibcs_stat(&st, &ibcs2_st);
 	return copyout((caddr_t)&ibcs2_st, (caddr_t)SCARG(uap, st),
 		       ibcs2_stat_len);
 }
 
 int
 ibcs2_lstat(td, uap)
 	struct thread *td;
 	struct ibcs2_lstat_args *uap;
 {
 	struct stat st;
 	struct ibcs2_stat ibcs2_st;
 	struct lstat_args cup;
 	int error;
 	caddr_t sg = stackgap_init();
 
 	CHECKALTEXIST(td, &sg, SCARG(uap, path));
 	SCARG(&cup, path) = SCARG(uap, path);
 	SCARG(&cup, ub) = stackgap_alloc(&sg, sizeof(st));
 
 	if ((error = lstat(td, &cup)) != 0)
 		return error;
 
 	if ((error = copyin(SCARG(&cup, ub), &st, sizeof(st))) != 0)
 		return error;
 	bsd_stat2ibcs_stat(&st, &ibcs2_st);
 	return copyout((caddr_t)&ibcs2_st, (caddr_t)SCARG(uap, st),
 		       ibcs2_stat_len);
 }
 
 int
 ibcs2_fstat(td, uap)
 	struct thread *td;
 	struct ibcs2_fstat_args *uap;
 {
 	struct stat st;
 	struct ibcs2_stat ibcs2_st;
 	struct fstat_args cup;
 	int error;
 	caddr_t sg = stackgap_init();
 
 	SCARG(&cup, fd) = SCARG(uap, fd);
 	SCARG(&cup, sb) = stackgap_alloc(&sg, sizeof(st));
 
 	if ((error = fstat(td, &cup)) != 0)
 		return error;
 
 	if ((error = copyin(SCARG(&cup, sb), &st, sizeof(st))) != 0)
 		return error;
 	bsd_stat2ibcs_stat(&st, &ibcs2_st);
 	return copyout((caddr_t)&ibcs2_st, (caddr_t)SCARG(uap, st),
 		       ibcs2_stat_len);
 }
 
 int
 ibcs2_utssys(td, uap)
 	struct thread *td;
 	struct ibcs2_utssys_args *uap;
 {
 	switch (SCARG(uap, flag)) {
 	case 0:			/* uname(2) */
 	{
 		char machine_name[9], *p;
 		struct ibcs2_utsname sut;
 		bzero(&sut, ibcs2_utsname_len);
 
 		strncpy(sut.sysname,
 			IBCS2_UNAME_SYSNAME, sizeof(sut.sysname) - 1);
 		strncpy(sut.release,
 			IBCS2_UNAME_RELEASE, sizeof(sut.release) - 1);
 		strncpy(sut.version,
 			IBCS2_UNAME_VERSION, sizeof(sut.version) - 1);
 		strncpy(machine_name, hostname, sizeof(machine_name) - 1);
 		machine_name[sizeof(machine_name) - 1] = 0;
 		p = index(machine_name, '.');
 		if ( p )
 			*p = '\0';
 		strncpy(sut.nodename, machine_name, sizeof(sut.nodename) - 1);
 		strncpy(sut.machine, machine, sizeof(sut.machine) - 1);
 
 		DPRINTF(("IBCS2 uname: sys=%s rel=%s ver=%s node=%s mach=%s\n",
 			 sut.sysname, sut.release, sut.version, sut.nodename,
 			 sut.machine));
 		return copyout((caddr_t)&sut, (caddr_t)SCARG(uap, a1),
 			       ibcs2_utsname_len);
 	}
 
 	case 2:			/* ustat(2) */
 	{
 		return ENOSYS;	/* XXX - TODO */
 	}
 
 	default:
 		return ENOSYS;
 	}
 }
Index: head/sys/kern/init_main.c
===================================================================
--- head/sys/kern/init_main.c	(revision 89305)
+++ head/sys/kern/init_main.c	(revision 89306)
@@ -1,649 +1,652 @@
 /*
  * Copyright (c) 1995 Terrence R. Lambert
  * All rights reserved.
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
  * $FreeBSD$
  */
 
 #include "opt_init_path.h"
 
 #include <sys/param.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/sysent.h>
 #include <sys/reboot.h>
 #include <sys/sx.h>
 #include <sys/sysproto.h>
 #include <sys/vmmeter.h>
 #include <sys/unistd.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 
 #include <machine/cpu.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/user.h>
 #include <sys/copyright.h>
 
 void mi_startup(void);				/* Should be elsewhere */
 
 /* Components of the first process -- never freed. */
 static struct session session0;
 static struct pgrp pgrp0;
 struct	proc proc0;
 struct	thread *thread0;
 static struct procsig procsig0;
 static struct filedesc0 filedesc0;
 static struct plimit limit0;
 static struct vmspace vmspace0;
 struct	proc *initproc;
 
 int cmask = CMASK;
 extern int fallback_elf_brand;
 
 struct	vnode *rootvp;
 int	boothowto = 0;		/* initialized so that it can be patched */
 SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0, "");
 int	bootverbose;
 SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0, "");
 
 /*
  * This ensures that there is at least one entry so that the sysinit_set
  * symbol is not undefined.  A sybsystem ID of SI_SUB_DUMMY is never
  * executed.
  */
 SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL)
 
 /*
  * The sysinit table itself.  Items are checked off as the are run.
  * If we want to register new sysinit types, add them to newsysinit.
  */
 SET_DECLARE(sysinit_set, struct sysinit);
 struct sysinit **sysinit, **sysinit_end;
 struct sysinit **newsysinit, **newsysinit_end;
 
 /*
  * Merge a new sysinit set into the current set, reallocating it if
  * necessary.  This can only be called after malloc is running.
  */
 void
 sysinit_add(struct sysinit **set, struct sysinit **set_end)
 {
 	struct sysinit **newset;
 	struct sysinit **sipp;
 	struct sysinit **xipp;
 	int count;
 
 	count = set_end - set;
 	if (newsysinit)
 		count += newsysinit_end - newsysinit;
 	else
 		count += sysinit_end - sysinit;
 	newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
 	if (newset == NULL)
 		panic("cannot malloc for sysinit");
 	xipp = newset;
 	if (newsysinit)
 		for (sipp = newsysinit; sipp < newsysinit_end; sipp++)
 			*xipp++ = *sipp;
 	else
 		for (sipp = sysinit; sipp < sysinit_end; sipp++)
 			*xipp++ = *sipp;
 	for (sipp = set; sipp < set_end; sipp++)
 		*xipp++ = *sipp;
 	if (newsysinit)
 		free(newsysinit, M_TEMP);
 	newsysinit = newset;
 	newsysinit_end = newset + count;
 }
 
 /*
  * System startup; initialize the world, create process 0, mount root
  * filesystem, and fork to create init and pagedaemon.  Most of the
  * hard work is done in the lower-level initialization routines including
  * startup(), which does memory initialization and autoconfiguration.
  *
  * This allows simple addition of new kernel subsystems that require
  * boot time initialization.  It also allows substitution of subsystem
  * (for instance, a scheduler, kernel profiler, or VM system) by object
  * module.  Finally, it allows for optional "kernel threads".
  */
 void
 mi_startup(void)
 {
 
 	register struct sysinit **sipp;		/* system initialization*/
 	register struct sysinit **xipp;		/* interior loop of sort*/
 	register struct sysinit *save;		/* bubble*/
 
 	if (sysinit == NULL) {
 		sysinit = SET_BEGIN(sysinit_set);
 		sysinit_end = SET_LIMIT(sysinit_set);
 	}
 
 restart:
 	/*
 	 * Perform a bubble sort of the system initialization objects by
 	 * their subsystem (primary key) and order (secondary key).
 	 */
 	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
 		for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
 			if ((*sipp)->subsystem < (*xipp)->subsystem ||
 			     ((*sipp)->subsystem == (*xipp)->subsystem &&
 			      (*sipp)->order <= (*xipp)->order))
 				continue;	/* skip*/
 			save = *sipp;
 			*sipp = *xipp;
 			*xipp = save;
 		}
 	}
 
 	/*
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 *
 	 * The last item on the list is expected to be the scheduler,
 	 * which will not return.
 	 */
 	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
 
 		if ((*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s)*/
 
 		if ((*sipp)->subsystem == SI_SUB_DONE)
 			continue;
 
 		/* Call function */
 		(*((*sipp)->func))((*sipp)->udata);
 
 		/* Check off the one we're just done */
 		(*sipp)->subsystem = SI_SUB_DONE;
 
 		/* Check if we've installed more sysinit items via KLD */
 		if (newsysinit != NULL) {
 			if (sysinit != SET_BEGIN(sysinit_set))
 				free(sysinit, M_TEMP);
 			sysinit = newsysinit;
 			sysinit_end = newsysinit_end;
 			newsysinit = NULL;
 			newsysinit_end = NULL;
 			goto restart;
 		}
 	}
 
 	panic("Shouldn't get here!");
 	/* NOTREACHED*/
 }
 
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's belong elsewhere, but have not yet
  **** been moved.
  ****
  ***************************************************************************
  */
 static void
 print_caddr_t(void *data __unused)
 {
 	printf("%s", (char *)data);
 }
 SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
 SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, version)
 
 static void
 set_boot_verbose(void *data __unused)
 {
 
 	if (boothowto & RB_VERBOSE)
 		bootverbose++;
 }
 SYSINIT(boot_verbose, SI_SUB_TUNABLES, SI_ORDER_ANY, set_boot_verbose, NULL)
 
 /*
  ***************************************************************************
  ****
  **** The two following SYSINT's are proc0 specific glue code.  I am not
  **** convinced that they can not be safely combined, but their order of
  **** operation has been maintained as the same as the original init_main.c
  **** for right now.
  ****
  **** These probably belong in init_proc.c or kern_proc.c, since they
  **** deal with proc0 (the fork template process).
  ****
  ***************************************************************************
  */
 /* ARGSUSED*/
 static void
 proc0_init(void *dummy __unused)
 {
 	register struct proc		*p;
 	register struct filedesc0	*fdp;
 	register unsigned i;
 	struct thread *td;
 
 	GIANT_REQUIRED;
 	/*
 	 * This assumes the proc0 struct has already been linked
 	 * using proc_linkup() in the machine specific initialisation
 	 * e.g. i386_init()
 	 */
 	p = &proc0;
 	td = thread0;
 
 	/*
 	 * Initialize magic number.
 	 */
 	p->p_magic = P_MAGIC;
 
 	/*
 	 * Initialize process and pgrp structures.
 	 */
 	procinit();
 
 	/*
 	 * Initialize sleep queue hash table
 	 */
 	sleepinit();
 
 	/*
 	 * additional VM structures
 	 */
 	vm_init2();
 
 	/*
 	 * Create process 0 (the swapper).
 	 */
 	LIST_INSERT_HEAD(&allproc, p, p_list);
 	LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
 	p->p_pgrp = &pgrp0;
 	LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
 	LIST_INIT(&pgrp0.pg_members);
 	LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);
 
 	pgrp0.pg_session = &session0;
 	session0.s_count = 1;
 	session0.s_leader = p;
 
 #ifdef __ELF__
 	p->p_sysent = &elf_freebsd_sysvec;
 #else
 	p->p_sysent = &aout_sysvec;
 #endif
 
 	p->p_flag = P_SYSTEM;
 	p->p_sflag = PS_INMEM;
 	p->p_stat = SRUN;
 	p->p_ksegrp.kg_nice = NZERO;
 	p->p_ksegrp.kg_pri.pri_class = PRI_TIMESHARE;
 	p->p_ksegrp.kg_pri.pri_level = PVM;
 	p->p_ksegrp.kg_pri.pri_native = PUSER;
 	p->p_ksegrp.kg_pri.pri_user = PUSER;
 
 	p->p_peers = 0;
 	p->p_leader = p;
 
 	bcopy("swapper", p->p_comm, sizeof ("swapper"));
 
 	callout_init(&p->p_itcallout, 0);
 	callout_init(&td->td_slpcallout, 1);
 
 	/* Create credentials. */
 	p->p_ucred = crget();
 	p->p_ucred->cr_ngroups = 1;	/* group 0 */
 	p->p_ucred->cr_uidinfo = uifind(0);
 	p->p_ucred->cr_ruidinfo = uifind(0);
 	p->p_ucred->cr_prison = NULL;	/* Don't jail it. */
 	td->td_ucred = crhold(p->p_ucred);
 
 	/* Create procsig. */
 	p->p_procsig = &procsig0;
 	p->p_procsig->ps_refcnt = 1;
 
 	/* Initialize signal state for process 0. */
 	siginit(&proc0);
 
 	/* Create the file descriptor table. */
 	fdp = &filedesc0;
 	p->p_fd = &fdp->fd_fd;
+	mtx_init(&fdp->fd_fd.fd_mtx, "struct filedesc", MTX_DEF);
 	fdp->fd_fd.fd_refcnt = 1;
 	fdp->fd_fd.fd_cmask = cmask;
 	fdp->fd_fd.fd_ofiles = fdp->fd_dfiles;
 	fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags;
 	fdp->fd_fd.fd_nfiles = NDFILE;
 
 	/* Create the limits structures. */
 	p->p_limit = &limit0;
 	for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++)
 		limit0.pl_rlimit[i].rlim_cur =
 		    limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY;
 	limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur =
 	    limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
 	limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur =
 	    limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
 	i = ptoa(cnt.v_free_count);
 	limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i;
 	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
 	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3;
 	limit0.p_cpulimit = RLIM_INFINITY;
 	limit0.p_refcnt = 1;
 
 	/* Allocate a prototype map so we have something to fork. */
 	pmap_pinit0(vmspace_pmap(&vmspace0));
 	p->p_vmspace = &vmspace0;
 	vmspace0.vm_refcnt = 1;
 	vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS),
 	    trunc_page(VM_MAXUSER_ADDRESS));
 	vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0);
 
 	/*
 	 * We continue to place resource usage info and signal
 	 * actions in the user struct so they're pageable.
 	 */
 	p->p_stats = &p->p_uarea->u_stats;
 	p->p_sigacts = &p->p_uarea->u_sigacts;
 
 	/*
 	 * Charge root for one process.
 	 */
 	(void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
 }
 SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
 
 /* ARGSUSED*/
 static void
 proc0_post(void *dummy __unused)
 {
 	struct timespec ts;
 	struct proc *p;
 
 	/*
 	 * Now we can look at the time, having had a chance to verify the
 	 * time from the file system.  Pretend that proc0 started now.
 	 */
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &allproc, p_list) {
 		microtime(&p->p_stats->p_start);
 		p->p_runtime = 0;
 	}
 	sx_sunlock(&allproc_lock);
 	microuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 
 	/*
 	 * Give the ``random'' number generator a thump.
 	 */
 	nanotime(&ts);
 	srandom(ts.tv_sec ^ ts.tv_nsec);
 }
 SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
 
 /*
  ***************************************************************************
  ****
  **** The following SYSINIT's and glue code should be moved to the
  **** respective files on a per subsystem basis.
  ****
  ***************************************************************************
  */
 
 
 /*
  ***************************************************************************
  ****
  **** The following code probably belongs in another file, like
  **** kern/init_init.c.
  ****
  ***************************************************************************
  */
 
 /*
  * List of paths to try when searching for "init".
  */
 static char init_path[MAXPATHLEN] =
 #ifdef	INIT_PATH
     __XSTRING(INIT_PATH);
 #else
     "/sbin/init:/sbin/oinit:/sbin/init.bak:/stand/sysinstall";
 #endif
 SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
 	"Path used to search the init process");
 
 /*
  * Start the initial user process; try exec'ing each pathname in init_path.
  * The program is invoked with one argument containing the boot flags.
  */
 static void
 start_init(void *dummy)
 {
 	vm_offset_t addr;
 	struct execve_args args;
 	int options, error;
 	char *var, *path, *next, *s;
 	char *ucp, **uap, *arg0, *arg1;
 	struct thread *td;
 	struct proc *p;
 	int init_does_devfs = 0;
 
 	mtx_lock(&Giant);
 
 	GIANT_REQUIRED;
 
 	td = curthread;
 	p = td->td_proc;
 
 	/* Get the vnode for '/'.  Set p->p_fd->fd_cdir to reference it. */
 	if (VFS_ROOT(TAILQ_FIRST(&mountlist), &rootvnode))
 		panic("cannot find root vnode");
+	FILEDESC_LOCK(p->p_fd);
 	p->p_fd->fd_cdir = rootvnode;
 	VREF(p->p_fd->fd_cdir);
 	p->p_fd->fd_rdir = rootvnode;
 	VREF(p->p_fd->fd_rdir);
+	FILEDESC_UNLOCK(p->p_fd);
 	VOP_UNLOCK(rootvnode, 0, td);
 
 	if (devfs_present) {
 		/*
 		 * For disk based systems, we probably cannot do this yet
 		 * since the fs will be read-only.  But a NFS root
 		 * might be ok.  It is worth a shot.
 		 */
 		error = vn_mkdir("/dev", 0700, UIO_SYSSPACE, td);
 		if (error == EEXIST)
 			error = 0;
 		if (error == 0)
 			error = vfs_mount(td, "devfs", "/dev", 0, 0);
 		if (error != 0)
 			init_does_devfs = 1;
 	}
 
 	/*
 	 * Need just enough stack to hold the faked-up "execve()" arguments.
 	 */
 	addr = trunc_page(USRSTACK - PAGE_SIZE);
 	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE,
 			FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
 		panic("init: couldn't allocate argument space");
 	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
 	p->p_vmspace->vm_ssize = 1;
 
 	if ((var = getenv("init_path")) != NULL) {
 		strncpy(init_path, var, sizeof init_path);
 		init_path[sizeof init_path - 1] = 0;
 	}
 	if ((var = getenv("kern.fallback_elf_brand")) != NULL)
 		fallback_elf_brand = strtol(var, NULL, 0);
 	
 	for (path = init_path; *path != '\0'; path = next) {
 		while (*path == ':')
 			path++;
 		if (*path == '\0')
 			break;
 		for (next = path; *next != '\0' && *next != ':'; next++)
 			/* nothing */ ;
 		if (bootverbose)
 			printf("start_init: trying %.*s\n", (int)(next - path),
 			    path);
 			
 		/*
 		 * Move out the boot flag argument.
 		 */
 		options = 0;
 		ucp = (char *)USRSTACK;
 		(void)subyte(--ucp, 0);		/* trailing zero */
 		if (boothowto & RB_SINGLE) {
 			(void)subyte(--ucp, 's');
 			options = 1;
 		}
 #ifdef notyet
                 if (boothowto & RB_FASTBOOT) {
 			(void)subyte(--ucp, 'f');
 			options = 1;
 		}
 #endif
 
 #ifdef BOOTCDROM
 		(void)subyte(--ucp, 'C');
 		options = 1;
 #endif
 		if (init_does_devfs) {
 			(void)subyte(--ucp, 'd');
 			options = 1;
 		}
 
 		if (options == 0)
 			(void)subyte(--ucp, '-');
 		(void)subyte(--ucp, '-');		/* leading hyphen */
 		arg1 = ucp;
 
 		/*
 		 * Move out the file name (also arg 0).
 		 */
 		(void)subyte(--ucp, 0);
 		for (s = next - 1; s >= path; s--)
 			(void)subyte(--ucp, *s);
 		arg0 = ucp;
 
 		/*
 		 * Move out the arg pointers.
 		 */
 		uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
 		(void)suword((caddr_t)--uap, (long)0);	/* terminator */
 		(void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
 		(void)suword((caddr_t)--uap, (long)(intptr_t)arg0);
 
 		/*
 		 * Point at the arguments.
 		 */
 		args.fname = arg0;
 		args.argv = uap;
 		args.envv = NULL;
 
 		/*
 		 * Now try to exec the program.  If can't for any reason
 		 * other than it doesn't exist, complain.
 		 *
 		 * Otherwise, return via fork_trampoline() all the way
 		 * to user mode as init!
 		 */
 		if ((error = execve(td, &args)) == 0) {
 			mtx_unlock(&Giant);
 			return;
 		}
 		if (error != ENOENT)
 			printf("exec %.*s: error %d\n", (int)(next - path), 
 			    path, error);
 	}
 	printf("init: not found in path %s\n", init_path);
 	panic("no init");
 }
 
 /*
  * Like kthread_create(), but runs in it's own address space.
  * We do this early to reserve pid 1.
  *
  * Note special case - do not make it runnable yet.  Other work
  * in progress will change this more.
  */
 static void
 create_init(const void *udata __unused)
 {
 	int error;
 
 	error = fork1(thread0, RFFDG | RFPROC | RFSTOPPED, &initproc);
 	if (error)
 		panic("cannot fork init: %d\n", error);
 	PROC_LOCK(initproc);
 	initproc->p_flag |= P_SYSTEM;
 	PROC_UNLOCK(initproc);
 	mtx_lock_spin(&sched_lock);
 	initproc->p_sflag |= PS_INMEM;
 	mtx_unlock_spin(&sched_lock);
 	cpu_set_fork_handler(&initproc->p_thread, start_init, NULL);
 }
 SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL)
 
 /*
  * Make it runnable now.
  */
 static void
 kick_init(const void *udata __unused)
 {
 
 	mtx_lock_spin(&sched_lock);
 	initproc->p_stat = SRUN;
 	setrunqueue(&initproc->p_thread); /* XXXKSE */
 	mtx_unlock_spin(&sched_lock);
 }
 SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
Index: head/sys/kern/kern_acl.c
===================================================================
--- head/sys/kern/kern_acl.c	(revision 89305)
+++ head/sys/kern/kern_acl.c	(revision 89306)
@@ -1,817 +1,821 @@
 /*-
  * Copyright (c) 1999-2001 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 /*
  * Developed by the TrustedBSD Project.
  * Support for POSIX.1e access control lists.
  */
 
 #include "opt_cap.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/vnode.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/acl.h>
 
 MALLOC_DEFINE(M_ACL, "acl", "access control list");
 
 static int	vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
 	    struct acl *aclp);
 static int	vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
 	    struct acl *aclp);
 static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
 	    acl_type_t type, struct acl *aclp);
 
 /*
  * Implement a version of vaccess() that understands POSIX.1e ACL semantics.
  * Return 0 on success, else an errno value.  Should be merged into
  * vaccess() eventually.
  */
 int
 vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
     struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)
 {
 	struct acl_entry *acl_other, *acl_mask;
 	mode_t dac_granted;
 	mode_t cap_granted;
 	mode_t acl_mask_granted;
 	int group_matched, i;
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.  Otherwise, attempt
 	 * to use privileges granted via cap_granted.  In some cases,
 	 * which privileges to use may be ambiguous due to "best match",
 	 * in which case fall back on first match for the time being.
 	 */
 	if (privused != NULL)
 		*privused = 0;
 
 	/*
 	 * Determine privileges now, but don't apply until we've found
 	 * a DAC entry that matches but has failed to allow access.
 	 */
 #ifndef CAPABILITIES
 	if (suser_xxx(cred, NULL, PRISON_ROOT) == 0)
 		cap_granted = (VEXEC | VREAD | VWRITE | VADMIN);
 	else
 		cap_granted = 0;
 #else
 	cap_granted = 0;
 
 	if (type == VDIR) {
 		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
 		     CAP_DAC_READ_SEARCH, PRISON_ROOT))
 			cap_granted |= VEXEC;
 	} else {
 		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
 		    CAP_DAC_EXECUTE, PRISON_ROOT))
 			cap_granted |= VEXEC;
 	}
 
 	if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH,
 	    PRISON_ROOT))
 		cap_granted |= VREAD;
 
 	if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE,
 	    PRISON_ROOT))
 		cap_granted |= VWRITE;
 
 	if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER,
 	    PRISON_ROOT))
 		cap_granted |= VADMIN;
 #endif /* CAPABILITIES */
 
 	/*
 	 * The owner matches if the effective uid associated with the
 	 * credential matches that of the ACL_USER_OBJ entry.  While we're
 	 * doing the first scan, also cache the location of the ACL_MASK
 	 * and ACL_OTHER entries, preventing some future iterations.
 	 */
 	acl_mask = acl_other = NULL;
 	for (i = 0; i < acl->acl_cnt; i++) {
 		switch (acl->acl_entry[i].ae_tag) {
 		case ACL_USER_OBJ:
 			if (file_uid != cred->cr_uid)
 				break;
 			dac_granted = 0;
 			dac_granted |= VADMIN;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= VWRITE;
 			if ((acc_mode & dac_granted) == acc_mode)
 				return (0);
 			if ((acc_mode & (dac_granted | cap_granted)) ==
 			    acc_mode) {
 				if (privused != NULL)
 					*privused = 1;
 				return (0);
 			}
 			goto error;
 
 		case ACL_MASK:
 			acl_mask = &acl->acl_entry[i];
 			break;
 
 		case ACL_OTHER:
 			acl_other = &acl->acl_entry[i];
 			break;
 
 		default:
 		}
 	}
 
 	/*
 	 * An ACL_OTHER entry should always exist in a valid access
 	 * ACL.  If it doesn't, then generate a serious failure.  For now,
 	 * this means a debugging message and EPERM, but in the future
 	 * should probably be a panic.
 	 */
 	if (acl_other == NULL) {
 		/*
 		 * XXX This should never happen
 		 */
 		printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
 		return (EPERM);
 	}
 
 	/*
 	 * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields
 	 * are masked by an ACL_MASK entry, if any.  As such, first identify
 	 * the ACL_MASK field, then iterate through identifying potential
 	 * user matches, then group matches.  If there is no ACL_MASK,
 	 * assume that the mask allows all requests to succeed.
 	 */
 	if (acl_mask != NULL) {
 		acl_mask_granted = 0;
 		if (acl_mask->ae_perm & ACL_EXECUTE)
 			acl_mask_granted |= VEXEC;
 		if (acl_mask->ae_perm & ACL_READ)
 			acl_mask_granted |= VREAD;
 		if (acl_mask->ae_perm & ACL_WRITE)
 			acl_mask_granted |= VWRITE;
 	} else
 		acl_mask_granted = VEXEC | VREAD | VWRITE;
 
 	/*
 	 * Iterate through user ACL entries.  Do checks twice, first
 	 * without privilege, and then if a match is found but failed,
 	 * a second time with privilege.
 	 */
 
 	/*
 	 * Check ACL_USER ACL entries.
 	 */
 	for (i = 0; i < acl->acl_cnt; i++) {
 		switch (acl->acl_entry[i].ae_tag) {
 		case ACL_USER:
 			if (acl->acl_entry[i].ae_id != cred->cr_uid)
 				break;
 			dac_granted = 0;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= VWRITE;
 			dac_granted &= acl_mask_granted;
 			if ((acc_mode & dac_granted) == acc_mode)
 				return (0);
 			if ((acc_mode & (dac_granted | cap_granted)) !=
 			    acc_mode)
 				goto error;
 
 			if (privused != NULL)
 				*privused = 1;
 			return (0);
 		}
 	}
 
 	/*
 	 * Group match is best-match, not first-match, so find a 
 	 * "best" match.  Iterate across, testing each potential group
 	 * match.  Make sure we keep track of whether we found a match
 	 * or not, so that we know if we should try again with any
 	 * available privilege, or if we should move on to ACL_OTHER.
 	 */
 	group_matched = 0;
 	for (i = 0; i < acl->acl_cnt; i++) {
 		switch (acl->acl_entry[i].ae_tag) {
 		case ACL_GROUP_OBJ:
 			if (!groupmember(file_gid, cred))
 				break;
 			dac_granted = 0;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= VWRITE;
 			dac_granted  &= acl_mask_granted;
 
 			if ((acc_mode & dac_granted) == acc_mode)
 				return (0);
 
 			group_matched = 1;
 			break;
 
 		case ACL_GROUP:
 			if (!groupmember(acl->acl_entry[i].ae_id, cred))
 				break;
 			dac_granted = 0;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= VWRITE;
 			dac_granted  &= acl_mask_granted;
 
 			if ((acc_mode & dac_granted) == acc_mode)
 				return (0);
 
 			group_matched = 1;
 			break;
 
 		default:
 		}
 	}
 
 	if (group_matched == 1) {
 		/*
 		 * There was a match, but it did not grant rights via
 		 * pure DAC.  Try again, this time with privilege.
 		 */
 		for (i = 0; i < acl->acl_cnt; i++) {
 			switch (acl->acl_entry[i].ae_tag) {
 			case ACL_GROUP_OBJ:
 				if (!groupmember(file_gid, cred))
 					break;
 				dac_granted = 0;
 				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 					dac_granted |= VEXEC;
 				if (acl->acl_entry[i].ae_perm & ACL_READ)
 					dac_granted |= VREAD;
 				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 					dac_granted |= VWRITE;
 				dac_granted &= acl_mask_granted;
 
 				if ((acc_mode & (dac_granted | cap_granted)) !=
 				    acc_mode)
 					break;
 
 				if (privused != NULL)
 					*privused = 1;
 				return (0);
 
 			case ACL_GROUP:
 				if (!groupmember(acl->acl_entry[i].ae_id,
 				    cred))
 					break;
 				dac_granted = 0;
 				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 				if (acl->acl_entry[i].ae_perm & ACL_READ)
 					dac_granted |= VREAD;
 				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 					dac_granted |= VWRITE;
 				dac_granted &= acl_mask_granted;
 
 				if ((acc_mode & (dac_granted | cap_granted)) !=
 				    acc_mode)
 					break;
 
 				if (privused != NULL)
 					*privused = 1;
 				return (0);
 
 			default:
 			}
 		}
 		/*
 		 * Even with privilege, group membership was not sufficient.
 		 * Return failure.
 		 */
 		goto error;
 	}
 		
 	/*
 	 * Fall back on ACL_OTHER.  ACL_MASK is not applied to ACL_OTHER.
 	 */
 	dac_granted = 0;
 	if (acl_other->ae_perm & ACL_EXECUTE)
 		dac_granted |= VEXEC;
 	if (acl_other->ae_perm & ACL_READ)
 		dac_granted |= VREAD;
 	if (acl_other->ae_perm & ACL_WRITE)
 		dac_granted |= VWRITE;
 
 	if ((acc_mode & dac_granted) == acc_mode)
 		return (0);
 	if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) {
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 error:
 	return ((acc_mode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * For the purposes of file systems maintaining the _OBJ entries in an
  * inode with a mode_t field, this routine converts a mode_t entry
  * to an acl_perm_t.
  */
 acl_perm_t
 acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
 {
 	acl_perm_t	perm = 0;
 
 	switch(tag) {
 	case ACL_USER_OBJ:
 		if (mode & S_IXUSR)
 			perm |= ACL_EXECUTE;
 		if (mode & S_IRUSR)
 			perm |= ACL_READ;
 		if (mode & S_IWUSR)
 			perm |= ACL_WRITE;
 		return (perm);
 
 	case ACL_GROUP_OBJ:
 		if (mode & S_IXGRP)
 			perm |= ACL_EXECUTE;
 		if (mode & S_IRGRP)
 			perm |= ACL_READ;
 		if (mode & S_IWGRP)
 			perm |= ACL_WRITE;
 		return (perm);
 
 	case ACL_OTHER:
 		if (mode & S_IXOTH)
 			perm |= ACL_EXECUTE;
 		if (mode & S_IROTH)
 			perm |= ACL_READ;
 		if (mode & S_IWOTH)
 			perm |= ACL_WRITE;
 		return (perm);
 
 	default:
 		printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
 		return (0);
 	}
 }
 
 /*
  * Given inode information (uid, gid, mode), return an acl entry of the
  * appropriate type.
  */
 struct acl_entry
 acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
 {
 	struct acl_entry	acl_entry;
 
 	acl_entry.ae_tag = tag;
 	acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
 	switch(tag) {
 	case ACL_USER_OBJ:
 		acl_entry.ae_id = uid;
 		break;
 
 	case ACL_GROUP_OBJ:
 		acl_entry.ae_id = gid;
 		break;
 
 	case ACL_OTHER:
 		acl_entry.ae_id = ACL_UNDEFINED_ID;
 		break;
 
 	default:
 		acl_entry.ae_id = ACL_UNDEFINED_ID;
 		printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
 	}
 
 	return (acl_entry);
 }
 
 /*
  * Utility function to generate a file mode given appropriate ACL entries.
  */
 mode_t
 acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
     struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
 {
 	mode_t	mode;
 
 	mode = 0;
 	if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
 		mode |= S_IXUSR;
 	if (acl_user_obj_entry->ae_perm & ACL_READ)
 		mode |= S_IRUSR;
 	if (acl_user_obj_entry->ae_perm & ACL_WRITE)
 		mode |= S_IWUSR;
 	if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
 		mode |= S_IXGRP;
 	if (acl_group_obj_entry->ae_perm & ACL_READ)
 		mode |= S_IRGRP;
 	if (acl_group_obj_entry->ae_perm & ACL_WRITE)
 		mode |= S_IWGRP;
 	if (acl_other_entry->ae_perm & ACL_EXECUTE)
 		mode |= S_IXOTH;
 	if (acl_other_entry->ae_perm & ACL_READ)
 		mode |= S_IROTH;
 	if (acl_other_entry->ae_perm & ACL_WRITE)
 		mode |= S_IWOTH;
 
 	return (mode);
 }
 
 /*
  * Perform a syntactic check of the ACL, sufficient to allow an
  * implementing file system to determine if it should accept this and
  * rely on the POSIX.1e ACL properties.
  */
 int
 acl_posix1e_check(struct acl *acl)
 {
 	int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
 	int num_acl_mask, num_acl_other, i;
 
 	/*
 	 * Verify that the number of entries does not exceed the maximum
 	 * defined for acl_t.
 	 * Verify that the correct number of various sorts of ae_tags are
 	 * present:
 	 *   Exactly one ACL_USER_OBJ
 	 *   Exactly one ACL_GROUP_OBJ
 	 *   Exactly one ACL_OTHER
 	 *   If any ACL_USER or ACL_GROUP entries appear, then exactly one
 	 *   ACL_MASK entry must also appear.
 	 * Verify that all ae_perm entries are in ACL_PERM_BITS.
 	 * Verify all ae_tag entries are understood by this implementation.
 	 * Note: Does not check for uniqueness of qualifier (ae_id) field.
 	 */
 	num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
 	    num_acl_mask = num_acl_other = 0;
 	if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0)
 		return (EINVAL);
 	for (i = 0; i < acl->acl_cnt; i++) {
 		/*
 		 * Check for a valid tag.
 		 */
 		switch(acl->acl_entry[i].ae_tag) {
 		case ACL_USER_OBJ:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_user_obj++;
 			break;
 		case ACL_GROUP_OBJ:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_group_obj++;
 			break;
 		case ACL_USER:
 			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_user++;
 			break;
 		case ACL_GROUP:
 			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_group++;
 			break;
 		case ACL_OTHER:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_other++;
 			break;
 		case ACL_MASK:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_mask++;
 			break;
 		default:
 			return (EINVAL);
 		}
 		/*
 		 * Check for valid perm entries.
 		 */
 		if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
 		    ACL_PERM_BITS)
 			return (EINVAL);
 	}
 	if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
 	    (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
 		return (EINVAL);
 	if (((num_acl_group != 0) || (num_acl_user != 0)) &&
 	    (num_acl_mask != 1))
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * These calls wrap the real vnode operations, and are called by the 
  * syscall code once the syscall has converted the path or file
  * descriptor to a vnode (unlocked).  The aclp pointer is assumed
  * still to point to userland, so this should not be consumed within
  * the kernel except by syscall code.  Other code should directly
  * invoke VOP_{SET,GET}ACL.
  */
 
 /*
  * Given a vnode, set its ACL.
  */
 static int
 vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernacl;
 	int error;
 
 	error = copyin(aclp, &inkernacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = VOP_SETACL(vp, type, &inkernacl, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	return(error);
 }
 
 /*
  * Given a vnode, get its ACL.
  */
 static int
 vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = VOP_GETACL(vp, type, &inkernelacl, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	if (error == 0)
 		error = copyout(&inkernelacl, aclp, sizeof(struct acl));
 	return (error);
 }
 
 /*
  * Given a vnode, delete its ACL.
  */
 static int
 vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
 {
 	int error;
 
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = VOP_SETACL(vp, ACL_TYPE_DEFAULT, 0, td->td_proc->p_ucred,
 	    td);
 	VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 /*
  * Given a vnode, check whether an ACL is appropriate for it
  */
 static int
 vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	error = copyin(aclp, &inkernelacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_proc->p_ucred,
 	    td);
 	return (error);
 }
 
 /*
  * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.
  * Don't need to lock, as the vacl_ code will get/release any locks
  * required.
  */
 
 /*
  * Given a file path, get an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	mtx_lock(&Giant);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type), 
 			    SCARG(uap, aclp));
 		NDFREE(&nd, 0);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file path, set an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	mtx_lock(&Giant);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type),
 			    SCARG(uap, aclp));
 		NDFREE(&nd, 0);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file descriptor, get an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
 	if (error == 0) {
 		error = vacl_get_acl(td, (struct vnode *)fp->f_data,
 			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file descriptor, set an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
 	if (error == 0) {
 		error = vacl_set_acl(td, (struct vnode *)fp->f_data,
 			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
  *
  * MPSAFE
  */
 int
 __acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	mtx_lock(&Giant);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_delete(td, nd.ni_vp, SCARG(uap, type));
 		NDFREE(&nd, 0);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
  *
  * MPSAFE
  */
 int
 __acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
 	if (error == 0) {
 		error = vacl_delete(td, (struct vnode *)fp->f_data, 
 			    SCARG(uap, type));
+		fdrop(fp, td);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file path, check an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
 {
 	struct nameidata	nd;
 	int	error;
 
 	mtx_lock(&Giant);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type),
 			    SCARG(uap, aclp));
 		NDFREE(&nd, 0);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file descriptor, check an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
 	if (error == 0) {
 		error = vacl_aclcheck(td, (struct vnode *)fp->f_data,
 			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
Index: head/sys/kern/kern_descrip.c
===================================================================
--- head/sys/kern/kern_descrip.c	(revision 89305)
+++ head/sys/kern/kern_descrip.c	(revision 89306)
@@ -1,1844 +1,2087 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/conf.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/proc.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/unistd.h>
 #include <sys/resourcevar.h>
 #include <sys/event.h>
 #include <sys/sx.h>
 #include <sys/socketvar.h>
 
 #include <machine/limits.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
 MALLOC_DEFINE(M_FILE, "file", "Open file structure");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 
 static	 d_open_t  fdopen;
 #define NUMFDESC 64
 
 #define CDEV_MAJOR 22
 static struct cdevsw fildesc_cdevsw = {
 	/* open */	fdopen,
 	/* close */	noclose,
 	/* read */	noread,
 	/* write */	nowrite,
 	/* ioctl */	noioctl,
 	/* poll */	nopoll,
 	/* mmap */	nommap,
 	/* strategy */	nostrategy,
 	/* name */	"FD",
 	/* maj */	CDEV_MAJOR,
 	/* dump */	nodump,
 	/* psize */	nopsize,
 	/* flags */	0,
 };
 
 static int do_dup __P((struct filedesc *fdp, int old, int new, register_t *retval, struct thread *td));
 static int badfo_readwrite __P((struct file *fp, struct uio *uio,
     struct ucred *cred, int flags, struct thread *td));
 static int badfo_ioctl __P((struct file *fp, u_long com, caddr_t data,
     struct thread *td));
 static int badfo_poll __P((struct file *fp, int events,
     struct ucred *cred, struct thread *td));
 static int badfo_kqfilter __P((struct file *fp, struct knote *kn));
 static int badfo_stat __P((struct file *fp, struct stat *sb, struct thread *td));
 static int badfo_close __P((struct file *fp, struct thread *td));
 
 /*
  * Descriptor management.
  */
 struct filelist filehead;	/* head of list of open files */
 int nfiles;			/* actual number of open files */
 extern int cmask;	
+struct sx filelist_lock;	/* sx to protect filelist */
 
 /*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdtablesize_args {
 	int	dummy;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 getdtablesize(td, uap)
 	struct thread *td;
 	struct getdtablesize_args *uap;
 {
 	struct proc *p = td->td_proc;
 
 	mtx_lock(&Giant);
 	td->td_retval[0] = 
 	    min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
 	mtx_unlock(&Giant);
 	return (0);
 }
 
 /*
  * Duplicate a file descriptor to a particular value.
  *
  * note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup2_args {
 	u_int	from;
 	u_int	to;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 dup2(td, uap)
 	struct thread *td;
 	struct dup2_args *uap;
 {
 	struct proc *p = td->td_proc;
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	register u_int old = uap->from, new = uap->to;
 	int i, error;
 
 	mtx_lock(&Giant);
+	FILEDESC_LOCK(fdp);
 retry:
 	if (old >= fdp->fd_nfiles ||
 	    fdp->fd_ofiles[old] == NULL ||
 	    new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
 	    new >= maxfilesperproc) {
+		FILEDESC_UNLOCK(fdp);
 		error = EBADF;
 		goto done2;
 	}
 	if (old == new) {
 		td->td_retval[0] = new;
+		FILEDESC_UNLOCK(fdp);
 		error = 0;
 		goto done2;
 	}
 	if (new >= fdp->fd_nfiles) {
-		if ((error = fdalloc(td, new, &i)))
+		if ((error = fdalloc(td, new, &i))) {
+			FILEDESC_UNLOCK(fdp);
 			goto done2;
+		}
 		if (new != i)
 			panic("dup2: fdalloc");
 		/*
 		 * fdalloc() may block, retest everything.
 		 */
 		goto retry;
 	}
 	error = do_dup(fdp, (int)old, (int)new, td->td_retval, td);
 done2:
 	mtx_unlock(&Giant);
 	return(error);
 }
 
 /*
  * Duplicate a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup_args {
 	u_int	fd;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 dup(td, uap)
 	struct thread *td;
 	struct dup_args *uap;
 {
 	register struct filedesc *fdp;
 	u_int old;
 	int new, error;
 
 	mtx_lock(&Giant);
 	old = uap->fd;
 	fdp = td->td_proc->p_fd;
+	FILEDESC_LOCK(fdp);
 	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
+		FILEDESC_UNLOCK(fdp);
 		error = EBADF;
 		goto done2;
 	}
-	if ((error = fdalloc(td, 0, &new)))
+	if ((error = fdalloc(td, 0, &new))) {
+		FILEDESC_UNLOCK(fdp);
 		goto done2;
+	}
 	error = do_dup(fdp, (int)old, new, td->td_retval, td);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * The file control system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fcntl_args {
 	int	fd;
 	int	cmd;
 	long	arg;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 fcntl(td, uap)
 	struct thread *td;
 	register struct fcntl_args *uap;
 {
 	register struct proc *p = td->td_proc;
 	register struct filedesc *fdp;
 	register struct file *fp;
 	register char *pop;
 	struct vnode *vp;
 	int i, tmp, error = 0, flg = F_POSIX;
 	struct flock fl;
 	u_int newmin;
+	struct proc *leaderp;
 
 	mtx_lock(&Giant);
 
 	fdp = p->p_fd;
+	FILEDESC_LOCK(fdp);
 	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[uap->fd]) == NULL) {
+		FILEDESC_UNLOCK(fdp);
 		error = EBADF;
 		goto done2;
 	}
 	pop = &fdp->fd_ofileflags[uap->fd];
 
 	switch (uap->cmd) {
 	case F_DUPFD:
 		newmin = uap->arg;
 		if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
 		    newmin >= maxfilesperproc) {
+			FILEDESC_UNLOCK(fdp);
 			error = EINVAL;
 			break;
 		}
-		if ((error = fdalloc(td, newmin, &i)))
+		if ((error = fdalloc(td, newmin, &i))) {
+			FILEDESC_UNLOCK(fdp);
 			break;
+		}
 		error = do_dup(fdp, uap->fd, i, td->td_retval, td);
 		break;
 
 	case F_GETFD:
 		td->td_retval[0] = *pop & 1;
+		FILEDESC_UNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		*pop = (*pop &~ 1) | (uap->arg & 1);
+		FILEDESC_UNLOCK(fdp);
 		break;
 
 	case F_GETFL:
+		FILE_LOCK(fp);
+		FILEDESC_UNLOCK(fdp);
 		td->td_retval[0] = OFLAGS(fp->f_flag);
+		FILE_UNLOCK(fp);
 		break;
 
 	case F_SETFL:
 		fhold(fp);
+		FILEDESC_UNLOCK(fdp);
 		fp->f_flag &= ~FCNTLFLAGS;
 		fp->f_flag |= FFLAGS(uap->arg & ~O_ACCMODE) & FCNTLFLAGS;
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
 		if (error) {
 			fdrop(fp, td);
 			break;
 		}
 		tmp = fp->f_flag & FASYNC;
 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
 		if (!error) {
 			fdrop(fp, td);
 			break;
 		}
 		fp->f_flag &= ~FNONBLOCK;
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
 		fdrop(fp, td);
 		break;
 
 	case F_GETOWN:
 		fhold(fp);
+		FILEDESC_UNLOCK(fdp);
 		error = fo_ioctl(fp, FIOGETOWN, (caddr_t)td->td_retval, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETOWN:
 		fhold(fp);
+		FILEDESC_UNLOCK(fdp);
 		error = fo_ioctl(fp, FIOSETOWN, (caddr_t)&uap->arg, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLKW:
 		flg |= F_WAIT;
 		/* Fall into F_SETLK */
 
 	case F_SETLK:
 		if (fp->f_type != DTYPE_VNODE) {
+			FILEDESC_UNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		vp = (struct vnode *)fp->f_data;
-
 		/*
 		 * copyin/lockop may block
 		 */
 		fhold(fp);
+		FILEDESC_UNLOCK(fdp);
+		vp = (struct vnode *)fp->f_data;
+
 		/* Copy in the lock structure */
 		error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
 		    sizeof(fl));
 		if (error) {
 			fdrop(fp, td);
 			break;
 		}
 		if (fl.l_whence == SEEK_CUR) {
 			if (fp->f_offset < 0 ||
 			    (fl.l_start > 0 &&
 			     fp->f_offset > OFF_MAX - fl.l_start)) {
 				fdrop(fp, td);
 				error = EOVERFLOW;
 				break;
 			}
 			fl.l_start += fp->f_offset;
 		}
 
 		switch (fl.l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
 				error = EBADF;
 				break;
 			}
+			PROC_LOCK(p);
 			p->p_flag |= P_ADVLOCK;
-			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
+			leaderp = p->p_leader;
+			PROC_UNLOCK(p);
+			error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_SETLK,
 			    &fl, flg);
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
 				error = EBADF;
 				break;
 			}
+			PROC_LOCK(p);
 			p->p_flag |= P_ADVLOCK;
-			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
+			leaderp = p->p_leader;
+			PROC_UNLOCK(p);
+			error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_SETLK,
 			    &fl, flg);
 			break;
 		case F_UNLCK:
-			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
+			PROC_LOCK(p);
+			leaderp = p->p_leader;
+			PROC_UNLOCK(p);
+			error = VOP_ADVLOCK(vp, (caddr_t)leaderp, F_UNLCK,
 				&fl, F_POSIX);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		fdrop(fp, td);
 		break;
 
 	case F_GETLK:
 		if (fp->f_type != DTYPE_VNODE) {
+			FILEDESC_UNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		vp = (struct vnode *)fp->f_data;
 		/*
 		 * copyin/lockop may block
 		 */
 		fhold(fp);
+		FILEDESC_UNLOCK(fdp);
+		vp = (struct vnode *)fp->f_data;
+
 		/* Copy in the lock structure */
 		error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl,
 		    sizeof(fl));
 		if (error) {
 			fdrop(fp, td);
 			break;
 		}
 		if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK &&
 		    fl.l_type != F_UNLCK) {
 			fdrop(fp, td);
 			error = EINVAL;
 			break;
 		}
 		if (fl.l_whence == SEEK_CUR) {
 			if ((fl.l_start > 0 &&
 			     fp->f_offset > OFF_MAX - fl.l_start) ||
 			    (fl.l_start < 0 &&
 			     fp->f_offset < OFF_MIN - fl.l_start)) {
 				fdrop(fp, td);
 				error = EOVERFLOW;
 				break;
 			}
 			fl.l_start += fp->f_offset;
 		}
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK,
 			    &fl, F_POSIX);
 		fdrop(fp, td);
 		if (error == 0) {
 			error = copyout((caddr_t)&fl,
 				    (caddr_t)(intptr_t)uap->arg, sizeof(fl));
 		}
 		break;
 	default:
+		FILEDESC_UNLOCK(fdp);
 		error = EINVAL;
 		break;
 	}
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Common code for dup, dup2, and fcntl(F_DUPFD).
+ * filedesc must be locked, but will be unlocked as a side effect.
  */
 static int
 do_dup(fdp, old, new, retval, td)
 	register struct filedesc *fdp;
 	register int old, new;
 	register_t *retval;
 	struct thread *td;
 {
 	struct file *fp;
 	struct file *delfp;
 
+	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
 	/*
 	 * Save info on the descriptor being overwritten.  We have
 	 * to do the unmap now, but we cannot close it without
 	 * introducing an ownership race for the slot.
 	 */
 	delfp = fdp->fd_ofiles[new];
 #if 0
 	if (delfp && (fdp->fd_ofileflags[new] & UF_MAPPED))
 		(void) munmapfd(td, new);
 #endif
 
 	/*
 	 * Duplicate the source descriptor, update lastfile
 	 */
 	fp = fdp->fd_ofiles[old];
 	fdp->fd_ofiles[new] = fp;
 	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
 	fhold(fp);
 	if (new > fdp->fd_lastfile)
 		fdp->fd_lastfile = new;
 	*retval = new;
 
+	FILEDESC_UNLOCK(fdp);
+
 	/*
 	 * If we dup'd over a valid file, we now own the reference to it
 	 * and must dispose of it using closef() semantics (as if a
 	 * close() were performed on it).
 	 */
 	if (delfp)
 		(void) closef(delfp, td);
 	return (0);
 }
 
 /*
  * If sigio is on the list associated with a process or process group,
  * disable signalling from the device, remove sigio from the list and
  * free sigio.
  */
 void
 funsetown(sigio)
 	struct sigio *sigio;
 {
 	int s;
 
 	if (sigio == NULL)
 		return;
 	s = splhigh();
 	*(sigio->sio_myref) = NULL;
 	splx(s);
 	if (sigio->sio_pgid < 0) {
 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 	} else /* if ((*sigiop)->sio_pgid > 0) */ {
 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 	}
 	crfree(sigio->sio_ucred);
 	FREE(sigio, M_SIGIO);
 }
 
 /* Free a list of sigio structures. */
 void
 funsetownlst(sigiolst)
 	struct sigiolst *sigiolst;
 {
 	struct sigio *sigio;
 
 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL)
 		funsetown(sigio);
 }
 
 /*
  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
  *
  * After permission checking, add a sigio structure to the sigio list for
  * the process or process group.
  */
 int
 fsetown(pgid, sigiop)
 	pid_t pgid;
 	struct sigio **sigiop;
 {
 	struct proc *proc;
 	struct pgrp *pgrp;
 	struct sigio *sigio;
 	int s;
 
 	if (pgid == 0) {
 		funsetown(*sigiop);
 		return (0);
 	}
 	if (pgid > 0) {
 		proc = pfind(pgid);
 		if (proc == NULL)
 			return (ESRCH);
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		if (proc->p_session != curthread->td_proc->p_session) {
 			PROC_UNLOCK(proc);
 			return (EPERM);
 		}
 		PROC_UNLOCK(proc);
 
 		pgrp = NULL;
 	} else /* if (pgid < 0) */ {
 		pgrp = pgfind(-pgid);
 		if (pgrp == NULL)
 			return (ESRCH);
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		if (pgrp->pg_session != curthread->td_proc->p_session)
 			return (EPERM);
 
 		proc = NULL;
 	}
 	funsetown(*sigiop);
 	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
 	if (pgid > 0) {
 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_proc = proc;
 	} else {
 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_pgrp = pgrp;
 	}
 	sigio->sio_pgid = pgid;
 	sigio->sio_ucred = crhold(curthread->td_proc->p_ucred);
 	sigio->sio_myref = sigiop;
 	s = splhigh();
 	*sigiop = sigio;
 	splx(s);
 	return (0);
 }
 
 /*
  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
  */
 pid_t
 fgetown(sigio)
 	struct sigio *sigio;
 {
 	return (sigio != NULL ? sigio->sio_pgid : 0);
 }
 
 /*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct close_args {
         int     fd;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 close(td, uap)
 	struct thread *td;
 	struct close_args *uap;
 {
 	register struct filedesc *fdp;
 	register struct file *fp;
 	register int fd = uap->fd;
 	int error = 0;
 
 	mtx_lock(&Giant);
 	fdp = td->td_proc->p_fd;
+	FILEDESC_LOCK(fdp);
 	if ((unsigned)fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
+		FILEDESC_UNLOCK(fdp);
 		error = EBADF;
 		goto done2;
 	}
 #if 0
 	if (fdp->fd_ofileflags[fd] & UF_MAPPED)
 		(void) munmapfd(td, fd);
 #endif
 	fdp->fd_ofiles[fd] = NULL;
 	fdp->fd_ofileflags[fd] = 0;
 
 	/*
 	 * we now hold the fp reference that used to be owned by the descriptor
 	 * array.
 	 */
 	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
 		fdp->fd_lastfile--;
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
-	if (fd < fdp->fd_knlistsize)
+	if (fd < fdp->fd_knlistsize) {
+		FILEDESC_UNLOCK(fdp);
 		knote_fdclose(td, fd);
+	} else
+		FILEDESC_UNLOCK(fdp);
+
 	error = closef(fp, td);
 done2:
 	mtx_unlock(&Giant);
 	return(error);
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ofstat_args {
 	int	fd;
 	struct	ostat *sb;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 ofstat(td, uap)
 	struct thread *td;
 	register struct ofstat_args *uap;
 {
 	struct file *fp;
 	struct stat ub;
 	struct ostat oub;
 	int error;
 
 	mtx_lock(&Giant);
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		goto done2;
 	error = fo_stat(fp, &ub, td);
 	if (error == 0) {
 		cvtstat(&ub, &oub);
 		error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub));
 	}
 	fdrop(fp, td);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstat_args {
 	int	fd;
 	struct	stat *sb;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 fstat(td, uap)
 	struct thread *td;
 	struct fstat_args *uap;
 {
 	struct file *fp;
 	struct stat ub;
 	int error;
 
 	mtx_lock(&Giant);
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		goto done2;
 	error = fo_stat(fp, &ub, td);
 	if (error == 0)
 		error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub));
 	fdrop(fp, td);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nfstat_args {
 	int	fd;
 	struct	nstat *sb;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 nfstat(td, uap)
 	struct thread *td;
 	register struct nfstat_args *uap;
 {
 	struct file *fp;
 	struct stat ub;
 	struct nstat nub;
 	int error;
 
-	mtx_lock(&Giant);
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		goto done2;
 	error = fo_stat(fp, &ub, td);
 	if (error == 0) {
 		cvtnstat(&ub, &nub);
 		error = copyout((caddr_t)&nub, (caddr_t)uap->sb, sizeof (nub));
 	}
 	fdrop(fp, td);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Return pathconf information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fpathconf_args {
 	int	fd;
 	int	name;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 fpathconf(td, uap)
 	struct thread *td;
 	register struct fpathconf_args *uap;
 {
 	struct file *fp;
 	struct vnode *vp;
 	int error;
 
+	fp = ffind_hold(td, uap->fd);
+	if (fp == NULL)
+		return (EBADF);
 	mtx_lock(&Giant);
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		goto done2;
 
 	switch (fp->f_type) {
 	case DTYPE_PIPE:
 	case DTYPE_SOCKET:
 		if (uap->name != _PC_PIPE_BUF) {
+			fdrop(fp, td);
 			error = EINVAL;
 			goto done2;
 		}
 		td->td_retval[0] = PIPE_BUF;
 		error = 0;
 		break;
 	case DTYPE_FIFO:
 	case DTYPE_VNODE:
 		vp = (struct vnode *)fp->f_data;
 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	fdrop(fp, td);
 done2:
 	mtx_unlock(&Giant);
 	return(error);
 }
 
 /*
  * Allocate a file descriptor for the process.
  */
 static int fdexpand;
 SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
 
 int
 fdalloc(td, want, result)
 	struct thread *td;
 	int want;
 	int *result;
 {
 	struct proc *p = td->td_proc;
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	register int i;
 	int lim, last, nfiles;
-	struct file **newofile;
+	struct file **newofile, **oldofile;
 	char *newofileflags;
 
+	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
 	/*
 	 * Search for a free descriptor starting at the higher
 	 * of want or fd_freefile.  If that fails, consider
 	 * expanding the ofile array.
 	 */
 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
 	for (;;) {
 		last = min(fdp->fd_nfiles, lim);
 		if ((i = want) < fdp->fd_freefile)
 			i = fdp->fd_freefile;
 		for (; i < last; i++) {
 			if (fdp->fd_ofiles[i] == NULL) {
 				fdp->fd_ofileflags[i] = 0;
 				if (i > fdp->fd_lastfile)
 					fdp->fd_lastfile = i;
 				if (want <= fdp->fd_freefile)
 					fdp->fd_freefile = i;
 				*result = i;
 				return (0);
 			}
 		}
 
 		/*
 		 * No space in current array.  Expand?
 		 */
 		if (fdp->fd_nfiles >= lim)
 			return (EMFILE);
 		if (fdp->fd_nfiles < NDEXTENT)
 			nfiles = NDEXTENT;
 		else
 			nfiles = 2 * fdp->fd_nfiles;
+		FILEDESC_UNLOCK(fdp);
 		MALLOC(newofile, struct file **, nfiles * OFILESIZE,
 		    M_FILEDESC, M_WAITOK);
+		FILEDESC_LOCK(fdp);
 
 		/*
 		 * deal with file-table extend race that might have occured
 		 * when malloc was blocked.
 		 */
 		if (fdp->fd_nfiles >= nfiles) {
+			FILEDESC_UNLOCK(fdp);
 			FREE(newofile, M_FILEDESC);
+			FILEDESC_LOCK(fdp);
 			continue;
 		}
 		newofileflags = (char *) &newofile[nfiles];
 		/*
 		 * Copy the existing ofile and ofileflags arrays
 		 * and zero the new portion of each array.
 		 */
 		bcopy(fdp->fd_ofiles, newofile,
 			(i = sizeof(struct file *) * fdp->fd_nfiles));
 		bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i);
 		bcopy(fdp->fd_ofileflags, newofileflags,
 			(i = sizeof(char) * fdp->fd_nfiles));
 		bzero(newofileflags + i, nfiles * sizeof(char) - i);
 		if (fdp->fd_nfiles > NDFILE)
-			FREE(fdp->fd_ofiles, M_FILEDESC);
+			oldofile = fdp->fd_ofiles;
+		else
+			oldofile = NULL;
 		fdp->fd_ofiles = newofile;
 		fdp->fd_ofileflags = newofileflags;
 		fdp->fd_nfiles = nfiles;
 		fdexpand++;
+		if (oldofile != NULL)
+			FREE(oldofile, M_FILEDESC);
 	}
 	return (0);
 }
 
 /*
  * Check to see whether n user file descriptors
  * are available to the process p.
  */
 int
 fdavail(td, n)
 	struct thread *td;
 	register int n;
 {
 	struct proc *p = td->td_proc;
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	register struct file **fpp;
 	register int i, lim, last;
 
+	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
 	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
 		return (1);
 
 	last = min(fdp->fd_nfiles, lim);
 	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
 	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
 		if (*fpp == NULL && --n <= 0)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Create a new open file structure and allocate
  * a file decriptor for the process that refers to it.
  */
 int
 falloc(td, resultfp, resultfd)
 	register struct thread *td;
 	struct file **resultfp;
 	int *resultfd;
 {
 	struct proc *p = td->td_proc;
 	register struct file *fp, *fq;
 	int error, i;
 
+	sx_xlock(&filelist_lock);
 	if (nfiles >= maxfiles) {
+		sx_xunlock(&filelist_lock);
 		tablefull("file");
 		return (ENFILE);
 	}
+	nfiles++;
+	sx_xunlock(&filelist_lock);
 	/*
 	 * Allocate a new file descriptor.
 	 * If the process has file descriptor zero open, add to the list
 	 * of open files at that point, otherwise put it at the front of
 	 * the list of open files.
 	 */
-	nfiles++;
 	MALLOC(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK | M_ZERO);
 
 	/*
 	 * wait until after malloc (which may have blocked) returns before
 	 * allocating the slot, else a race might have shrunk it if we had
 	 * allocated it before the malloc.
 	 */
+	FILEDESC_LOCK(p->p_fd);
 	if ((error = fdalloc(td, 0, &i))) {
+		FILEDESC_UNLOCK(p->p_fd);
+		sx_xlock(&filelist_lock);
 		nfiles--;
+		sx_xunlock(&filelist_lock);
 		FREE(fp, M_FILE);
 		return (error);
 	}
+	mtx_init(&fp->f_mtx, "file structure", MTX_DEF);
+	fp->f_gcflag = 0;
 	fp->f_count = 1;
 	fp->f_cred = crhold(p->p_ucred);
 	fp->f_ops = &badfileops;
 	fp->f_seqcount = 1;
+	FILEDESC_UNLOCK(p->p_fd);
+	sx_xlock(&filelist_lock);
+	FILEDESC_LOCK(p->p_fd);
 	if ((fq = p->p_fd->fd_ofiles[0])) {
 		LIST_INSERT_AFTER(fq, fp, f_list);
 	} else {
 		LIST_INSERT_HEAD(&filehead, fp, f_list);
 	}
 	p->p_fd->fd_ofiles[i] = fp;
+	FILEDESC_UNLOCK(p->p_fd);
+	sx_xunlock(&filelist_lock);
 	if (resultfp)
 		*resultfp = fp;
 	if (resultfd)
 		*resultfd = i;
 	return (0);
 }
 
 /*
  * Free a file descriptor.
  */
 void
 ffree(fp)
 	register struct file *fp;
 {
+
 	KASSERT((fp->f_count == 0), ("ffree: fp_fcount not 0!"));
+	sx_xlock(&filelist_lock);
 	LIST_REMOVE(fp, f_list);
-	crfree(fp->f_cred);
 	nfiles--;
+	sx_xunlock(&filelist_lock);
+	crfree(fp->f_cred);
+	mtx_destroy(&fp->f_mtx);
 	FREE(fp, M_FILE);
 }
 
 /*
  * Build a new filedesc structure.
  */
 struct filedesc *
 fdinit(td)
 	struct thread *td;
 {
 	register struct filedesc0 *newfdp;
 	register struct filedesc *fdp = td->td_proc->p_fd;
 
 	MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
 	    M_FILEDESC, M_WAITOK | M_ZERO);
+	mtx_init(&newfdp->fd_fd.fd_mtx, "filedesc structure", MTX_DEF);
+	FILEDESC_LOCK(&newfdp->fd_fd);
 	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
 	if (newfdp->fd_fd.fd_cdir)
 		VREF(newfdp->fd_fd.fd_cdir);
 	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
 	if (newfdp->fd_fd.fd_rdir)
 		VREF(newfdp->fd_fd.fd_rdir);
 	newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
 	if (newfdp->fd_fd.fd_jdir)
 		VREF(newfdp->fd_fd.fd_jdir);
 
 	/* Create the file descriptor table. */
 	newfdp->fd_fd.fd_refcnt = 1;
 	newfdp->fd_fd.fd_cmask = cmask;
 	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
 	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
 	newfdp->fd_fd.fd_nfiles = NDFILE;
 	newfdp->fd_fd.fd_knlistsize = -1;
+	FILEDESC_UNLOCK(&newfdp->fd_fd);
 
 	return (&newfdp->fd_fd);
 }
 
 /*
  * Share a filedesc structure.
  */
 struct filedesc *
 fdshare(p)
 	struct proc *p;
 {
+	FILEDESC_LOCK(p->p_fd);
 	p->p_fd->fd_refcnt++;
+	FILEDESC_UNLOCK(p->p_fd);
 	return (p->p_fd);
 }
 
 /*
  * Copy a filedesc structure.
  */
 struct filedesc *
 fdcopy(td)
 	struct thread *td;
 {
 	register struct filedesc *newfdp, *fdp = td->td_proc->p_fd;
 	register struct file **fpp;
-	register int i;
+	register int i, j;
 
 	/* Certain daemons might not have file descriptors. */
 	if (fdp == NULL)
 		return (NULL);
 
+	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+	FILEDESC_UNLOCK(fdp);
 	MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0),
 	    M_FILEDESC, M_WAITOK);
+	FILEDESC_LOCK(fdp);
 	bcopy(fdp, newfdp, sizeof(struct filedesc));
+	FILEDESC_UNLOCK(fdp);
+	bzero(&newfdp->fd_mtx, sizeof(newfdp->fd_mtx));
+	mtx_init(&newfdp->fd_mtx, "filedesc structure", MTX_DEF);
 	if (newfdp->fd_cdir)
 		VREF(newfdp->fd_cdir);
 	if (newfdp->fd_rdir)
 		VREF(newfdp->fd_rdir);
 	if (newfdp->fd_jdir)
 		VREF(newfdp->fd_jdir);
 	newfdp->fd_refcnt = 1;
 
 	/*
 	 * If the number of open files fits in the internal arrays
 	 * of the open file structure, use them, otherwise allocate
 	 * additional memory for the number of descriptors currently
 	 * in use.
 	 */
+	FILEDESC_LOCK(fdp);
+	newfdp->fd_lastfile = fdp->fd_lastfile;
+	newfdp->fd_nfiles = fdp->fd_nfiles;
 	if (newfdp->fd_lastfile < NDFILE) {
 		newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles;
 		newfdp->fd_ofileflags =
 		    ((struct filedesc0 *) newfdp)->fd_dfileflags;
 		i = NDFILE;
 	} else {
 		/*
 		 * Compute the smallest multiple of NDEXTENT needed
 		 * for the file descriptors currently in use,
 		 * allowing the table to shrink.
 		 */
+retry:
 		i = newfdp->fd_nfiles;
 		while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2)
 			i /= 2;
+		FILEDESC_UNLOCK(fdp);
 		MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE,
 		    M_FILEDESC, M_WAITOK);
+		FILEDESC_LOCK(fdp);
+		newfdp->fd_lastfile = fdp->fd_lastfile;
+		newfdp->fd_nfiles = fdp->fd_nfiles;
+		j = newfdp->fd_nfiles;
+		while (j > 2 * NDEXTENT && j > newfdp->fd_lastfile * 2)
+			j /= 2;
+		if (i != j) {
+			/*
+			 * The size of the original table has changed.
+			 * Go over once again.
+			 */
+			FILEDESC_UNLOCK(fdp);
+			FREE(newfdp->fd_ofiles, M_FILEDESC);
+			FILEDESC_LOCK(fdp);
+			newfdp->fd_lastfile = fdp->fd_lastfile;
+			newfdp->fd_nfiles = fdp->fd_nfiles;
+			goto retry;
+		}
 		newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i];
 	}
 	newfdp->fd_nfiles = i;
 	bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **));
 	bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char));
 
 	/*
 	 * kq descriptors cannot be copied.
 	 */
 	if (newfdp->fd_knlistsize != -1) {
 		fpp = &newfdp->fd_ofiles[newfdp->fd_lastfile];
 		for (i = newfdp->fd_lastfile; i >= 0; i--, fpp--) {
 			if (*fpp != NULL && (*fpp)->f_type == DTYPE_KQUEUE) {
 				*fpp = NULL;
 				if (i < newfdp->fd_freefile)
 					newfdp->fd_freefile = i;
 			}
 			if (*fpp == NULL && i == newfdp->fd_lastfile && i > 0)
 				newfdp->fd_lastfile--;
 		}
 		newfdp->fd_knlist = NULL;
 		newfdp->fd_knlistsize = -1;
 		newfdp->fd_knhash = NULL;
 		newfdp->fd_knhashmask = 0;
 	}
 
 	fpp = newfdp->fd_ofiles;
 	for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) {
-		if (*fpp != NULL)
+		if (*fpp != NULL) {
 			fhold(*fpp);
+		}
 	}
 	return (newfdp);
 }
 
 /*
  * Release a filedesc structure.
  */
 void
 fdfree(td)
 	struct thread *td;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	struct file **fpp;
 	register int i;
 
 	/* Certain daemons might not have file descriptors. */
 	if (fdp == NULL)
 		return;
 
-	if (--fdp->fd_refcnt > 0)
+	FILEDESC_LOCK(fdp);
+	if (--fdp->fd_refcnt > 0) {
+		FILEDESC_UNLOCK(fdp);
 		return;
+	}
 	/*
 	 * we are the last reference to the structure, we can
 	 * safely assume it will not change out from under us.
 	 */
+	FILEDESC_UNLOCK(fdp);
 	fpp = fdp->fd_ofiles;
 	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
 		if (*fpp)
 			(void) closef(*fpp, td);
 	}
 	if (fdp->fd_nfiles > NDFILE)
 		FREE(fdp->fd_ofiles, M_FILEDESC);
 	if (fdp->fd_cdir)
 		vrele(fdp->fd_cdir);
 	if (fdp->fd_rdir)
 		vrele(fdp->fd_rdir);
 	if (fdp->fd_jdir)
 		vrele(fdp->fd_jdir);
 	if (fdp->fd_knlist)
 		FREE(fdp->fd_knlist, M_KQUEUE);
 	if (fdp->fd_knhash)
 		FREE(fdp->fd_knhash, M_KQUEUE);
+	mtx_destroy(&fdp->fd_mtx);
 	FREE(fdp, M_FILEDESC);
 }
 
 /*
  * For setugid programs, we don't want to people to use that setugidness
  * to generate error messages which write to a file which otherwise would
  * otherwise be off-limits to the process.
  *
  * This is a gross hack to plug the hole.  A better solution would involve
  * a special vop or other form of generalized access control mechanism.  We
  * go ahead and just reject all procfs file systems accesses as dangerous.
  *
  * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
  * sufficient.  We also don't for check setugidness since we know we are.
  */
 static int
 is_unsafe(struct file *fp)
 {
 	if (fp->f_type == DTYPE_VNODE && 
 	    ((struct vnode *)(fp->f_data))->v_tag == VT_PROCFS)
 		return (1);
 	return (0);
 }
 
 /*
  * Make this setguid thing safe, if at all possible.
  */
 void
 setugidsafety(td)
 	struct thread *td;
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	register int i;
 
 	/* Certain daemons might not have file descriptors. */
 	if (fdp == NULL)
 		return;
 
 	/*
 	 * note: fdp->fd_ofiles may be reallocated out from under us while
 	 * we are blocked in a close.  Be careful!
 	 */
+	FILEDESC_LOCK(fdp);
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (i > 2)
 			break;
 		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
 			struct file *fp;
 
 #if 0
 			if ((fdp->fd_ofileflags[i] & UF_MAPPED) != 0)
 				(void) munmapfd(td, i);
 #endif
-			if (i < fdp->fd_knlistsize)
+			if (i < fdp->fd_knlistsize) {
+				FILEDESC_UNLOCK(fdp);
 				knote_fdclose(td, i);
+				FILEDESC_LOCK(fdp);
+			}
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fp = fdp->fd_ofiles[i];
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			if (i < fdp->fd_freefile)
 				fdp->fd_freefile = i;
+			FILEDESC_UNLOCK(fdp);
 			(void) closef(fp, td);
+			FILEDESC_LOCK(fdp);
 		}
 	}
 	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
 		fdp->fd_lastfile--;
+	FILEDESC_UNLOCK(fdp);
 }
 
 /*
  * Close any files on exec?
  */
 void
 fdcloseexec(td)
 	struct thread *td;
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	register int i;
 
 	/* Certain daemons might not have file descriptors. */
 	if (fdp == NULL)
 		return;
 
+	FILEDESC_LOCK(fdp);
+
 	/*
 	 * We cannot cache fd_ofiles or fd_ofileflags since operations
 	 * may block and rip them out from under us.
 	 */
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (fdp->fd_ofiles[i] != NULL &&
 		    (fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
 			struct file *fp;
 
 #if 0
 			if (fdp->fd_ofileflags[i] & UF_MAPPED)
 				(void) munmapfd(td, i);
 #endif
-			if (i < fdp->fd_knlistsize)
+			if (i < fdp->fd_knlistsize) {
+				FILEDESC_UNLOCK(fdp);
 				knote_fdclose(td, i);
+				FILEDESC_LOCK(fdp);
+			}
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fp = fdp->fd_ofiles[i];
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			if (i < fdp->fd_freefile)
 				fdp->fd_freefile = i;
+			FILEDESC_UNLOCK(fdp);
 			(void) closef(fp, td);
+			FILEDESC_LOCK(fdp);
 		}
 	}
 	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
 		fdp->fd_lastfile--;
+	FILEDESC_UNLOCK(fdp);
 }
 
 /*
  * Internal form of close.
  * Decrement reference count on file structure.
  * Note: td may be NULL when closing a file
  * that was being passed in a message.
  */
 int
 closef(fp, td)
 	register struct file *fp;
 	register struct thread *td;
 {
 	struct vnode *vp;
 	struct flock lf;
 
 	if (fp == NULL)
 		return (0);
 	/*
 	 * POSIX record locking dictates that any close releases ALL
 	 * locks owned by this process.  This is handled by setting
 	 * a flag in the unlock to free ONLY locks obeying POSIX
 	 * semantics, and not to free BSD-style file locks.
 	 * If the descriptor was in a message, POSIX-style locks
 	 * aren't passed with the descriptor.
 	 */
 	if (td && (td->td_proc->p_flag & P_ADVLOCK) &&
 	    fp->f_type == DTYPE_VNODE) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		lf.l_type = F_UNLCK;
 		vp = (struct vnode *)fp->f_data;
 		(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 		    F_UNLCK, &lf, F_POSIX);
 	}
 	return (fdrop(fp, td));
 }
 
 /*
+ * Find the struct file 'fd' in process 'p' and bump it's refcount
+ * struct file is not locked on return.
+ */
+struct file *
+ffind_hold(td, fd)
+	struct thread *td;
+	int fd;
+{
+	struct filedesc *fdp;
+	struct file *fp;
+
+	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
+		return (NULL);
+	FILEDESC_LOCK(fdp);
+	if (fd < 0 || fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[fd]) == NULL ||
+	    fp->f_ops == &badfileops)
+		fp = NULL;
+	else
+		fhold(fp);
+	FILEDESC_UNLOCK(fdp);
+	return (fp);
+}
+
+/*
+ * Find the struct file 'fd' in process 'p' and bump it's refcount,
+ * struct file is locked on return.
+ */
+struct file *
+ffind_lock(td, fd)
+	struct thread *td;
+	int fd;
+{
+	struct filedesc *fdp;
+	struct file *fp;
+
+	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
+		return (NULL);
+	FILEDESC_LOCK(fdp);
+	if (fd < 0 || fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[fd]) == NULL ||
+	    fp->f_ops == &badfileops) {
+		fp = NULL;
+	} else {
+		FILE_LOCK(fp);
+		fhold_locked(fp);
+	}
+	FILEDESC_UNLOCK(fdp);
+	return (fp);
+}
+
+int
+fdrop(fp, td)
+	struct file *fp;
+	struct thread *td;
+{
+
+	FILE_LOCK(fp);
+	return (fdrop_locked(fp, td));
+}
+
+/*
  * Extract the file pointer associated with the specified descriptor for
  * the current user process.  If no error occured 0 is returned, *fpp
  * will be set to the file pointer, and the file pointer's ref count
  * will be bumped.  Use fdrop() to drop it.  If an error occured the
  * non-zero error is returned and *fpp is set to NULL.
  *
  * This routine requires Giant for the moment.  Once enough of the
  * system is converted over to this and other encapsulated APIs we
  * will be able to mutex it and call it without Giant.
  */
 static __inline
 int
 _fget(struct thread *td, int fd, struct file **fpp, int flags)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	GIANT_REQUIRED;
 	fdp = td->td_proc->p_fd;
 	*fpp = NULL;
 	if ((u_int)fd >= fdp->fd_nfiles)
 		return(EBADF);
 	if ((fp = fdp->fd_ofiles[fd]) == NULL)
 		return(EBADF);
 
 	/*
 	 * Note: FREAD failures returns EBADF to maintain backwards
 	 * compatibility with what routines returned before.
 	 *
 	 * Only one flag, or 0, may be specified.
 	 */
 	if (flags == FREAD && (fp->f_flag & FREAD) == 0)
 		return(EBADF);
 	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0)
 		return(EINVAL);
 	++fp->f_count;
 	*fpp = fp;
 	return(0);
 }
 
 int
 fget(struct thread *td, int fd, struct file **fpp)
 {
     return(_fget(td, fd, fpp, 0));
 }
 
 int
 fget_read(struct thread *td, int fd, struct file **fpp)
 {
     return(_fget(td, fd, fpp, FREAD));
 }
 
 int
 fget_write(struct thread *td, int fd, struct file **fpp)
 {
     return(_fget(td, fd, fpp, FWRITE));
 }
 
 /*
  * Like fget() but loads the underlying vnode, or returns an error if
  * the descriptor does not represent a vnode.  Note that pipes use vnodes
  * but never have VM objects (so VOP_GETVOBJECT() calls will return an
  * error).  The returned vnode will be vref()d.
  */
 
 static __inline
 int
 _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	GIANT_REQUIRED;
 	fdp = td->td_proc->p_fd;
 	*vpp = NULL;
 	if ((u_int)fd >= fdp->fd_nfiles)
 		return(EBADF);
 	if ((fp = fdp->fd_ofiles[fd]) == NULL)
 		return(EBADF);
 	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO)
 		return(EINVAL);
 	if (fp->f_data == NULL)
 		return(EINVAL);
 
 	/*
 	 * Note: FREAD failures returns EBADF to maintain backwards
 	 * compatibility with what routines returned before.
 	 *
 	 * Only one flag, or 0, may be specified.
 	 */
 	if (flags == FREAD && (fp->f_flag & FREAD) == 0)
 		return(EBADF);
 	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0)
 		return(EINVAL);
 	*vpp = (struct vnode *)fp->f_data;
 	vref(*vpp);
 	return(0);
 }
 
 int
 fgetvp(struct thread *td, int fd, struct vnode **vpp)
 {
 	return(_fgetvp(td, fd, vpp, 0));
 }
 
 int
 fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
 {
 	return(_fgetvp(td, fd, vpp, FREAD));
 }
 
 int
 fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
 {
 	return(_fgetvp(td, fd, vpp, FWRITE));
 }
 
 /*
  * Like fget() but loads the underlying socket, or returns an error if
  * the descriptor does not represent a socket.
  *
  * We bump the ref count on the returned socket.  XXX Also obtain the SX lock in
  * the future.
  */
 int
 fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct socket *so;
 
 	GIANT_REQUIRED;
 	fdp = td->td_proc->p_fd;
 	*spp = NULL;
 	if (fflagp)
 		*fflagp = 0;
 	if ((u_int)fd >= fdp->fd_nfiles)
 		return(EBADF);
 	if ((fp = fdp->fd_ofiles[fd]) == NULL)
 		return(EBADF);
 	if (fp->f_type != DTYPE_SOCKET)
 		return(ENOTSOCK);
 	if (fp->f_data == NULL)
 		return(EINVAL);
 	so = (struct socket *)fp->f_data;
 	if (fflagp)
 		*fflagp = fp->f_flag;
 	soref(so);
 	*spp = so;
 	return(0);
 }
 
 /*
  * Drop the reference count on the the socket and XXX release the SX lock in
  * the future.  The last reference closes the socket.
  */
 void
 fputsock(struct socket *so)
 {
 	sorele(so);
 }
 
 int
-fdrop(fp, td)
+fdrop_locked(fp, td)
 	struct file *fp;
 	struct thread *td;
 {
 	struct flock lf;
 	struct vnode *vp;
 	int error;
 
-	if (--fp->f_count > 0)
+	FILE_LOCK_ASSERT(fp, MA_OWNED);
+
+	if (--fp->f_count > 0) {
+		FILE_UNLOCK(fp);
 		return (0);
+	}
 	if (fp->f_count < 0)
 		panic("fdrop: count < 0");
 	if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		lf.l_type = F_UNLCK;
 		vp = (struct vnode *)fp->f_data;
+		FILE_UNLOCK(fp);
 		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
-	}
+	} else
+		FILE_UNLOCK(fp);
 	if (fp->f_ops != &badfileops)
 		error = fo_close(fp, td);
 	else
 		error = 0;
 	ffree(fp);
 	return (error);
 }
 
 /*
  * Apply an advisory lock on a file descriptor.
  *
  * Just attempt to get a record lock of the requested type on
  * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
 	int	fd;
 	int	how;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 flock(td, uap)
 	struct thread *td;
 	register struct flock_args *uap;
 {
-	register struct filedesc *fdp = td->td_proc->p_fd;
 	register struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 	int error;
 
-	mtx_lock(&Giant);
-
-	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[uap->fd]) == NULL) {
-		error = EBADF;
-		goto done2;
-	}
+	fp = ffind_hold(td, uap->fd);
+	if (fp == NULL)
+		return (EBADF);
 	if (fp->f_type != DTYPE_VNODE) {
-		error = EOPNOTSUPP;
-		goto done2;
+		fdrop(fp, td);
+		return (EOPNOTSUPP);
 	}
+
+	mtx_lock(&Giant);
 	vp = (struct vnode *)fp->f_data;
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
+		FILE_LOCK(fp);
 		fp->f_flag &= ~FHASLOCK;
+		FILE_UNLOCK(fp);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done2;
 	}
 	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
 	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else {
 		error = EBADF;
 		goto done2;
 	}
+	FILE_LOCK(fp);
 	fp->f_flag |= FHASLOCK;
-	if (uap->how & LOCK_NB)
-		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK);
-	else
-		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT);
+	FILE_UNLOCK(fp);
+	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
+	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
+	fdrop(fp, td);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * File Descriptor pseudo-device driver (/dev/fd/).
  *
  * Opening minor device N dup()s the file (if any) connected to file
  * descriptor N belonging to the calling process.  Note that this driver
  * consists of only the ``open()'' routine, because all subsequent
  * references to this file will be direct to the other driver.
  */
 /* ARGSUSED */
 static int
 fdopen(dev, mode, type, td)
 	dev_t dev;
 	int mode, type;
 	struct thread *td;
 {
 
 	/*
 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
 	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 	 * will simply report the error.
 	 */
 	td->td_dupfd = dev2unit(dev);
 	return (ENODEV);
 }
 
 /*
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
 dupfdopen(td, fdp, indx, dfd, mode, error)
 	struct thread *td;
 	struct filedesc *fdp;
 	int indx, dfd;
 	int mode;
 	int error;
 {
 	register struct file *wfp;
 	struct file *fp;
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
+	FILEDESC_LOCK(fdp);
 	if ((u_int)dfd >= fdp->fd_nfiles ||
 	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
+		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
 	 * For ENODEV simply dup (dfd) to file descriptor
 	 * (indx) and return.
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and
 	 * store it in (indx).  (dfd) is effectively closed by
 	 * this operation.
 	 *
 	 * Any other error code is just returned.
 	 */
 	switch (error) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
-		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag)
+		FILE_LOCK(wfp);
+		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
+			FILE_UNLOCK(wfp);
+			FILEDESC_UNLOCK(fdp);
 			return (EACCES);
+		}
 		fp = fdp->fd_ofiles[indx];
 #if 0
 		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
 			(void) munmapfd(td, indx);
 #endif
 		fdp->fd_ofiles[indx] = wfp;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
-		fhold(wfp);
+		fhold_locked(wfp);
+		FILE_UNLOCK(wfp);
 		if (indx > fdp->fd_lastfile)
 			fdp->fd_lastfile = indx;
+		if (fp != NULL)
+			FILE_LOCK(fp);
+		FILEDESC_UNLOCK(fdp);
 		/*
 		 * we now own the reference to fp that the ofiles[] array
 		 * used to own.  Release it.
 		 */
-		if (fp)
-			fdrop(fp, td);
+		if (fp != NULL)
+			fdrop_locked(fp, td);
 		return (0);
 
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd, and stuff it into indx.
 		 */
 		fp = fdp->fd_ofiles[indx];
 #if 0
 		if (fp && fdp->fd_ofileflags[indx] & UF_MAPPED)
 			(void) munmapfd(td, indx);
 #endif
 		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
 		fdp->fd_ofiles[dfd] = NULL;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
 		fdp->fd_ofileflags[dfd] = 0;
 
 		/*
-		 * we now own the reference to fp that the ofiles[] array
-		 * used to own.  Release it.
-		 */
-		if (fp)
-			fdrop(fp, td);
-		/*
 		 * Complete the clean up of the filedesc structure by
 		 * recomputing the various hints.
 		 */
 		if (indx > fdp->fd_lastfile) {
 			fdp->fd_lastfile = indx;
 		} else {
 			while (fdp->fd_lastfile > 0 &&
 			   fdp->fd_ofiles[fdp->fd_lastfile] == NULL) {
 				fdp->fd_lastfile--;
 			}
 			if (dfd < fdp->fd_freefile)
 				fdp->fd_freefile = dfd;
 		}
+		if (fp != NULL)
+			FILE_LOCK(fp);
+		FILEDESC_UNLOCK(fdp);
+
+		/*
+		 * we now own the reference to fp that the ofiles[] array
+		 * used to own.  Release it.
+		 */
+		if (fp != NULL)
+			fdrop_locked(fp, td);
 		return (0);
 
 	default:
+		FILEDESC_UNLOCK(fdp);
 		return (error);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Get file structures.
  */
 static int
 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct file *fp;
 
+	sx_slock(&filelist_lock);
 	if (!req->oldptr) {
 		/*
 		 * overestimate by 10 files
 		 */
-		return (SYSCTL_OUT(req, 0, sizeof(filehead) + 
-				(nfiles + 10) * sizeof(struct file)));
+		error = SYSCTL_OUT(req, 0, sizeof(filehead) + 
+				   (nfiles + 10) * sizeof(struct file));
+		sx_sunlock(&filelist_lock);
+		return (error);
 	}
 
 	error = SYSCTL_OUT(req, (caddr_t)&filehead, sizeof(filehead));
-	if (error)
+	if (error) {
+		sx_sunlock(&filelist_lock);
 		return (error);
+	}
 
 	/*
 	 * followed by an array of file structures
 	 */
 	LIST_FOREACH(fp, &filehead, f_list) {
 		error = SYSCTL_OUT(req, (caddr_t)fp, sizeof (struct file));
-		if (error)
+		if (error) {
+			sx_sunlock(&filelist_lock);
 			return (error);
+		}
 	}
+	sx_sunlock(&filelist_lock);
 	return (0);
 }
 
 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
     0, 0, sysctl_kern_file, "S,file", "Entire file table");
 
 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW, 
     &maxfilesperproc, 0, "Maximum files allowed open per process");
 
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, 
     &maxfiles, 0, "Maximum number of files");
 
 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD, 
     &nfiles, 0, "System-wide number of open files");
 
 static void
 fildesc_drvinit(void *unused)
 {
 	dev_t dev;
 
 	dev = make_dev(&fildesc_cdevsw, 0, UID_BIN, GID_BIN, 0666, "fd/0");
 	make_dev_alias(dev, "stdin");
 	dev = make_dev(&fildesc_cdevsw, 1, UID_BIN, GID_BIN, 0666, "fd/1");
 	make_dev_alias(dev, "stdout");
 	dev = make_dev(&fildesc_cdevsw, 2, UID_BIN, GID_BIN, 0666, "fd/2");
 	make_dev_alias(dev, "stderr");
 	if (!devfs_present) {
 		int fd;
 
 		for (fd = 3; fd < NUMFDESC; fd++)
 			make_dev(&fildesc_cdevsw, fd, UID_BIN, GID_BIN, 0666,
 			    "fd/%d", fd);
 	}
 }
 
 struct fileops badfileops = {
 	badfo_readwrite,
 	badfo_readwrite,
 	badfo_ioctl,
 	badfo_poll,
 	badfo_kqfilter,
 	badfo_stat,
 	badfo_close
 };
 
 static int
 badfo_readwrite(fp, uio, cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *cred;
 	struct thread *td;
 	int flags;
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_ioctl(fp, com, data, td)
 	struct file *fp;
 	u_long com;
 	caddr_t data;
 	struct thread *td;
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_poll(fp, events, cred, td)
 	struct file *fp;
 	int events;
 	struct ucred *cred;
 	struct thread *td;
 {
 
 	return (0);
 }
 
 static int
 badfo_kqfilter(fp, kn)
 	struct file *fp;
 	struct knote *kn;
 {
 
 	return (0);
 }
 
 static int
 badfo_stat(fp, sb, td)
 	struct file *fp;
 	struct stat *sb;
 	struct thread *td;
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_close(fp, td)
 	struct file *fp;
 	struct thread *td;
 {
 
 	return (EBADF);
 }
 
 SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
 					fildesc_drvinit,NULL)
+
+static void filelistinit __P((void *));
+SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
+
+/* ARGSUSED*/
+static void
+filelistinit(dummy)
+	void *dummy;
+{
+	sx_init(&filelist_lock, "filelist lock");
+}
Index: head/sys/kern/kern_event.c
===================================================================
--- head/sys/kern/kern_event.c	(revision 89305)
+++ head/sys/kern/kern_event.c	(revision 89306)
@@ -1,1029 +1,1079 @@
 /*-
  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/malloc.h> 
 #include <sys/unistd.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/selinfo.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/eventvar.h>
 #include <sys/poll.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/uio.h>
 
 #include <vm/vm_zone.h>
 
 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
 static int	kqueue_scan(struct file *fp, int maxevents,
 		    struct kevent *ulistp, const struct timespec *timeout,
 		    struct thread *td);
 static int 	kqueue_read(struct file *fp, struct uio *uio,
 		    struct ucred *cred, int flags, struct thread *td);
 static int	kqueue_write(struct file *fp, struct uio *uio,
 		    struct ucred *cred, int flags, struct thread *td);
 static int	kqueue_ioctl(struct file *fp, u_long com, caddr_t data,
 		    struct thread *td);
 static int 	kqueue_poll(struct file *fp, int events, struct ucred *cred,
 		    struct thread *td);
 static int 	kqueue_kqfilter(struct file *fp, struct knote *kn);
 static int 	kqueue_stat(struct file *fp, struct stat *st, struct thread *td);
 static int 	kqueue_close(struct file *fp, struct thread *td);
 static void 	kqueue_wakeup(struct kqueue *kq);
 
 static struct fileops kqueueops = {
 	kqueue_read,
 	kqueue_write,
 	kqueue_ioctl,
 	kqueue_poll,
 	kqueue_kqfilter,
 	kqueue_stat,
 	kqueue_close
 };
 
 static void 	knote_attach(struct knote *kn, struct filedesc *fdp);
 static void 	knote_drop(struct knote *kn, struct thread *td);
 static void 	knote_enqueue(struct knote *kn);
 static void 	knote_dequeue(struct knote *kn);
 static void 	knote_init(void);
 static struct 	knote *knote_alloc(void);
 static void 	knote_free(struct knote *kn);
 
 static void	filt_kqdetach(struct knote *kn);
 static int	filt_kqueue(struct knote *kn, long hint);
 static int	filt_procattach(struct knote *kn);
 static void	filt_procdetach(struct knote *kn);
 static int	filt_proc(struct knote *kn, long hint);
 static int	filt_fileattach(struct knote *kn);
 static void	filt_timerexpire(void *knx);
 static int	filt_timerattach(struct knote *kn);
 static void	filt_timerdetach(struct knote *kn);
 static int	filt_timer(struct knote *kn, long hint);
 
 static struct filterops file_filtops =
 	{ 1, filt_fileattach, NULL, NULL };
 static struct filterops kqread_filtops =
 	{ 1, NULL, filt_kqdetach, filt_kqueue };
 static struct filterops proc_filtops =
 	{ 0, filt_procattach, filt_procdetach, filt_proc };
 static struct filterops timer_filtops =
 	{ 0, filt_timerattach, filt_timerdetach, filt_timer };
 
 static vm_zone_t	knote_zone;
 static int 		kq_ncallouts = 0;
 static int 		kq_calloutmax = (4 * 1024);
 SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
 
 #define KNOTE_ACTIVATE(kn) do { 					\
 	kn->kn_status |= KN_ACTIVE;					\
 	if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
 		knote_enqueue(kn);					\
 } while(0)
 
 #define	KN_HASHSIZE		64		/* XXX should be tunable */
 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
 
 static int
 filt_nullattach(struct knote *kn)
 {
 
 	return (ENXIO);
 };
 
 struct filterops null_filtops =
 	{ 0, filt_nullattach, NULL, NULL };
 
 extern struct filterops sig_filtops;
 
 /*
  * Table for for all system-defined filters.
  */
 static struct filterops *sysfilt_ops[] = {
 	&file_filtops,			/* EVFILT_READ */
 	&file_filtops,			/* EVFILT_WRITE */
 	&null_filtops,			/* EVFILT_AIO */
 	&file_filtops,			/* EVFILT_VNODE */
 	&proc_filtops,			/* EVFILT_PROC */
 	&sig_filtops,			/* EVFILT_SIGNAL */
 	&timer_filtops,			/* EVFILT_TIMER */
 };
 
 static int
 filt_fileattach(struct knote *kn)
 {
 	
 	return (fo_kqfilter(kn->kn_fp, kn));
 }
 
 /*ARGSUSED*/
 static int
 kqueue_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 
 	if (kn->kn_filter != EVFILT_READ)
 		return (1);
 
 	kn->kn_fop = &kqread_filtops;
 	SLIST_INSERT_HEAD(&kq->kq_sel.si_note, kn, kn_selnext);
 	return (0);
 }
 
 static void
 filt_kqdetach(struct knote *kn)
 {
 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 
 	SLIST_REMOVE(&kq->kq_sel.si_note, kn, knote, kn_selnext);
 }
 
 /*ARGSUSED*/
 static int
 filt_kqueue(struct knote *kn, long hint)
 {
 	struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
 
 	kn->kn_data = kq->kq_count;
 	return (kn->kn_data > 0);
 }
 
 static int
 filt_procattach(struct knote *kn)
 {
 	struct proc *p;
 	int error;
 
 	p = pfind(kn->kn_id);
 	if (p == NULL)
 		return (ESRCH);
 	if ((error = p_cansee(curproc, p))) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	/*
 	 * internal flag indicating registration done by kernel
 	 */
 	if (kn->kn_flags & EV_FLAG1) {
 		kn->kn_data = kn->kn_sdata;		/* ppid */
 		kn->kn_fflags = NOTE_CHILD;
 		kn->kn_flags &= ~EV_FLAG1;
 	}
 
 	SLIST_INSERT_HEAD(&p->p_klist, kn, kn_selnext);
 	PROC_UNLOCK(p);
 
 	return (0);
 }
 
 /*
  * The knote may be attached to a different process, which may exit,
  * leaving nothing for the knote to be attached to.  So when the process
  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
  * it will be deleted when read out.  However, as part of the knote deletion,
  * this routine is called, so a check is needed to avoid actually performing
  * a detach, because the original process does not exist any more.
  */
 static void
 filt_procdetach(struct knote *kn)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 
 	if (kn->kn_status & KN_DETACHED)
 		return;
 
 	PROC_LOCK(p);
 	SLIST_REMOVE(&p->p_klist, kn, knote, kn_selnext);
 	PROC_UNLOCK(p);
 }
 
 static int
 filt_proc(struct knote *kn, long hint)
 {
 	u_int event;
 
 	/*
 	 * mask off extra data
 	 */
 	event = (u_int)hint & NOTE_PCTRLMASK;
 
 	/*
 	 * if the user is interested in this event, record it.
 	 */
 	if (kn->kn_sfflags & event)
 		kn->kn_fflags |= event;
 
 	/*
 	 * process is gone, so flag the event as finished.
 	 */
 	if (event == NOTE_EXIT) {
 		kn->kn_status |= KN_DETACHED;
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT); 
 		return (1);
 	}
 
 	/*
 	 * process forked, and user wants to track the new process,
 	 * so attach a new knote to it, and immediately report an
 	 * event with the parent's pid.
 	 */
 	if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
 		struct kevent kev;
 		int error;
 
 		/*
 		 * register knote with new process.
 		 */
 		kev.ident = hint & NOTE_PDATAMASK;	/* pid */
 		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;			/* parent */
 		kev.udata = kn->kn_kevent.udata;	/* preserve udata */
 		error = kqueue_register(kn->kn_kq, &kev, NULL);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
 	}
 
 	return (kn->kn_fflags != 0);
 }
 
 static void
 filt_timerexpire(void *knx)
 {
 	struct knote *kn = knx;
 	struct callout *calloutp;
 	struct timeval tv;
 	int tticks;
 
 	kn->kn_data++;
 	KNOTE_ACTIVATE(kn);
 
 	if ((kn->kn_flags & EV_ONESHOT) == 0) {
 		tv.tv_sec = kn->kn_sdata / 1000;
 		tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
 		tticks = tvtohz(&tv);
 		calloutp = (struct callout *)kn->kn_hook;
 		callout_reset(calloutp, tticks, filt_timerexpire, kn);
 	}
 }
 
 /*
  * data contains amount of time to sleep, in milliseconds
  */ 
 static int
 filt_timerattach(struct knote *kn)
 {
 	struct callout *calloutp;
 	struct timeval tv;
 	int tticks;
 
 	if (kq_ncallouts >= kq_calloutmax)
 		return (ENOMEM);
 	kq_ncallouts++;
 
 	tv.tv_sec = kn->kn_sdata / 1000;
 	tv.tv_usec = (kn->kn_sdata % 1000) * 1000;
 	tticks = tvtohz(&tv);
 
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 	MALLOC(calloutp, struct callout *, sizeof(*calloutp),
 	    M_KQUEUE, M_WAITOK);
 	callout_init(calloutp, 0);
 	callout_reset(calloutp, tticks, filt_timerexpire, kn);
 	kn->kn_hook = (caddr_t)calloutp;
 
 	return (0);
 }
 
 static void
 filt_timerdetach(struct knote *kn)
 {
 	struct callout *calloutp;
 
 	calloutp = (struct callout *)kn->kn_hook;
 	callout_stop(calloutp);
 	FREE(calloutp, M_KQUEUE);
 	kq_ncallouts--;
 }
 
 static int
 filt_timer(struct knote *kn, long hint)
 {
 
 	return (kn->kn_data != 0);
 }
 
 /*
  * MPSAFE
  */
 int
 kqueue(struct thread *td, struct kqueue_args *uap)
 {
 	struct filedesc *fdp;
 	struct kqueue *kq;
 	struct file *fp;
 	int fd, error;
 
 	mtx_lock(&Giant);
 	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd);
 	if (error)
 		goto done2;
+	kq = malloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO);
+	TAILQ_INIT(&kq->kq_head);
+	FILE_LOCK(fp);
 	fp->f_flag = FREAD | FWRITE;
 	fp->f_type = DTYPE_KQUEUE;
 	fp->f_ops = &kqueueops;
-	kq = malloc(sizeof(struct kqueue), M_KQUEUE, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&kq->kq_head);
 	fp->f_data = (caddr_t)kq;
+	FILE_UNLOCK(fp);
+	FILEDESC_LOCK(fdp);
 	td->td_retval[0] = fd;
 	if (fdp->fd_knlistsize < 0)
 		fdp->fd_knlistsize = 0;		/* this process has a kq */
+	FILEDESC_UNLOCK(fdp);
 	kq->kq_fdp = fdp;
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kevent_args {
 	int	fd;
 	const struct kevent *changelist;
 	int	nchanges;
 	struct	kevent *eventlist;
 	int	nevents;
 	const struct timespec *timeout;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 kevent(struct thread *td, struct kevent_args *uap)
 {
 	struct kevent *kevp;
 	struct kqueue *kq;
 	struct file *fp;
 	struct timespec ts;
 	int i, n, nerrors, error;
 
-	mtx_lock(&Giant);
-	if ((error = fget(td, uap->fd, &fp)) != 0)
-		goto done;
-	if (fp->f_type != DTYPE_KQUEUE) {
-		error = EBADF;
-		goto done;
+	fp = ffind_hold(td, uap->fd);
+	if (fp == NULL || fp->f_type != DTYPE_KQUEUE) {
+		if (fp != NULL)
+			fdrop(fp, td);
+		return (EBADF);
 	}
 	if (uap->timeout != NULL) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
-			goto done;
+			goto done_nogiant;
 		uap->timeout = &ts;
 	}
+	mtx_lock(&Giant);
 
 	kq = (struct kqueue *)fp->f_data;
 	nerrors = 0;
 
 	while (uap->nchanges > 0) {
 		n = uap->nchanges > KQ_NEVENTS ? KQ_NEVENTS : uap->nchanges;
 		error = copyin(uap->changelist, kq->kq_kev,
 		    n * sizeof(struct kevent));
 		if (error)
 			goto done;
 		for (i = 0; i < n; i++) {
 			kevp = &kq->kq_kev[i];
 			kevp->flags &= ~EV_SYSFLAGS;
 			error = kqueue_register(kq, kevp, td);
 			if (error) {
 				if (uap->nevents != 0) {
 					kevp->flags = EV_ERROR;
 					kevp->data = error;
 					(void) copyout((caddr_t)kevp,
 					    (caddr_t)uap->eventlist,
 					    sizeof(*kevp));
 					uap->eventlist++;
 					uap->nevents--;
 					nerrors++;
 				} else {
 					goto done;
 				}
 			}
 		}
 		uap->nchanges -= n;
 		uap->changelist += n;
 	}
 	if (nerrors) {
         	td->td_retval[0] = nerrors;
 		error = 0;
 		goto done;
 	}
 
 	error = kqueue_scan(fp, uap->nevents, uap->eventlist, uap->timeout, td);
 done:
+	mtx_unlock(&Giant);
+done_nogiant:
 	if (fp != NULL)
 		fdrop(fp, td);
-	mtx_unlock(&Giant);
 	return (error);
 }
 
 int
 kqueue_add_filteropts(int filt, struct filterops *filtops)
 {
 
 	if (filt > 0)
 		panic("filt(%d) > 0", filt);
 	if (filt + EVFILT_SYSCOUNT < 0)
 		panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0",
 		    filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT);
 	if (sysfilt_ops[~filt] != &null_filtops)
 		panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt);
 	sysfilt_ops[~filt] = filtops;
 	return (0);
 }
 
 int
 kqueue_del_filteropts(int filt)
 {
 
 	if (filt > 0)
 		panic("filt(%d) > 0", filt);
 	if (filt + EVFILT_SYSCOUNT < 0)
 		panic("filt(%d) + EVFILT_SYSCOUNT(%d) == %d < 0",
 		    filt, EVFILT_SYSCOUNT, filt + EVFILT_SYSCOUNT);
 	if (sysfilt_ops[~filt] == &null_filtops)
 		panic("sysfilt_ops[~filt(%d)] != &null_filtops", filt);
 	sysfilt_ops[~filt] = &null_filtops;
 	return (0);
 }
 
 int
 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td)
 {
 	struct filedesc *fdp = kq->kq_fdp;
 	struct filterops *fops;
 	struct file *fp = NULL;
 	struct knote *kn = NULL;
 	int s, error = 0;
 
 	if (kev->filter < 0) {
 		if (kev->filter + EVFILT_SYSCOUNT < 0)
 			return (EINVAL);
 		fops = sysfilt_ops[~kev->filter];	/* to 0-base index */
 	} else {
 		/*
 		 * XXX
 		 * filter attach routine is responsible for insuring that
 		 * the identifier can be attached to it.
 		 */
 		printf("unknown filter: %d\n", kev->filter);
 		return (EINVAL);
 	}
 
+	FILEDESC_LOCK(fdp);
 	if (fops->f_isfd) {
 		/* validate descriptor */
 		if ((u_int)kev->ident >= fdp->fd_nfiles ||
-		    (fp = fdp->fd_ofiles[kev->ident]) == NULL)
+		    (fp = fdp->fd_ofiles[kev->ident]) == NULL) {
+			FILEDESC_UNLOCK(fdp);
 			return (EBADF);
+		}
 		fhold(fp);
 
 		if (kev->ident < fdp->fd_knlistsize) {
 			SLIST_FOREACH(kn, &fdp->fd_knlist[kev->ident], kn_link)
 				if (kq == kn->kn_kq &&
 				    kev->filter == kn->kn_filter)
 					break;
 		}
 	} else {
 		if (fdp->fd_knhashmask != 0) {
 			struct klist *list;
 			
 			list = &fdp->fd_knhash[
 			    KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
 			SLIST_FOREACH(kn, list, kn_link)
 				if (kev->ident == kn->kn_id &&
 				    kq == kn->kn_kq &&
 				    kev->filter == kn->kn_filter)
 					break;
 		}
 	}
+	FILEDESC_UNLOCK(fdp);
 
 	if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
 		error = ENOENT;
 		goto done;
 	}
 
 	/*
 	 * kn now contains the matching knote, or NULL if no match
 	 */
 	if (kev->flags & EV_ADD) {
 
 		if (kn == NULL) {
 			kn = knote_alloc();
 			if (kn == NULL) {
 				error = ENOMEM;
 				goto done;
 			}
 			kn->kn_fp = fp;
 			kn->kn_kq = kq;
 			kn->kn_fop = fops;
 
 			/*
 			 * apply reference count to knote structure, and
 			 * do not release it at the end of this routine.
 			 */
 			fp = NULL;
 
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			kev->fflags = 0;
 			kev->data = 0;
 			kn->kn_kevent = *kev;
 
 			knote_attach(kn, fdp);
 			if ((error = fops->f_attach(kn)) != 0) {
 				knote_drop(kn, td);
 				goto done;
 			}
 		} else {
 			/*
 			 * The user may change some filter values after the
 			 * initial EV_ADD, but doing so will not reset any 
 			 * filter which have already been triggered.
 			 */
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			kn->kn_kevent.udata = kev->udata;
 		}
 
 		s = splhigh();
 		if (kn->kn_fop->f_event(kn, 0))
 			KNOTE_ACTIVATE(kn);
 		splx(s);
 
 	} else if (kev->flags & EV_DELETE) {
 		kn->kn_fop->f_detach(kn);
 		knote_drop(kn, td);
 		goto done;
 	}
 
 	if ((kev->flags & EV_DISABLE) &&
 	    ((kn->kn_status & KN_DISABLED) == 0)) {
 		s = splhigh();
 		kn->kn_status |= KN_DISABLED;
 		splx(s);
 	}
 
 	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
 		s = splhigh();
 		kn->kn_status &= ~KN_DISABLED;
 		if ((kn->kn_status & KN_ACTIVE) &&
 		    ((kn->kn_status & KN_QUEUED) == 0))
 			knote_enqueue(kn);
 		splx(s);
 	}
 
 done:
 	if (fp != NULL)
 		fdrop(fp, td);
 	return (error);
 }
 
 static int
 kqueue_scan(struct file *fp, int maxevents, struct kevent *ulistp,
 	const struct timespec *tsp, struct thread *td)
 {
-	struct kqueue *kq = (struct kqueue *)fp->f_data;
+	struct kqueue *kq;
 	struct kevent *kevp;
 	struct timeval atv, rtv, ttv;
 	struct knote *kn, marker;
 	int s, count, timeout, nkev = 0, error = 0;
 
+	FILE_LOCK_ASSERT(fp, MA_NOTOWNED);
+
+	kq = (struct kqueue *)fp->f_data;
 	count = maxevents;
 	if (count == 0)
 		goto done;
 
 	if (tsp != NULL) {
 		TIMESPEC_TO_TIMEVAL(&atv, tsp);
 		if (itimerfix(&atv)) {
 			error = EINVAL;
 			goto done;
 		}
 		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			timeout = -1;
 		else 
 			timeout = atv.tv_sec > 24 * 60 * 60 ?
 			    24 * 60 * 60 * hz : tvtohz(&atv);
 		getmicrouptime(&rtv);
 		timevaladd(&atv, &rtv);
 	} else {
 		atv.tv_sec = 0;
 		atv.tv_usec = 0;
 		timeout = 0;
 	}
 	goto start;
 
 retry:
 	if (atv.tv_sec || atv.tv_usec) {
 		getmicrouptime(&rtv);
 		if (timevalcmp(&rtv, &atv, >=))
 			goto done;
 		ttv = atv;
 		timevalsub(&ttv, &rtv);
 		timeout = ttv.tv_sec > 24 * 60 * 60 ?
 			24 * 60 * 60 * hz : tvtohz(&ttv);
 	}
 
 start:
 	kevp = kq->kq_kev;
 	s = splhigh();
 	if (kq->kq_count == 0) {
 		if (timeout < 0) { 
 			error = EWOULDBLOCK;
 		} else {
 			kq->kq_state |= KQ_SLEEP;
 			error = tsleep(kq, PSOCK | PCATCH, "kqread", timeout);
 		}
 		splx(s);
 		if (error == 0)
 			goto retry;
 		/* don't restart after signals... */
 		if (error == ERESTART)
 			error = EINTR;
 		else if (error == EWOULDBLOCK)
 			error = 0;
 		goto done;
 	}
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, &marker, kn_tqe); 
 	while (count) {
 		kn = TAILQ_FIRST(&kq->kq_head);
 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 
 		if (kn == &marker) {
 			splx(s);
 			if (count == maxevents)
 				goto retry;
 			goto done;
 		}
 		if (kn->kn_status & KN_DISABLED) {
 			kn->kn_status &= ~KN_QUEUED;
 			kq->kq_count--;
 			continue;
 		}
 		if ((kn->kn_flags & EV_ONESHOT) == 0 &&
 		    kn->kn_fop->f_event(kn, 0) == 0) {
 			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
 			kq->kq_count--;
 			continue;
 		}
 		*kevp = kn->kn_kevent;
 		kevp++;
 		nkev++;
 		if (kn->kn_flags & EV_ONESHOT) {
 			kn->kn_status &= ~KN_QUEUED;
 			kq->kq_count--;
 			splx(s);
 			kn->kn_fop->f_detach(kn);
 			knote_drop(kn, td);
 			s = splhigh();
 		} else if (kn->kn_flags & EV_CLEAR) {
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 			kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
 			kq->kq_count--;
 		} else {
 			TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 
 		}
 		count--;
 		if (nkev == KQ_NEVENTS) {
 			splx(s);
 			error = copyout((caddr_t)&kq->kq_kev, (caddr_t)ulistp,
 			    sizeof(struct kevent) * nkev);
 			ulistp += nkev;
 			nkev = 0;
 			kevp = kq->kq_kev;
 			s = splhigh();
 			if (error)
 				break;
 		}
 	}
 	TAILQ_REMOVE(&kq->kq_head, &marker, kn_tqe); 
 	splx(s);
 done:
 	if (nkev != 0)
 		error = copyout((caddr_t)&kq->kq_kev, (caddr_t)ulistp,
 		    sizeof(struct kevent) * nkev);
         td->td_retval[0] = maxevents - count;
 	return (error);
 }
 
 /*
  * XXX
  * This could be expanded to call kqueue_scan, if desired.
  */
 /*ARGSUSED*/
 static int
 kqueue_read(struct file *fp, struct uio *uio, struct ucred *cred,
 	int flags, struct thread *td)
 {
 	return (ENXIO);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_write(struct file *fp, struct uio *uio, struct ucred *cred,
 	 int flags, struct thread *td)
 {
 	return (ENXIO);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_ioctl(struct file *fp, u_long com, caddr_t data, struct thread *td)
 {
 	return (ENOTTY);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_poll(struct file *fp, int events, struct ucred *cred, struct thread *td)
 {
-	struct kqueue *kq = (struct kqueue *)fp->f_data;
+	struct kqueue *kq;
 	int revents = 0;
 	int s = splnet();
 
+	kq = (struct kqueue *)fp->f_data;
         if (events & (POLLIN | POLLRDNORM)) {
                 if (kq->kq_count) {
                         revents |= events & (POLLIN | POLLRDNORM);
 		} else {
                         selrecord(td, &kq->kq_sel);
 			kq->kq_state |= KQ_SEL;
 		}
 	}
 	splx(s);
 	return (revents);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_stat(struct file *fp, struct stat *st, struct thread *td)
 {
-	struct kqueue *kq = (struct kqueue *)fp->f_data;
+	struct kqueue *kq;
 
+	kq = (struct kqueue *)fp->f_data;
 	bzero((void *)st, sizeof(*st));
 	st->st_size = kq->kq_count;
 	st->st_blksize = sizeof(struct kevent);
 	st->st_mode = S_IFIFO;
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_close(struct file *fp, struct thread *td)
 {
 	struct kqueue *kq = (struct kqueue *)fp->f_data;
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct knote **knp, *kn, *kn0;
 	int i;
 
+	FILEDESC_LOCK(fdp);
 	for (i = 0; i < fdp->fd_knlistsize; i++) {
 		knp = &SLIST_FIRST(&fdp->fd_knlist[i]);
 		kn = *knp;
 		while (kn != NULL) {
 			kn0 = SLIST_NEXT(kn, kn_link);
 			if (kq == kn->kn_kq) {
 				kn->kn_fop->f_detach(kn);
-				fdrop(kn->kn_fp, td);
-				knote_free(kn);
 				*knp = kn0;
+				FILE_LOCK(kn->kn_fp);
+				FILEDESC_UNLOCK(fdp);
+				fdrop_locked(kn->kn_fp, td);
+				knote_free(kn);
+				FILEDESC_LOCK(fdp);
 			} else {
 				knp = &SLIST_NEXT(kn, kn_link);
 			}
 			kn = kn0;
 		}
 	}
 	if (fdp->fd_knhashmask != 0) {
 		for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
 			knp = &SLIST_FIRST(&fdp->fd_knhash[i]);
 			kn = *knp;
 			while (kn != NULL) {
 				kn0 = SLIST_NEXT(kn, kn_link);
 				if (kq == kn->kn_kq) {
 					kn->kn_fop->f_detach(kn);
+					*knp = kn0;
 		/* XXX non-fd release of kn->kn_ptr */
+					FILEDESC_UNLOCK(fdp);
 					knote_free(kn);
-					*knp = kn0;
+					FILEDESC_LOCK(fdp);
 				} else {
 					knp = &SLIST_NEXT(kn, kn_link);
 				}
 				kn = kn0;
 			}
 		}
 	}
+	FILEDESC_UNLOCK(fdp);
 	free(kq, M_KQUEUE);
 	fp->f_data = NULL;
 
 	return (0);
 }
 
 static void
 kqueue_wakeup(struct kqueue *kq)
 {
 
 	if (kq->kq_state & KQ_SLEEP) {
 		kq->kq_state &= ~KQ_SLEEP;
 		wakeup(kq);
 	}
 	if (kq->kq_state & KQ_SEL) {
 		kq->kq_state &= ~KQ_SEL;
 		selwakeup(&kq->kq_sel);
 	}
 	KNOTE(&kq->kq_sel.si_note, 0);
 }
 
 /*
  * walk down a list of knotes, activating them if their event has triggered.
  */
 void
 knote(struct klist *list, long hint)
 {
 	struct knote *kn;
 
 	SLIST_FOREACH(kn, list, kn_selnext)
 		if (kn->kn_fop->f_event(kn, hint))
 			KNOTE_ACTIVATE(kn);
 }
 
 /*
  * remove all knotes from a specified klist
  */
 void
 knote_remove(struct thread *td, struct klist *list)
 {
 	struct knote *kn;
 
 	while ((kn = SLIST_FIRST(list)) != NULL) {
 		kn->kn_fop->f_detach(kn);
 		knote_drop(kn, td);
 	}
 }
 
 /*
  * remove all knotes referencing a specified fd
  */
 void
 knote_fdclose(struct thread *td, int fd)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
-	struct klist *list = &fdp->fd_knlist[fd];
+	struct klist *list;
 
+	FILEDESC_LOCK(fdp);
+	list = &fdp->fd_knlist[fd];
+	FILEDESC_UNLOCK(fdp);
 	knote_remove(td, list);
 }
 
 static void
 knote_attach(struct knote *kn, struct filedesc *fdp)
 {
-	struct klist *list;
-	int size;
+	struct klist *list, *oldlist;
+	int size, newsize;
 
+	FILEDESC_LOCK(fdp);
+
 	if (! kn->kn_fop->f_isfd) {
 		if (fdp->fd_knhashmask == 0)
 			fdp->fd_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
 			    &fdp->fd_knhashmask);
 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
 		goto done;
 	}
 
 	if (fdp->fd_knlistsize <= kn->kn_id) {
+retry:
 		size = fdp->fd_knlistsize;
 		while (size <= kn->kn_id)
 			size += KQEXTENT;
+		FILEDESC_UNLOCK(fdp);
 		MALLOC(list, struct klist *,
 		    size * sizeof(struct klist *), M_KQUEUE, M_WAITOK);
+		FILEDESC_LOCK(fdp);
+		newsize = fdp->fd_knlistsize;
+		while (newsize <= kn->kn_id)
+			newsize += KQEXTENT;
+		if (newsize != size) {
+			FILEDESC_UNLOCK(fdp);
+			free(list, M_TEMP);
+			FILEDESC_LOCK(fdp);
+			goto retry;
+		}
 		bcopy((caddr_t)fdp->fd_knlist, (caddr_t)list,
 		    fdp->fd_knlistsize * sizeof(struct klist *));
 		bzero((caddr_t)list +
 		    fdp->fd_knlistsize * sizeof(struct klist *),
 		    (size - fdp->fd_knlistsize) * sizeof(struct klist *));
 		if (fdp->fd_knlist != NULL)
-			FREE(fdp->fd_knlist, M_KQUEUE);
+			oldlist = fdp->fd_knlist;
+		else
+			oldlist = NULL;
 		fdp->fd_knlistsize = size;
 		fdp->fd_knlist = list;
+		FILEDESC_UNLOCK(fdp);
+		if (oldlist != NULL)
+			FREE(oldlist, M_KQUEUE);
+		FILEDESC_LOCK(fdp);
 	}
 	list = &fdp->fd_knlist[kn->kn_id];
 done:
+	FILEDESC_UNLOCK(fdp);
 	SLIST_INSERT_HEAD(list, kn, kn_link);
 	kn->kn_status = 0;
 }
 
 /*
  * should be called at spl == 0, since we don't want to hold spl
  * while calling fdrop and free.
  */
 static void
 knote_drop(struct knote *kn, struct thread *td)
 {
         struct filedesc *fdp = td->td_proc->p_fd;
 	struct klist *list;
 
+	FILEDESC_LOCK(fdp);
 	if (kn->kn_fop->f_isfd)
 		list = &fdp->fd_knlist[kn->kn_id];
 	else
 		list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
+	if (kn->kn_fop->f_isfd)
+		FILE_LOCK(kn->kn_fp);
+	FILEDESC_UNLOCK(fdp);
 
 	SLIST_REMOVE(list, kn, knote, kn_link);
 	if (kn->kn_status & KN_QUEUED)
 		knote_dequeue(kn);
 	if (kn->kn_fop->f_isfd)
-		fdrop(kn->kn_fp, td);
+		fdrop_locked(kn->kn_fp, td);
 	knote_free(kn);
 }
 
 
 static void
 knote_enqueue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 	int s = splhigh();
 
 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); 
 	kn->kn_status |= KN_QUEUED;
 	kq->kq_count++;
 	splx(s);
 	kqueue_wakeup(kq);
 }
 
 static void
 knote_dequeue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 	int s = splhigh();
 
 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
 
 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe); 
 	kn->kn_status &= ~KN_QUEUED;
 	kq->kq_count--;
 	splx(s);
 }
 
 static void
 knote_init(void)
 {
 	knote_zone = zinit("KNOTE", sizeof(struct knote), 0, 0, 1);
 }
 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL)
 
 static struct knote *
 knote_alloc(void)
 {
 	return ((struct knote *)zalloc(knote_zone));
 }
 
 static void
 knote_free(struct knote *kn)
 {
 	zfree(knote_zone, kn);
 }
Index: head/sys/kern/kern_exec.c
===================================================================
--- head/sys/kern/kern_exec.c	(revision 89305)
+++ head/sys/kern/kern_exec.c	(revision 89306)
@@ -1,986 +1,989 @@
 /*
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/acct.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/wait.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/namei.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 #include <sys/sysctl.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #include <machine/reg.h>
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
 static MALLOC_DEFINE(M_ATEXEC, "atexec", "atexec callback");
 
 /*
  * callout list for things to do at exec time
  */
 struct execlist {
 	execlist_fn function;
 	TAILQ_ENTRY(execlist) next;
 };
 
 TAILQ_HEAD(exec_list_head, execlist);
 static struct exec_list_head exec_list = TAILQ_HEAD_INITIALIZER(exec_list);
 
 static register_t *exec_copyout_strings __P((struct image_params *));
 
 /* XXX This should be vm_size_t. */
 static u_long ps_strings = PS_STRINGS;
 SYSCTL_ULONG(_kern, KERN_PS_STRINGS, ps_strings, CTLFLAG_RD, &ps_strings, 0, "");
 
 /* XXX This should be vm_size_t. */
 static u_long usrstack = USRSTACK;
 SYSCTL_ULONG(_kern, KERN_USRSTACK, usrstack, CTLFLAG_RD, &usrstack, 0, "");
 
 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
     &ps_arg_cache_limit, 0, "");
 
 int ps_argsopen = 1;
 SYSCTL_INT(_kern, OID_AUTO, ps_argsopen, CTLFLAG_RW, &ps_argsopen, 0, "");
 
 /*
  * Each of the items is a pointer to a `const struct execsw', hence the
  * double pointer here.
  */
 static const struct execsw **execsw;
 
 #ifndef _SYS_SYSPROTO_H_
 struct execve_args {
         char    *fname; 
         char    **argv;
         char    **envv; 
 };
 #endif
 
 /*
  * execve() system call.
  *
  * MPSAFE
  */
 int
 execve(td, uap)
 	struct thread *td;
 	register struct execve_args *uap;
 {
 	struct proc *p = td->td_proc;
 	struct nameidata nd, *ndp;
 	struct ucred *newcred, *oldcred;
 	register_t *stack_base;
 	int error, len, i;
 	struct image_params image_params, *imgp;
 	struct vattr attr;
 	int (*img_first) __P((struct image_params *));
 	struct pargs *pa;
 	struct execlist *ep;
 
 	imgp = &image_params;
 
 	/*
 	 * Lock the process and set the P_INEXEC flag to indicate that
 	 * it should be left alone until we're done here.  This is
 	 * necessary to avoid race conditions - e.g. in ptrace() -
 	 * that might allow a local user to illicitly obtain elevated
 	 * privileges.
 	 */
 	mtx_lock(&Giant);
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
 	
 /* XXXKSE */
 /* !!!!!!!! we need abort all the other threads of this process before we */
 /* proceed beyond his point! */
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->uap = uap;
 	imgp->attr = &attr;
 	imgp->argc = imgp->envc = 0;
 	imgp->argv0 = NULL;
 	imgp->entry_addr = 0;
 	imgp->vmspace_destroyed = 0;
 	imgp->interpreted = 0;
 	imgp->interpreter_name[0] = '\0';
 	imgp->auxargs = NULL;
 	imgp->vp = NULL;
 	imgp->firstpage = NULL;
 	imgp->ps_strings = 0;
 	imgp->auxarg_size = 0;
 
 	/*
 	 * Allocate temporary demand zeroed space for argument and
 	 *	environment strings
 	 */
 	imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX + PAGE_SIZE);
 	if (imgp->stringbase == NULL) {
 		error = ENOMEM;
 		goto exec_fail;
 	}
 	imgp->stringp = imgp->stringbase;
 	imgp->stringspace = ARG_MAX;
 	imgp->image_header = imgp->stringbase + ARG_MAX;
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp amoung other things.
 	 */
 	ndp = &nd;
 	NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
 	    UIO_USERSPACE, uap->fname, td);
 
 interpret:
 
 	error = namei(ndp);
 	if (error) {
 		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
 			ARG_MAX + PAGE_SIZE);
 		goto exec_fail;
 	}
 
 	imgp->vp = ndp->ni_vp;
 	imgp->fname = uap->fname;
 
 	/*
 	 * Check file permissions (also 'opens' file)
 	 */
 	error = exec_check_permissions(imgp);
 	if (error) {
 		VOP_UNLOCK(imgp->vp, 0, td);
 		goto exec_fail_dealloc;
 	}
 
 	error = exec_map_first_page(imgp);
 	VOP_UNLOCK(imgp->vp, 0, td);
 	if (error)
 		goto exec_fail_dealloc;
 
 	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell 
 	 *	scripts differently.
 	 */
 	error = -1;
 	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
 		error = img_first(imgp);
 
 	/*
 	 *	Loop through the list of image activators, calling each one.
 	 *	An activator returns -1 if there is no match, 0 on success,
 	 *	and an error otherwise.
 	 */
 	for (i = 0; error == -1 && execsw[i]; ++i) {
 		if (execsw[i]->ex_imgact == NULL ||
 		    execsw[i]->ex_imgact == img_first) {
 			continue;
 		}
 		error = (*execsw[i]->ex_imgact)(imgp);
 	}
 
 	if (error) {
 		if (error == -1)
 			error = ENOEXEC;
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Special interpreter operation, cleanup and loop up to try to
 	 * activate the interpreter.
 	 */
 	if (imgp->interpreted) {
 		exec_unmap_first_page(imgp);
 		/* free name buffer and old vnode */
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 		vrele(ndp->ni_vp);
 		/* set new name to that of the interpreter */
 		NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
 		    UIO_SYSSPACE, imgp->interpreter_name, td);
 		goto interpret;
 	}
 
 	TAILQ_FOREACH(ep, &exec_list, next)
 		(*ep->function)(p);
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base
 	 */
 	stack_base = exec_copyout_strings(imgp);
 	p->p_vmspace->vm_minsaddr = (char *)stack_base;
 
 	/*
 	 * If custom stack fixup routine present for this process
 	 * let it do the stack setup.
 	 * Else stuff argument count as first item on stack
 	 */
 	if (p->p_sysent->sv_fixup)
 		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
 	else
 		suword(--stack_base, imgp->argc);
 
 	/*
 	 * For security and other reasons, the file descriptor table cannot
 	 * be shared after an exec.
 	 */
+	FILEDESC_LOCK(p->p_fd);
 	if (p->p_fd->fd_refcnt > 1) {
 		struct filedesc *tmp;
 
 		tmp = fdcopy(td);
+		FILEDESC_UNLOCK(p->p_fd);
 		fdfree(td);
 		p->p_fd = tmp;
-	}
+	} else
+		FILEDESC_UNLOCK(p->p_fd);
 
 	/*
 	 * For security and other reasons, signal handlers cannot
 	 * be shared after an exec. The new process gets a copy of the old
 	 * handlers. In execsigs(), the new process will have its signals
 	 * reset.
 	 */
 	if (p->p_procsig->ps_refcnt > 1) {
 		struct procsig *newprocsig;
 
 		MALLOC(newprocsig, struct procsig *, sizeof(struct procsig),
 		       M_SUBPROC, M_WAITOK);
 		bcopy(p->p_procsig, newprocsig, sizeof(*newprocsig));
 		p->p_procsig->ps_refcnt--;
 		p->p_procsig = newprocsig;
 		p->p_procsig->ps_refcnt = 1;
 		if (p->p_sigacts == &p->p_uarea->u_sigacts)
 			panic("shared procsig but private sigacts?");
 
 		p->p_uarea->u_sigacts = *p->p_sigacts;
 		p->p_sigacts = &p->p_uarea->u_sigacts;
 	}
 	/* Stop profiling */
 	stopprofclock(p);
 
 	/* close files on exec */
 	fdcloseexec(td);
 
 	/* reset caught signals */
 	execsigs(p);
 
 	/* name this process - nameiexec(p, ndp) */
 	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
 	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
 	p->p_comm[len] = 0;
 
 	/*
 	 * mark as execed, wakeup the process that vforked (if any) and tell
 	 * it that it now has its own resources back
 	 */
 	PROC_LOCK(p);
 	p->p_flag |= P_EXEC;
 	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
 		p->p_flag &= ~P_PPWAIT;
 		wakeup((caddr_t)p->p_pptr);
 	}
 
 	/*
 	 * Implement image setuid/setgid.
 	 *
 	 * Don't honor setuid/setgid if the filesystem prohibits it or if
 	 * the process is being traced.
 	 */
 	oldcred = p->p_ucred;
 	newcred = NULL;
 	if ((((attr.va_mode & VSUID) && oldcred->cr_uid != attr.va_uid) ||
 	     ((attr.va_mode & VSGID) && oldcred->cr_gid != attr.va_gid)) &&
 	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
 	    (p->p_flag & P_TRACED) == 0) {
 		PROC_UNLOCK(p);
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.  Record any set-id flags first to make sure that
 		 * we do not regain any tracing during a possible block.
 		 */
 		setsugid(p);
 		if (p->p_tracep && suser_xxx(oldcred, NULL, PRISON_ROOT)) {
 			struct vnode *vtmp;
 
 			if ((vtmp = p->p_tracep) != NULL) {
 				p->p_tracep = NULL;
 				p->p_traceflag = 0;
 				vrele(vtmp);
 			}
 		}
 		/*
 		 * Set the new credentials.
 		 */
 		newcred = crdup(oldcred);
 		if (attr.va_mode & VSUID)
 			change_euid(newcred, attr.va_uid);
 		if (attr.va_mode & VSGID)
 			change_egid(newcred, attr.va_gid);
 		setugidsafety(td);
 	} else {
 		if (oldcred->cr_uid == oldcred->cr_ruid &&
 		    oldcred->cr_gid == oldcred->cr_rgid)
 			p->p_flag &= ~P_SUGID;
 		PROC_UNLOCK(p);
 	}
 
 	/*
 	 * Implement correct POSIX saved-id behavior.
 	 *
 	 * XXX: It's not clear that the existing behavior is
 	 * POSIX-compliant.  A number of sources indicate that the saved
 	 * uid/gid should only be updated if the new ruid is not equal to
 	 * the old ruid, or the new euid is not equal to the old euid and
 	 * the new euid is not equal to the old ruid.  The FreeBSD code
 	 * always updates the saved uid/gid.  Also, this code uses the new
 	 * (replaced) euid and egid as the source, which may or may not be
 	 * the right ones to use.
 	 */
 	if (newcred == NULL) {
 		if (oldcred->cr_svuid != oldcred->cr_uid ||
 		    oldcred->cr_svgid != oldcred->cr_gid) {
 			newcred = crdup(oldcred);
 			change_svuid(newcred, newcred->cr_uid);
 			change_svgid(newcred, newcred->cr_gid);
 		}
 	} else {
 		change_svuid(newcred, newcred->cr_uid);
 		change_svgid(newcred, newcred->cr_gid);
 	}
 
 	if (newcred != NULL) {
 		PROC_LOCK(p);
 		p->p_ucred = newcred;
 		PROC_UNLOCK(p);
 		crfree(oldcred);
 	}
 
 	/*
 	 * Store the vp for use in procfs
 	 */
 	if (p->p_textvp)		/* release old reference */
 		vrele(p->p_textvp);
 	VREF(ndp->ni_vp);
 	p->p_textvp = ndp->ni_vp;
 
 	/*
 	 * Notify others that we exec'd, and clear the P_INEXEC flag
 	 * as we're now a bona fide freshly-execed process.
 	 */
 	PROC_LOCK(p);
 	KNOTE(&p->p_klist, NOTE_EXEC);
 	p->p_flag &= ~P_INEXEC;
 
 	/*
 	 * If tracing the process, trap to debugger so breakpoints
 	 * can be set before the program executes.
 	 */
 	_STOPEVENT(p, S_EXEC, 0);
 
 	if (p->p_flag & P_TRACED)
 		psignal(p, SIGTRAP);
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
 
 	/* Free any previous argument cache */
 	pa = p->p_args;
 	p->p_args = NULL;
 	PROC_UNLOCK(p);
 	if (pa != NULL && --pa->ar_ref == 0)
 		FREE(pa, M_PARGS);
 
 	/* Set values passed into the program in registers. */
 	setregs(td, imgp->entry_addr, (u_long)(uintptr_t)stack_base,
 	    imgp->ps_strings);
 
 	/* Cache arguments if they fit inside our allowance */
 	i = imgp->endargs - imgp->stringbase;
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
 		MALLOC(pa, struct pargs *, sizeof(struct pargs) + i, 
 		    M_PARGS, M_WAITOK);
 		pa->ar_ref = 1;
 		pa->ar_length = i;
 		bcopy(imgp->stringbase, pa->ar_args, i);
 		PROC_LOCK(p);
 		p->p_args = pa;
 		PROC_UNLOCK(p);
 	}
 
 exec_fail_dealloc:
 
 	/*
 	 * free various allocated resources
 	 */
 	if (imgp->firstpage)
 		exec_unmap_first_page(imgp);
 
 	if (imgp->stringbase != NULL)
 		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase,
 			ARG_MAX + PAGE_SIZE);
 
 	if (imgp->vp) {
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 		vrele(imgp->vp);
 	}
 
 	if (error == 0)
 		goto done2;
 
 exec_fail:
 	/* we're done here, clear P_INEXEC */
 	PROC_LOCK(p);
 	p->p_flag &= ~P_INEXEC;
 	PROC_UNLOCK(p);
 	
 	if (imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 		exit1(td, W_EXITCODE(0, SIGABRT));
 		/* NOT REACHED */
 		error = 0;
 	}
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 int
 exec_map_first_page(imgp)
 	struct image_params *imgp;
 {
 	int rv, i;
 	int initial_pagein;
 	vm_page_t ma[VM_INITIAL_PAGEIN];
 	vm_object_t object;
 
 	GIANT_REQUIRED;
 
 	if (imgp->firstpage) {
 		exec_unmap_first_page(imgp);
 	}
 
 	VOP_GETVOBJECT(imgp->vp, &object);
 
 	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 
 	if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
 		initial_pagein = VM_INITIAL_PAGEIN;
 		if (initial_pagein > object->size)
 			initial_pagein = object->size;
 		for (i = 1; i < initial_pagein; i++) {
 			if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
 				if ((ma[i]->flags & PG_BUSY) || ma[i]->busy)
 					break;
 				if (ma[i]->valid)
 					break;
 				vm_page_busy(ma[i]);
 			} else {
 				ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL);
 				if (ma[i] == NULL)
 					break;
 			}
 		}
 		initial_pagein = i;
 
 		rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
 		ma[0] = vm_page_lookup(object, 0);
 
 		if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) {
 			if (ma[0]) {
 				vm_page_protect(ma[0], VM_PROT_NONE);
 				vm_page_free(ma[0]);
 			}
 			return EIO;
 		}
 	}
 
 	vm_page_wire(ma[0]);
 	vm_page_wakeup(ma[0]);
 
 	pmap_kenter((vm_offset_t) imgp->image_header, VM_PAGE_TO_PHYS(ma[0]));
 	imgp->firstpage = ma[0];
 
 	return 0;
 }
 
 void
 exec_unmap_first_page(imgp)
 	struct image_params *imgp;
 {
 	GIANT_REQUIRED;
 
 	if (imgp->firstpage) {
 		pmap_kremove((vm_offset_t) imgp->image_header);
 		vm_page_unwire(imgp->firstpage, 1);
 		imgp->firstpage = NULL;
 	}
 }
 
 /*
  * Destroy old address space, and allocate a new stack
  *	The new stack is only SGROWSIZ large because it is grown
  *	automatically in trap.c.
  */
 int
 exec_new_vmspace(imgp)
 	struct image_params *imgp;
 {
 	int error;
 	struct vmspace *vmspace = imgp->proc->p_vmspace;
 	vm_offset_t stack_addr = USRSTACK - maxssiz;
 	vm_map_t map = &vmspace->vm_map;
 
 	GIANT_REQUIRED;
 
 	imgp->vmspace_destroyed = 1;
 
 	/*
 	 * Blow away entire process VM, if address space not shared,
 	 * otherwise, create a new VM space so that other threads are
 	 * not disrupted
 	 */
 	if (vmspace->vm_refcnt == 1) {
 		if (vmspace->vm_shm)
 			shmexit(imgp->proc);
 		pmap_remove_pages(vmspace_pmap(vmspace), 0, VM_MAXUSER_ADDRESS);
 		vm_map_remove(map, 0, VM_MAXUSER_ADDRESS);
 	} else {
 		vmspace_exec(imgp->proc);
 		vmspace = imgp->proc->p_vmspace;
 		map = &vmspace->vm_map;
 	}
 
 	/* Allocate a new stack */
 	error = vm_map_stack(&vmspace->vm_map, stack_addr, (vm_size_t)maxssiz,
 	    VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (error)
 		return (error);
 
 #ifdef __ia64__
 	{
 		/*
 		 * Allocate backing store. We really need something
 		 * similar to vm_map_stack which can allow the backing 
 		 * store to grow upwards. This will do for now.
 		 */
 		vm_offset_t bsaddr;
 		bsaddr = USRSTACK - 2*maxssiz;
 		error = vm_map_find(&vmspace->vm_map, 0, 0, &bsaddr,
 				    4*PAGE_SIZE, 0,
 				    VM_PROT_ALL, VM_PROT_ALL, 0);
 		imgp->proc->p_thread.td_md.md_bspstore = bsaddr;
 	}
 #endif
 
 	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
 	 * VM_STACK case, but they are still used to monitor the size of the
 	 * process stack so we can check the stack rlimit.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 	vmspace->vm_maxsaddr = (char *)USRSTACK - maxssiz;
 
 	return(0);
 }
 
 /*
  * Copy out argument and environment strings from the old process
  *	address space into the temporary string buffer.
  */
 int
 exec_extract_strings(imgp)
 	struct image_params *imgp;
 {
 	char	**argv, **envv;
 	char	*argp, *envp;
 	int	error;
 	size_t	length;
 
 	/*
 	 * extract arguments first
 	 */
 
 	argv = imgp->uap->argv;
 
 	if (argv) {
 		argp = (caddr_t) (intptr_t) fuword(argv);
 		if (argp == (caddr_t) -1)
 			return (EFAULT);
 		if (argp)
 			argv++;
 		if (imgp->argv0)
 			argp = imgp->argv0;
 		if (argp) {
 			do {
 				if (argp == (caddr_t) -1)
 					return (EFAULT);
 				if ((error = copyinstr(argp, imgp->stringp,
 				    imgp->stringspace, &length))) {
 					if (error == ENAMETOOLONG)
 						return(E2BIG);
 					return (error);
 				}
 				imgp->stringspace -= length;
 				imgp->stringp += length;
 				imgp->argc++;
 			} while ((argp = (caddr_t) (intptr_t) fuword(argv++)));
 		}
 	}	
 
 	imgp->endargs = imgp->stringp;
 
 	/*
 	 * extract environment strings
 	 */
 
 	envv = imgp->uap->envv;
 
 	if (envv) {
 		while ((envp = (caddr_t) (intptr_t) fuword(envv++))) {
 			if (envp == (caddr_t) -1)
 				return (EFAULT);
 			if ((error = copyinstr(envp, imgp->stringp,
 			    imgp->stringspace, &length))) {
 				if (error == ENAMETOOLONG)
 					return(E2BIG);
 				return (error);
 			}
 			imgp->stringspace -= length;
 			imgp->stringp += length;
 			imgp->envc++;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Copy strings out to the new process address space, constructing
  *	new arg and env vector tables. Return a pointer to the base
  *	so that it can be used as the initial stack pointer.
  */
 register_t *
 exec_copyout_strings(imgp)
 	struct image_params *imgp;
 {
 	int argc, envc;
 	char **vectp;
 	char *stringp, *destp;
 	register_t *stack_base;
 	struct ps_strings *arginfo;
 	int szsigcode;
 
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	arginfo = (struct ps_strings *)PS_STRINGS;
 	szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
 	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
 		roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode)
 		copyout(imgp->proc->p_sysent->sv_sigcode,
 			((caddr_t)arginfo - szsigcode), szsigcode);
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
 			: (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (char **) (destp - (imgp->argc + imgp->envc + 2 +
 				       imgp->auxarg_size) * sizeof(char *));
 
 	} else 
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (char **)
 			(destp - (imgp->argc + imgp->envc + 2) * sizeof(char *));
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = (register_t *)vectp;
 
 	stringp = imgp->stringbase;
 	argc = imgp->argc;
 	envc = imgp->envc;
 
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, destp, ARG_MAX - imgp->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword(vectp++, 0);
 
 	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword(vectp, 0);
 
 	return (stack_base);
 }
 
 /*
  * Check permissions of file to execute.
  *	Called with imgp->vp locked.
  *	Return 0 for success or error code on failure.
  */
 int
 exec_check_permissions(imgp)
 	struct image_params *imgp;
 {
 	struct proc *p = imgp->proc;
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	int error;
 
 	/* Get file attributes */
 	error = VOP_GETATTR(vp, attr, p->p_ucred, curthread); /* XXXKSE */
 	if (error)
 		return (error);
 
 	/*
 	 * 1) Check if file execution is disabled for the filesystem that this
 	 *	file resides on.
 	 * 2) Insure that at least one execute bit is on - otherwise root
 	 *	will always succeed, and we don't want to happen unless the
 	 *	file really is executable.
 	 * 3) Insure that the file is a regular file.
 	 */
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    ((attr->va_mode & 0111) == 0) ||
 	    (attr->va_type != VREG)) {
 		return (EACCES);
 	}
 
 	/*
 	 * Zero length files can't be exec'd
 	 */
 	if (attr->va_size == 0)
 		return (ENOEXEC);
 
 	/*
 	 *  Check for execute permission to file based on current credentials.
 	 */
 	error = VOP_ACCESS(vp, VEXEC, p->p_ucred, curthread); /* XXXKSE */
 	if (error)
 		return (error);
 
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 * if there are any.
 	 */
 	if (vp->v_writecount)
 		return (ETXTBSY);
 
 	/*
 	 * Call filesystem specific open routine (which does nothing in the
 	 * general case).
 	 */
 	error = VOP_OPEN(vp, FREAD, p->p_ucred, curthread); /* XXXKSE */
 	if (error)
 		return (error);
 
 	return (0);
 }
 
 /*
  * Exec handler registration
  */
 int
 exec_register(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 2;	/* New slot and trailing NULL */
 
 	if (execsw)
 		for (es = execsw; *es; es++)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return ENOMEM;
 	xs = newexecsw;
 	if (execsw)
 		for (es = execsw; *es; es++)
 			*xs++ = *es;
 	*xs++ = execsw_arg;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return 0;
 }
 
 int
 exec_unregister(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 1;
 
 	if (execsw == NULL)
 		panic("unregister with no handlers left?\n");
 
 	for (es = execsw; *es; es++) {
 		if (*es == execsw_arg)
 			break;
 	}
 	if (*es == NULL)
 		return ENOENT;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return ENOMEM;
 	xs = newexecsw;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			*xs++ = *es;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return 0;
 }
 
 int
 at_exec(function)
 	execlist_fn function;
 {
 	struct execlist *ep;
 
 #ifdef INVARIANTS
 	/* Be noisy if the programmer has lost track of things */
 	if (rm_at_exec(function)) 
 		printf("WARNING: exec callout entry (%p) already present\n",
 		    function);
 #endif
 	ep = malloc(sizeof(*ep), M_ATEXEC, M_NOWAIT);
 	if (ep == NULL)
 		return (ENOMEM);
 	ep->function = function;
 	TAILQ_INSERT_TAIL(&exec_list, ep, next);
 	return (0);
 }
 
 /*
  * Scan the exec callout list for the given item and remove it.
  * Returns the number of items removed (0 or 1)
  */
 int
 rm_at_exec(function)
 	execlist_fn function;
 {
 	struct execlist *ep;
 
 	TAILQ_FOREACH(ep, &exec_list, next) {
 		if (ep->function == function) {
 			TAILQ_REMOVE(&exec_list, ep, next);
 			free(ep, M_ATEXEC);
 			return(1);
 		}
 	}	
 	return (0);
 }
 
Index: head/sys/kern/kern_fork.c
===================================================================
--- head/sys/kern/kern_fork.c	(revision 89305)
+++ head/sys/kern/kern_fork.c	(revision 89306)
@@ -1,820 +1,826 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
  * $FreeBSD$
  */
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/syscall.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/kthread.h>
 #include <sys/unistd.h>	
 #include <sys/jail.h>
 #include <sys/sx.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_zone.h>
 
 #include <sys/vmmeter.h>
 #include <sys/user.h>
 
 static MALLOC_DEFINE(M_ATFORK, "atfork", "atfork callback");
 
 /*
  * These are the stuctures used to create a callout list for things to do
  * when forking a process
  */
 struct forklist {
 	forklist_fn function;
 	TAILQ_ENTRY(forklist) next;
 };
 
 static struct sx fork_list_lock;
 
 TAILQ_HEAD(forklist_head, forklist);
 static struct forklist_head fork_list = TAILQ_HEAD_INITIALIZER(fork_list);
 
 #ifndef _SYS_SYSPROTO_H_
 struct fork_args {
 	int     dummy;
 };
 #endif
 
 static void
 init_fork_list(void *data __unused)
 {
 
 	sx_init(&fork_list_lock, "fork list");
 }
 SYSINIT(fork_list, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_fork_list, NULL);
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 fork(td, uap)
 	struct thread *td;
 	struct fork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	mtx_lock(&Giant);
 	error = fork1(td, RFFDG | RFPROC, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	mtx_unlock(&Giant);
 	return error;
 }
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 vfork(td, uap)
 	struct thread *td;
 	struct vfork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	mtx_lock(&Giant);
 	error = fork1(td, RFFDG | RFPROC | RFPPWAIT | RFMEM, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2->p_pid;
 		td->td_retval[1] = 0;
 	}
 	mtx_unlock(&Giant);
 	return error;
 }
 
 /*
  * MPSAFE
  */
 int
 rfork(td, uap)
 	struct thread *td;
 	struct rfork_args *uap;
 {
 	int error;
 	struct proc *p2;
 
 	/* Don't allow kernel only flags. */
 	if ((uap->flags & RFKERNELONLY) != 0)
 		return (EINVAL);
 	mtx_lock(&Giant);
 	error = fork1(td, uap->flags, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2 ? p2->p_pid : 0;
 		td->td_retval[1] = 0;
 	}
 	mtx_unlock(&Giant);
 	return error;
 }
 
 
 int	nprocs = 1;				/* process 0 */
 int	lastpid = 0;
 SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0, 
     "Last used PID");
 
 /*
  * Random component to lastpid generation.  We mix in a random factor to make
  * it a little harder to predict.  We sanity check the modulus value to avoid
  * doing it in critical paths.  Don't let it be too small or we pointlessly
  * waste randomness entropy, and don't let it be impossibly large.  Using a
  * modulus that is too big causes a LOT more process table scans and slows
  * down fork processing as the pidchecked caching is defeated.
  */
 static int randompid = 0;
 
 static int
 sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
 {
 	int error, pid;
 
 	pid = randompid;
 	error = sysctl_handle_int(oidp, &pid, 0, req);
 	if (error || !req->newptr)
 		return (error);
 	if (pid < 0 || pid > PID_MAX - 100)	/* out of range */
 		pid = PID_MAX - 100;
 	else if (pid < 2)			/* NOP */
 		pid = 0;
 	else if (pid < 100)			/* Make it reasonable */
 		pid = 100;
 	randompid = pid;
 	return (error);
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_kern_randompid, "I", "Random PID modulus");
 
 #if 0
 void
 kse_init(struct kse *kse1, struct kse *kse2) 
 {
 }
 
 void
 thread_init(struct thread *thread1, struct thread *thread2) 
 {
 }
 
 void
 ksegrp_init(struct ksegrp *ksegrp1, struct ksegrp *ksegrp2) 
 {
 }
 #endif
 
 int
 fork1(td, flags, procp)
 	struct thread *td;			/* parent proc */
 	int flags;
 	struct proc **procp;			/* child proc */
 {
 	struct proc *p2, *pptr;
 	uid_t uid;
 	struct proc *newproc;
 	int trypid;
 	int ok;
 	static int pidchecked = 0;
 	struct forklist *ep;
 	struct filedesc *fd;
 	struct proc *p1 = td->td_proc;
 
 	GIANT_REQUIRED;
 
 	/* Can't copy and clear */
 	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
 		return (EINVAL);
 
 	/*
 	 * Here we don't create a new process, but we divorce
 	 * certain parts of a process from itself.
 	 */
 	if ((flags & RFPROC) == 0) {
 		vm_forkproc(td, 0, flags);
 
 		/*
 		 * Close all file descriptors.
 		 */
 		if (flags & RFCFDG) {
 			struct filedesc *fdtmp;
 			fdtmp = fdinit(td);	/* XXXKSE */
 			PROC_LOCK(p1);
 			fdfree(td);		/* XXXKSE */
 			p1->p_fd = fdtmp;
 			PROC_UNLOCK(p1);
 		}
 
 		/*
 		 * Unshare file descriptors (from parent.)
 		 */
 		if (flags & RFFDG) {
+			FILEDESC_LOCK(p1->p_fd);
 			if (p1->p_fd->fd_refcnt > 1) {
 				struct filedesc *newfd;
+
 				newfd = fdcopy(td);
+				FILEDESC_UNLOCK(p1->p_fd);
 				PROC_LOCK(p1);
 				fdfree(td);
 				p1->p_fd = newfd;
 				PROC_UNLOCK(p1);
-			}
+			} else
+				FILEDESC_UNLOCK(p1->p_fd);
 		}
 		*procp = NULL;
 		return (0);
 	}
 
 	/*
 	 * Although process entries are dynamically created, we still keep
 	 * a global limit on the maximum number we will create.  Don't allow
 	 * a nonprivileged user to use the last process; don't let root
 	 * exceed the limit. The variable nprocs is the current number of
 	 * processes, maxproc is the limit.
 	 */
 	uid = p1->p_ucred->cr_ruid;
 	if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) {
 		tablefull("proc");
 		return (EAGAIN);
 	}
 	/*
 	 * Increment the nprocs resource before blocking can occur.  There
 	 * are hard-limits as to the number of processes that can run.
 	 */
 	nprocs++;
 
 	/*
 	 * Increment the count of procs running with this uid. Don't allow
 	 * a nonprivileged user to exceed their current limit.
 	 */
 	ok = chgproccnt(p1->p_ucred->cr_ruidinfo, 1,
 		(uid != 0) ? p1->p_rlimit[RLIMIT_NPROC].rlim_cur : 0);
 	if (!ok) {
 		/*
 		 * Back out the process count
 		 */
 		nprocs--;
 		return (EAGAIN);
 	}
 
 	/* Allocate new proc. */
 	newproc = zalloc(proc_zone);
 
 	/*
 	 * Setup linkage for kernel based threading
 	 */
 	if((flags & RFTHREAD) != 0) {
 		newproc->p_peers = p1->p_peers;
 		p1->p_peers = newproc;
 		newproc->p_leader = p1->p_leader;
 	} else {
 		newproc->p_peers = NULL;
 		newproc->p_leader = newproc;
 	}
 
 	newproc->p_vmspace = NULL;
 
 	/*
 	 * Find an unused process ID.  We remember a range of unused IDs
 	 * ready to use (from lastpid+1 through pidchecked-1).
 	 *
 	 * If RFHIGHPID is set (used during system boot), do not allocate
 	 * low-numbered pids.
 	 */
 	sx_xlock(&allproc_lock);
 	trypid = lastpid + 1;
 	if (flags & RFHIGHPID) {
 		if (trypid < 10) {
 			trypid = 10;
 		}
 	} else {
 		if (randompid)
 			trypid += arc4random() % randompid;
 	}
 retry:
 	/*
 	 * If the process ID prototype has wrapped around,
 	 * restart somewhat above 0, as the low-numbered procs
 	 * tend to include daemons that don't exit.
 	 */
 	if (trypid >= PID_MAX) {
 		trypid = trypid % PID_MAX;
 		if (trypid < 100)
 			trypid += 100;
 		pidchecked = 0;
 	}
 	if (trypid >= pidchecked) {
 		int doingzomb = 0;
 
 		pidchecked = PID_MAX;
 		/*
 		 * Scan the active and zombie procs to check whether this pid
 		 * is in use.  Remember the lowest pid that's greater
 		 * than trypid, so we can avoid checking for a while.
 		 */
 		p2 = LIST_FIRST(&allproc);
 again:
 		for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
 			while (p2->p_pid == trypid ||
 			    p2->p_pgrp->pg_id == trypid ||
 			    p2->p_session->s_sid == trypid) {
 				trypid++;
 				if (trypid >= pidchecked)
 					goto retry;
 			}
 			if (p2->p_pid > trypid && pidchecked > p2->p_pid)
 				pidchecked = p2->p_pid;
 			if (p2->p_pgrp->pg_id > trypid &&
 			    pidchecked > p2->p_pgrp->pg_id)
 				pidchecked = p2->p_pgrp->pg_id;
 			if (p2->p_session->s_sid > trypid &&
 			    pidchecked > p2->p_session->s_sid)
 				pidchecked = p2->p_session->s_sid;
 		}
 		if (!doingzomb) {
 			doingzomb = 1;
 			p2 = LIST_FIRST(&zombproc);
 			goto again;
 		}
 	}
 
 	/*
 	 * RFHIGHPID does not mess with the lastpid counter during boot.
 	 */
 	if (flags & RFHIGHPID)
 		pidchecked = 0;
 	else
 		lastpid = trypid;
 
 	p2 = newproc;
 	p2->p_stat = SIDL;			/* protect against others */
 	p2->p_pid = trypid;
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
 	sx_xunlock(&allproc_lock);
 
 	/*
 	 * Make a proc table entry for the new process.
 	 * Start by zeroing the section of proc that is zero-initialized,
 	 * then copy the section that is copied directly from the parent.
 	 */
 	bzero(&p2->p_startzero,
 	    (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero));
 	bzero(&p2->p_kse.ke_startzero,
 	    (unsigned) ((caddr_t)&p2->p_kse.ke_endzero
 			- (caddr_t)&p2->p_kse.ke_startzero));
 	bzero(&p2->p_thread.td_startzero,
 	    (unsigned) ((caddr_t)&p2->p_thread.td_endzero
 			- (caddr_t)&p2->p_thread.td_startzero));
 	bzero(&p2->p_ksegrp.kg_startzero,
 	    (unsigned) ((caddr_t)&p2->p_ksegrp.kg_endzero
 			- (caddr_t)&p2->p_ksegrp.kg_startzero));
 	PROC_LOCK(p1);
 	bcopy(&p1->p_startcopy, &p2->p_startcopy,
 	    (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
 
 	bcopy(&p1->p_kse.ke_startcopy, &p2->p_kse.ke_startcopy,
 	    (unsigned) ((caddr_t)&p2->p_kse.ke_endcopy
 			- (caddr_t)&p2->p_kse.ke_startcopy));
 
 	bcopy(&p1->p_thread.td_startcopy, &p2->p_thread.td_startcopy,
 	    (unsigned) ((caddr_t)&p2->p_thread.td_endcopy
 			- (caddr_t)&p2->p_thread.td_startcopy));
 
 	bcopy(&p1->p_ksegrp.kg_startcopy, &p2->p_ksegrp.kg_startcopy,
 	    (unsigned) ((caddr_t)&p2->p_ksegrp.kg_endcopy
 			- (caddr_t)&p2->p_ksegrp.kg_startcopy));
 	PROC_UNLOCK(p1);
 
 	/*
 	 * XXXKSE Theoretically only the running thread would get copied 
 	 * Others in the kernel would be 'aborted' in the child.
 	 * i.e return E*something*
 	 */
 	proc_linkup(p2);
 
 	mtx_init(&p2->p_mtx, "process lock", MTX_DEF);
 	PROC_LOCK(p2);
 	/* note.. XXXKSE no pcb or u-area yet */
 
 	/*
 	 * Duplicate sub-structures as needed.
 	 * Increase reference counts on shared objects.
 	 * The p_stats and p_sigacts substructs are set in vm_forkproc.
 	 */
 	p2->p_flag = 0;
 	mtx_lock_spin(&sched_lock);
 	p2->p_sflag = PS_INMEM;
 	if (p1->p_sflag & PS_PROFIL)
 		startprofclock(p2);
 	mtx_unlock_spin(&sched_lock);
 	PROC_LOCK(p1);
 	p2->p_ucred = crhold(p1->p_ucred);
 	p2->p_thread.td_ucred = crhold(p2->p_ucred);	/* XXXKSE */
 
 	if (p2->p_args)
 		p2->p_args->ar_ref++;
 
 	if (flags & RFSIGSHARE) {
 		p2->p_procsig = p1->p_procsig;
 		p2->p_procsig->ps_refcnt++;
 		if (p1->p_sigacts == &p1->p_uarea->u_sigacts) {
 			struct sigacts *newsigacts;
 
 			PROC_UNLOCK(p1);
 			PROC_UNLOCK(p2);
 			/* Create the shared sigacts structure */
 			MALLOC(newsigacts, struct sigacts *,
 			    sizeof(struct sigacts), M_SUBPROC, M_WAITOK);
 			PROC_LOCK(p2);
 			PROC_LOCK(p1);
 			/*
 			 * Set p_sigacts to the new shared structure.
 			 * Note that this is updating p1->p_sigacts at the
 			 * same time, since p_sigacts is just a pointer to
 			 * the shared p_procsig->ps_sigacts.
 			 */
 			p2->p_sigacts  = newsigacts;
 			*p2->p_sigacts = p1->p_uarea->u_sigacts;
 		}
 	} else {
 		PROC_UNLOCK(p1);
 		PROC_UNLOCK(p2);
 		MALLOC(p2->p_procsig, struct procsig *, sizeof(struct procsig),
 		    M_SUBPROC, M_WAITOK);
 		PROC_LOCK(p2);
 		PROC_LOCK(p1);
 		bcopy(p1->p_procsig, p2->p_procsig, sizeof(*p2->p_procsig));
 		p2->p_procsig->ps_refcnt = 1;
 		p2->p_sigacts = NULL;	/* finished in vm_forkproc() */
 	}
 	if (flags & RFLINUXTHPN) 
 	        p2->p_sigparent = SIGUSR1;
 	else
 	        p2->p_sigparent = SIGCHLD;
 
 	/* bump references to the text vnode (for procfs) */
 	p2->p_textvp = p1->p_textvp;
 	PROC_UNLOCK(p1);
 	PROC_UNLOCK(p2);
 	if (p2->p_textvp)
 		VREF(p2->p_textvp);
 
 	if (flags & RFCFDG)
 		fd = fdinit(td);
-	else if (flags & RFFDG)
+	else if (flags & RFFDG) {
+		FILEDESC_LOCK(p1->p_fd);
 		fd = fdcopy(td);
-	else
+		FILEDESC_UNLOCK(p1->p_fd);
+	} else
 		fd = fdshare(p1);
 	PROC_LOCK(p2);
 	p2->p_fd = fd;
 
 	/*
 	 * If p_limit is still copy-on-write, bump refcnt,
 	 * otherwise get a copy that won't be modified.
 	 * (If PL_SHAREMOD is clear, the structure is shared
 	 * copy-on-write.)
 	 */
 	PROC_LOCK(p1);
 	if (p1->p_limit->p_lflags & PL_SHAREMOD)
 		p2->p_limit = limcopy(p1->p_limit);
 	else {
 		p2->p_limit = p1->p_limit;
 		p2->p_limit->p_refcnt++;
 	}
 
 	/*
 	 * Preserve some more flags in subprocess.  PS_PROFIL has already
 	 * been preserved.
 	 */
 	p2->p_flag |= p1->p_flag & (P_SUGID | P_ALTSTACK);
 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
 		p2->p_flag |= P_CONTROLT;
 	if (flags & RFPPWAIT)
 		p2->p_flag |= P_PPWAIT;
 
 	LIST_INSERT_AFTER(p1, p2, p_pglist);
 	PROC_UNLOCK(p1);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Attach the new process to its parent.
 	 *
 	 * If RFNOWAIT is set, the newly created process becomes a child
 	 * of init.  This effectively disassociates the child from the
 	 * parent.
 	 */
 	if (flags & RFNOWAIT)
 		pptr = initproc;
 	else
 		pptr = p1;
 	sx_xlock(&proctree_lock);
 	PROC_LOCK(p2);
 	p2->p_pptr = pptr;
 	PROC_UNLOCK(p2);
 	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
 	sx_xunlock(&proctree_lock);
 	PROC_LOCK(p2);
 	LIST_INIT(&p2->p_children);
 	LIST_INIT(&p2->p_thread.td_contested); /* XXXKSE only 1 thread? */
 
 	callout_init(&p2->p_itcallout, 0);
 	callout_init(&p2->p_thread.td_slpcallout, 1); /* XXXKSE */
 
 	PROC_LOCK(p1);
 #ifdef KTRACE
 	/*
 	 * Copy traceflag and tracefile if enabled.  If not inherited,
 	 * these were zeroed above but we still could have a trace race
 	 * so make sure p2's p_tracep is NULL.
 	 */
 	if ((p1->p_traceflag & KTRFAC_INHERIT) && p2->p_tracep == NULL) {
 		p2->p_traceflag = p1->p_traceflag;
 		if ((p2->p_tracep = p1->p_tracep) != NULL) {
 			PROC_UNLOCK(p1);
 			PROC_UNLOCK(p2);
 			VREF(p2->p_tracep);
 			PROC_LOCK(p2);
 			PROC_LOCK(p1);
 		}
 	}
 #endif
 
 	/*
 	 * set priority of child to be that of parent
 	 * XXXKSE hey! copying the estcpu seems dodgy.. should split it..
 	 */
 	mtx_lock_spin(&sched_lock);
 	p2->p_ksegrp.kg_estcpu = p1->p_ksegrp.kg_estcpu;
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * This begins the section where we must prevent the parent
 	 * from being swapped.
 	 */
 	_PHOLD(p1);
 	PROC_UNLOCK(p1);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Finish creating the child process.  It will return via a different
 	 * execution path later.  (ie: directly into user mode)
 	 */
 	vm_forkproc(td, p2, flags);
 
 	if (flags == (RFFDG | RFPROC)) {
 		cnt.v_forks++;
 		cnt.v_forkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
 	} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
 		cnt.v_vforks++;
 		cnt.v_vforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
 	} else if (p1 == &proc0) {
 		cnt.v_kthreads++;
 		cnt.v_kthreadpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
 	} else {
 		cnt.v_rforks++;
 		cnt.v_rforkpages += p2->p_vmspace->vm_dsize + p2->p_vmspace->vm_ssize;
 	}
 
 	/*
 	 * Both processes are set up, now check if any loadable modules want
 	 * to adjust anything.
 	 *   What if they have an error? XXX
 	 */
 	sx_slock(&fork_list_lock);
 	TAILQ_FOREACH(ep, &fork_list, next) {
 		(*ep->function)(p1, p2, flags);
 	}
 	sx_sunlock(&fork_list_lock);
 
 	/*
 	 * If RFSTOPPED not requested, make child runnable and add to
 	 * run queue.
 	 */
 	microtime(&(p2->p_stats->p_start));
 	p2->p_acflag = AFORK;
 	if ((flags & RFSTOPPED) == 0) {
 		mtx_lock_spin(&sched_lock);
 		p2->p_stat = SRUN;
 		setrunqueue(&p2->p_thread);
 		mtx_unlock_spin(&sched_lock);
 	}
 
 	/*
 	 * Now can be swapped.
 	 */
 	PROC_LOCK(p1);
 	_PRELE(p1);
 
 	/*
 	 * tell any interested parties about the new process
 	 */
 	KNOTE(&p1->p_klist, NOTE_FORK | p2->p_pid);
 	PROC_UNLOCK(p1);
 
 	/*
 	 * Preserve synchronization semantics of vfork.  If waiting for
 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
 	 * proc (in case of exit).
 	 */
 	PROC_LOCK(p2);
 	while (p2->p_flag & P_PPWAIT)
 		msleep(p1, &p2->p_mtx, PWAIT, "ppwait", 0);
 	PROC_UNLOCK(p2);
 
 	/*
 	 * Return child proc pointer to parent.
 	 */
 	*procp = p2;
 	return (0);
 }
 
 /*
  * The next two functionms are general routines to handle adding/deleting
  * items on the fork callout list.
  *
  * at_fork():
  * Take the arguments given and put them onto the fork callout list,
  * However first make sure that it's not already there.
  * Returns 0 on success or a standard error number.
  */
 
 int
 at_fork(function)
 	forklist_fn function;
 {
 	struct forklist *ep;
 
 #ifdef INVARIANTS
 	/* let the programmer know if he's been stupid */
 	if (rm_at_fork(function)) 
 		printf("WARNING: fork callout entry (%p) already present\n",
 		    function);
 #endif
 	ep = malloc(sizeof(*ep), M_ATFORK, M_NOWAIT);
 	if (ep == NULL)
 		return (ENOMEM);
 	ep->function = function;
 	sx_xlock(&fork_list_lock);
 	TAILQ_INSERT_TAIL(&fork_list, ep, next);
 	sx_xunlock(&fork_list_lock);
 	return (0);
 }
 
 /*
  * Scan the exit callout list for the given item and remove it..
  * Returns the number of items removed (0 or 1)
  */
 
 int
 rm_at_fork(function)
 	forklist_fn function;
 {
 	struct forklist *ep;
 
 	sx_xlock(&fork_list_lock);
 	TAILQ_FOREACH(ep, &fork_list, next) {
 		if (ep->function == function) {
 			TAILQ_REMOVE(&fork_list, ep, next);
 			sx_xunlock(&fork_list_lock);
 			free(ep, M_ATFORK);
 			return(1);
 		}
 	}
 	sx_xunlock(&fork_list_lock);
 	return (0);
 }
 
 /*
  * Handle the return of a child process from fork1().  This function
  * is called from the MD fork_trampoline() entry point.
  */
 void
 fork_exit(callout, arg, frame)
 	void (*callout)(void *, struct trapframe *);
 	void *arg;
 	struct trapframe *frame;
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 
 	td->td_kse->ke_oncpu = PCPU_GET(cpuid);
 	/*
 	 * Setup the sched_lock state so that we can release it.
 	 */
 	sched_lock.mtx_lock = (uintptr_t)td;
 	sched_lock.mtx_recurse = 0;
 	td->td_critnest = 1;
 	td->td_savecrit = CRITICAL_FORK;
 	CTR3(KTR_PROC, "fork_exit: new proc %p (pid %d, %s)", p, p->p_pid,
 	    p->p_comm);
 	if (PCPU_GET(switchtime.tv_sec) == 0)
 		microuptime(PCPU_PTR(switchtime));
 	PCPU_SET(switchticks, ticks);
 	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
          * have this call a non-return function to stay in kernel mode.
          * initproc has its own fork handler, but it does return.
          */
 	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
 	callout(arg, frame);
 
 	/*
 	 * Check if a kernel thread misbehaved and returned from its main
 	 * function.
 	 */
 	PROC_LOCK(p);
 	if (p->p_flag & P_KTHREAD) {
 		PROC_UNLOCK(p);
 		mtx_lock(&Giant);
 		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
 		    p->p_comm, p->p_pid);
 		kthread_exit(0);
 	}
 	PROC_UNLOCK(p);
 	mtx_lock(&Giant);
 	crfree(td->td_ucred);
 	mtx_unlock(&Giant);
 	td->td_ucred = NULL;
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
 
 /*
  * Simplified back end of syscall(), used when returning from fork()
  * directly into user mode.  Giant is not held on entry, and must not
  * be held on return.  This function is passed in to fork_exit() as the
  * first parameter and is called when returning to a new userland process.
  */
 void
 fork_return(td, frame)
 	struct thread *td;
 	struct trapframe *frame;
 {
 
 	userret(td, frame, 0);
 #ifdef KTRACE
 	if (KTRPOINT(td->td_proc, KTR_SYSRET)) {
 		ktrsysret(td->td_proc->p_tracep, SYS_fork, 0, 0);
 	}
 #endif
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: head/sys/kern/subr_acl_posix1e.c
===================================================================
--- head/sys/kern/subr_acl_posix1e.c	(revision 89305)
+++ head/sys/kern/subr_acl_posix1e.c	(revision 89306)
@@ -1,817 +1,821 @@
 /*-
  * Copyright (c) 1999-2001 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 /*
  * Developed by the TrustedBSD Project.
  * Support for POSIX.1e access control lists.
  */
 
 #include "opt_cap.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/vnode.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/acl.h>
 
 MALLOC_DEFINE(M_ACL, "acl", "access control list");
 
 static int	vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
 	    struct acl *aclp);
 static int	vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
 	    struct acl *aclp);
 static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
 	    acl_type_t type, struct acl *aclp);
 
 /*
  * Implement a version of vaccess() that understands POSIX.1e ACL semantics.
  * Return 0 on success, else an errno value.  Should be merged into
  * vaccess() eventually.
  */
 int
 vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
     struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)
 {
 	struct acl_entry *acl_other, *acl_mask;
 	mode_t dac_granted;
 	mode_t cap_granted;
 	mode_t acl_mask_granted;
 	int group_matched, i;
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.  Otherwise, attempt
 	 * to use privileges granted via cap_granted.  In some cases,
 	 * which privileges to use may be ambiguous due to "best match",
 	 * in which case fall back on first match for the time being.
 	 */
 	if (privused != NULL)
 		*privused = 0;
 
 	/*
 	 * Determine privileges now, but don't apply until we've found
 	 * a DAC entry that matches but has failed to allow access.
 	 */
 #ifndef CAPABILITIES
 	if (suser_xxx(cred, NULL, PRISON_ROOT) == 0)
 		cap_granted = (VEXEC | VREAD | VWRITE | VADMIN);
 	else
 		cap_granted = 0;
 #else
 	cap_granted = 0;
 
 	if (type == VDIR) {
 		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
 		     CAP_DAC_READ_SEARCH, PRISON_ROOT))
 			cap_granted |= VEXEC;
 	} else {
 		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
 		    CAP_DAC_EXECUTE, PRISON_ROOT))
 			cap_granted |= VEXEC;
 	}
 
 	if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH,
 	    PRISON_ROOT))
 		cap_granted |= VREAD;
 
 	if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE,
 	    PRISON_ROOT))
 		cap_granted |= VWRITE;
 
 	if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER,
 	    PRISON_ROOT))
 		cap_granted |= VADMIN;
 #endif /* CAPABILITIES */
 
 	/*
 	 * The owner matches if the effective uid associated with the
 	 * credential matches that of the ACL_USER_OBJ entry.  While we're
 	 * doing the first scan, also cache the location of the ACL_MASK
 	 * and ACL_OTHER entries, preventing some future iterations.
 	 */
 	acl_mask = acl_other = NULL;
 	for (i = 0; i < acl->acl_cnt; i++) {
 		switch (acl->acl_entry[i].ae_tag) {
 		case ACL_USER_OBJ:
 			if (file_uid != cred->cr_uid)
 				break;
 			dac_granted = 0;
 			dac_granted |= VADMIN;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= VWRITE;
 			if ((acc_mode & dac_granted) == acc_mode)
 				return (0);
 			if ((acc_mode & (dac_granted | cap_granted)) ==
 			    acc_mode) {
 				if (privused != NULL)
 					*privused = 1;
 				return (0);
 			}
 			goto error;
 
 		case ACL_MASK:
 			acl_mask = &acl->acl_entry[i];
 			break;
 
 		case ACL_OTHER:
 			acl_other = &acl->acl_entry[i];
 			break;
 
 		default:
 		}
 	}
 
 	/*
 	 * An ACL_OTHER entry should always exist in a valid access
 	 * ACL.  If it doesn't, then generate a serious failure.  For now,
 	 * this means a debugging message and EPERM, but in the future
 	 * should probably be a panic.
 	 */
 	if (acl_other == NULL) {
 		/*
 		 * XXX This should never happen
 		 */
 		printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
 		return (EPERM);
 	}
 
 	/*
 	 * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields
 	 * are masked by an ACL_MASK entry, if any.  As such, first identify
 	 * the ACL_MASK field, then iterate through identifying potential
 	 * user matches, then group matches.  If there is no ACL_MASK,
 	 * assume that the mask allows all requests to succeed.
 	 */
 	if (acl_mask != NULL) {
 		acl_mask_granted = 0;
 		if (acl_mask->ae_perm & ACL_EXECUTE)
 			acl_mask_granted |= VEXEC;
 		if (acl_mask->ae_perm & ACL_READ)
 			acl_mask_granted |= VREAD;
 		if (acl_mask->ae_perm & ACL_WRITE)
 			acl_mask_granted |= VWRITE;
 	} else
 		acl_mask_granted = VEXEC | VREAD | VWRITE;
 
 	/*
 	 * Iterate through user ACL entries.  Do checks twice, first
 	 * without privilege, and then if a match is found but failed,
 	 * a second time with privilege.
 	 */
 
 	/*
 	 * Check ACL_USER ACL entries.
 	 */
 	for (i = 0; i < acl->acl_cnt; i++) {
 		switch (acl->acl_entry[i].ae_tag) {
 		case ACL_USER:
 			if (acl->acl_entry[i].ae_id != cred->cr_uid)
 				break;
 			dac_granted = 0;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= VWRITE;
 			dac_granted &= acl_mask_granted;
 			if ((acc_mode & dac_granted) == acc_mode)
 				return (0);
 			if ((acc_mode & (dac_granted | cap_granted)) !=
 			    acc_mode)
 				goto error;
 
 			if (privused != NULL)
 				*privused = 1;
 			return (0);
 		}
 	}
 
 	/*
 	 * Group match is best-match, not first-match, so find a 
 	 * "best" match.  Iterate across, testing each potential group
 	 * match.  Make sure we keep track of whether we found a match
 	 * or not, so that we know if we should try again with any
 	 * available privilege, or if we should move on to ACL_OTHER.
 	 */
 	group_matched = 0;
 	for (i = 0; i < acl->acl_cnt; i++) {
 		switch (acl->acl_entry[i].ae_tag) {
 		case ACL_GROUP_OBJ:
 			if (!groupmember(file_gid, cred))
 				break;
 			dac_granted = 0;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= VWRITE;
 			dac_granted  &= acl_mask_granted;
 
 			if ((acc_mode & dac_granted) == acc_mode)
 				return (0);
 
 			group_matched = 1;
 			break;
 
 		case ACL_GROUP:
 			if (!groupmember(acl->acl_entry[i].ae_id, cred))
 				break;
 			dac_granted = 0;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= VWRITE;
 			dac_granted  &= acl_mask_granted;
 
 			if ((acc_mode & dac_granted) == acc_mode)
 				return (0);
 
 			group_matched = 1;
 			break;
 
 		default:
 		}
 	}
 
 	if (group_matched == 1) {
 		/*
 		 * There was a match, but it did not grant rights via
 		 * pure DAC.  Try again, this time with privilege.
 		 */
 		for (i = 0; i < acl->acl_cnt; i++) {
 			switch (acl->acl_entry[i].ae_tag) {
 			case ACL_GROUP_OBJ:
 				if (!groupmember(file_gid, cred))
 					break;
 				dac_granted = 0;
 				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 					dac_granted |= VEXEC;
 				if (acl->acl_entry[i].ae_perm & ACL_READ)
 					dac_granted |= VREAD;
 				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 					dac_granted |= VWRITE;
 				dac_granted &= acl_mask_granted;
 
 				if ((acc_mode & (dac_granted | cap_granted)) !=
 				    acc_mode)
 					break;
 
 				if (privused != NULL)
 					*privused = 1;
 				return (0);
 
 			case ACL_GROUP:
 				if (!groupmember(acl->acl_entry[i].ae_id,
 				    cred))
 					break;
 				dac_granted = 0;
 				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 				if (acl->acl_entry[i].ae_perm & ACL_READ)
 					dac_granted |= VREAD;
 				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 					dac_granted |= VWRITE;
 				dac_granted &= acl_mask_granted;
 
 				if ((acc_mode & (dac_granted | cap_granted)) !=
 				    acc_mode)
 					break;
 
 				if (privused != NULL)
 					*privused = 1;
 				return (0);
 
 			default:
 			}
 		}
 		/*
 		 * Even with privilege, group membership was not sufficient.
 		 * Return failure.
 		 */
 		goto error;
 	}
 		
 	/*
 	 * Fall back on ACL_OTHER.  ACL_MASK is not applied to ACL_OTHER.
 	 */
 	dac_granted = 0;
 	if (acl_other->ae_perm & ACL_EXECUTE)
 		dac_granted |= VEXEC;
 	if (acl_other->ae_perm & ACL_READ)
 		dac_granted |= VREAD;
 	if (acl_other->ae_perm & ACL_WRITE)
 		dac_granted |= VWRITE;
 
 	if ((acc_mode & dac_granted) == acc_mode)
 		return (0);
 	if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) {
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 error:
 	return ((acc_mode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * For the purposes of file systems maintaining the _OBJ entries in an
  * inode with a mode_t field, this routine converts a mode_t entry
  * to an acl_perm_t.
  */
 acl_perm_t
 acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
 {
 	acl_perm_t	perm = 0;
 
 	switch(tag) {
 	case ACL_USER_OBJ:
 		if (mode & S_IXUSR)
 			perm |= ACL_EXECUTE;
 		if (mode & S_IRUSR)
 			perm |= ACL_READ;
 		if (mode & S_IWUSR)
 			perm |= ACL_WRITE;
 		return (perm);
 
 	case ACL_GROUP_OBJ:
 		if (mode & S_IXGRP)
 			perm |= ACL_EXECUTE;
 		if (mode & S_IRGRP)
 			perm |= ACL_READ;
 		if (mode & S_IWGRP)
 			perm |= ACL_WRITE;
 		return (perm);
 
 	case ACL_OTHER:
 		if (mode & S_IXOTH)
 			perm |= ACL_EXECUTE;
 		if (mode & S_IROTH)
 			perm |= ACL_READ;
 		if (mode & S_IWOTH)
 			perm |= ACL_WRITE;
 		return (perm);
 
 	default:
 		printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
 		return (0);
 	}
 }
 
 /*
  * Given inode information (uid, gid, mode), return an acl entry of the
  * appropriate type.
  */
 struct acl_entry
 acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
 {
 	struct acl_entry	acl_entry;
 
 	acl_entry.ae_tag = tag;
 	acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
 	switch(tag) {
 	case ACL_USER_OBJ:
 		acl_entry.ae_id = uid;
 		break;
 
 	case ACL_GROUP_OBJ:
 		acl_entry.ae_id = gid;
 		break;
 
 	case ACL_OTHER:
 		acl_entry.ae_id = ACL_UNDEFINED_ID;
 		break;
 
 	default:
 		acl_entry.ae_id = ACL_UNDEFINED_ID;
 		printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
 	}
 
 	return (acl_entry);
 }
 
 /*
  * Utility function to generate a file mode given appropriate ACL entries.
  */
 mode_t
 acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
     struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
 {
 	mode_t	mode;
 
 	mode = 0;
 	if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
 		mode |= S_IXUSR;
 	if (acl_user_obj_entry->ae_perm & ACL_READ)
 		mode |= S_IRUSR;
 	if (acl_user_obj_entry->ae_perm & ACL_WRITE)
 		mode |= S_IWUSR;
 	if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
 		mode |= S_IXGRP;
 	if (acl_group_obj_entry->ae_perm & ACL_READ)
 		mode |= S_IRGRP;
 	if (acl_group_obj_entry->ae_perm & ACL_WRITE)
 		mode |= S_IWGRP;
 	if (acl_other_entry->ae_perm & ACL_EXECUTE)
 		mode |= S_IXOTH;
 	if (acl_other_entry->ae_perm & ACL_READ)
 		mode |= S_IROTH;
 	if (acl_other_entry->ae_perm & ACL_WRITE)
 		mode |= S_IWOTH;
 
 	return (mode);
 }
 
 /*
  * Perform a syntactic check of the ACL, sufficient to allow an
  * implementing file system to determine if it should accept this and
  * rely on the POSIX.1e ACL properties.
  */
 int
 acl_posix1e_check(struct acl *acl)
 {
 	int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
 	int num_acl_mask, num_acl_other, i;
 
 	/*
 	 * Verify that the number of entries does not exceed the maximum
 	 * defined for acl_t.
 	 * Verify that the correct number of various sorts of ae_tags are
 	 * present:
 	 *   Exactly one ACL_USER_OBJ
 	 *   Exactly one ACL_GROUP_OBJ
 	 *   Exactly one ACL_OTHER
 	 *   If any ACL_USER or ACL_GROUP entries appear, then exactly one
 	 *   ACL_MASK entry must also appear.
 	 * Verify that all ae_perm entries are in ACL_PERM_BITS.
 	 * Verify all ae_tag entries are understood by this implementation.
 	 * Note: Does not check for uniqueness of qualifier (ae_id) field.
 	 */
 	num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
 	    num_acl_mask = num_acl_other = 0;
 	if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0)
 		return (EINVAL);
 	for (i = 0; i < acl->acl_cnt; i++) {
 		/*
 		 * Check for a valid tag.
 		 */
 		switch(acl->acl_entry[i].ae_tag) {
 		case ACL_USER_OBJ:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_user_obj++;
 			break;
 		case ACL_GROUP_OBJ:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_group_obj++;
 			break;
 		case ACL_USER:
 			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_user++;
 			break;
 		case ACL_GROUP:
 			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_group++;
 			break;
 		case ACL_OTHER:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_other++;
 			break;
 		case ACL_MASK:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_mask++;
 			break;
 		default:
 			return (EINVAL);
 		}
 		/*
 		 * Check for valid perm entries.
 		 */
 		if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
 		    ACL_PERM_BITS)
 			return (EINVAL);
 	}
 	if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
 	    (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
 		return (EINVAL);
 	if (((num_acl_group != 0) || (num_acl_user != 0)) &&
 	    (num_acl_mask != 1))
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * These calls wrap the real vnode operations, and are called by the 
  * syscall code once the syscall has converted the path or file
  * descriptor to a vnode (unlocked).  The aclp pointer is assumed
  * still to point to userland, so this should not be consumed within
  * the kernel except by syscall code.  Other code should directly
  * invoke VOP_{SET,GET}ACL.
  */
 
 /*
  * Given a vnode, set its ACL.
  */
 static int
 vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernacl;
 	int error;
 
 	error = copyin(aclp, &inkernacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = VOP_SETACL(vp, type, &inkernacl, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	return(error);
 }
 
 /*
  * Given a vnode, get its ACL.
  */
 static int
 vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = VOP_GETACL(vp, type, &inkernelacl, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	if (error == 0)
 		error = copyout(&inkernelacl, aclp, sizeof(struct acl));
 	return (error);
 }
 
 /*
  * Given a vnode, delete its ACL.
  */
 static int
 vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
 {
 	int error;
 
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = VOP_SETACL(vp, ACL_TYPE_DEFAULT, 0, td->td_proc->p_ucred,
 	    td);
 	VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 /*
  * Given a vnode, check whether an ACL is appropriate for it
  */
 static int
 vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	error = copyin(aclp, &inkernelacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_proc->p_ucred,
 	    td);
 	return (error);
 }
 
 /*
  * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.
  * Don't need to lock, as the vacl_ code will get/release any locks
  * required.
  */
 
 /*
  * Given a file path, get an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	mtx_lock(&Giant);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type), 
 			    SCARG(uap, aclp));
 		NDFREE(&nd, 0);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file path, set an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	mtx_lock(&Giant);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type),
 			    SCARG(uap, aclp));
 		NDFREE(&nd, 0);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file descriptor, get an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
 	if (error == 0) {
 		error = vacl_get_acl(td, (struct vnode *)fp->f_data,
 			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file descriptor, set an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
 	if (error == 0) {
 		error = vacl_set_acl(td, (struct vnode *)fp->f_data,
 			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
  *
  * MPSAFE
  */
 int
 __acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	mtx_lock(&Giant);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_delete(td, nd.ni_vp, SCARG(uap, type));
 		NDFREE(&nd, 0);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
  *
  * MPSAFE
  */
 int
 __acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
 	if (error == 0) {
 		error = vacl_delete(td, (struct vnode *)fp->f_data, 
 			    SCARG(uap, type));
+		fdrop(fp, td);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file path, check an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
 {
 	struct nameidata	nd;
 	int	error;
 
 	mtx_lock(&Giant);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type),
 			    SCARG(uap, aclp));
 		NDFREE(&nd, 0);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file descriptor, check an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
 	if (error == 0) {
 		error = vacl_aclcheck(td, (struct vnode *)fp->f_data,
 			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
Index: head/sys/kern/sys_generic.c
===================================================================
--- head/sys/kern/sys_generic.c	(revision 89305)
+++ head/sys/kern/sys_generic.c	(revision 89306)
@@ -1,1298 +1,1357 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
  * $FreeBSD$
  */
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/uio.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/poll.h>
 #include <sys/resourcevar.h>
 #include <sys/selinfo.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/condvar.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 
 #include <machine/limits.h>
 
 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
 MALLOC_DEFINE(M_IOV, "iov", "large iov's");
 
 static int	pollscan __P((struct thread *, struct pollfd *, u_int));
 static int	pollholddrop __P((struct thread *, struct pollfd *, u_int, int));
 static int	selscan __P((struct thread *, fd_mask **, fd_mask **, int));
 static int	selholddrop __P((struct thread *, fd_mask *, fd_mask *, int, int));
 static int	dofileread __P((struct thread *, struct file *, int, void *,
 		    size_t, off_t, int));
 static int	dofilewrite __P((struct thread *, struct file *, int,
 		    const void *, size_t, off_t, int));
 
+struct file*
+holdfp(fdp, fd, flag)
+	struct filedesc* fdp;
+	int fd, flag;
+{
+	struct file* fp;
+
+	FILEDESC_LOCK(fdp);
+	if (((u_int)fd) >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[fd]) == NULL) {
+		FILEDESC_UNLOCK(fdp);
+		return (NULL);
+	}
+	FILE_LOCK(fp);
+	FILEDESC_UNLOCK(fdp);
+	if ((fp->f_flag & flag) == 0) {
+		FILE_UNLOCK(fp);
+		return (NULL);
+	}
+	fp->f_count++;
+	FILE_UNLOCK(fp);
+	return (fp);
+}
+
 /*
  * Read system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct read_args {
 	int	fd;
 	void	*buf;
 	size_t	nbyte;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 read(td, uap)
 	struct thread *td;
 	struct read_args *uap;
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
 		error = dofileread(td, fp, uap->fd, uap->buf,
 			    uap->nbyte, (off_t)-1, 0);
 		fdrop(fp, td);
 	}
 	mtx_unlock(&Giant);
 	return(error);
 }
 
 /*
  * Pread system call
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pread_args {
 	int	fd;
 	void	*buf;
 	size_t	nbyte;
 	int	pad;
 	off_t	offset;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 pread(td, uap)
 	struct thread *td;
 	struct pread_args *uap;
 {
 	struct file *fp;
 	int error;
 
-	mtx_lock(&Giant);
-	if ((error = fget_read(td, uap->fd, &fp)) == 0) {
-		if (fp->f_type == DTYPE_VNODE) {
-			error = dofileread(td, fp, uap->fd, uap->buf, 
-				    uap->nbyte, uap->offset, FOF_OFFSET);
-		} else {
-			error = ESPIPE;
-		}
-		fdrop(fp, td);
+	fp = holdfp(td->td_proc->p_fd, uap->fd, FREAD);
+	if (fp == NULL)
+		return (EBADF);
+	if (fp->f_type != DTYPE_VNODE) {
+		error = ESPIPE;
+	} else {
+		mtx_lock(&Giant);
+		error = dofileread(td, fp, uap->fd, uap->buf, uap->nbyte, 
+			    uap->offset, FOF_OFFSET);
+		mtx_unlock(&Giant);
 	}
-	mtx_unlock(&Giant);
+	fdrop(fp, td);
 	return(error);
 }
 
 /*
  * Code common for read and pread
  */
 int
 dofileread(td, fp, fd, buf, nbyte, offset, flags)
 	struct thread *td;
 	struct file *fp;
 	int fd, flags;
 	void *buf;
 	size_t nbyte;
 	off_t offset;
 {
 	struct uio auio;
 	struct iovec aiov;
 	long cnt, error = 0;
 #ifdef KTRACE
 	struct iovec ktriov;
 	struct uio ktruio;
 	int didktr = 0;
 #endif
 
 	aiov.iov_base = (caddr_t)buf;
 	aiov.iov_len = nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = offset;
 	if (nbyte > INT_MAX)
 		return (EINVAL);
 	auio.uio_resid = nbyte;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 #ifdef KTRACE
 	/*
 	 * if tracing, save a copy of iovec
 	 */
 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
 		ktriov = aiov;
 		ktruio = auio;
 		didktr = 1;
 	}
 #endif
 	cnt = nbyte;
 
 	if ((error = fo_read(fp, &auio, fp->f_cred, flags, td))) {
 		if (auio.uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 	cnt -= auio.uio_resid;
 #ifdef KTRACE
 	if (didktr && error == 0) {
 		ktruio.uio_iov = &ktriov;
 		ktruio.uio_resid = cnt;
 		ktrgenio(td->td_proc->p_tracep, fd, UIO_READ, &ktruio, error);
 	}
 #endif
 	td->td_retval[0] = cnt;
 	return (error);
 }
 
 /*
  * Scatter read system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct readv_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 readv(td, uap)
 	struct thread *td;
 	struct readv_args *uap;
 {
 	struct file *fp;
 	struct uio auio;
 	struct iovec *iov;
 	struct iovec *needfree;
 	struct iovec aiov[UIO_SMALLIOV];
 	long i, cnt, error = 0;
 	u_int iovlen;
 #ifdef KTRACE
 	struct iovec *ktriov = NULL;
 	struct uio ktruio;
 #endif
 	mtx_lock(&Giant);
 
 	if ((error = fget_read(td, uap->fd, &fp)) != 0)
 		goto done2;
 	/* note: can't use iovlen until iovcnt is validated */
 	iovlen = uap->iovcnt * sizeof (struct iovec);
 	if (uap->iovcnt > UIO_SMALLIOV) {
 		if (uap->iovcnt > UIO_MAXIOV) {
 			error = EINVAL;
 			goto done2;
 		}
 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
 		needfree = iov;
 	} else {
 		iov = aiov;
 		needfree = NULL;
 	}
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = uap->iovcnt;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = -1;
 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
 		goto done;
 	auio.uio_resid = 0;
 	for (i = 0; i < uap->iovcnt; i++) {
 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
 			error = EINVAL;
 			goto done;
 		}
 		auio.uio_resid += iov->iov_len;
 		iov++;
 	}
 #ifdef KTRACE
 	/*
 	 * if tracing, save a copy of iovec
 	 */
 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
 		ktruio = auio;
 	}
 #endif
 	cnt = auio.uio_resid;
 	if ((error = fo_read(fp, &auio, fp->f_cred, 0, td))) {
 		if (auio.uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 	cnt -= auio.uio_resid;
 #ifdef KTRACE
 	if (ktriov != NULL) {
 		if (error == 0) {
 			ktruio.uio_iov = ktriov;
 			ktruio.uio_resid = cnt;
 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_READ, &ktruio,
 			    error);
 		}
 		FREE(ktriov, M_TEMP);
 	}
 #endif
 	td->td_retval[0] = cnt;
 done:
 	fdrop(fp, td);
 	if (needfree)
 		FREE(needfree, M_IOV);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Write system call
  */
 #ifndef _SYS_SYSPROTO_H_
 struct write_args {
 	int	fd;
 	const void *buf;
 	size_t	nbyte;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 write(td, uap)
 	struct thread *td;
 	struct write_args *uap;
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
 		error = dofilewrite(td, fp, uap->fd, uap->buf, uap->nbyte,
 			    (off_t)-1, 0);
 		fdrop(fp, td);
 	} else {
 		error = EBADF;	/* XXX this can't be right */
 	}
 	mtx_unlock(&Giant);
 	return(error);
 }
 
 /*
  * Pwrite system call
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pwrite_args {
 	int	fd;
 	const void *buf;
 	size_t	nbyte;
 	int	pad;
 	off_t	offset;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 pwrite(td, uap)
 	struct thread *td;
 	struct pwrite_args *uap;
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	if ((error = fget_write(td, uap->fd, &fp)) == 0) {
 		if (fp->f_type == DTYPE_VNODE) {
 			error = dofilewrite(td, fp, uap->fd, uap->buf,
 				    uap->nbyte, uap->offset, FOF_OFFSET);
 		} else {
 			error = ESPIPE;
 		}
 		fdrop(fp, td);
 	} else {
 		error = EBADF;	/* this can't be right */
 	}
-	mtx_unlock(&Giant);
 	return(error);
 }
 
 static int
 dofilewrite(td, fp, fd, buf, nbyte, offset, flags)
 	struct thread *td;
 	struct file *fp;
 	int fd, flags;
 	const void *buf;
 	size_t nbyte;
 	off_t offset;
 {
 	struct uio auio;
 	struct iovec aiov;
 	long cnt, error = 0;
 #ifdef KTRACE
 	struct iovec ktriov;
 	struct uio ktruio;
 	int didktr = 0;
 #endif
 
 	aiov.iov_base = (void *)(uintptr_t)buf;
 	aiov.iov_len = nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_offset = offset;
 	if (nbyte > INT_MAX)
 		return (EINVAL);
 	auio.uio_resid = nbyte;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 #ifdef KTRACE
 	/*
 	 * if tracing, save a copy of iovec and uio
 	 */
 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
 		ktriov = aiov;
 		ktruio = auio;
 		didktr = 1;
 	}
 #endif
 	cnt = nbyte;
 	if (fp->f_type == DTYPE_VNODE)
 		bwillwrite();
 	if ((error = fo_write(fp, &auio, fp->f_cred, flags, td))) {
 		if (auio.uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		if (error == EPIPE) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	cnt -= auio.uio_resid;
 #ifdef KTRACE
 	if (didktr && error == 0) {
 		ktruio.uio_iov = &ktriov;
 		ktruio.uio_resid = cnt;
 		ktrgenio(td->td_proc->p_tracep, fd, UIO_WRITE, &ktruio, error);
 	}
 #endif
 	td->td_retval[0] = cnt;
 	return (error);
 }
 
 /*
  * Gather write system call
  */
 #ifndef _SYS_SYSPROTO_H_
 struct writev_args {
 	int	fd;
 	struct	iovec *iovp;
 	u_int	iovcnt;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 writev(td, uap)
 	struct thread *td;
 	register struct writev_args *uap;
 {
 	struct file *fp;
 	struct uio auio;
 	register struct iovec *iov;
 	struct iovec *needfree;
 	struct iovec aiov[UIO_SMALLIOV];
 	long i, cnt, error = 0;
 	u_int iovlen;
 #ifdef KTRACE
 	struct iovec *ktriov = NULL;
 	struct uio ktruio;
 #endif
 
 	mtx_lock(&Giant);
 	if ((error = fget_write(td, uap->fd, &fp)) != 0) {
 		error = EBADF;
 		goto done2;
 	}
 	/* note: can't use iovlen until iovcnt is validated */
 	iovlen = uap->iovcnt * sizeof (struct iovec);
 	if (uap->iovcnt > UIO_SMALLIOV) {
 		if (uap->iovcnt > UIO_MAXIOV) {
 			needfree = NULL;
 			error = EINVAL;
 			goto done;
 		}
 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
 		needfree = iov;
 	} else {
 		iov = aiov;
 		needfree = NULL;
 	}
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = uap->iovcnt;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = -1;
 	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
 		goto done;
 	auio.uio_resid = 0;
 	for (i = 0; i < uap->iovcnt; i++) {
 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
 			error = EINVAL;
 			goto done;
 		}
 		auio.uio_resid += iov->iov_len;
 		iov++;
 	}
 #ifdef KTRACE
 	/*
 	 * if tracing, save a copy of iovec and uio
 	 */
 	if (KTRPOINT(td->td_proc, KTR_GENIO))  {
 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
 		ktruio = auio;
 	}
 #endif
 	cnt = auio.uio_resid;
 	if (fp->f_type == DTYPE_VNODE)
 		bwillwrite();
 	if ((error = fo_write(fp, &auio, fp->f_cred, 0, td))) {
 		if (auio.uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		if (error == EPIPE) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	cnt -= auio.uio_resid;
 #ifdef KTRACE
 	if (ktriov != NULL) {
 		if (error == 0) {
 			ktruio.uio_iov = ktriov;
 			ktruio.uio_resid = cnt;
 			ktrgenio(td->td_proc->p_tracep, uap->fd, UIO_WRITE, &ktruio,
 			    error);
 		}
 		FREE(ktriov, M_TEMP);
 	}
 #endif
 	td->td_retval[0] = cnt;
 done:
 	fdrop(fp, td);
 	if (needfree)
 		FREE(needfree, M_IOV);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Ioctl system call
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ioctl_args {
 	int	fd;
 	u_long	com;
 	caddr_t	data;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 ioctl(td, uap)
 	struct thread *td;
 	register struct ioctl_args *uap;
 {
 	register struct file *fp;
 	register struct filedesc *fdp;
 	register u_long com;
 	int error = 0;
 	register u_int size;
 	caddr_t data, memp;
 	int tmp;
 #define STK_PARAMS	128
 	union {
 	    char stkbuf[STK_PARAMS];
 	    long align;
 	} ubuf;
 
-	mtx_lock(&Giant);
-	fdp = td->td_proc->p_fd;
-	if ((u_int)uap->fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[uap->fd]) == NULL) {
-		error = EBADF;
-		goto done2;
-	}
-
+	fp = ffind_hold(td, uap->fd);
+	if (fp == NULL)
+		return (EBADF);
 	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
-		error = EBADF;
-		goto done2;
+		fdrop(fp, td);
+		return (EBADF);
 	}
-
+	fdp = td->td_proc->p_fd;
 	switch (com = uap->com) {
 	case FIONCLEX:
+		FILEDESC_LOCK(fdp);
 		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
-		goto done2;
+		FILEDESC_UNLOCK(fdp);
+		fdrop(fp, td);
+		return (0);
 	case FIOCLEX:
+		FILEDESC_LOCK(fdp);
 		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
-		goto done2;
+		FILEDESC_UNLOCK(fdp);
+		fdrop(fp, td);
+		return (0);
 	}
 
 	/*
 	 * Interpret high order word to find amount of data to be
 	 * copied to/from the user's address space.
 	 */
 	size = IOCPARM_LEN(com);
 	if (size > IOCPARM_MAX) {
-		error = ENOTTY;
-		goto done2;
+		fdrop(fp, td);
+		return (ENOTTY);
 	}
 
-	fhold(fp);
-
+	mtx_lock(&Giant);
 	memp = NULL;
 	if (size > sizeof (ubuf.stkbuf)) {
 		memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
 		data = memp;
 	} else {
 		data = ubuf.stkbuf;
 	}
 	if (com&IOC_IN) {
 		if (size) {
 			error = copyin(uap->data, data, (u_int)size);
 			if (error) {
 				if (memp)
 					free(memp, M_IOCTLOPS);
 				fdrop(fp, td);
-				goto done2;
+				goto done;
 			}
 		} else {
 			*(caddr_t *)data = uap->data;
 		}
 	} else if ((com&IOC_OUT) && size) {
 		/*
 		 * Zero the buffer so the user always
 		 * gets back something deterministic.
 		 */
 		bzero(data, size);
 	} else if (com&IOC_VOID) {
 		*(caddr_t *)data = uap->data;
 	}
 
 	switch (com) {
 
 	case FIONBIO:
+		FILE_LOCK(fp);
 		if ((tmp = *(int *)data))
 			fp->f_flag |= FNONBLOCK;
 		else
 			fp->f_flag &= ~FNONBLOCK;
+		FILE_UNLOCK(fp);
 		error = fo_ioctl(fp, FIONBIO, (caddr_t)&tmp, td);
 		break;
 
 	case FIOASYNC:
+		FILE_LOCK(fp);
 		if ((tmp = *(int *)data))
 			fp->f_flag |= FASYNC;
 		else
 			fp->f_flag &= ~FASYNC;
+		FILE_UNLOCK(fp);
 		error = fo_ioctl(fp, FIOASYNC, (caddr_t)&tmp, td);
 		break;
 
 	default:
 		error = fo_ioctl(fp, com, data, td);
 		/*
 		 * Copy any data to user, size was
 		 * already set and checked above.
 		 */
 		if (error == 0 && (com&IOC_OUT) && size)
 			error = copyout(data, uap->data, (u_int)size);
 		break;
 	}
 	if (memp)
 		free(memp, M_IOCTLOPS);
 	fdrop(fp, td);
-done2:
+done:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 static int	nselcoll;	/* Select collisions since boot */
 struct cv	selwait;
 SYSCTL_INT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
 
 /*
  * Select system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct select_args {
 	int	nd;
 	fd_set	*in, *ou, *ex;
 	struct	timeval *tv;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 select(td, uap)
 	register struct thread *td;
 	register struct select_args *uap;
 {
+	struct filedesc *fdp;
 	/*
 	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
 	 * infds with the new FD_SETSIZE of 1024, and more than enough for
 	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
 	 * of 256.
 	 */
 	fd_mask s_selbits[howmany(2048, NFDBITS)];
 	fd_mask s_heldbits[howmany(2048, NFDBITS)];
 	fd_mask *ibits[3], *obits[3], *selbits, *sbp, *heldbits, *hibits, *hobits;
 	struct timeval atv, rtv, ttv;
 	int ncoll, error, timo, i;
 	u_int nbufbytes, ncpbytes, nfdbits;
 
 	if (uap->nd < 0)
 		return (EINVAL);
-
+	fdp = td->td_proc->p_fd;
 	mtx_lock(&Giant);
+	FILEDESC_LOCK(fdp);
 
 	if (uap->nd > td->td_proc->p_fd->fd_nfiles)
 		uap->nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
+	FILEDESC_UNLOCK(fdp);
 
 	/*
 	 * Allocate just enough bits for the non-null fd_sets.  Use the
 	 * preallocated auto buffer if possible.
 	 */
 	nfdbits = roundup(uap->nd, NFDBITS);
 	ncpbytes = nfdbits / NBBY;
 	nbufbytes = 0;
 	if (uap->in != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (uap->ou != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (uap->ex != NULL)
 		nbufbytes += 2 * ncpbytes;
 	if (nbufbytes <= sizeof s_selbits)
 		selbits = &s_selbits[0];
 	else
 		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
 	if (2 * ncpbytes <= sizeof s_heldbits) {
 		bzero(s_heldbits, sizeof(s_heldbits));
 		heldbits = &s_heldbits[0];
 	} else
 		heldbits = malloc(2 * ncpbytes, M_SELECT, M_WAITOK | M_ZERO);
 
 	/*
 	 * Assign pointers into the bit buffers and fetch the input bits.
 	 * Put the output buffers together so that they can be bzeroed
 	 * together.
 	 */
 	sbp = selbits;
 	hibits = heldbits + ncpbytes / sizeof *heldbits;
 	hobits = heldbits;
 #define	getbits(name, x) \
 	do {								\
 		if (uap->name == NULL)					\
 			ibits[x] = NULL;				\
 		else {							\
 			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
 			obits[x] = sbp;					\
 			sbp += ncpbytes / sizeof *sbp;			\
 			error = copyin(uap->name, ibits[x], ncpbytes);	\
 			if (error != 0)					\
 				goto done_noproclock;			\
 			for (i = 0;					\
 			     i < ncpbytes / sizeof ibits[i][0];		\
 			     i++)					\
 				hibits[i] |= ibits[x][i];		\
 		}							\
 	} while (0)
 	getbits(in, 0);
 	getbits(ou, 1);
 	getbits(ex, 2);
 #undef	getbits
 	if (nbufbytes != 0)
 		bzero(selbits, nbufbytes / 2);
 
 	if (uap->tv) {
 		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
 			sizeof (atv));
 		if (error)
 			goto done_noproclock;
 		if (itimerfix(&atv)) {
 			error = EINVAL;
 			goto done_noproclock;
 		}
 		getmicrouptime(&rtv);
 		timevaladd(&atv, &rtv);
 	} else {
 		atv.tv_sec = 0;
 		atv.tv_usec = 0;
 	}
 	selholddrop(td, hibits, hobits, uap->nd, 1);
 	timo = 0;
 	PROC_LOCK(td->td_proc);
 retry:
 	ncoll = nselcoll;
 	mtx_lock_spin(&sched_lock);
 	td->td_flags |= TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	PROC_UNLOCK(td->td_proc);
 	error = selscan(td, ibits, obits, uap->nd);
 	PROC_LOCK(td->td_proc);
 	if (error || td->td_retval[0])
 		goto done;
 	if (atv.tv_sec || atv.tv_usec) {
 		getmicrouptime(&rtv);
 		if (timevalcmp(&rtv, &atv, >=)) {
 			/*
 			 * An event of our interest may occur during locking a process.
 			 * In order to avoid missing the event that occured during locking
 			 * the process, test TDF_SELECT and rescan file descriptors if
 			 * necessary.
 			 */
 			mtx_lock_spin(&sched_lock);
 			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
 				ncoll = nselcoll;
 				td->td_flags |= TDF_SELECT;
 				mtx_unlock_spin(&sched_lock);
 				PROC_UNLOCK(td->td_proc);
 				error = selscan(td, ibits, obits, uap->nd);
 				PROC_LOCK(td->td_proc);
 			} else
 				mtx_unlock_spin(&sched_lock);
 			goto done;
 		}
 		ttv = atv;
 		timevalsub(&ttv, &rtv);
 		timo = ttv.tv_sec > 24 * 60 * 60 ?
 		    24 * 60 * 60 * hz : tvtohz(&ttv);
 	}
 	mtx_lock_spin(&sched_lock);
 	td->td_flags &= ~TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 
 	if (timo > 0)
 		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
 	else
 		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
 	
 	if (error == 0)
 		goto retry;
 
 done:
 	mtx_lock_spin(&sched_lock);
 	td->td_flags &= ~TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	PROC_UNLOCK(td->td_proc);
 	selholddrop(td, hibits, hobits, uap->nd, 0);
 done_noproclock:
 	/* select is not restarted after signals... */
 	if (error == ERESTART)
 		error = EINTR;
 	if (error == EWOULDBLOCK)
 		error = 0;
 #define	putbits(name, x) \
 	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
 		error = error2;
 	if (error == 0) {
 		int error2;
 
 		putbits(in, 0);
 		putbits(ou, 1);
 		putbits(ex, 2);
 #undef putbits
 	}
 	if (selbits != &s_selbits[0])
 		free(selbits, M_SELECT);
 	if (heldbits != &s_heldbits[0])
 		free(heldbits, M_SELECT);
 
 	mtx_unlock(&Giant);
 	return (error);
 }
 
+/*
+ * Used to hold then release a group of fds for select(2).
+ * Hold (hold == 1) or release (hold == 0) a group of filedescriptors.
+ * if holding then use ibits setting the bits in obits, otherwise use obits.
+ */
 static int
 selholddrop(td, ibits, obits, nfd, hold)
 	struct thread *td;
 	fd_mask *ibits, *obits;
 	int nfd, hold;
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	int i, fd;
 	fd_mask bits;
 	struct file *fp;
 
+	FILEDESC_LOCK(fdp);
 	for (i = 0; i < nfd; i += NFDBITS) {
 		if (hold)
 			bits = ibits[i/NFDBITS];
 		else
 			bits = obits[i/NFDBITS];
 		/* ffs(int mask) not portable, fd_mask is long */
 		for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
 			if (!(bits & 1))
 				continue;
 			fp = fdp->fd_ofiles[fd];
-			if (fp == NULL)
+			if (fp == NULL) {
+				FILEDESC_UNLOCK(fdp);
 				return (EBADF);
+			}
 			if (hold) {
 				fhold(fp);
 				obits[(fd)/NFDBITS] |=
 				    ((fd_mask)1 << ((fd) % NFDBITS));
-			} else
+			} else {
+				/* XXX: optimize by making a special
+				 * version of fdrop that only unlocks
+				 * the filedesc if needed?  This would
+				 * redcuce the number of lock/unlock
+				 * pairs by quite a bit.
+				 */
+				FILEDESC_UNLOCK(fdp);
 				fdrop(fp, td);
+				FILEDESC_LOCK(fdp);
+			}
 		}
 	}
+	FILEDESC_UNLOCK(fdp);
 	return (0);
 }
 
 static int
 selscan(td, ibits, obits, nfd)
 	struct thread *td;
 	fd_mask **ibits, **obits;
 	int nfd;
 {
-	struct filedesc *fdp = td->td_proc->p_fd;
 	int msk, i, fd;
 	fd_mask bits;
 	struct file *fp;
 	int n = 0;
 	/* Note: backend also returns POLLHUP/POLLERR if appropriate. */
 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
 
 	for (msk = 0; msk < 3; msk++) {
 		if (ibits[msk] == NULL)
 			continue;
 		for (i = 0; i < nfd; i += NFDBITS) {
 			bits = ibits[msk][i/NFDBITS];
 			/* ffs(int mask) not portable, fd_mask is long */
 			for (fd = i; bits && fd < nfd; fd++, bits >>= 1) {
 				if (!(bits & 1))
 					continue;
-				fp = fdp->fd_ofiles[fd];
+				fp = ffind_hold(td, fd);
 				if (fp == NULL)
 					return (EBADF);
 				if (fo_poll(fp, flag[msk], fp->f_cred, td)) {
 					obits[msk][(fd)/NFDBITS] |=
 					    ((fd_mask)1 << ((fd) % NFDBITS));
 					n++;
 				}
+				fdrop(fp, td);
 			}
 		}
 	}
 	td->td_retval[0] = n;
 	return (0);
 }
 
 /*
  * Poll system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct poll_args {
 	struct pollfd *fds;
 	u_int	nfds;
 	int	timeout;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 poll(td, uap)
 	struct thread *td;
 	struct poll_args *uap;
 {
 	caddr_t bits;
 	char smallbits[32 * sizeof(struct pollfd)];
 	struct timeval atv, rtv, ttv;
 	int ncoll, error = 0, timo;
 	u_int nfds;
 	size_t ni;
 	struct pollfd p_heldbits[32];
 	struct pollfd *heldbits;
 
 	nfds = SCARG(uap, nfds);
 
 	mtx_lock(&Giant);
 	/*
 	 * This is kinda bogus.  We have fd limits, but that is not
 	 * really related to the size of the pollfd array.  Make sure
 	 * we let the process use at least FD_SETSIZE entries and at
 	 * least enough for the current limits.  We want to be reasonably
 	 * safe, but not overly restrictive.
 	 */
 	if ((nfds > td->td_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur) &&
 	    (nfds > FD_SETSIZE)) {
 		error = EINVAL;
 		goto done2;
 	}
 	ni = nfds * sizeof(struct pollfd);
 	if (ni > sizeof(smallbits))
 		bits = malloc(ni, M_TEMP, M_WAITOK);
 	else
 		bits = smallbits;
 	if (ni > sizeof(p_heldbits))
 		heldbits = malloc(ni, M_TEMP, M_WAITOK);
 	else {
 		bzero(p_heldbits, sizeof(p_heldbits));
 		heldbits = p_heldbits;
 	}
 	error = copyin(SCARG(uap, fds), bits, ni);
 	if (error)
 		goto done_noproclock;
 	bcopy(bits, heldbits, ni);
 	if (SCARG(uap, timeout) != INFTIM) {
 		atv.tv_sec = SCARG(uap, timeout) / 1000;
 		atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000;
 		if (itimerfix(&atv)) {
 			error = EINVAL;
 			goto done_noproclock;
 		}
 		getmicrouptime(&rtv);
 		timevaladd(&atv, &rtv);
 	} else {
 		atv.tv_sec = 0;
 		atv.tv_usec = 0;
 	}
 	pollholddrop(td, heldbits, nfds, 1);
 	timo = 0;
 	PROC_LOCK(td->td_proc);
 retry:
 	ncoll = nselcoll;
 	mtx_lock_spin(&sched_lock);
 	td->td_flags |= TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	PROC_UNLOCK(td->td_proc);
 	error = pollscan(td, (struct pollfd *)bits, nfds);
 	PROC_LOCK(td->td_proc);
 	if (error || td->td_retval[0])
 		goto done;
 	if (atv.tv_sec || atv.tv_usec) {
 		getmicrouptime(&rtv);
 		if (timevalcmp(&rtv, &atv, >=)) {
 			/*
 			 * An event of our interest may occur during locking a process.
 			 * In order to avoid missing the event that occured during locking
 			 * the process, test TDF_SELECT and rescan file descriptors if
 			 * necessary.
 			 */
 			mtx_lock_spin(&sched_lock);
 			if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
 				ncoll = nselcoll;
 				td->td_flags |= TDF_SELECT;
 				mtx_unlock_spin(&sched_lock);
 				PROC_UNLOCK(td->td_proc);
 				error = pollscan(td, (struct pollfd *)bits, nfds);
 				PROC_LOCK(td->td_proc);
 			} else
 				mtx_unlock_spin(&sched_lock);
 			goto done;
 		}
 		ttv = atv;
 		timevalsub(&ttv, &rtv);
 		timo = ttv.tv_sec > 24 * 60 * 60 ?
 		    24 * 60 * 60 * hz : tvtohz(&ttv);
 	}
 	mtx_lock_spin(&sched_lock);
 	td->td_flags &= ~TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	if (timo > 0)
 		error = cv_timedwait_sig(&selwait, &td->td_proc->p_mtx, timo);
 	else
 		error = cv_wait_sig(&selwait, &td->td_proc->p_mtx);
 	if (error == 0)
 		goto retry;
 
 done:
 	mtx_lock_spin(&sched_lock);
 	td->td_flags &= ~TDF_SELECT;
 	mtx_unlock_spin(&sched_lock);
 	PROC_UNLOCK(td->td_proc);
 	pollholddrop(td, heldbits, nfds, 0);
 done_noproclock:
 	/* poll is not restarted after signals... */
 	if (error == ERESTART)
 		error = EINTR;
 	if (error == EWOULDBLOCK)
 		error = 0;
 	if (error == 0) {
 		error = copyout(bits, SCARG(uap, fds), ni);
 		if (error)
 			goto out;
 	}
 out:
 	if (ni > sizeof(smallbits))
 		free(bits, M_TEMP);
 	if (ni > sizeof(p_heldbits))
 		free(heldbits, M_TEMP);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 static int
 pollholddrop(td, fds, nfd, hold)
 	struct thread *td;
 	struct pollfd *fds;
 	u_int nfd;
 	int hold;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	int i;
 	struct file *fp;
 
+	FILEDESC_LOCK(fdp);
 	for (i = 0; i < nfd; i++, fds++) {
 		if (0 <= fds->fd && fds->fd < fdp->fd_nfiles) {
 			fp = fdp->fd_ofiles[fds->fd];
 			if (hold) {
 				if (fp != NULL) {
 					fhold(fp);
 					fds->revents = 1;
 				} else
 					fds->revents = 0;
-			} else if(fp != NULL && fds->revents)
-				fdrop(fp, td);
+			} else if(fp != NULL && fds->revents) {
+				FILE_LOCK(fp);
+				FILEDESC_UNLOCK(fdp);
+				fdrop_locked(fp, td);
+				FILEDESC_LOCK(fdp);
+			}
 		}
 	}
+	FILEDESC_UNLOCK(fdp);
 	return (0);
 }
 
 static int
 pollscan(td, fds, nfd)
 	struct thread *td;
 	struct pollfd *fds;
 	u_int nfd;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	int i;
 	struct file *fp;
 	int n = 0;
 
 	for (i = 0; i < nfd; i++, fds++) {
+		FILEDESC_LOCK(fdp);
 		if (fds->fd >= fdp->fd_nfiles) {
 			fds->revents = POLLNVAL;
 			n++;
+			FILEDESC_UNLOCK(fdp);
 		} else if (fds->fd < 0) {
 			fds->revents = 0;
+			FILEDESC_UNLOCK(fdp);
 		} else {
 			fp = fdp->fd_ofiles[fds->fd];
+			FILEDESC_UNLOCK(fdp);
 			if (fp == NULL) {
 				fds->revents = POLLNVAL;
 				n++;
 			} else {
 				/*
 				 * Note: backend also returns POLLHUP and
 				 * POLLERR if appropriate.
 				 */
 				fds->revents = fo_poll(fp, fds->events,
 				    fp->f_cred, td);
 				if (fds->revents != 0)
 					n++;
 			}
 		}
 	}
 	td->td_retval[0] = n;
 	return (0);
 }
 
 /*
  * OpenBSD poll system call.
  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct openbsd_poll_args {
 	struct pollfd *fds;
 	u_int	nfds;
 	int	timeout;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 openbsd_poll(td, uap)
 	register struct thread *td;
 	register struct openbsd_poll_args *uap;
 {
 	return (poll(td, (struct poll_args *)uap));
 }
 
 /*ARGSUSED*/
 int
 seltrue(dev, events, td)
 	dev_t dev;
 	int events;
 	struct thread *td;
 {
 
 	return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
 }
 
 static int
 find_thread_in_proc(struct proc *p, struct thread *td)
 {
 	struct thread *td2;
 	FOREACH_THREAD_IN_PROC(p, td2) {
 		if (td2 == td) {
 			return (1);
 		}
 	}
 	return (0);
 }
 
 /*
  * Record a select request.
  */
 void
 selrecord(selector, sip)
 	struct thread *selector;
 	struct selinfo *sip;
 {
 	struct proc *p;
 	pid_t mypid;
 
 	mypid = selector->td_proc->p_pid;
 	if ((sip->si_pid == mypid) &&
 	    (sip->si_thread == selector)) { /* XXXKSE should be an ID? */
 		return;
 	}
 	if (sip->si_pid &&
 	    (p = pfind(sip->si_pid)) &&
 	    (find_thread_in_proc(p, sip->si_thread))) {
 		mtx_lock_spin(&sched_lock);
 	    	if (sip->si_thread->td_wchan == (caddr_t)&selwait) {
 			mtx_unlock_spin(&sched_lock);
 			PROC_UNLOCK(p);
 			sip->si_flags |= SI_COLL;
 			return;
 		}
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 	}
 	sip->si_pid = mypid;
 	sip->si_thread = selector;
 }
 
 /*
  * Do a wakeup when a selectable event occurs.
  */
 void
 selwakeup(sip)
 	register struct selinfo *sip;
 {
 	struct thread *td;
 	register struct proc *p;
 
 	if (sip->si_pid == 0)
 		return;
 	if (sip->si_flags & SI_COLL) {
 		nselcoll++;
 		sip->si_flags &= ~SI_COLL;
 		cv_broadcast(&selwait);
 	}
 	p = pfind(sip->si_pid);
 	sip->si_pid = 0;
 	td = sip->si_thread;
 	if (p != NULL) {
 		if (!find_thread_in_proc(p, td)) {
 			PROC_UNLOCK(p); /* lock is in pfind() */;
 			return;
 		}
 		mtx_lock_spin(&sched_lock);
 		if (td->td_wchan == (caddr_t)&selwait) {
 			if (td->td_proc->p_stat == SSLEEP)
 				setrunnable(td);
 			else
 				cv_waitq_remove(td);
 		} else
 			td->td_flags &= ~TDF_SELECT;
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p); /* Lock is in pfind() */
 	}
 }
 
 static void selectinit __P((void *));
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, selectinit, NULL)
 
 /* ARGSUSED*/
 static void
 selectinit(dummy)
 	void *dummy;
 {
 	cv_init(&selwait, "select");
 }
Index: head/sys/kern/sys_pipe.c
===================================================================
--- head/sys/kern/sys_pipe.c	(revision 89305)
+++ head/sys/kern/sys_pipe.c	(revision 89306)
@@ -1,1302 +1,1318 @@
 /*
  * Copyright (c) 1996 John S. Dyson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice immediately at the beginning of the file, without modification,
  *    this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Absolutely no warranty of function or purpose is made by the author
  *    John S. Dyson.
  * 4. Modifications may be freely made to this file if the above conditions
  *    are met.
  *
  * $FreeBSD$
  */
 
 /*
  * This file contains a high-performance replacement for the socket-based
  * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
  * all features of sockets, but does do everything that pipes normally
  * do.
  */
 
 /*
  * This code has two modes of operation, a small write mode and a large
  * write mode.  The small write mode acts like conventional pipes with
  * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
  * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
  * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
  * the receiving process can copy it directly from the pages in the sending
  * process.
  *
  * If the sending process receives a signal, it is possible that it will
  * go away, and certainly its address space can change, because control
  * is returned back to the user-mode side.  In that case, the pipe code
  * arranges to copy the buffer supplied by the user process, to a pageable
  * kernel buffer, and the receiving process will grab the data from the
  * pageable kernel buffer.  Since signals don't happen all that often,
  * the copy operation is normally eliminated.
  *
  * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
  * happen for small transfers so that the system will not spend all of
  * its time context switching.  PIPE_SIZE is constrained by the
  * amount of kernel virtual memory.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/ttycom.h>
 #include <sys/stat.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/sysproto.h>
 #include <sys/pipe.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/uio.h>
 #include <sys/event.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_zone.h>
 
 /*
  * Use this define if you want to disable *fancy* VM things.  Expect an
  * approx 30% decrease in transfer rate.  This could be useful for
  * NetBSD or OpenBSD.
  */
 /* #define PIPE_NODIRECT */
 
 /*
  * interfaces to the outside world
  */
 static int pipe_read __P((struct file *fp, struct uio *uio, 
 		struct ucred *cred, int flags, struct thread *td));
 static int pipe_write __P((struct file *fp, struct uio *uio, 
 		struct ucred *cred, int flags, struct thread *td));
 static int pipe_close __P((struct file *fp, struct thread *td));
 static int pipe_poll __P((struct file *fp, int events, struct ucred *cred,
 		struct thread *td));
 static int pipe_kqfilter __P((struct file *fp, struct knote *kn));
 static int pipe_stat __P((struct file *fp, struct stat *sb, struct thread *td));
 static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct thread *td));
 
 static struct fileops pipeops = {
 	pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_kqfilter,
 	pipe_stat, pipe_close
 };
 
 static void	filt_pipedetach(struct knote *kn);
 static int	filt_piperead(struct knote *kn, long hint);
 static int	filt_pipewrite(struct knote *kn, long hint);
 
 static struct filterops pipe_rfiltops =
 	{ 1, NULL, filt_pipedetach, filt_piperead };
 static struct filterops pipe_wfiltops =
 	{ 1, NULL, filt_pipedetach, filt_pipewrite };
 
 
 /*
  * Default pipe buffer size(s), this can be kind-of large now because pipe
  * space is pageable.  The pipe code will try to maintain locality of
  * reference for performance reasons, so small amounts of outstanding I/O
  * will not wipe the cache.
  */
 #define MINPIPESIZE (PIPE_SIZE/3)
 #define MAXPIPESIZE (2*PIPE_SIZE/3)
 
 /*
  * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
  * is there so that on large systems, we don't exhaust it.
  */
 #define MAXPIPEKVA (8*1024*1024)
 
 /*
  * Limit for direct transfers, we cannot, of course limit
  * the amount of kva for pipes in general though.
  */
 #define LIMITPIPEKVA (16*1024*1024)
 
 /*
  * Limit the number of "big" pipes
  */
 #define LIMITBIGPIPES	32
 static int nbigpipe;
 
 static int amountpipekva;
 
 static void pipeclose __P((struct pipe *cpipe));
 static void pipe_free_kmem __P((struct pipe *cpipe));
 static int pipe_create __P((struct pipe **cpipep));
 static __inline int pipelock __P((struct pipe *cpipe, int catch));
 static __inline void pipeunlock __P((struct pipe *cpipe));
 static __inline void pipeselwakeup __P((struct pipe *cpipe));
 #ifndef PIPE_NODIRECT
 static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
 static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
 static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
 static void pipe_clone_write_buffer __P((struct pipe *wpipe));
 #endif
 static int pipespace __P((struct pipe *cpipe, int size));
 
 static vm_zone_t pipe_zone;
 
 /*
  * The pipe system call for the DTYPE_PIPE type of pipes
  */
 
 /* ARGSUSED */
 int
 pipe(td, uap)
 	struct thread *td;
 	struct pipe_args /* {
 		int	dummy;
 	} */ *uap;
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *rf, *wf;
 	struct pipe *rpipe, *wpipe;
 	int fd, error;
 
 	if (pipe_zone == NULL)
 		pipe_zone = zinit("PIPE", sizeof(struct pipe), 0, 0, 4);
 
 	rpipe = wpipe = NULL;
 	if (pipe_create(&rpipe) || pipe_create(&wpipe)) {
 		pipeclose(rpipe); 
 		pipeclose(wpipe); 
 		return (ENFILE);
 	}
 	
 	rpipe->pipe_state |= PIPE_DIRECTOK;
 	wpipe->pipe_state |= PIPE_DIRECTOK;
 
 	error = falloc(td, &rf, &fd);
 	if (error) {
 		pipeclose(rpipe);
 		pipeclose(wpipe);
 		return (error);
 	}
 	fhold(rf);
 	td->td_retval[0] = fd;
 
 	/*
 	 * Warning: once we've gotten past allocation of the fd for the
 	 * read-side, we can only drop the read side via fdrop() in order
 	 * to avoid races against processes which manage to dup() the read
 	 * side while we are blocked trying to allocate the write side.
 	 */
+	FILE_LOCK(rf);
 	rf->f_flag = FREAD | FWRITE;
 	rf->f_type = DTYPE_PIPE;
 	rf->f_data = (caddr_t)rpipe;
 	rf->f_ops = &pipeops;
+	FILE_UNLOCK(rf);
 	error = falloc(td, &wf, &fd);
 	if (error) {
+		FILEDESC_LOCK(fdp);
 		if (fdp->fd_ofiles[td->td_retval[0]] == rf) {
 			fdp->fd_ofiles[td->td_retval[0]] = NULL;
+			FILEDESC_UNLOCK(fdp);
 			fdrop(rf, td);
-		}
+		} else
+			FILEDESC_UNLOCK(fdp);
 		fdrop(rf, td);
 		/* rpipe has been closed by fdrop(). */
 		pipeclose(wpipe);
 		return (error);
 	}
+	FILE_LOCK(wf);
 	wf->f_flag = FREAD | FWRITE;
 	wf->f_type = DTYPE_PIPE;
 	wf->f_data = (caddr_t)wpipe;
 	wf->f_ops = &pipeops;
+	FILE_UNLOCK(wf);
 	td->td_retval[1] = fd;
-
 	rpipe->pipe_peer = wpipe;
 	wpipe->pipe_peer = rpipe;
 	fdrop(rf, td);
 
 	return (0);
 }
 
 /*
  * Allocate kva for pipe circular buffer, the space is pageable
  * This routine will 'realloc' the size of a pipe safely, if it fails
  * it will retain the old buffer.
  * If it fails it will return ENOMEM.
  */
 static int
 pipespace(cpipe, size)
 	struct pipe *cpipe;
 	int size;
 {
 	struct vm_object *object;
 	caddr_t buffer;
 	int npages, error;
 
 	GIANT_REQUIRED;
 
 	npages = round_page(size)/PAGE_SIZE;
 	/*
 	 * Create an object, I don't like the idea of paging to/from
 	 * kernel_object.
 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
 	 */
 	object = vm_object_allocate(OBJT_DEFAULT, npages);
 	buffer = (caddr_t) vm_map_min(kernel_map);
 
 	/*
 	 * Insert the object into the kernel map, and allocate kva for it.
 	 * The map entry is, by default, pageable.
 	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
 	 */
 	error = vm_map_find(kernel_map, object, 0,
 		(vm_offset_t *) &buffer, size, 1,
 		VM_PROT_ALL, VM_PROT_ALL, 0);
 
 	if (error != KERN_SUCCESS) {
 		vm_object_deallocate(object);
 		return (ENOMEM);
 	}
 
 	/* free old resources if we're resizing */
 	pipe_free_kmem(cpipe);
 	cpipe->pipe_buffer.object = object;
 	cpipe->pipe_buffer.buffer = buffer;
 	cpipe->pipe_buffer.size = size;
 	cpipe->pipe_buffer.in = 0;
 	cpipe->pipe_buffer.out = 0;
 	cpipe->pipe_buffer.cnt = 0;
 	amountpipekva += cpipe->pipe_buffer.size;
 	return (0);
 }
 
 /*
  * initialize and allocate VM and memory for pipe
  */
 static int
 pipe_create(cpipep)
 	struct pipe **cpipep;
 {
 	struct pipe *cpipe;
 	int error;
 
 	*cpipep = zalloc(pipe_zone);
 	if (*cpipep == NULL)
 		return (ENOMEM);
 
 	cpipe = *cpipep;
 	
 	/* so pipespace()->pipe_free_kmem() doesn't follow junk pointer */
 	cpipe->pipe_buffer.object = NULL;
 #ifndef PIPE_NODIRECT
 	cpipe->pipe_map.kva = NULL;
 #endif
 	/*
 	 * protect so pipeclose() doesn't follow a junk pointer
 	 * if pipespace() fails.
 	 */
 	bzero(&cpipe->pipe_sel, sizeof(cpipe->pipe_sel));
 	cpipe->pipe_state = 0;
 	cpipe->pipe_peer = NULL;
 	cpipe->pipe_busy = 0;
 
 #ifndef PIPE_NODIRECT
 	/*
 	 * pipe data structure initializations to support direct pipe I/O
 	 */
 	cpipe->pipe_map.cnt = 0;
 	cpipe->pipe_map.kva = 0;
 	cpipe->pipe_map.pos = 0;
 	cpipe->pipe_map.npages = 0;
 	/* cpipe->pipe_map.ms[] = invalid */
 #endif
 
 	error = pipespace(cpipe, PIPE_SIZE);
 	if (error)
 		return (error);
 
 	vfs_timestamp(&cpipe->pipe_ctime);
 	cpipe->pipe_atime = cpipe->pipe_ctime;
 	cpipe->pipe_mtime = cpipe->pipe_ctime;
 
 	return (0);
 }
 
 
 /*
  * lock a pipe for I/O, blocking other access
  */
 static __inline int
 pipelock(cpipe, catch)
 	struct pipe *cpipe;
 	int catch;
 {
 	int error;
 
 	while (cpipe->pipe_state & PIPE_LOCK) {
 		cpipe->pipe_state |= PIPE_LWANT;
 		error = tsleep(cpipe, catch ? (PRIBIO | PCATCH) : PRIBIO,
 		    "pipelk", 0);
 		if (error != 0) 
 			return (error);
 	}
 	cpipe->pipe_state |= PIPE_LOCK;
 	return (0);
 }
 
 /*
  * unlock a pipe I/O lock
  */
 static __inline void
 pipeunlock(cpipe)
 	struct pipe *cpipe;
 {
 
 	cpipe->pipe_state &= ~PIPE_LOCK;
 	if (cpipe->pipe_state & PIPE_LWANT) {
 		cpipe->pipe_state &= ~PIPE_LWANT;
 		wakeup(cpipe);
 	}
 }
 
 static __inline void
 pipeselwakeup(cpipe)
 	struct pipe *cpipe;
 {
 
 	if (cpipe->pipe_state & PIPE_SEL) {
 		cpipe->pipe_state &= ~PIPE_SEL;
 		selwakeup(&cpipe->pipe_sel);
 	}
 	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
 		pgsigio(cpipe->pipe_sigio, SIGIO, 0);
 	KNOTE(&cpipe->pipe_sel.si_note, 0);
 }
 
 /* ARGSUSED */
 static int
 pipe_read(fp, uio, cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *cred;
 	struct thread *td;
 	int flags;
 {
 	struct pipe *rpipe = (struct pipe *) fp->f_data;
 	int error;
 	int nread = 0;
 	u_int size;
 
 	++rpipe->pipe_busy;
 	error = pipelock(rpipe, 1);
 	if (error)
 		goto unlocked_error;
 
 	while (uio->uio_resid) {
 		/*
 		 * normal pipe buffer receive
 		 */
 		if (rpipe->pipe_buffer.cnt > 0) {
 			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
 			if (size > rpipe->pipe_buffer.cnt)
 				size = rpipe->pipe_buffer.cnt;
 			if (size > (u_int) uio->uio_resid)
 				size = (u_int) uio->uio_resid;
 
 			error = uiomove(&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
 					size, uio);
 			if (error)
 				break;
 
 			rpipe->pipe_buffer.out += size;
 			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
 				rpipe->pipe_buffer.out = 0;
 
 			rpipe->pipe_buffer.cnt -= size;
 
 			/*
 			 * If there is no more to read in the pipe, reset
 			 * its pointers to the beginning.  This improves
 			 * cache hit stats.
 			 */
 			if (rpipe->pipe_buffer.cnt == 0) {
 				rpipe->pipe_buffer.in = 0;
 				rpipe->pipe_buffer.out = 0;
 			}
 			nread += size;
 #ifndef PIPE_NODIRECT
 		/*
 		 * Direct copy, bypassing a kernel buffer.
 		 */
 		} else if ((size = rpipe->pipe_map.cnt) &&
 			   (rpipe->pipe_state & PIPE_DIRECTW)) {
 			caddr_t	va;
 			if (size > (u_int) uio->uio_resid)
 				size = (u_int) uio->uio_resid;
 
 			va = (caddr_t) rpipe->pipe_map.kva +
 			    rpipe->pipe_map.pos;
 			error = uiomove(va, size, uio);
 			if (error)
 				break;
 			nread += size;
 			rpipe->pipe_map.pos += size;
 			rpipe->pipe_map.cnt -= size;
 			if (rpipe->pipe_map.cnt == 0) {
 				rpipe->pipe_state &= ~PIPE_DIRECTW;
 				wakeup(rpipe);
 			}
 #endif
 		} else {
 			/*
 			 * detect EOF condition
 			 * read returns 0 on EOF, no need to set error
 			 */
 			if (rpipe->pipe_state & PIPE_EOF)
 				break;
 
 			/*
 			 * If the "write-side" has been blocked, wake it up now.
 			 */
 			if (rpipe->pipe_state & PIPE_WANTW) {
 				rpipe->pipe_state &= ~PIPE_WANTW;
 				wakeup(rpipe);
 			}
 
 			/*
 			 * Break if some data was read.
 			 */
 			if (nread > 0)
 				break;
 
 			/*
 			 * Unlock the pipe buffer for our remaining processing.  We
 			 * will either break out with an error or we will sleep and
 			 * relock to loop.
 			 */
 			pipeunlock(rpipe);
 
 			/*
 			 * Handle non-blocking mode operation or
 			 * wait for more data.
 			 */
+			FILE_LOCK(fp);
 			if (fp->f_flag & FNONBLOCK) {
+				FILE_UNLOCK(fp);
 				error = EAGAIN;
 			} else {
+				FILE_UNLOCK(fp);
 				rpipe->pipe_state |= PIPE_WANTR;
 				if ((error = tsleep(rpipe, PRIBIO | PCATCH,
 				    "piperd", 0)) == 0)
 					error = pipelock(rpipe, 1);
 			}
 			if (error)
 				goto unlocked_error;
 		}
 	}
 	pipeunlock(rpipe);
 
 	if (error == 0)
 		vfs_timestamp(&rpipe->pipe_atime);
 unlocked_error:
 	--rpipe->pipe_busy;
 
 	/*
 	 * PIPE_WANT processing only makes sense if pipe_busy is 0.
 	 */
 	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
 		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
 		wakeup(rpipe);
 	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
 		/*
 		 * Handle write blocking hysteresis.
 		 */
 		if (rpipe->pipe_state & PIPE_WANTW) {
 			rpipe->pipe_state &= ~PIPE_WANTW;
 			wakeup(rpipe);
 		}
 	}
 
 	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
 		pipeselwakeup(rpipe);
 
 	return (error);
 }
 
 #ifndef PIPE_NODIRECT
 /*
  * Map the sending processes' buffer into kernel space and wire it.
  * This is similar to a physical write operation.
  */
 static int
 pipe_build_write_buffer(wpipe, uio)
 	struct pipe *wpipe;
 	struct uio *uio;
 {
 	u_int size;
 	int i;
 	vm_offset_t addr, endaddr, paddr;
 
 	GIANT_REQUIRED;
 
 	size = (u_int) uio->uio_iov->iov_len;
 	if (size > wpipe->pipe_buffer.size)
 		size = wpipe->pipe_buffer.size;
 
 	endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size);
 	addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base);
 	for (i = 0; addr < endaddr; addr += PAGE_SIZE, i++) {
 		vm_page_t m;
 
 		if (vm_fault_quick((caddr_t)addr, VM_PROT_READ) < 0 ||
 		    (paddr = pmap_kextract(addr)) == 0) {
 			int j;
 
 			for (j = 0; j < i; j++)
 				vm_page_unwire(wpipe->pipe_map.ms[j], 1);
 			return (EFAULT);
 		}
 
 		m = PHYS_TO_VM_PAGE(paddr);
 		vm_page_wire(m);
 		wpipe->pipe_map.ms[i] = m;
 	}
 
 /*
  * set up the control block
  */
 	wpipe->pipe_map.npages = i;
 	wpipe->pipe_map.pos =
 	    ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
 	wpipe->pipe_map.cnt = size;
 
 /*
  * and map the buffer
  */
 	if (wpipe->pipe_map.kva == 0) {
 		/*
 		 * We need to allocate space for an extra page because the
 		 * address range might (will) span pages at times.
 		 */
 		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
 			wpipe->pipe_buffer.size + PAGE_SIZE);
 		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
 	}
 	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
 		wpipe->pipe_map.npages);
 
 /*
  * and update the uio data
  */
 
 	uio->uio_iov->iov_len -= size;
 	uio->uio_iov->iov_base += size;
 	if (uio->uio_iov->iov_len == 0)
 		uio->uio_iov++;
 	uio->uio_resid -= size;
 	uio->uio_offset += size;
 	return (0);
 }
 
 /*
  * unmap and unwire the process buffer
  */
 static void
 pipe_destroy_write_buffer(wpipe)
 	struct pipe *wpipe;
 {
 	int i;
 
 	GIANT_REQUIRED;
 
 	if (wpipe->pipe_map.kva) {
 		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
 
 		if (amountpipekva > MAXPIPEKVA) {
 			vm_offset_t kva = wpipe->pipe_map.kva;
 			wpipe->pipe_map.kva = 0;
 			kmem_free(kernel_map, kva,
 				wpipe->pipe_buffer.size + PAGE_SIZE);
 			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
 		}
 	}
 	for (i = 0; i < wpipe->pipe_map.npages; i++)
 		vm_page_unwire(wpipe->pipe_map.ms[i], 1);
 }
 
 /*
  * In the case of a signal, the writing process might go away.  This
  * code copies the data into the circular buffer so that the source
  * pages can be freed without loss of data.
  */
 static void
 pipe_clone_write_buffer(wpipe)
 	struct pipe *wpipe;
 {
 	int size;
 	int pos;
 
 	size = wpipe->pipe_map.cnt;
 	pos = wpipe->pipe_map.pos;
 	bcopy((caddr_t) wpipe->pipe_map.kva + pos,
 	    (caddr_t) wpipe->pipe_buffer.buffer, size);
 
 	wpipe->pipe_buffer.in = size;
 	wpipe->pipe_buffer.out = 0;
 	wpipe->pipe_buffer.cnt = size;
 	wpipe->pipe_state &= ~PIPE_DIRECTW;
 
 	pipe_destroy_write_buffer(wpipe);
 }
 
 /*
  * This implements the pipe buffer write mechanism.  Note that only
  * a direct write OR a normal pipe write can be pending at any given time.
  * If there are any characters in the pipe buffer, the direct write will
  * be deferred until the receiving process grabs all of the bytes from
  * the pipe buffer.  Then the direct mapping write is set-up.
  */
 static int
 pipe_direct_write(wpipe, uio)
 	struct pipe *wpipe;
 	struct uio *uio;
 {
 	int error;
 
 retry:
 	while (wpipe->pipe_state & PIPE_DIRECTW) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		wpipe->pipe_state |= PIPE_WANTW;
 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdww", 0);
 		if (error)
 			goto error1;
 		if (wpipe->pipe_state & PIPE_EOF) {
 			error = EPIPE;
 			goto error1;
 		}
 	}
 	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
 	if (wpipe->pipe_buffer.cnt > 0) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 			
 		wpipe->pipe_state |= PIPE_WANTW;
 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwc", 0);
 		if (error)
 			goto error1;
 		if (wpipe->pipe_state & PIPE_EOF) {
 			error = EPIPE;
 			goto error1;
 		}
 		goto retry;
 	}
 
 	wpipe->pipe_state |= PIPE_DIRECTW;
 
 	error = pipe_build_write_buffer(wpipe, uio);
 	if (error) {
 		wpipe->pipe_state &= ~PIPE_DIRECTW;
 		goto error1;
 	}
 
 	error = 0;
 	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
 		if (wpipe->pipe_state & PIPE_EOF) {
 			pipelock(wpipe, 0);
 			pipe_destroy_write_buffer(wpipe);
 			pipeunlock(wpipe);
 			pipeselwakeup(wpipe);
 			error = EPIPE;
 			goto error1;
 		}
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 		pipeselwakeup(wpipe);
 		error = tsleep(wpipe, PRIBIO | PCATCH, "pipdwt", 0);
 	}
 
 	pipelock(wpipe,0);
 	if (wpipe->pipe_state & PIPE_DIRECTW) {
 		/*
 		 * this bit of trickery substitutes a kernel buffer for
 		 * the process that might be going away.
 		 */
 		pipe_clone_write_buffer(wpipe);
 	} else {
 		pipe_destroy_write_buffer(wpipe);
 	}
 	pipeunlock(wpipe);
 	return (error);
 
 error1:
 	wakeup(wpipe);
 	return (error);
 }
 #endif
 	
 static int
 pipe_write(fp, uio, cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *cred;
 	struct thread *td;
 	int flags;
 {
 	int error = 0;
 	int orig_resid;
 	struct pipe *wpipe, *rpipe;
 
 	rpipe = (struct pipe *) fp->f_data;
 	wpipe = rpipe->pipe_peer;
 
 	/*
 	 * detect loss of pipe read side, issue SIGPIPE if lost.
 	 */
 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
 		return (EPIPE);
 	}
 	++wpipe->pipe_busy;
 
 	/*
 	 * If it is advantageous to resize the pipe buffer, do
 	 * so.
 	 */
 	if ((uio->uio_resid > PIPE_SIZE) &&
 		(nbigpipe < LIMITBIGPIPES) &&
 		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
 		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
 		(wpipe->pipe_buffer.cnt == 0)) {
 
 		if ((error = pipelock(wpipe,1)) == 0) {
 			if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
 				nbigpipe++;
 			pipeunlock(wpipe);
 		}
 	}
 
 	/*
 	 * If an early error occured unbusy and return, waking up any pending
 	 * readers.
 	 */
 	if (error) {
 		--wpipe->pipe_busy;
 		if ((wpipe->pipe_busy == 0) && 
 		    (wpipe->pipe_state & PIPE_WANT)) {
 			wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
 			wakeup(wpipe);
 		}
 		return(error);
 	}
 		
 	KASSERT(wpipe->pipe_buffer.buffer != NULL, ("pipe buffer gone"));
 
 	orig_resid = uio->uio_resid;
 
 	while (uio->uio_resid) {
 		int space;
 
 #ifndef PIPE_NODIRECT
 		/*
 		 * If the transfer is large, we can gain performance if
 		 * we do process-to-process copies directly.
 		 * If the write is non-blocking, we don't use the
 		 * direct write mechanism.
 		 *
 		 * The direct write mechanism will detect the reader going
 		 * away on us.
 		 */
+		FILE_LOCK(fp);
 		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
 		    (fp->f_flag & FNONBLOCK) == 0 &&
 			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
 			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
+			FILE_UNLOCK(fp);
 			error = pipe_direct_write( wpipe, uio);
 			if (error)
 				break;
 			continue;
-		}
+		} else
+			FILE_UNLOCK(fp);
 #endif
 
 		/*
 		 * Pipe buffered writes cannot be coincidental with
 		 * direct writes.  We wait until the currently executing
 		 * direct write is completed before we start filling the
 		 * pipe buffer.  We break out if a signal occurs or the
 		 * reader goes away.
 		 */
 	retrywrite:
 		while (wpipe->pipe_state & PIPE_DIRECTW) {
 			if (wpipe->pipe_state & PIPE_WANTR) {
 				wpipe->pipe_state &= ~PIPE_WANTR;
 				wakeup(wpipe);
 			}
 			error = tsleep(wpipe, PRIBIO | PCATCH, "pipbww", 0);
 			if (wpipe->pipe_state & PIPE_EOF)
 				break;
 			if (error)
 				break;
 		}
 		if (wpipe->pipe_state & PIPE_EOF) {
 			error = EPIPE;
 			break;
 		}
 
 		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
 
 		/* Writes of size <= PIPE_BUF must be atomic. */
 		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
 			space = 0;
 
 		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
 			if ((error = pipelock(wpipe,1)) == 0) {
 				int size;	/* Transfer size */
 				int segsize;	/* first segment to transfer */
 
 				/*
 				 * It is possible for a direct write to
 				 * slip in on us... handle it here...
 				 */
 				if (wpipe->pipe_state & PIPE_DIRECTW) {
 					pipeunlock(wpipe);
 					goto retrywrite;
 				}
 				/* 
 				 * If a process blocked in uiomove, our
 				 * value for space might be bad.
 				 *
 				 * XXX will we be ok if the reader has gone
 				 * away here?
 				 */
 				if (space > wpipe->pipe_buffer.size - 
 				    wpipe->pipe_buffer.cnt) {
 					pipeunlock(wpipe);
 					goto retrywrite;
 				}
 
 				/*
 				 * Transfer size is minimum of uio transfer
 				 * and free space in pipe buffer.
 				 */
 				if (space > uio->uio_resid)
 					size = uio->uio_resid;
 				else
 					size = space;
 				/*
 				 * First segment to transfer is minimum of 
 				 * transfer size and contiguous space in
 				 * pipe buffer.  If first segment to transfer
 				 * is less than the transfer size, we've got
 				 * a wraparound in the buffer.
 				 */
 				segsize = wpipe->pipe_buffer.size - 
 					wpipe->pipe_buffer.in;
 				if (segsize > size)
 					segsize = size;
 				
 				/* Transfer first segment */
 
 				error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 
 						segsize, uio);
 				
 				if (error == 0 && segsize < size) {
 					/* 
 					 * Transfer remaining part now, to
 					 * support atomic writes.  Wraparound
 					 * happened.
 					 */
 					if (wpipe->pipe_buffer.in + segsize != 
 					    wpipe->pipe_buffer.size)
 						panic("Expected pipe buffer wraparound disappeared");
 						
 					error = uiomove(&wpipe->pipe_buffer.buffer[0],
 							size - segsize, uio);
 				}
 				if (error == 0) {
 					wpipe->pipe_buffer.in += size;
 					if (wpipe->pipe_buffer.in >=
 					    wpipe->pipe_buffer.size) {
 						if (wpipe->pipe_buffer.in != size - segsize + wpipe->pipe_buffer.size)
 							panic("Expected wraparound bad");
 						wpipe->pipe_buffer.in = size - segsize;
 					}
 				
 					wpipe->pipe_buffer.cnt += size;
 					if (wpipe->pipe_buffer.cnt > wpipe->pipe_buffer.size)
 						panic("Pipe buffer overflow");
 				
 				}
 				pipeunlock(wpipe);
 			}
 			if (error)
 				break;
 
 		} else {
 			/*
 			 * If the "read-side" has been blocked, wake it up now.
 			 */
 			if (wpipe->pipe_state & PIPE_WANTR) {
 				wpipe->pipe_state &= ~PIPE_WANTR;
 				wakeup(wpipe);
 			}
 
 			/*
 			 * don't block on non-blocking I/O
 			 */
+			FILE_LOCK(fp);
 			if (fp->f_flag & FNONBLOCK) {
+				FILE_UNLOCK(fp);
 				error = EAGAIN;
 				break;
 			}
+			FILE_UNLOCK(fp);
 
 			/*
 			 * We have no more space and have something to offer,
 			 * wake up select/poll.
 			 */
 			pipeselwakeup(wpipe);
 
 			wpipe->pipe_state |= PIPE_WANTW;
 			error = tsleep(wpipe, PRIBIO | PCATCH, "pipewr", 0);
 			if (error != 0)
 				break;
 			/*
 			 * If read side wants to go away, we just issue a signal
 			 * to ourselves.
 			 */
 			if (wpipe->pipe_state & PIPE_EOF) {
 				error = EPIPE;
 				break;
 			}	
 		}
 	}
 
 	--wpipe->pipe_busy;
 
 	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
 		wpipe->pipe_state &= ~(PIPE_WANT | PIPE_WANTR);
 		wakeup(wpipe);
 	} else if (wpipe->pipe_buffer.cnt > 0) {
 		/*
 		 * If we have put any characters in the buffer, we wake up
 		 * the reader.
 		 */
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
 			wakeup(wpipe);
 		}
 	}
 
 	/*
 	 * Don't return EPIPE if I/O was successful
 	 */
 	if ((wpipe->pipe_buffer.cnt == 0) &&
 	    (uio->uio_resid == 0) &&
 	    (error == EPIPE)) {
 		error = 0;
 	}
 
 	if (error == 0)
 		vfs_timestamp(&wpipe->pipe_mtime);
 
 	/*
 	 * We have something to offer,
 	 * wake up select/poll.
 	 */
 	if (wpipe->pipe_buffer.cnt)
 		pipeselwakeup(wpipe);
 
 	return (error);
 }
 
 /*
  * we implement a very minimal set of ioctls for compatibility with sockets.
  */
 int
 pipe_ioctl(fp, cmd, data, td)
 	struct file *fp;
 	u_long cmd;
 	caddr_t data;
 	struct thread *td;
 {
 	struct pipe *mpipe = (struct pipe *)fp->f_data;
 
 	switch (cmd) {
 
 	case FIONBIO:
 		return (0);
 
 	case FIOASYNC:
 		if (*(int *)data) {
 			mpipe->pipe_state |= PIPE_ASYNC;
 		} else {
 			mpipe->pipe_state &= ~PIPE_ASYNC;
 		}
 		return (0);
 
 	case FIONREAD:
 		if (mpipe->pipe_state & PIPE_DIRECTW)
 			*(int *)data = mpipe->pipe_map.cnt;
 		else
 			*(int *)data = mpipe->pipe_buffer.cnt;
 		return (0);
 
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &mpipe->pipe_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(mpipe->pipe_sigio);
 		return (0);
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		return (fsetown(-(*(int *)data), &mpipe->pipe_sigio));
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)data = -fgetown(mpipe->pipe_sigio);
 		return (0);
 
 	}
 	return (ENOTTY);
 }
 
 int
 pipe_poll(fp, events, cred, td)
 	struct file *fp;
 	int events;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct pipe *rpipe = (struct pipe *)fp->f_data;
 	struct pipe *wpipe;
 	int revents = 0;
 
 	wpipe = rpipe->pipe_peer;
 	if (events & (POLLIN | POLLRDNORM))
 		if ((rpipe->pipe_state & PIPE_DIRECTW) ||
 		    (rpipe->pipe_buffer.cnt > 0) ||
 		    (rpipe->pipe_state & PIPE_EOF))
 			revents |= events & (POLLIN | POLLRDNORM);
 
 	if (events & (POLLOUT | POLLWRNORM))
 		if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) ||
 		    (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
 		     (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
 			revents |= events & (POLLOUT | POLLWRNORM);
 
 	if ((rpipe->pipe_state & PIPE_EOF) ||
 	    (wpipe == NULL) ||
 	    (wpipe->pipe_state & PIPE_EOF))
 		revents |= POLLHUP;
 
 	if (revents == 0) {
 		if (events & (POLLIN | POLLRDNORM)) {
 			selrecord(td, &rpipe->pipe_sel);
 			rpipe->pipe_state |= PIPE_SEL;
 		}
 
 		if (events & (POLLOUT | POLLWRNORM)) {
 			selrecord(td, &wpipe->pipe_sel);
 			wpipe->pipe_state |= PIPE_SEL;
 		}
 	}
 
 	return (revents);
 }
 
 static int
 pipe_stat(fp, ub, td)
 	struct file *fp;
 	struct stat *ub;
 	struct thread *td;
 {
 	struct pipe *pipe = (struct pipe *)fp->f_data;
 
 	bzero((caddr_t)ub, sizeof(*ub));
 	ub->st_mode = S_IFIFO;
 	ub->st_blksize = pipe->pipe_buffer.size;
 	ub->st_size = pipe->pipe_buffer.cnt;
 	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
 	ub->st_atimespec = pipe->pipe_atime;
 	ub->st_mtimespec = pipe->pipe_mtime;
 	ub->st_ctimespec = pipe->pipe_ctime;
 	ub->st_uid = fp->f_cred->cr_uid;
 	ub->st_gid = fp->f_cred->cr_gid;
 	/*
 	 * Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
 	 * XXX (st_dev, st_ino) should be unique.
 	 */
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 pipe_close(fp, td)
 	struct file *fp;
 	struct thread *td;
 {
 	struct pipe *cpipe = (struct pipe *)fp->f_data;
 
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	funsetown(cpipe->pipe_sigio);
 	pipeclose(cpipe);
 	return (0);
 }
 
 static void
 pipe_free_kmem(cpipe)
 	struct pipe *cpipe;
 {
 	GIANT_REQUIRED;
 
 	if (cpipe->pipe_buffer.buffer != NULL) {
 		if (cpipe->pipe_buffer.size > PIPE_SIZE)
 			--nbigpipe;
 		amountpipekva -= cpipe->pipe_buffer.size;
 		kmem_free(kernel_map,
 			(vm_offset_t)cpipe->pipe_buffer.buffer,
 			cpipe->pipe_buffer.size);
 		cpipe->pipe_buffer.buffer = NULL;
 	}
 #ifndef PIPE_NODIRECT
 	if (cpipe->pipe_map.kva != NULL) {
 		amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
 		kmem_free(kernel_map,
 			cpipe->pipe_map.kva,
 			cpipe->pipe_buffer.size + PAGE_SIZE);
 		cpipe->pipe_map.cnt = 0;
 		cpipe->pipe_map.kva = 0;
 		cpipe->pipe_map.pos = 0;
 		cpipe->pipe_map.npages = 0;
 	}
 #endif
 }
 
 /*
  * shutdown the pipe
  */
 static void
 pipeclose(cpipe)
 	struct pipe *cpipe;
 {
 	struct pipe *ppipe;
 
 	if (cpipe) {
 		
 		pipeselwakeup(cpipe);
 
 		/*
 		 * If the other side is blocked, wake it up saying that
 		 * we want to close it down.
 		 */
 		while (cpipe->pipe_busy) {
 			wakeup(cpipe);
 			cpipe->pipe_state |= PIPE_WANT | PIPE_EOF;
 			tsleep(cpipe, PRIBIO, "pipecl", 0);
 		}
 
 		/*
 		 * Disconnect from peer
 		 */
 		if ((ppipe = cpipe->pipe_peer) != NULL) {
 			pipeselwakeup(ppipe);
 
 			ppipe->pipe_state |= PIPE_EOF;
 			wakeup(ppipe);
 			KNOTE(&ppipe->pipe_sel.si_note, 0);
 			ppipe->pipe_peer = NULL;
 		}
 		/*
 		 * free resources
 		 */
 		pipe_free_kmem(cpipe);
 		zfree(pipe_zone, cpipe);
 	}
 }
 
 /*ARGSUSED*/
 static int
 pipe_kqfilter(struct file *fp, struct knote *kn)
 {
-	struct pipe *cpipe = (struct pipe *)kn->kn_fp->f_data;
+	struct pipe *cpipe;
 
+	cpipe = (struct pipe *)kn->kn_fp->f_data;
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &pipe_rfiltops;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &pipe_wfiltops;
 		cpipe = cpipe->pipe_peer;
 		break;
 	default:
 		return (1);
 	}
 	kn->kn_hook = (caddr_t)cpipe;
 
 	SLIST_INSERT_HEAD(&cpipe->pipe_sel.si_note, kn, kn_selnext);
 	return (0);
 }
 
 static void
 filt_pipedetach(struct knote *kn)
 {
 	struct pipe *cpipe = (struct pipe *)kn->kn_hook;
 
 	SLIST_REMOVE(&cpipe->pipe_sel.si_note, kn, knote, kn_selnext);
 }
 
 /*ARGSUSED*/
 static int
 filt_piperead(struct knote *kn, long hint)
 {
 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
 	struct pipe *wpipe = rpipe->pipe_peer;
 
 	kn->kn_data = rpipe->pipe_buffer.cnt;
 	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
 		kn->kn_data = rpipe->pipe_map.cnt;
 
 	if ((rpipe->pipe_state & PIPE_EOF) ||
 	    (wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
 		kn->kn_flags |= EV_EOF; 
 		return (1);
 	}
 	return (kn->kn_data > 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_pipewrite(struct knote *kn, long hint)
 {
 	struct pipe *rpipe = (struct pipe *)kn->kn_fp->f_data;
 	struct pipe *wpipe = rpipe->pipe_peer;
 
 	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
 		kn->kn_data = 0;
 		kn->kn_flags |= EV_EOF; 
 		return (1);
 	}
 	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
 	if (wpipe->pipe_state & PIPE_DIRECTW)
 		kn->kn_data = 0;
 
 	return (kn->kn_data >= PIPE_BUF);
 }
Index: head/sys/kern/sys_socket.c
===================================================================
--- head/sys/kern/sys_socket.c	(revision 89305)
+++ head/sys/kern/sys_socket.c	(revision 89306)
@@ -1,206 +1,207 @@
 /*
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sys_socket.c	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/file.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/filio.h>			/* XXX */
 #include <sys/sockio.h>
 #include <sys/stat.h>
 #include <sys/uio.h>
 #include <sys/filedesc.h>
 #include <sys/ucred.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
 struct	fileops socketops = {
 	soo_read, soo_write, soo_ioctl, soo_poll, sokqfilter,
 	soo_stat, soo_close
 };
 
 /* ARGSUSED */
 int
 soo_read(fp, uio, cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *cred;
 	struct thread *td;
 	int flags;
 {
 	struct socket *so = (struct socket *)fp->f_data;
 	return so->so_proto->pr_usrreqs->pru_soreceive(so, 0, uio, 0, 0, 0);
 }
 
 /* ARGSUSED */
 int
 soo_write(fp, uio, cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *cred;
 	struct thread *td;
 	int flags;
 {
 	struct socket *so = (struct socket *)fp->f_data;
 	return so->so_proto->pr_usrreqs->pru_sosend(so, 0, uio, 0, 0, 0,
 						    uio->uio_td);
 }
 
 int
 soo_ioctl(fp, cmd, data, td)
 	struct file *fp;
 	u_long cmd;
 	register caddr_t data;
 	struct thread *td;
 {
 	register struct socket *so = (struct socket *)fp->f_data;
 
 	switch (cmd) {
 
 	case FIONBIO:
 		if (*(int *)data)
 			so->so_state |= SS_NBIO;
 		else
 			so->so_state &= ~SS_NBIO;
 		return (0);
 
 	case FIOASYNC:
 		if (*(int *)data) {
 			so->so_state |= SS_ASYNC;
 			so->so_rcv.sb_flags |= SB_ASYNC;
 			so->so_snd.sb_flags |= SB_ASYNC;
 		} else {
 			so->so_state &= ~SS_ASYNC;
 			so->so_rcv.sb_flags &= ~SB_ASYNC;
 			so->so_snd.sb_flags &= ~SB_ASYNC;
 		}
 		return (0);
 
 	case FIONREAD:
 		*(int *)data = so->so_rcv.sb_cc;
 		return (0);
 
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &so->so_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(so->so_sigio);
 		return (0);
 
 	case SIOCSPGRP:
 		return (fsetown(-(*(int *)data), &so->so_sigio));
 
 	case SIOCGPGRP:
 		*(int *)data = -fgetown(so->so_sigio);
 		return (0);
 
 	case SIOCATMARK:
 		*(int *)data = (so->so_state&SS_RCVATMARK) != 0;
 		return (0);
 	}
 	/*
 	 * Interface/routing/protocol specific ioctls:
 	 * interface and routing ioctls should have a
 	 * different entry since a socket's unnecessary
 	 */
 	if (IOCGROUP(cmd) == 'i')
 		return (ifioctl(so, cmd, data, td));
 	if (IOCGROUP(cmd) == 'r')
 		return (rtioctl(cmd, data));
 	return ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, 0, td));
 }
 
 int
 soo_poll(fp, events, cred, td)
 	struct file *fp;
 	int events;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct socket *so = (struct socket *)fp->f_data;
 	return so->so_proto->pr_usrreqs->pru_sopoll(so, events, cred, td);
 }
 
 int
 soo_stat(fp, ub, td)
 	struct file *fp;
 	struct stat *ub;
 	struct thread *td;
 {
 	struct socket *so = (struct socket *)fp->f_data;
 
 	bzero((caddr_t)ub, sizeof (*ub));
 	ub->st_mode = S_IFSOCK;
 	/*
 	 * If SS_CANTRCVMORE is set, but there's still data left in the
 	 * receive buffer, the socket is still readable.
 	 */
 	if ((so->so_state & SS_CANTRCVMORE) == 0 ||
 	    so->so_rcv.sb_cc != 0)
 		ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
 	if ((so->so_state & SS_CANTSENDMORE) == 0)
 		ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
 	ub->st_size = so->so_rcv.sb_cc;
 	ub->st_uid = so->so_cred->cr_uid;
 	ub->st_gid = so->so_cred->cr_gid;
 	return ((*so->so_proto->pr_usrreqs->pru_sense)(so, ub));
 }
 
 /*
  * API socket close on file pointer.  We call soclose() to close the 
  * socket (including initiating closing protocols).  soclose() will
  * sorele() the file reference but the actual socket will not go away
  * until the socket's ref count hits 0.
  */
 /* ARGSUSED */
 int
 soo_close(fp, td)
 	struct file *fp;
 	struct thread *td;
 {
 	int error = 0;
 	struct socket *so;
 
+	so = (struct socket *)fp->f_data;
 	fp->f_ops = &badfileops;
-	if ((so = (struct socket *)fp->f_data) != NULL) {
-		fp->f_data = NULL;
+	fp->f_data = 0;
+
+	if (so)
 		error = soclose(so);
-	}
 	return (error);
 }
Index: head/sys/kern/uipc_syscalls.c
===================================================================
--- head/sys/kern/uipc_syscalls.c	(revision 89305)
+++ head/sys/kern/uipc_syscalls.c	(revision 89306)
@@ -1,1935 +1,1957 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * sendfile(2) and related extensions:
  * Copyright (c) 1998, David Greenman. All rights reserved. 
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
  * $FreeBSD$
  */
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/filedesc.h>
 #include <sys/event.h>
 #include <sys/proc.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
 static void sf_buf_init(void *arg);
 SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL)
 static struct sf_buf *sf_buf_alloc(void);
 static void sf_buf_free(caddr_t addr, void *args);
 
 static int sendit __P((struct thread *td, int s, struct msghdr *mp, int flags));
 static int recvit __P((struct thread *td, int s, struct msghdr *mp,
 		       caddr_t namelenp));
   
 static int accept1 __P((struct thread *td, struct accept_args *uap, int compat));
 static int getsockname1 __P((struct thread *td, struct getsockname_args *uap,
 			     int compat));
 static int getpeername1 __P((struct thread *td, struct getpeername_args *uap,
 			     int compat));
 
 /*
  * Expanded sf_freelist head. Really an SLIST_HEAD() in disguise, with the
  * sf_freelist head with the sf_lock mutex.
  */
 static struct {
 	SLIST_HEAD(, sf_buf) sf_head;
 	struct mtx sf_lock;
 } sf_freelist;
 
 static vm_offset_t sf_base;
 static struct sf_buf *sf_bufs;
 static u_int sf_buf_alloc_want;
 
 /*
  * System call interface to the socket abstraction.
  */
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 #define COMPAT_OLDSOCK
 #endif
 
 extern	struct fileops socketops;
 
 /*
  * MPSAFE
  */
 int
 socket(td, uap)
 	struct thread *td;
 	register struct socket_args /* {
 		int	domain;
 		int	type;
 		int	protocol;
 	} */ *uap;
 {
 	struct filedesc *fdp;
 	struct socket *so;
 	struct file *fp;
 	int fd, error;
 
 	mtx_lock(&Giant);
 	fdp = td->td_proc->p_fd;
 	error = falloc(td, &fp, &fd);
 	if (error)
 		goto done2;
 	fhold(fp);
 	error = socreate(uap->domain, &so, uap->type, uap->protocol,
 	    td->td_proc->p_ucred, td);
+	FILEDESC_LOCK(fdp);
 	if (error) {
 		if (fdp->fd_ofiles[fd] == fp) {
 			fdp->fd_ofiles[fd] = NULL;
+			FILEDESC_UNLOCK(fdp);
 			fdrop(fp, td);
-		}
+		} else
+			FILEDESC_UNLOCK(fdp);
 	} else {
 		fp->f_data = (caddr_t)so;	/* already has ref count */
 		fp->f_flag = FREAD|FWRITE;
 		fp->f_ops = &socketops;
 		fp->f_type = DTYPE_SOCKET;
+		FILEDESC_UNLOCK(fdp);
 		td->td_retval[0] = fd;
 	}
 	fdrop(fp, td);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 bind(td, uap)
 	struct thread *td;
 	register struct bind_args /* {
 		int	s;
 		caddr_t	name;
 		int	namelen;
 	} */ *uap;
 {
 	struct socket *so;
 	struct sockaddr *sa;
 	int error;
 
 	mtx_lock(&Giant);
 	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
 		goto done2;
 	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
 		goto done1;
 	error = sobind(so, sa, td);
 	FREE(sa, M_SONAME);
 done1:
 	fputsock(so);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 listen(td, uap)
 	struct thread *td;
 	register struct listen_args /* {
 		int	s;
 		int	backlog;
 	} */ *uap;
 {
 	struct socket *so;
 	int error;
 
 	mtx_lock(&Giant);
 	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
 		error = solisten(so, uap->backlog, td);
 		fputsock(so);
 	}
 	mtx_unlock(&Giant);
 	return(error);
 }
 
 /*
  * accept1()
  * MPSAFE
  */
 static int
 accept1(td, uap, compat)
 	struct thread *td;
 	register struct accept_args /* {
 		int	s;
 		caddr_t	name;
 		int	*anamelen;
 	} */ *uap;
 	int compat;
 {
 	struct filedesc *fdp;
 	struct file *nfp = NULL;
 	struct sockaddr *sa;
 	int namelen, error, s;
 	struct socket *head, *so;
 	int fd;
 	u_int fflag;
 
 	mtx_lock(&Giant);
 	fdp = td->td_proc->p_fd;
 	if (uap->name) {
 		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
 			sizeof (namelen));
 		if(error)
 			goto done2;
 	}
 	error = fgetsock(td, uap->s, &head, &fflag);
 	if (error)
 		goto done2;
 	s = splnet();
 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
 		splx(s);
 		error = EINVAL;
 		goto done;
 	}
 	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
 		splx(s);
 		error = EWOULDBLOCK;
 		goto done;
 	}
 	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
 		if (head->so_state & SS_CANTRCVMORE) {
 			head->so_error = ECONNABORTED;
 			break;
 		}
 		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
 		    "accept", 0);
 		if (error) {
 			splx(s);
 			goto done;
 		}
 	}
 	if (head->so_error) {
 		error = head->so_error;
 		head->so_error = 0;
 		splx(s);
 		goto done;
 	}
 
 	/*
 	 * At this point we know that there is at least one connection
 	 * ready to be accepted. Remove it from the queue prior to
 	 * allocating the file descriptor for it since falloc() may
 	 * block allowing another process to accept the connection
 	 * instead.
 	 */
 	so = TAILQ_FIRST(&head->so_comp);
 	TAILQ_REMOVE(&head->so_comp, so, so_list);
 	head->so_qlen--;
 
 	error = falloc(td, &nfp, &fd);
 	if (error) {
 		/*
 		 * Probably ran out of file descriptors. Put the
 		 * unaccepted connection back onto the queue and
 		 * do another wakeup so some other process might
 		 * have a chance at it.
 		 */
 		TAILQ_INSERT_HEAD(&head->so_comp, so, so_list);
 		head->so_qlen++;
 		wakeup_one(&head->so_timeo);
 		splx(s);
 		goto done;
 	}
 	fhold(nfp);
 	td->td_retval[0] = fd;
 
 	/* connection has been removed from the listen queue */
 	KNOTE(&head->so_rcv.sb_sel.si_note, 0);
 
 	so->so_state &= ~SS_COMP;
 	so->so_head = NULL;
 	if (head->so_sigio != NULL)
 		fsetown(fgetown(head->so_sigio), &so->so_sigio);
 
+	FILE_LOCK(nfp);
 	soref(so);			/* file descriptor reference */
 	nfp->f_data = (caddr_t)so;	/* nfp has ref count from falloc */
 	nfp->f_flag = fflag;
 	nfp->f_ops = &socketops;
 	nfp->f_type = DTYPE_SOCKET;
+	FILE_UNLOCK(nfp);
 	sa = 0;
 	error = soaccept(so, &sa);
 	if (error) {
 		/*
 		 * return a namelen of zero for older code which might
 	 	 * ignore the return value from accept.
 		 */	
 		if (uap->name != NULL) {
 			namelen = 0;
 			(void) copyout((caddr_t)&namelen,
 			    (caddr_t)uap->anamelen, sizeof(*uap->anamelen));
 		}
 		goto noconnection;
 	}
 	if (sa == NULL) {
 		namelen = 0;
 		if (uap->name)
 			goto gotnoname;
 		splx(s);
 		error = 0;
 		goto done;
 	}
 	if (uap->name) {
 		/* check sa_len before it is destroyed */
 		if (namelen > sa->sa_len)
 			namelen = sa->sa_len;
 #ifdef COMPAT_OLDSOCK
 		if (compat)
 			((struct osockaddr *)sa)->sa_family =
 			    sa->sa_family;
 #endif
 		error = copyout(sa, (caddr_t)uap->name, (u_int)namelen);
 		if (!error)
 gotnoname:
 			error = copyout((caddr_t)&namelen,
 			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
 	}
 noconnection:
 	if (sa)
 		FREE(sa, M_SONAME);
 
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
 	if (error) {
+		FILEDESC_LOCK(fdp);
 		if (fdp->fd_ofiles[fd] == nfp) {
 			fdp->fd_ofiles[fd] = NULL;
+			FILEDESC_UNLOCK(fdp);
 			fdrop(nfp, td);
+		} else {
+			FILEDESC_UNLOCK(fdp);
 		}
 	}
 	splx(s);
 
 	/*
 	 * Release explicitly held references before returning.
 	 */
 done:
 	if (nfp != NULL)
 		fdrop(nfp, td);
 	fputsock(head);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * MPSAFE (accept1() is MPSAFE)
  */
 int
 accept(td, uap)
 	struct thread *td;
 	struct accept_args *uap;
 {
 
 	return (accept1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
 /*
  * MPSAFE (accept1() is MPSAFE)
  */
 int
 oaccept(td, uap)
 	struct thread *td;
 	struct accept_args *uap;
 {
 
 	return (accept1(td, uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 connect(td, uap)
 	struct thread *td;
 	register struct connect_args /* {
 		int	s;
 		caddr_t	name;
 		int	namelen;
 	} */ *uap;
 {
 	struct socket *so;
 	struct sockaddr *sa;
 	int error, s;
 
 	mtx_lock(&Giant);
 	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
 		goto done2;
 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
 		error = EALREADY;
 		goto done1;
 	}
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error)
 		goto done1;
 	error = soconnect(so, sa, td);
 	if (error)
 		goto bad;
 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
 		FREE(sa, M_SONAME);
 		error = EINPROGRESS;
 		goto done1;
 	}
 	s = splnet();
 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, "connec", 0);
 		if (error)
 			break;
 	}
 	if (error == 0) {
 		error = so->so_error;
 		so->so_error = 0;
 	}
 	splx(s);
 bad:
 	so->so_state &= ~SS_ISCONNECTING;
 	FREE(sa, M_SONAME);
 	if (error == ERESTART)
 		error = EINTR;
 done1:
 	fputsock(so);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 int
 socketpair(td, uap)
 	struct thread *td;
 	register struct socketpair_args /* {
 		int	domain;
 		int	type;
 		int	protocol;
 		int	*rsv;
 	} */ *uap;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *fp1, *fp2;
 	struct socket *so1, *so2;
 	int fd, error, sv[2];
 
 	mtx_lock(&Giant);
 	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
 	    td->td_proc->p_ucred, td);
 	if (error)
 		goto done2;
 	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
 	    td->td_proc->p_ucred, td);
 	if (error)
 		goto free1;
 	error = falloc(td, &fp1, &fd);
 	if (error)
 		goto free2;
 	fhold(fp1);
 	sv[0] = fd;
 	fp1->f_data = (caddr_t)so1;	/* so1 already has ref count */
 	error = falloc(td, &fp2, &fd);
 	if (error)
 		goto free3;
 	fhold(fp2);
 	fp2->f_data = (caddr_t)so2;	/* so2 already has ref count */
 	sv[1] = fd;
 	error = soconnect2(so1, so2);
 	if (error)
 		goto free4;
 	if (uap->type == SOCK_DGRAM) {
 		/*
 		 * Datagram socket connection is asymmetric.
 		 */
 		 error = soconnect2(so2, so1);
 		 if (error)
 			goto free4;
 	}
-	fp1->f_flag = fp2->f_flag = FREAD|FWRITE;
-	fp1->f_ops = fp2->f_ops = &socketops;
-	fp1->f_type = fp2->f_type = DTYPE_SOCKET;
+	FILE_LOCK(fp1);
+	fp1->f_flag = FREAD|FWRITE;
+	fp1->f_ops = &socketops;
+	fp1->f_type = DTYPE_SOCKET;
+	FILE_UNLOCK(fp1);
+	FILE_LOCK(fp2);
+	fp2->f_flag = FREAD|FWRITE;
+	fp2->f_ops = &socketops;
+	fp2->f_type = DTYPE_SOCKET;
+	FILE_UNLOCK(fp2);
 	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
 	fdrop(fp1, td);
 	fdrop(fp2, td);
 	goto done2;
 free4:
+	FILEDESC_LOCK(fdp);
 	if (fdp->fd_ofiles[sv[1]] == fp2) {
 		fdp->fd_ofiles[sv[1]] = NULL;
+		FILEDESC_UNLOCK(fdp);
 		fdrop(fp2, td);
-	}
+	} else
+		FILEDESC_UNLOCK(fdp);
 	fdrop(fp2, td);
 free3:
+	FILEDESC_LOCK(fdp);
 	if (fdp->fd_ofiles[sv[0]] == fp1) {
 		fdp->fd_ofiles[sv[0]] = NULL;
+		FILEDESC_UNLOCK(fdp);
 		fdrop(fp1, td);
-	}
+	} else
+		FILEDESC_UNLOCK(fdp);
 	fdrop(fp1, td);
 free2:
 	(void)soclose(so2);
 free1:
 	(void)soclose(so1);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 static int
 sendit(td, s, mp, flags)
 	register struct thread *td;
 	int s;
 	register struct msghdr *mp;
 	int flags;
 {
 	struct uio auio;
 	register struct iovec *iov;
 	register int i;
 	struct mbuf *control;
 	struct sockaddr *to = NULL;
 	int len, error;
 	struct socket *so;
 #ifdef KTRACE
 	struct iovec *ktriov = NULL;
 	struct uio ktruio;
 #endif
 
 	if ((error = fgetsock(td, s, &so, NULL)) != 0)
 		return (error);
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			error = EINVAL;
 			goto bad;
 		}
 	}
 	if (mp->msg_name) {
 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
 		if (error)
 			goto bad;
 	}
 	if (mp->msg_control) {
 		if (mp->msg_controllen < sizeof(struct cmsghdr)
 #ifdef COMPAT_OLDSOCK
 		    && mp->msg_flags != MSG_COMPAT
 #endif
 		) {
 			error = EINVAL;
 			goto bad;
 		}
 		error = sockargs(&control, mp->msg_control,
 		    mp->msg_controllen, MT_CONTROL);
 		if (error)
 			goto bad;
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags == MSG_COMPAT) {
 			register struct cmsghdr *cm;
 
 			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
 			if (control == 0) {
 				error = ENOBUFS;
 				goto bad;
 			} else {
 				cm = mtod(control, struct cmsghdr *);
 				cm->cmsg_len = control->m_len;
 				cm->cmsg_level = SOL_SOCKET;
 				cm->cmsg_type = SCM_RIGHTS;
 			}
 		}
 #endif
 	} else {
 		control = 0;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
 
 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
 		ktruio = auio;
 	}
 #endif
 	len = auio.uio_resid;
 	error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control,
 						     flags, td);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 		if (error == EPIPE) {
 			PROC_LOCK(td->td_proc);
 			psignal(td->td_proc, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktriov != NULL) {
 		if (error == 0) {
 			ktruio.uio_iov = ktriov;
 			ktruio.uio_resid = td->td_retval[0];
 			ktrgenio(td->td_proc->p_tracep, s, UIO_WRITE, &ktruio, error);
 		}
 		FREE(ktriov, M_TEMP);
 	}
 #endif
 bad:
 	fputsock(so);
 	if (to)
 		FREE(to, M_SONAME);
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 int
 sendto(td, uap)
 	struct thread *td;
 	register struct sendto_args /* {
 		int	s;
 		caddr_t	buf;
 		size_t	len;
 		int	flags;
 		caddr_t	to;
 		int	tolen;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	msg.msg_name = uap->to;
 	msg.msg_namelen = uap->tolen;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	msg.msg_control = 0;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags = 0;
 #endif
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	mtx_lock(&Giant);
 	error = sendit(td, uap->s, &msg, uap->flags);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifdef COMPAT_OLDSOCK
 /*
  * MPSAFE
  */
 int
 osend(td, uap)
 	struct thread *td;
 	register struct osend_args /* {
 		int	s;
 		caddr_t	buf;
 		int	len;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = 0;
 	mtx_lock(&Giant);
 	error = sendit(td, uap->s, &msg, uap->flags);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 int
 osendmsg(td, uap)
 	struct thread *td;
 	register struct osendmsg_args /* {
 		int	s;
 		caddr_t	msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov[UIO_SMALLIOV], *iov;
 	int error;
 
 	mtx_lock(&Giant);
 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
 	if (error)
 		goto done2;
 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
 			error = EMSGSIZE;
 			goto done2;
 		}
 		MALLOC(iov, struct iovec *,
 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
 		      M_WAITOK);
 	} else {
 		iov = aiov;
 	}
 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
 	if (error)
 		goto done;
 	msg.msg_flags = MSG_COMPAT;
 	msg.msg_iov = iov;
 	error = sendit(td, uap->s, &msg, uap->flags);
 done:
 	if (iov != aiov)
 		FREE(iov, M_IOV);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 #endif
 
 /*
  * MPSAFE
  */
 int
 sendmsg(td, uap)
 	struct thread *td;
 	register struct sendmsg_args /* {
 		int	s;
 		caddr_t	msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov[UIO_SMALLIOV], *iov;
 	int error;
 
 	mtx_lock(&Giant);
 	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
 	if (error)
 		goto done2;
 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
 			error = EMSGSIZE;
 			goto done2;
 		}
 		MALLOC(iov, struct iovec *,
 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
 		       M_WAITOK);
 	} else {
 		iov = aiov;
 	}
 	if (msg.msg_iovlen &&
 	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
 		goto done;
 	msg.msg_iov = iov;
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags = 0;
 #endif
 	error = sendit(td, uap->s, &msg, uap->flags);
 done:
 	if (iov != aiov)
 		FREE(iov, M_IOV);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 static int
 recvit(td, s, mp, namelenp)
 	register struct thread *td;
 	int s;
 	register struct msghdr *mp;
 	caddr_t namelenp;
 {
 	struct uio auio;
 	register struct iovec *iov;
 	register int i;
 	int len, error;
 	struct mbuf *m, *control = 0;
 	caddr_t ctlbuf;
 	struct socket *so;
 	struct sockaddr *fromsa = 0;
 #ifdef KTRACE
 	struct iovec *ktriov = NULL;
 	struct uio ktruio;
 #endif
 
 	if ((error = fgetsock(td, s, &so, NULL)) != 0)
 		return (error);
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			fputsock(so);
 			return (EINVAL);
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td->td_proc, KTR_GENIO)) {
 		int iovlen = auio.uio_iovcnt * sizeof (struct iovec);
 
 		MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK);
 		bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen);
 		ktruio = auio;
 	}
 #endif
 	len = auio.uio_resid;
 	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
 	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
 	    &mp->msg_flags);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 #ifdef KTRACE
 	if (ktriov != NULL) {
 		if (error == 0) {
 			ktruio.uio_iov = ktriov;
 			ktruio.uio_resid = len - auio.uio_resid;
 			ktrgenio(td->td_proc->p_tracep, s, UIO_READ, &ktruio, error);
 		}
 		FREE(ktriov, M_TEMP);
 	}
 #endif
 	if (error)
 		goto out;
 	td->td_retval[0] = len - auio.uio_resid;
 	if (mp->msg_name) {
 		len = mp->msg_namelen;
 		if (len <= 0 || fromsa == 0)
 			len = 0;
 		else {
 #ifndef MIN
 #define MIN(a,b) ((a)>(b)?(b):(a))
 #endif
 			/* save sa_len before it is destroyed by MSG_COMPAT */
 			len = MIN(len, fromsa->sa_len);
 #ifdef COMPAT_OLDSOCK
 			if (mp->msg_flags & MSG_COMPAT)
 				((struct osockaddr *)fromsa)->sa_family =
 				    fromsa->sa_family;
 #endif
 			error = copyout(fromsa,
 			    (caddr_t)mp->msg_name, (unsigned)len);
 			if (error)
 				goto out;
 		}
 		mp->msg_namelen = len;
 		if (namelenp &&
 		    (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) {
 #ifdef COMPAT_OLDSOCK
 			if (mp->msg_flags & MSG_COMPAT)
 				error = 0;	/* old recvfrom didn't check */
 			else
 #endif
 			goto out;
 		}
 	}
 	if (mp->msg_control) {
 #ifdef COMPAT_OLDSOCK
 		/*
 		 * We assume that old recvmsg calls won't receive access
 		 * rights and other control info, esp. as control info
 		 * is always optional and those options didn't exist in 4.3.
 		 * If we receive rights, trim the cmsghdr; anything else
 		 * is tossed.
 		 */
 		if (control && mp->msg_flags & MSG_COMPAT) {
 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
 			    SOL_SOCKET ||
 			    mtod(control, struct cmsghdr *)->cmsg_type !=
 			    SCM_RIGHTS) {
 				mp->msg_controllen = 0;
 				goto out;
 			}
 			control->m_len -= sizeof (struct cmsghdr);
 			control->m_data += sizeof (struct cmsghdr);
 		}
 #endif
 		len = mp->msg_controllen;
 		m = control;
 		mp->msg_controllen = 0;
 		ctlbuf = (caddr_t) mp->msg_control;
 
 		while (m && len > 0) {
 			unsigned int tocopy;
 
 			if (len >= m->m_len) 
 				tocopy = m->m_len;
 			else {
 				mp->msg_flags |= MSG_CTRUNC;
 				tocopy = len;
 			}
 		
 			if ((error = copyout((caddr_t)mtod(m, caddr_t),
 					ctlbuf, tocopy)) != 0)
 				goto out;
 
 			ctlbuf += tocopy;
 			len -= tocopy;
 			m = m->m_next;
 		}
 		mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
 	}
 out:
 	fputsock(so);
 	if (fromsa)
 		FREE(fromsa, M_SONAME);
 	if (control)
 		m_freem(control);
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 int
 recvfrom(td, uap)
 	struct thread *td;
 	register struct recvfrom_args /* {
 		int	s;
 		caddr_t	buf;
 		size_t	len;
 		int	flags;
 		caddr_t	from;
 		int	*fromlenaddr;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	mtx_lock(&Giant);
 	if (uap->fromlenaddr) {
 		error = copyin((caddr_t)uap->fromlenaddr,
 		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
 		if (error)
 			goto done2;
 	} else {
 		msg.msg_namelen = 0;
 	}
 	msg.msg_name = uap->from;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = uap->flags;
 	error = recvit(td, uap->s, &msg, (caddr_t)uap->fromlenaddr);
 done2:
 	mtx_unlock(&Giant);
 	return(error);
 }
 
 #ifdef COMPAT_OLDSOCK
 /*
  * MPSAFE
  */
 int
 orecvfrom(td, uap)
 	struct thread *td;
 	struct recvfrom_args *uap;
 {
 
 	uap->flags |= MSG_COMPAT;
 	return (recvfrom(td, uap));
 }
 #endif
 
 
 #ifdef COMPAT_OLDSOCK
 /*
  * MPSAFE
  */
 int
 orecv(td, uap)
 	struct thread *td;
 	register struct orecv_args /* {
 		int	s;
 		caddr_t	buf;
 		int	len;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	mtx_lock(&Giant);
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = uap->flags;
 	error = recvit(td, uap->s, &msg, (caddr_t)0);
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
  * overlays the new one, missing only the flags, and with the (old) access
  * rights where the control fields are now.
  *
  * MPSAFE
  */
 int
 orecvmsg(td, uap)
 	struct thread *td;
 	register struct orecvmsg_args /* {
 		int	s;
 		struct	omsghdr *msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov[UIO_SMALLIOV], *iov;
 	int error;
 
 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
 	    sizeof (struct omsghdr));
 	if (error)
 		return (error);
 
 	mtx_lock(&Giant);
 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
 			error = EMSGSIZE;
 			goto done2;
 		}
 		MALLOC(iov, struct iovec *,
 		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
 		      M_WAITOK);
 	} else {
 		iov = aiov;
 	}
 	msg.msg_flags = uap->flags | MSG_COMPAT;
 	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
 	if (error)
 		goto done;
 	msg.msg_iov = iov;
 	error = recvit(td, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen);
 
 	if (msg.msg_controllen && error == 0)
 		error = copyout((caddr_t)&msg.msg_controllen,
 		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
 done:
 	if (iov != aiov)
 		FREE(iov, M_IOV);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 #endif
 
 /*
  * MPSAFE
  */
 int
 recvmsg(td, uap)
 	struct thread *td;
 	register struct recvmsg_args /* {
 		int	s;
 		struct	msghdr *msg;
 		int	flags;
 	} */ *uap;
 {
 	struct msghdr msg;
 	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
 	register int error;
 
 	mtx_lock(&Giant);
 	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
 	if (error)
 		goto done2;
 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) {
 			error = EMSGSIZE;
 			goto done2;
 		}
 		MALLOC(iov, struct iovec *,
 		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
 		       M_WAITOK);
 	} else {
 		iov = aiov;
 	}
 #ifdef COMPAT_OLDSOCK
 	msg.msg_flags = uap->flags &~ MSG_COMPAT;
 #else
 	msg.msg_flags = uap->flags;
 #endif
 	uiov = msg.msg_iov;
 	msg.msg_iov = iov;
 	error = copyin((caddr_t)uiov, (caddr_t)iov,
 	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
 	if (error)
 		goto done;
 	error = recvit(td, uap->s, &msg, (caddr_t)0);
 	if (!error) {
 		msg.msg_iov = uiov;
 		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
 	}
 done:
 	if (iov != aiov)
 		FREE(iov, M_IOV);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 shutdown(td, uap)
 	struct thread *td;
 	register struct shutdown_args /* {
 		int	s;
 		int	how;
 	} */ *uap;
 {
 	struct socket *so;
 	int error;
 
 	mtx_lock(&Giant);
 	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
 		error = soshutdown(so, uap->how);
 		fputsock(so);
 	}
 	mtx_unlock(&Giant);
 	return(error);
 }
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 setsockopt(td, uap)
 	struct thread *td;
 	register struct setsockopt_args /* {
 		int	s;
 		int	level;
 		int	name;
 		caddr_t	val;
 		int	valsize;
 	} */ *uap;
 {
 	struct socket *so;
 	struct sockopt sopt;
 	int error;
 
 	if (uap->val == 0 && uap->valsize != 0)
 		return (EFAULT);
 	if (uap->valsize < 0)
 		return (EINVAL);
 
 	mtx_lock(&Giant);
 	if ((error = fgetsock(td, uap->s, &so, NULL)) == 0) {
 		sopt.sopt_dir = SOPT_SET;
 		sopt.sopt_level = uap->level;
 		sopt.sopt_name = uap->name;
 		sopt.sopt_val = uap->val;
 		sopt.sopt_valsize = uap->valsize;
 		sopt.sopt_td = td;
 		error = sosetopt(so, &sopt);
 		fputsock(so);
 	}
 	mtx_unlock(&Giant);
 	return(error);
 }
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 getsockopt(td, uap)
 	struct thread *td;
 	register struct getsockopt_args /* {
 		int	s;
 		int	level;
 		int	name;
 		caddr_t	val;
 		int	*avalsize;
 	} */ *uap;
 {
 	int	valsize, error;
 	struct  socket *so;
 	struct	sockopt sopt;
 
 	mtx_lock(&Giant);
 	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
 		goto done2;
 	if (uap->val) {
 		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
 		    sizeof (valsize));
 		if (error)
 			goto done1;
 		if (valsize < 0) {
 			error = EINVAL;
 			goto done1;
 		}
 	} else {
 		valsize = 0;
 	}
 
 	sopt.sopt_dir = SOPT_GET;
 	sopt.sopt_level = uap->level;
 	sopt.sopt_name = uap->name;
 	sopt.sopt_val = uap->val;
 	sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */
 	sopt.sopt_td = td;
 
 	error = sogetopt(so, &sopt);
 	if (error == 0) {
 		valsize = sopt.sopt_valsize;
 		error = copyout((caddr_t)&valsize,
 				(caddr_t)uap->avalsize, sizeof (valsize));
 	}
 done1:
 	fputsock(so);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * getsockname1() - Get socket name. 
  *
  * MPSAFE
  */
 /* ARGSUSED */
 static int
 getsockname1(td, uap, compat)
 	struct thread *td;
 	register struct getsockname_args /* {
 		int	fdes;
 		caddr_t	asa;
 		int	*alen;
 	} */ *uap;
 	int compat;
 {
 	struct socket *so;
 	struct sockaddr *sa;
 	int len, error;
 
 	mtx_lock(&Giant);
 	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
 		goto done2;
 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
 	if (error)
 		goto done1;
 	sa = 0;
 	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
 	if (error)
 		goto bad;
 	if (sa == 0) {
 		len = 0;
 		goto gotnothing;
 	}
 
 	len = MIN(len, sa->sa_len);
 #ifdef COMPAT_OLDSOCK
 	if (compat)
 		((struct osockaddr *)sa)->sa_family = sa->sa_family;
 #endif
 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
 	if (error == 0)
 gotnothing:
 		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
 		    sizeof (len));
 bad:
 	if (sa)
 		FREE(sa, M_SONAME);
 done1:
 	fputsock(so);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 int
 getsockname(td, uap)
 	struct thread *td;
 	struct getsockname_args *uap;
 {
 
 	return (getsockname1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
 /*
  * MPSAFE
  */
 int
 ogetsockname(td, uap)
 	struct thread *td;
 	struct getsockname_args *uap;
 {
 
 	return (getsockname1(td, uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
 /*
  * getpeername1() - Get name of peer for connected socket.
  *
  * MPSAFE
  */
 /* ARGSUSED */
 static int
 getpeername1(td, uap, compat)
 	struct thread *td;
 	register struct getpeername_args /* {
 		int	fdes;
 		caddr_t	asa;
 		int	*alen;
 	} */ *uap;
 	int compat;
 {
 	struct socket *so;
 	struct sockaddr *sa;
 	int len, error;
 
 	mtx_lock(&Giant);
 	if ((error = fgetsock(td, uap->fdes, &so, NULL)) != 0)
 		goto done2;
 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
 		error = ENOTCONN;
 		goto done1;
 	}
 	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
 	if (error)
 		goto done1;
 	sa = 0;
 	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
 	if (error)
 		goto bad;
 	if (sa == 0) {
 		len = 0;
 		goto gotnothing;
 	}
 	len = MIN(len, sa->sa_len);
 #ifdef COMPAT_OLDSOCK
 	if (compat)
 		((struct osockaddr *)sa)->sa_family =
 		    sa->sa_family;
 #endif
 	error = copyout(sa, (caddr_t)uap->asa, (u_int)len);
 	if (error)
 		goto bad;
 gotnothing:
 	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
 bad:
 	if (sa)
 		FREE(sa, M_SONAME);
 done1:
 	fputsock(so);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * MPSAFE
  */
 int
 getpeername(td, uap)
 	struct thread *td;
 	struct getpeername_args *uap;
 {
 
 	return (getpeername1(td, uap, 0));
 }
 
 #ifdef COMPAT_OLDSOCK
 /*
  * MPSAFE
  */
 int
 ogetpeername(td, uap)
 	struct thread *td;
 	struct ogetpeername_args *uap;
 {
 
 	/* XXX uap should have type `getpeername_args *' to begin with. */
 	return (getpeername1(td, (struct getpeername_args *)uap, 1));
 }
 #endif /* COMPAT_OLDSOCK */
 
 int
 sockargs(mp, buf, buflen, type)
 	struct mbuf **mp;
 	caddr_t buf;
 	int buflen, type;
 {
 	register struct sockaddr *sa;
 	register struct mbuf *m;
 	int error;
 
 	if ((u_int)buflen > MLEN) {
 #ifdef COMPAT_OLDSOCK
 		if (type == MT_SONAME && (u_int)buflen <= 112)
 			buflen = MLEN;		/* unix domain compat. hack */
 		else
 #endif
 		return (EINVAL);
 	}
 	m = m_get(M_TRYWAIT, type);
 	if (m == NULL)
 		return (ENOBUFS);
 	m->m_len = buflen;
 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
 	if (error)
 		(void) m_free(m);
 	else {
 		*mp = m;
 		if (type == MT_SONAME) {
 			sa = mtod(m, struct sockaddr *);
 
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
 				sa->sa_family = sa->sa_len;
 #endif
 			sa->sa_len = buflen;
 		}
 	}
 	return (error);
 }
 
 int
 getsockaddr(namp, uaddr, len)
 	struct sockaddr **namp;
 	caddr_t uaddr;
 	size_t len;
 {
 	struct sockaddr *sa;
 	int error;
 
 	if (len > SOCK_MAXADDRLEN)
 		return ENAMETOOLONG;
 	MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK);
 	error = copyin(uaddr, sa, len);
 	if (error) {
 		FREE(sa, M_SONAME);
 	} else {
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
 			sa->sa_family = sa->sa_len;
 #endif
 		sa->sa_len = len;
 		*namp = sa;
 	}
 	return error;
 }
 
 /*
  * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-))
  * XXX - The sf_buf functions are currently private to sendfile(2), so have
  * been made static, but may be useful in the future for doing zero-copy in
  * other parts of the networking code. 
  */
 static void
 sf_buf_init(void *arg)
 {
 	int i;
 
 	mtx_init(&sf_freelist.sf_lock, "sf_bufs list lock", MTX_DEF);
 	mtx_lock(&sf_freelist.sf_lock);
 	SLIST_INIT(&sf_freelist.sf_head);
 	sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
 	sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP,
 	    M_NOWAIT | M_ZERO);
 	for (i = 0; i < nsfbufs; i++) {
 		sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
 		SLIST_INSERT_HEAD(&sf_freelist.sf_head, &sf_bufs[i], free_list);
 	}
 	sf_buf_alloc_want = 0;
 	mtx_unlock(&sf_freelist.sf_lock);
 }
 
 /*
  * Get an sf_buf from the freelist. Will block if none are available.
  */
 static struct sf_buf *
 sf_buf_alloc()
 {
 	struct sf_buf *sf;
 	int error;
 
 	mtx_lock(&sf_freelist.sf_lock);
 	while ((sf = SLIST_FIRST(&sf_freelist.sf_head)) == NULL) {
 		sf_buf_alloc_want++;
 		error = msleep(&sf_freelist, &sf_freelist.sf_lock, PVM|PCATCH,
 		    "sfbufa", 0);
 		sf_buf_alloc_want--;
 
 		/*
 		 * If we got a signal, don't risk going back to sleep. 
 		 */
 		if (error)
 			break;
 	}
 	if (sf != NULL)
 		SLIST_REMOVE_HEAD(&sf_freelist.sf_head, free_list);
 	mtx_unlock(&sf_freelist.sf_lock);
 	return (sf);
 }
 
 #define dtosf(x)	(&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT])
 
 /*
  * Detatch mapped page and release resources back to the system.
  */
 static void
 sf_buf_free(caddr_t addr, void *args)
 {
 	struct sf_buf *sf;
 	struct vm_page *m;
 
 	GIANT_REQUIRED;
 
 	sf = dtosf(addr);
 	pmap_qremove((vm_offset_t)addr, 1);
 	m = sf->m;
 	vm_page_unwire(m, 0);
 	/*
 	 * Check for the object going away on us. This can
 	 * happen since we don't hold a reference to it.
 	 * If so, we're responsible for freeing the page.
 	 */
 	if (m->wire_count == 0 && m->object == NULL)
 		vm_page_free(m);
 	sf->m = NULL;
 	mtx_lock(&sf_freelist.sf_lock);
 	SLIST_INSERT_HEAD(&sf_freelist.sf_head, sf, free_list);
 	if (sf_buf_alloc_want > 0)
 		wakeup_one(&sf_freelist);
 	mtx_unlock(&sf_freelist.sf_lock);
 }
 
 /*
  * sendfile(2)
  *
  * MPSAFE
  *
  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
  *
  * Send a file specified by 'fd' and starting at 'offset' to a socket
  * specified by 's'. Send only 'nbytes' of the file or until EOF if
  * nbytes == 0. Optionally add a header and/or trailer to the socket
  * output. If specified, write the total number of bytes sent into *sbytes.
  *
  */
 int
 sendfile(struct thread *td, struct sendfile_args *uap)
 {
 	struct vnode *vp;
 	struct vm_object *obj;
 	struct socket *so = NULL;
 	struct mbuf *m;
 	struct sf_buf *sf;
 	struct vm_page *pg;
 	struct writev_args nuap;
 	struct sf_hdtr hdtr;
 	off_t off, xfsize, sbytes = 0;
 	int error, s;
 
 	mtx_lock(&Giant);
 
 	/*
 	 * The descriptor must be a regular file and have a backing VM object.
 	 */
 	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
 		goto done;
 	if (vp->v_type != VREG || VOP_GETVOBJECT(vp, &obj) != 0) {
 		error = EINVAL;
 		goto done;
 	}
 	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
 		goto done;
 	if (so->so_type != SOCK_STREAM) {
 		error = EINVAL;
 		goto done;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		error = ENOTCONN;
 		goto done;
 	}
 	if (uap->offset < 0) {
 		error = EINVAL;
 		goto done;
 	}
 
 	/*
 	 * If specified, get the pointer to the sf_hdtr struct for
 	 * any headers/trailers.
 	 */
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
 		if (error)
 			goto done;
 		/*
 		 * Send any headers. Wimp out and use writev(2).
 		 */
 		if (hdtr.headers != NULL) {
 			nuap.fd = uap->s;
 			nuap.iovp = hdtr.headers;
 			nuap.iovcnt = hdtr.hdr_cnt;
 			error = writev(td, &nuap);
 			if (error)
 				goto done;
 			sbytes += td->td_retval[0];
 		}
 	}
 
 	/*
 	 * Protect against multiple writers to the socket.
 	 */
 	(void) sblock(&so->so_snd, M_WAITOK);
 
 	/*
 	 * Loop through the pages in the file, starting with the requested
 	 * offset. Get a file page (do I/O if necessary), map the file page
 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
 	 * it on the socket.
 	 */
 	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
 		vm_pindex_t pindex;
 		vm_offset_t pgoff;
 
 		pindex = OFF_TO_IDX(off);
 retry_lookup:
 		/*
 		 * Calculate the amount to transfer. Not to exceed a page,
 		 * the EOF, or the passed in nbytes.
 		 */
 		xfsize = obj->un_pager.vnp.vnp_size - off;
 		if (xfsize > PAGE_SIZE)
 			xfsize = PAGE_SIZE;
 		pgoff = (vm_offset_t)(off & PAGE_MASK);
 		if (PAGE_SIZE - pgoff < xfsize)
 			xfsize = PAGE_SIZE - pgoff;
 		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
 			xfsize = uap->nbytes - sbytes;
 		if (xfsize <= 0)
 			break;
 		/*
 		 * Optimize the non-blocking case by looking at the socket space
 		 * before going to the extra work of constituting the sf_buf.
 		 */
 		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
 			if (so->so_state & SS_CANTSENDMORE)
 				error = EPIPE;
 			else
 				error = EAGAIN;
 			sbunlock(&so->so_snd);
 			goto done;
 		}
 		/*
 		 * Attempt to look up the page.  
 		 *
 		 *	Allocate if not found
 		 *
 		 *	Wait and loop if busy.
 		 */
 		pg = vm_page_lookup(obj, pindex);
 
 		if (pg == NULL) {
 			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
 			if (pg == NULL) {
 				VM_WAIT;
 				goto retry_lookup;
 			}
 			vm_page_wakeup(pg);
 		} else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
 			goto retry_lookup;
 		}
 
 		/*
 		 * Wire the page so it does not get ripped out from under
 		 * us. 
 		 */
 
 		vm_page_wire(pg);
 
 		/*
 		 * If page is not valid for what we need, initiate I/O
 		 */
 
 		if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
 			struct uio auio;
 			struct iovec aiov;
 			int bsize;
 
 			/*
 			 * Ensure that our page is still around when the I/O 
 			 * completes.
 			 */
 			vm_page_io_start(pg);
 
 			/*
 			 * Get the page from backing store.
 			 */
 			bsize = vp->v_mount->mnt_stat.f_iosize;
 			auio.uio_iov = &aiov;
 			auio.uio_iovcnt = 1;
 			aiov.iov_base = 0;
 			aiov.iov_len = MAXBSIZE;
 			auio.uio_resid = MAXBSIZE;
 			auio.uio_offset = trunc_page(off);
 			auio.uio_segflg = UIO_NOCOPY;
 			auio.uio_rw = UIO_READ;
 			auio.uio_td = td;
 			vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
 			error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16),
 			        td->td_proc->p_ucred);
 			VOP_UNLOCK(vp, 0, td);
 			vm_page_flag_clear(pg, PG_ZERO);
 			vm_page_io_finish(pg);
 			if (error) {
 				vm_page_unwire(pg, 0);
 				/*
 				 * See if anyone else might know about this page.
 				 * If not and it is not valid, then free it.
 				 */
 				if (pg->wire_count == 0 && pg->valid == 0 &&
 				    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
 				    pg->hold_count == 0) {
 					vm_page_busy(pg);
 					vm_page_free(pg);
 				}
 				sbunlock(&so->so_snd);
 				goto done;
 			}
 		}
 
 
 		/*
 		 * Get a sendfile buf. We usually wait as long as necessary,
 		 * but this wait can be interrupted.
 		 */
 		if ((sf = sf_buf_alloc()) == NULL) {
 			vm_page_unwire(pg, 0);
 			if (pg->wire_count == 0 && pg->object == NULL)
 				vm_page_free(pg);
 			sbunlock(&so->so_snd);
 			error = EINTR;
 			goto done;
 		}
 
 		/*
 		 * Allocate a kernel virtual page and insert the physical page
 		 * into it.
 		 */
 		sf->m = pg;
 		pmap_qenter(sf->kva, &pg, 1);
 		/*
 		 * Get an mbuf header and set it up as having external storage.
 		 */
 		MGETHDR(m, M_TRYWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			sf_buf_free((void *)sf->kva, NULL);
 			sbunlock(&so->so_snd);
 			goto done;
 		}
 		/*
 		 * Setup external storage for mbuf.
 		 */
 		MEXTADD(m, sf->kva, PAGE_SIZE, sf_buf_free, NULL, M_RDONLY,
 		    EXT_SFBUF);
 		m->m_data = (char *) sf->kva + pgoff;
 		m->m_pkthdr.len = m->m_len = xfsize;
 		/*
 		 * Add the buffer to the socket buffer chain.
 		 */
 		s = splnet();
 retry_space:
 		/*
 		 * Make sure that the socket is still able to take more data.
 		 * CANTSENDMORE being true usually means that the connection
 		 * was closed. so_error is true when an error was sensed after
 		 * a previous send.
 		 * The state is checked after the page mapping and buffer
 		 * allocation above since those operations may block and make
 		 * any socket checks stale. From this point forward, nothing
 		 * blocks before the pru_send (or more accurately, any blocking
 		 * results in a loop back to here to re-check).
 		 */
 		if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
 			if (so->so_state & SS_CANTSENDMORE) {
 				error = EPIPE;
 			} else {
 				error = so->so_error;
 				so->so_error = 0;
 			}
 			m_freem(m);
 			sbunlock(&so->so_snd);
 			splx(s);
 			goto done;
 		}
 		/*
 		 * Wait for socket space to become available. We do this just
 		 * after checking the connection state above in order to avoid
 		 * a race condition with sbwait().
 		 */
 		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
 			if (so->so_state & SS_NBIO) {
 				m_freem(m);
 				sbunlock(&so->so_snd);
 				splx(s);
 				error = EAGAIN;
 				goto done;
 			}
 			error = sbwait(&so->so_snd);
 			/*
 			 * An error from sbwait usually indicates that we've
 			 * been interrupted by a signal. If we've sent anything
 			 * then return bytes sent, otherwise return the error.
 			 */
 			if (error) {
 				m_freem(m);
 				sbunlock(&so->so_snd);
 				splx(s);
 				goto done;
 			}
 			goto retry_space;
 		}
 		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
 		splx(s);
 		if (error) {
 			sbunlock(&so->so_snd);
 			goto done;
 		}
 	}
 	sbunlock(&so->so_snd);
 
 	/*
 	 * Send trailers. Wimp out and use writev(2).
 	 */
 	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
 			nuap.fd = uap->s;
 			nuap.iovp = hdtr.trailers;
 			nuap.iovcnt = hdtr.trl_cnt;
 			error = writev(td, &nuap);
 			if (error)
 				goto done;
 			sbytes += td->td_retval[0];
 	}
 
 done:
 	/*
 	 * If there was no error we have to clear td->td_retval[0]
 	 * because it may have been set by writev.
 	 */
 	if (error == 0) {
 		td->td_retval[0] = 0;
 	}
 	if (uap->sbytes != NULL) {
 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
 	}
 	if (vp)
 		vrele(vp);
 	if (so)
 		fputsock(so);
 	mtx_unlock(&Giant);
 	return (error);
 }
-
Index: head/sys/kern/uipc_usrreq.c
===================================================================
--- head/sys/kern/uipc_usrreq.c	(revision 89305)
+++ head/sys/kern/uipc_usrreq.c	(revision 89306)
@@ -1,1478 +1,1518 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/domain.h>
 #include <sys/filedesc.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
 #include <sys/file.h>
 #include <sys/mutex.h>
 #include <sys/mbuf.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/resourcevar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/vnode.h>
 #include <sys/jail.h>
+#include <sys/sx.h>
 
 #include <vm/vm_zone.h>
 
 static	struct vm_zone *unp_zone;
 static	unp_gen_t unp_gencnt;
 static	u_int unp_count;
 
 static	struct unp_head unp_shead, unp_dhead;
 
 /*
  * Unix communications domain.
  *
  * TODO:
  *	SEQPACKET, RDM
  *	rethink name space problems
  *	need a proper out-of-band
  *	lock pushdown
  */
 static struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
 static ino_t	unp_ino;		/* prototype for fake inode numbers */
 
 static int     unp_attach __P((struct socket *));
 static void    unp_detach __P((struct unpcb *));
 static int     unp_bind __P((struct unpcb *,struct sockaddr *, struct thread *));
 static int     unp_connect __P((struct socket *,struct sockaddr *,
 				struct thread *));
 static void    unp_disconnect __P((struct unpcb *));
 static void    unp_shutdown __P((struct unpcb *));
 static void    unp_drop __P((struct unpcb *, int));
 static void    unp_gc __P((void));
 static void    unp_scan __P((struct mbuf *, void (*)(struct file *)));
 static void    unp_mark __P((struct file *));
 static void    unp_discard __P((struct file *));
 static void    unp_freerights __P((struct file **, int));
 static int     unp_internalize __P((struct mbuf **, struct thread *));
 static int     unp_listen __P((struct unpcb *, struct proc *));
 
 static int
 uipc_abort(struct socket *so)
 {
 	struct unpcb *unp = sotounpcb(so);
 
 	if (unp == 0)
 		return EINVAL;
 	unp_drop(unp, ECONNABORTED);
 	return 0;
 }
 
 static int
 uipc_accept(struct socket *so, struct sockaddr **nam)
 {
 	struct unpcb *unp = sotounpcb(so);
 
 	if (unp == 0)
 		return EINVAL;
 
 	/*
 	 * Pass back name of connected socket,
 	 * if it was bound and we are still connected
 	 * (our peer may have closed already!).
 	 */
 	if (unp->unp_conn && unp->unp_conn->unp_addr) {
 		*nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr,
 				    1);
 	} else {
 		*nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1);
 	}
 	return 0;
 }
 
 static int
 uipc_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct unpcb *unp = sotounpcb(so);
 
 	if (unp != 0)
 		return EISCONN;
 	return unp_attach(so);
 }
 
 static int
 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct unpcb *unp = sotounpcb(so);
 
 	if (unp == 0)
 		return EINVAL;
 
 	return unp_bind(unp, nam, td);
 }
 
 static int
 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct unpcb *unp = sotounpcb(so);
 
 	if (unp == 0)
 		return EINVAL;
 	return unp_connect(so, nam, curthread);
 }
 
 static int
 uipc_connect2(struct socket *so1, struct socket *so2)
 {
 	struct unpcb *unp = sotounpcb(so1);
 
 	if (unp == 0)
 		return EINVAL;
 
 	return unp_connect2(so1, so2);
 }
 
 /* control is EOPNOTSUPP */
 
 static int
 uipc_detach(struct socket *so)
 {
 	struct unpcb *unp = sotounpcb(so);
 
 	if (unp == 0)
 		return EINVAL;
 
 	unp_detach(unp);
 	return 0;
 }
 
 static int
 uipc_disconnect(struct socket *so)
 {
 	struct unpcb *unp = sotounpcb(so);
 
 	if (unp == 0)
 		return EINVAL;
 	unp_disconnect(unp);
 	return 0;
 }
 
 static int
 uipc_listen(struct socket *so, struct thread *td)
 {
 	struct unpcb *unp = sotounpcb(so);
 
 	if (unp == 0 || unp->unp_vnode == 0)
 		return EINVAL;
 	return unp_listen(unp, td->td_proc);
 }
 
 static int
 uipc_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct unpcb *unp = sotounpcb(so);
 
 	if (unp == 0)
 		return EINVAL;
 	if (unp->unp_conn && unp->unp_conn->unp_addr)
 		*nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr,
 				    1);
 	return 0;
 }
 
 static int
 uipc_rcvd(struct socket *so, int flags)
 {
 	struct unpcb *unp = sotounpcb(so);
 	struct socket *so2;
 	u_long newhiwat;
 
 	if (unp == 0)
 		return EINVAL;
 	switch (so->so_type) {
 	case SOCK_DGRAM:
 		panic("uipc_rcvd DGRAM?");
 		/*NOTREACHED*/
 
 	case SOCK_STREAM:
 		if (unp->unp_conn == 0)
 			break;
 		so2 = unp->unp_conn->unp_socket;
 		/*
 		 * Adjust backpressure on sender
 		 * and wakeup any waiting to write.
 		 */
 		so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
 		unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
 		newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
 		    so->so_rcv.sb_cc;
 		(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
 		    newhiwat, RLIM_INFINITY);
 		unp->unp_cc = so->so_rcv.sb_cc;
 		sowwakeup(so2);
 		break;
 
 	default:
 		panic("uipc_rcvd unknown socktype");
 	}
 	return 0;
 }
 
 /* pru_rcvoob is EOPNOTSUPP */
 
 static int
 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 	  struct mbuf *control, struct thread *td)
 {
 	int error = 0;
 	struct unpcb *unp = sotounpcb(so);
 	struct socket *so2;
 	u_long newhiwat;
 
 	if (unp == 0) {
 		error = EINVAL;
 		goto release;
 	}
 	if (flags & PRUS_OOB) {
 		error = EOPNOTSUPP;
 		goto release;
 	}
 
 	if (control && (error = unp_internalize(&control, td)))
 		goto release;
 
 	switch (so->so_type) {
 	case SOCK_DGRAM: 
 	{
 		struct sockaddr *from;
 
 		if (nam) {
 			if (unp->unp_conn) {
 				error = EISCONN;
 				break;
 			}
 			error = unp_connect(so, nam, td);
 			if (error)
 				break;
 		} else {
 			if (unp->unp_conn == 0) {
 				error = ENOTCONN;
 				break;
 			}
 		}
 		so2 = unp->unp_conn->unp_socket;
 		if (unp->unp_addr)
 			from = (struct sockaddr *)unp->unp_addr;
 		else
 			from = &sun_noname;
 		if (sbappendaddr(&so2->so_rcv, from, m, control)) {
 			sorwakeup(so2);
 			m = 0;
 			control = 0;
 		} else
 			error = ENOBUFS;
 		if (nam)
 			unp_disconnect(unp);
 		break;
 	}
 
 	case SOCK_STREAM:
 		/* Connect if not connected yet. */
 		/*
 		 * Note: A better implementation would complain
 		 * if not equal to the peer's address.
 		 */
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			if (nam) {
 				error = unp_connect(so, nam, td);
 				if (error)
 					break;	/* XXX */
 			} else {
 				error = ENOTCONN;
 				break;
 			}
 		}
 
 		if (so->so_state & SS_CANTSENDMORE) {
 			error = EPIPE;
 			break;
 		}
 		if (unp->unp_conn == 0)
 			panic("uipc_send connected but no connection?");
 		so2 = unp->unp_conn->unp_socket;
 		/*
 		 * Send to paired receive port, and then reduce
 		 * send buffer hiwater marks to maintain backpressure.
 		 * Wake up readers.
 		 */
 		if (control) {
 			if (sbappendcontrol(&so2->so_rcv, m, control))
 				control = 0;
 		} else
 			sbappend(&so2->so_rcv, m);
 		so->so_snd.sb_mbmax -=
 			so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
 		unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
 		newhiwat = so->so_snd.sb_hiwat -
 		    (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
 		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
 		    newhiwat, RLIM_INFINITY);
 		unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
 		sorwakeup(so2);
 		m = 0;
 		break;
 
 	default:
 		panic("uipc_send unknown socktype");
 	}
 
 	/*
 	 * SEND_EOF is equivalent to a SEND followed by
 	 * a SHUTDOWN.
 	 */
 	if (flags & PRUS_EOF) {
 		socantsendmore(so);
 		unp_shutdown(unp);
 	}
 
 	if (control && error != 0)
 		unp_dispose(control);
 
 release:
 	if (control)
 		m_freem(control);
 	if (m)
 		m_freem(m);
 	return error;
 }
 
 static int
 uipc_sense(struct socket *so, struct stat *sb)
 {
 	struct unpcb *unp = sotounpcb(so);
 	struct socket *so2;
 
 	if (unp == 0)
 		return EINVAL;
 	sb->st_blksize = so->so_snd.sb_hiwat;
 	if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) {
 		so2 = unp->unp_conn->unp_socket;
 		sb->st_blksize += so2->so_rcv.sb_cc;
 	}
 	sb->st_dev = NOUDEV;
 	if (unp->unp_ino == 0)
 		unp->unp_ino = unp_ino++;
 	sb->st_ino = unp->unp_ino;
 	return (0);
 }
 
 static int
 uipc_shutdown(struct socket *so)
 {
 	struct unpcb *unp = sotounpcb(so);
 
 	if (unp == 0)
 		return EINVAL;
 	socantsendmore(so);
 	unp_shutdown(unp);
 	return 0;
 }
 
 static int
 uipc_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct unpcb *unp = sotounpcb(so);
 
 	if (unp == 0)
 		return EINVAL;
 	if (unp->unp_addr)
 		*nam = dup_sockaddr((struct sockaddr *)unp->unp_addr, 1);
 	else
 		*nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1);
 	return 0;
 }
 
 struct pr_usrreqs uipc_usrreqs = {
 	uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect,
 	uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect,
 	uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp,
 	uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr,
 	sosend, soreceive, sopoll
 };
 
 int
 uipc_ctloutput(so, sopt)
 	struct socket *so;
 	struct sockopt *sopt;
 {
 	struct unpcb *unp = sotounpcb(so);
 	int error;
 
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case LOCAL_PEERCRED:
 			if (unp->unp_flags & UNP_HAVEPC)
 				error = sooptcopyout(sopt, &unp->unp_peercred,
 				    sizeof(unp->unp_peercred));
 			else {
 				if (so->so_type == SOCK_STREAM)
 					error = ENOTCONN;
 				else
 					error = EINVAL;
 			}
 			break;
 		default:
 			error = EOPNOTSUPP;
 			break;
 		}
 		break;
 	case SOPT_SET:
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 	
 /*
  * Both send and receive buffers are allocated PIPSIZ bytes of buffering
  * for stream sockets, although the total for sender and receiver is
  * actually only PIPSIZ.
  * Datagram sockets really use the sendspace as the maximum datagram size,
  * and don't really want to reserve the sendspace.  Their recvspace should
  * be large enough for at least one max-size datagram plus address.
  */
 #ifndef PIPSIZ
 #define	PIPSIZ	8192
 #endif
 static u_long	unpst_sendspace = PIPSIZ;
 static u_long	unpst_recvspace = PIPSIZ;
 static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
 static u_long	unpdg_recvspace = 4*1024;
 
 static int	unp_rights;			/* file descriptors in flight */
 
 SYSCTL_DECL(_net_local_stream);
 SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, 
 	   &unpst_sendspace, 0, "");
 SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
 	   &unpst_recvspace, 0, "");
 SYSCTL_DECL(_net_local_dgram);
 SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
 	   &unpdg_sendspace, 0, "");
 SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
 	   &unpdg_recvspace, 0, "");
 SYSCTL_DECL(_net_local);
 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
 
 static int
 unp_attach(so)
 	struct socket *so;
 {
 	register struct unpcb *unp;
 	int error;
 
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		switch (so->so_type) {
 
 		case SOCK_STREAM:
 			error = soreserve(so, unpst_sendspace, unpst_recvspace);
 			break;
 
 		case SOCK_DGRAM:
 			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
 			break;
 
 		default:
 			panic("unp_attach");
 		}
 		if (error)
 			return (error);
 	}
 	unp = zalloc(unp_zone);
 	if (unp == NULL)
 		return (ENOBUFS);
 	bzero(unp, sizeof *unp);
 	unp->unp_gencnt = ++unp_gencnt;
 	unp_count++;
 	LIST_INIT(&unp->unp_refs);
 	unp->unp_socket = so;
+	FILEDESC_LOCK(curproc->p_fd);
 	unp->unp_rvnode = curthread->td_proc->p_fd->fd_rdir;
+	FILEDESC_UNLOCK(curproc->p_fd);
 	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
 			 : &unp_shead, unp, unp_link);
 	so->so_pcb = (caddr_t)unp;
 	return (0);
 }
 
 static void
 unp_detach(unp)
 	register struct unpcb *unp;
 {
 	LIST_REMOVE(unp, unp_link);
 	unp->unp_gencnt = ++unp_gencnt;
 	--unp_count;
 	if (unp->unp_vnode) {
 		unp->unp_vnode->v_socket = 0;
 		vrele(unp->unp_vnode);
 		unp->unp_vnode = 0;
 	}
 	if (unp->unp_conn)
 		unp_disconnect(unp);
 	while (!LIST_EMPTY(&unp->unp_refs))
 		unp_drop(LIST_FIRST(&unp->unp_refs), ECONNRESET);
 	soisdisconnected(unp->unp_socket);
 	unp->unp_socket->so_pcb = 0;
 	if (unp_rights) {
 		/*
 		 * Normally the receive buffer is flushed later,
 		 * in sofree, but if our receive buffer holds references
 		 * to descriptors that are now garbage, we will dispose
 		 * of those descriptor references after the garbage collector
 		 * gets them (resulting in a "panic: closef: count < 0").
 		 */
 		sorflush(unp->unp_socket);
 		unp_gc();
 	}
 	if (unp->unp_addr)
 		FREE(unp->unp_addr, M_SONAME);
 	zfree(unp_zone, unp);
 }
 
 static int
 unp_bind(unp, nam, td)
 	struct unpcb *unp;
 	struct sockaddr *nam;
 	struct thread *td;
 {
 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vattr;
 	int error, namelen;
 	struct nameidata nd;
 	char *buf;
 
 	if (unp->unp_vnode != NULL)
 		return (EINVAL);
 	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
 	if (namelen <= 0)
 		return EINVAL;
 	buf = malloc(SOCK_MAXADDRLEN, M_TEMP, M_WAITOK);
 	strncpy(buf, soun->sun_path, namelen);
 	buf[namelen] = 0;	/* null-terminate the string */
 restart:
 	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT, UIO_SYSSPACE,
 	    buf, td);
 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
 	error = namei(&nd);
 	if (error) {
 		free(buf, M_TEMP);
 		return (error);
 	}
 	vp = nd.ni_vp;
 	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		if (vp != NULL) {
 			vrele(vp);
 			free(buf, M_TEMP);
 			return (EADDRINUSE);
 		}
 		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
 		if (error) {
 			free(buf, M_TEMP);
 			return (error);
 		}
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VSOCK;
+	FILEDESC_LOCK(td->td_proc->p_fd);
 	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	if (error) {
 		free(buf, M_TEMP);
 		return (error);
 	}
 	vp = nd.ni_vp;
 	vp->v_socket = unp->unp_socket;
 	unp->unp_vnode = vp;
 	unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam, 1);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	free(buf, M_TEMP);
 	return (0);
 }
 
 static int
 unp_connect(so, nam, td)
 	struct socket *so;
 	struct sockaddr *nam;
 	struct thread *td;
 {
 	register struct sockaddr_un *soun = (struct sockaddr_un *)nam;
 	register struct vnode *vp;
 	register struct socket *so2, *so3;
 	struct unpcb *unp, *unp2, *unp3;
 	int error, len;
 	struct nameidata nd;
 	char buf[SOCK_MAXADDRLEN];
 
 	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
 	if (len <= 0)
 		return EINVAL;
 	strncpy(buf, soun->sun_path, len);
 	buf[len] = 0;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (vp->v_type != VSOCK) {
 		error = ENOTSOCK;
 		goto bad;
 	}
 	error = VOP_ACCESS(vp, VWRITE, td->td_proc->p_ucred, td);
 	if (error)
 		goto bad;
 	so2 = vp->v_socket;
 	if (so2 == 0) {
 		error = ECONNREFUSED;
 		goto bad;
 	}
 	if (so->so_type != so2->so_type) {
 		error = EPROTOTYPE;
 		goto bad;
 	}
 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
 		if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
 		    (so3 = sonewconn(so2, 0)) == 0) {
 			error = ECONNREFUSED;
 			goto bad;
 		}
 		unp = sotounpcb(so);
 		unp2 = sotounpcb(so2);
 		unp3 = sotounpcb(so3);
 		if (unp2->unp_addr)
 			unp3->unp_addr = (struct sockaddr_un *)
 				dup_sockaddr((struct sockaddr *)
 					     unp2->unp_addr, 1);
 
 		/*
 		 * unp_peercred management:
 		 *
 		 * The connecter's (client's) credentials are copied
 		 * from its process structure at the time of connect()
 		 * (which is now).
 		 */
 		memset(&unp3->unp_peercred, '\0', sizeof(unp3->unp_peercred));
 		unp3->unp_peercred.cr_uid = td->td_proc->p_ucred->cr_uid;
 		unp3->unp_peercred.cr_ngroups = td->td_proc->p_ucred->cr_ngroups;
 		memcpy(unp3->unp_peercred.cr_groups, td->td_proc->p_ucred->cr_groups,
 		    sizeof(unp3->unp_peercred.cr_groups));
 		unp3->unp_flags |= UNP_HAVEPC;
 		/*
 		 * The receiver's (server's) credentials are copied
 		 * from the unp_peercred member of socket on which the
 		 * former called listen(); unp_listen() cached that
 		 * process's credentials at that time so we can use
 		 * them now.
 		 */
 		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
 		    ("unp_connect: listener without cached peercred"));
 		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
 		    sizeof(unp->unp_peercred));
 		unp->unp_flags |= UNP_HAVEPC;
 
 		so2 = so3;
 	}
 	error = unp_connect2(so, so2);
 bad:
 	vput(vp);
 	return (error);
 }
 
 int
 unp_connect2(so, so2)
 	register struct socket *so;
 	register struct socket *so2;
 {
 	register struct unpcb *unp = sotounpcb(so);
 	register struct unpcb *unp2;
 
 	if (so2->so_type != so->so_type)
 		return (EPROTOTYPE);
 	unp2 = sotounpcb(so2);
 	unp->unp_conn = unp2;
 	switch (so->so_type) {
 
 	case SOCK_DGRAM:
 		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
 		soisconnected(so);
 		break;
 
 	case SOCK_STREAM:
 		unp2->unp_conn = unp;
 		soisconnected(so);
 		soisconnected(so2);
 		break;
 
 	default:
 		panic("unp_connect2");
 	}
 	return (0);
 }
 
 static void
 unp_disconnect(unp)
 	struct unpcb *unp;
 {
 	register struct unpcb *unp2 = unp->unp_conn;
 
 	if (unp2 == 0)
 		return;
 	unp->unp_conn = 0;
 	switch (unp->unp_socket->so_type) {
 
 	case SOCK_DGRAM:
 		LIST_REMOVE(unp, unp_reflink);
 		unp->unp_socket->so_state &= ~SS_ISCONNECTED;
 		break;
 
 	case SOCK_STREAM:
 		soisdisconnected(unp->unp_socket);
 		unp2->unp_conn = 0;
 		soisdisconnected(unp2->unp_socket);
 		break;
 	}
 }
 
 #ifdef notdef
 void
 unp_abort(unp)
 	struct unpcb *unp;
 {
 
 	unp_detach(unp);
 }
 #endif
 
 static int
 unp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n;
 	struct unpcb *unp, **unp_list;
 	unp_gen_t gencnt;
 	struct xunpgen *xug;
 	struct unp_head *head;
 	struct xunpcb *xu;
 
 	head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead);
 
 	/*
 	 * The process of preparing the PCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == 0) {
 		n = unp_count;
 		req->oldidx = 2 * (sizeof *xug)
 			+ (n + n/8) * sizeof(struct xunpcb);
 		return 0;
 	}
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
 	gencnt = unp_gencnt;
 	n = unp_count;
 
 	xug->xug_len = sizeof *xug;
 	xug->xug_count = n;
 	xug->xug_gen = gencnt;
 	xug->xug_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, xug, sizeof *xug);
 	if (error) {
 		free(xug, M_TEMP);
 		return error;
 	}
 
 	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
 	
 	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
 	     unp = LIST_NEXT(unp, unp_link)) {
 		if (unp->unp_gencnt <= gencnt) {
 			if (cr_cansee(req->td->td_proc->p_ucred,
 			    unp->unp_socket->so_cred))
 				continue;
 			unp_list[i++] = unp;
 		}
 	}
 	n = i;			/* in case we lost some during malloc */
 
 	error = 0;
 	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK);
 	for (i = 0; i < n; i++) {
 		unp = unp_list[i];
 		if (unp->unp_gencnt <= gencnt) {
 			xu->xu_len = sizeof *xu;
 			xu->xu_unpp = unp;
 			/*
 			 * XXX - need more locking here to protect against
 			 * connect/disconnect races for SMP.
 			 */
 			if (unp->unp_addr)
 				bcopy(unp->unp_addr, &xu->xu_addr, 
 				      unp->unp_addr->sun_len);
 			if (unp->unp_conn && unp->unp_conn->unp_addr)
 				bcopy(unp->unp_conn->unp_addr,
 				      &xu->xu_caddr,
 				      unp->unp_conn->unp_addr->sun_len);
 			bcopy(unp, &xu->xu_unp, sizeof *unp);
 			sotoxsocket(unp->unp_socket, &xu->xu_socket);
 			error = SYSCTL_OUT(req, xu, sizeof *xu);
 		}
 	}
 	free(xu, M_TEMP);
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		xug->xug_gen = unp_gencnt;
 		xug->xug_sogen = so_gencnt;
 		xug->xug_count = unp_count;
 		error = SYSCTL_OUT(req, xug, sizeof *xug);
 	}
 	free(unp_list, M_TEMP);
 	free(xug, M_TEMP);
 	return error;
 }
 
 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, 
 	    (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
 	    "List of active local datagram sockets");
 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, 
 	    (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
 	    "List of active local stream sockets");
 
 static void
 unp_shutdown(unp)
 	struct unpcb *unp;
 {
 	struct socket *so;
 
 	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
 	    (so = unp->unp_conn->unp_socket))
 		socantrcvmore(so);
 }
 
 static void
 unp_drop(unp, errno)
 	struct unpcb *unp;
 	int errno;
 {
 	struct socket *so = unp->unp_socket;
 
 	so->so_error = errno;
 	unp_disconnect(unp);
 	if (so->so_head) {
 		LIST_REMOVE(unp, unp_link);
 		unp->unp_gencnt = ++unp_gencnt;
 		unp_count--;
 		so->so_pcb = (caddr_t) 0;
 		if (unp->unp_addr)
 			FREE(unp->unp_addr, M_SONAME);
 		zfree(unp_zone, unp);
 		sotryfree(so);
 	}
 }
 
 #ifdef notdef
 void
 unp_drain()
 {
 
 }
 #endif
 
 static void
 unp_freerights(rp, fdcount)
 	struct file **rp;
 	int fdcount;
 {
 	int i;
 	struct file *fp;
 
 	for (i = 0; i < fdcount; i++) {
 		fp = *rp;
 		/*
 		 * zero the pointer before calling
 		 * unp_discard since it may end up
 		 * in unp_gc()..
 		 */
 		*rp++ = 0;
 		unp_discard(fp);
 	}
 }
 
 int
 unp_externalize(control, controlp)
 	struct mbuf *control, **controlp;
 {
 	struct thread *td = curthread;		/* XXX */
 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
 	int i;
 	int *fdp;
 	struct file **rp;
 	struct file *fp;
 	void *data;
 	socklen_t clen = control->m_len, datalen;
 	int error, newfds;
 	int f;
 	u_int newlen;
 
 	error = 0;
 	if (controlp != NULL) /* controlp == NULL => free control messages */
 		*controlp = NULL;
 
 	while (cm != NULL) {
 		if (sizeof(*cm) > clen || cm->cmsg_len > clen) {
 			error = EINVAL;
 			break;
 		}
 
 		data = CMSG_DATA(cm);
 		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
 
 		if (cm->cmsg_level == SOL_SOCKET
 		    && cm->cmsg_type == SCM_RIGHTS) {
 			newfds = datalen / sizeof(struct file *);
 			rp = data;
 
 			/* If we're not outputting the discriptors free them. */
 			if (error || controlp == NULL) {
 				unp_freerights(rp, newfds);
 				goto next;
 			}
+			FILEDESC_LOCK(td->td_proc->p_fd);
 			/* if the new FD's will not fit free them.  */
 			if (!fdavail(td, newfds)) {
+				FILEDESC_UNLOCK(td->td_proc->p_fd);
 				error = EMSGSIZE;
 				unp_freerights(rp, newfds);
 				goto next;
 			}
 			/*
 			 * now change each pointer to an fd in the global
 			 * table to an integer that is the index to the
 			 * local fd table entry that we set up to point
 			 * to the global one we are transferring.
 			 */
 			newlen = newfds * sizeof(int);
 			*controlp = sbcreatecontrol(NULL, newlen,
 			    SCM_RIGHTS, SOL_SOCKET);
 			if (*controlp == NULL) {
+				FILEDESC_UNLOCK(td->td_proc->p_fd);
 				error = E2BIG;
 				unp_freerights(rp, newfds);
 				goto next;
 			}
 
 			fdp = (int *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			for (i = 0; i < newfds; i++) {
 				if (fdalloc(td, 0, &f))
 					panic("unp_externalize fdalloc failed");
 				fp = *rp++;
 				td->td_proc->p_fd->fd_ofiles[f] = fp;
+				FILE_LOCK(fp);
 				fp->f_msgcount--;
+				FILE_UNLOCK(fp);
 				unp_rights--;
 				*fdp++ = f;
 			}
+			FILEDESC_UNLOCK(td->td_proc->p_fd);
 		} else { /* We can just copy anything else across */
 			if (error || controlp == NULL)
 				goto next;
 			*controlp = sbcreatecontrol(NULL, datalen,
 			    cm->cmsg_type, cm->cmsg_level);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto next;
 			}
 			bcopy(data,
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
 			    datalen);
 		}
 
 		controlp = &(*controlp)->m_next;
 
 next:
 		if (CMSG_SPACE(datalen) < clen) {
 			clen -= CMSG_SPACE(datalen);
 			cm = (struct cmsghdr *)
 			    ((caddr_t)cm + CMSG_SPACE(datalen));
 		} else {
 			clen = 0;
 			cm = NULL;
 		}
 	}
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
 
 	m_freem(control);
 
 	return (error);
 }
 
 void
 unp_init(void)
 {
 	unp_zone = zinit("unpcb", sizeof(struct unpcb), nmbclusters, 0, 0);
 	if (unp_zone == 0)
 		panic("unp_init");
 	LIST_INIT(&unp_dhead);
 	LIST_INIT(&unp_shead);
 }
 
 #ifndef MIN
 #define	MIN(a,b) (((a)<(b))?(a):(b))
 #endif
 
 static int
 unp_internalize(controlp, td)
 	struct mbuf **controlp;
 	struct thread *td;
 {
 	struct mbuf *control = *controlp;
 	struct proc *p = td->td_proc;
 	struct filedesc *fdescp = p->p_fd;
 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
 	struct cmsgcred *cmcred;
 	struct file **rp;
 	struct file *fp;
 	struct timeval *tv;
 	int i, fd, *fdp;
 	void *data;
 	socklen_t clen = control->m_len, datalen;
 	int error, oldfds;
 	u_int newlen;
 
 	error = 0;
 	*controlp = NULL;
 
 	while (cm != NULL) {
 		if (sizeof(*cm) > clen || cm->cmsg_level != SOL_SOCKET
 		    || cm->cmsg_len > clen) {
 			error = EINVAL;
 			goto out;
 		}
 
 		data = CMSG_DATA(cm);
 		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
 
 		switch (cm->cmsg_type) {
 		/*
 		 * Fill in credential information.
 		 */
 		case SCM_CREDS:
 			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
 			    SCM_CREDS, SOL_SOCKET);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 
 			cmcred = (struct cmsgcred *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			cmcred->cmcred_pid = p->p_pid;
 			cmcred->cmcred_uid = p->p_ucred->cr_ruid;
 			cmcred->cmcred_gid = p->p_ucred->cr_rgid;
 			cmcred->cmcred_euid = p->p_ucred->cr_uid;
 			cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups,
 							CMGROUP_MAX);
 			for (i = 0; i < cmcred->cmcred_ngroups; i++)
 				cmcred->cmcred_groups[i] =
 				    p->p_ucred->cr_groups[i];
 			break;
 
 		case SCM_RIGHTS:
 			oldfds = datalen / sizeof (int);
 			/*
 			 * check that all the FDs passed in refer to legal files
 			 * If not, reject the entire operation.
 			 */
 			fdp = data;
+			FILEDESC_LOCK(fdescp);
 			for (i = 0; i < oldfds; i++) {
 				fd = *fdp++;
 				if ((unsigned)fd >= fdescp->fd_nfiles ||
 				    fdescp->fd_ofiles[fd] == NULL) {
+					FILEDESC_UNLOCK(fdescp);
 					error = EBADF;
 					goto out;
 				}
 			}
 			/*
 			 * Now replace the integer FDs with pointers to
 			 * the associated global file table entry..
 			 */
 			newlen = oldfds * sizeof(struct file *);
 			*controlp = sbcreatecontrol(NULL, newlen,
 			    SCM_RIGHTS, SOL_SOCKET);
 			if (*controlp == NULL) {
+				FILEDESC_UNLOCK(fdescp);
 				error = E2BIG;
 				goto out;
 			}
 
 			fdp = data;
 			rp = (struct file **)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			for (i = 0; i < oldfds; i++) {
 				fp = fdescp->fd_ofiles[*fdp++];
 				*rp++ = fp;
+				FILE_LOCK(fp);
 				fp->f_count++;
 				fp->f_msgcount++;
+				FILE_UNLOCK(fp);
 				unp_rights++;
 			}
+			FILEDESC_UNLOCK(fdescp);
 			break;
 
 		case SCM_TIMESTAMP:
 			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
 			    SCM_TIMESTAMP, SOL_SOCKET);
 			if (*controlp == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 			tv = (struct timeval *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			microtime(tv);
 			break;
 
 		default:
 			error = EINVAL;
 			goto out;
 		}
 
 		controlp = &(*controlp)->m_next;
 
 		if (CMSG_SPACE(datalen) < clen) {
 			clen -= CMSG_SPACE(datalen);
 			cm = (struct cmsghdr *)
 			    ((caddr_t)cm + CMSG_SPACE(datalen));
 		} else {
 			clen = 0;
 			cm = NULL;
 		}
 	}
 
 out:
 	m_freem(control);
 
 	return (error);
 }
 
 static int	unp_defer, unp_gcing;
 
 static void
 unp_gc()
 {
 	register struct file *fp, *nextfp;
 	register struct socket *so;
 	struct file **extra_ref, **fpp;
 	int nunref, i;
 
 	if (unp_gcing)
 		return;
 	unp_gcing = 1;
 	unp_defer = 0;
 	/* 
 	 * before going through all this, set all FDs to 
 	 * be NOT defered and NOT externally accessible
 	 */
+	sx_slock(&filelist_lock);
 	LIST_FOREACH(fp, &filehead, f_list)
-		fp->f_flag &= ~(FMARK|FDEFER);
+		fp->f_gcflag &= ~(FMARK|FDEFER);
 	do {
 		LIST_FOREACH(fp, &filehead, f_list) {
+			FILE_LOCK(fp);
 			/*
 			 * If the file is not open, skip it
 			 */
-			if (fp->f_count == 0)
+			if (fp->f_count == 0) {
+				FILE_UNLOCK(fp);
 				continue;
+			}
 			/*
 			 * If we already marked it as 'defer'  in a
 			 * previous pass, then try process it this time
 			 * and un-mark it
 			 */
-			if (fp->f_flag & FDEFER) {
-				fp->f_flag &= ~FDEFER;
+			if (fp->f_gcflag & FDEFER) {
+				fp->f_gcflag &= ~FDEFER;
 				unp_defer--;
 			} else {
 				/*
 				 * if it's not defered, then check if it's
 				 * already marked.. if so skip it
 				 */
-				if (fp->f_flag & FMARK)
+				if (fp->f_gcflag & FMARK) {
+					FILE_UNLOCK(fp);
 					continue;
+				}
 				/* 
 				 * If all references are from messages
 				 * in transit, then skip it. it's not 
 				 * externally accessible.
 				 */ 
-				if (fp->f_count == fp->f_msgcount)
+				if (fp->f_count == fp->f_msgcount) {
+					FILE_UNLOCK(fp);
 					continue;
+				}
 				/* 
 				 * If it got this far then it must be
 				 * externally accessible.
 				 */
-				fp->f_flag |= FMARK;
+				fp->f_gcflag |= FMARK;
 			}
 			/*
 			 * either it was defered, or it is externally 
 			 * accessible and not already marked so.
 			 * Now check if it is possibly one of OUR sockets.
 			 */ 
 			if (fp->f_type != DTYPE_SOCKET ||
-			    (so = (struct socket *)fp->f_data) == 0)
+			    (so = (struct socket *)fp->f_data) == 0) {
+				FILE_UNLOCK(fp);
 				continue;
+			}
+			FILE_UNLOCK(fp);
 			if (so->so_proto->pr_domain != &localdomain ||
 			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
 				continue;
 #ifdef notdef
 			if (so->so_rcv.sb_flags & SB_LOCK) {
 				/*
 				 * This is problematical; it's not clear
 				 * we need to wait for the sockbuf to be
 				 * unlocked (on a uniprocessor, at least),
 				 * and it's also not clear what to do
 				 * if sbwait returns an error due to receipt
 				 * of a signal.  If sbwait does return
 				 * an error, we'll go into an infinite
 				 * loop.  Delete all of this for now.
 				 */
 				(void) sbwait(&so->so_rcv);
 				goto restart;
 			}
 #endif
 			/*
 			 * So, Ok, it's one of our sockets and it IS externally
 			 * accessible (or was defered). Now we look
 			 * to see if we hold any file descriptors in its
 			 * message buffers. Follow those links and mark them 
 			 * as accessible too.
 			 */
 			unp_scan(so->so_rcv.sb_mb, unp_mark);
 		}
 	} while (unp_defer);
+	sx_sunlock(&filelist_lock);
 	/*
 	 * We grab an extra reference to each of the file table entries
 	 * that are not otherwise accessible and then free the rights
 	 * that are stored in messages on them.
 	 *
 	 * The bug in the orginal code is a little tricky, so I'll describe
 	 * what's wrong with it here.
 	 *
 	 * It is incorrect to simply unp_discard each entry for f_msgcount
 	 * times -- consider the case of sockets A and B that contain
 	 * references to each other.  On a last close of some other socket,
 	 * we trigger a gc since the number of outstanding rights (unp_rights)
 	 * is non-zero.  If during the sweep phase the gc code un_discards,
 	 * we end up doing a (full) closef on the descriptor.  A closef on A
 	 * results in the following chain.  Closef calls soo_close, which
 	 * calls soclose.   Soclose calls first (through the switch
 	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
 	 * returns because the previous instance had set unp_gcing, and
 	 * we return all the way back to soclose, which marks the socket
 	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
 	 * to free up the rights that are queued in messages on the socket A,
 	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
 	 * switch unp_dispose, which unp_scans with unp_discard.  This second
 	 * instance of unp_discard just calls closef on B.
 	 *
 	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
 	 * which results in another closef on A.  Unfortunately, A is already
 	 * being closed, and the descriptor has already been marked with
 	 * SS_NOFDREF, and soclose panics at this point.
 	 *
 	 * Here, we first take an extra reference to each inaccessible
 	 * descriptor.  Then, we call sorflush ourself, since we know
 	 * it is a Unix domain socket anyhow.  After we destroy all the
 	 * rights carried in messages, we do a last closef to get rid
 	 * of our extra reference.  This is the last close, and the
 	 * unp_detach etc will shut down the socket.
 	 *
 	 * 91/09/19, bsy@cs.cmu.edu
 	 */
 	extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK);
+	sx_slock(&filelist_lock);
 	for (nunref = 0, fp = LIST_FIRST(&filehead), fpp = extra_ref; fp != 0;
 	    fp = nextfp) {
 		nextfp = LIST_NEXT(fp, f_list);
+		FILE_LOCK(fp);
 		/* 
 		 * If it's not open, skip it
 		 */
-		if (fp->f_count == 0)
+		if (fp->f_count == 0) {
+			FILE_UNLOCK(fp);
 			continue;
+		}
 		/* 
 		 * If all refs are from msgs, and it's not marked accessible
 		 * then it must be referenced from some unreachable cycle
 		 * of (shut-down) FDs, so include it in our
 		 * list of FDs to remove
 		 */
-		if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
+		if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {
 			*fpp++ = fp;
 			nunref++;
 			fp->f_count++;
 		}
+		FILE_UNLOCK(fp);
 	}
+	sx_sunlock(&filelist_lock);
 	/* 
 	 * for each FD on our hit list, do the following two things
 	 */
 	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
 		struct file *tfp = *fpp;
-		if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL)
+		FILE_LOCK(tfp);
+		if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL) {
+			FILE_UNLOCK(tfp);
 			sorflush((struct socket *)(tfp->f_data));
+		} else
+			FILE_UNLOCK(tfp);
 	}
 	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
 		closef(*fpp, (struct thread *) NULL);
 	free((caddr_t)extra_ref, M_FILE);
 	unp_gcing = 0;
 }
 
 void
 unp_dispose(m)
 	struct mbuf *m;
 {
 
 	if (m)
 		unp_scan(m, unp_discard);
 }
 
 static int
 unp_listen(unp, p)
 	struct unpcb *unp;
 	struct proc *p;
 {
 
 	bzero(&unp->unp_peercred, sizeof(unp->unp_peercred));
 	unp->unp_peercred.cr_uid = p->p_ucred->cr_uid;
 	unp->unp_peercred.cr_ngroups = p->p_ucred->cr_ngroups;
 	bcopy(p->p_ucred->cr_groups, unp->unp_peercred.cr_groups,
 	    sizeof(unp->unp_peercred.cr_groups));
 	unp->unp_flags |= UNP_HAVEPCCACHED;
 	return (0);
 }
 
 static void
 unp_scan(m0, op)
 	register struct mbuf *m0;
 	void (*op) __P((struct file *));
 {
 	struct mbuf *m;
 	struct file **rp;
 	struct cmsghdr *cm;
 	void *data;
 	int i;
 	socklen_t clen, datalen;
 	int qfds;
 
 	while (m0) {
 		for (m = m0; m; m = m->m_next) {
 			if (m->m_type != MT_CONTROL)
 				continue;
 
 			cm = mtod(m, struct cmsghdr *);
 			clen = m->m_len;
 
 			while (cm != NULL) {
 				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
 					break;
 
 				data = CMSG_DATA(cm);
 				datalen = (caddr_t)cm + cm->cmsg_len
 				    - (caddr_t)data;
 
 				if (cm->cmsg_level == SOL_SOCKET &&
 				    cm->cmsg_type == SCM_RIGHTS) {
 					qfds = datalen / sizeof (struct file *);
 					rp = data;
 					for (i = 0; i < qfds; i++)
 						(*op)(*rp++);
 				}
 
 				if (CMSG_SPACE(datalen) < clen) {
 					clen -= CMSG_SPACE(datalen);
 					cm = (struct cmsghdr *)
 					    ((caddr_t)cm + CMSG_SPACE(datalen));
 				} else {
 					clen = 0;
 					cm = NULL;
 				}
 			}
 		}
 		m0 = m0->m_act;
 	}
 }
 
 static void
 unp_mark(fp)
 	struct file *fp;
 {
-
-	if (fp->f_flag & FMARK)
+	if (fp->f_gcflag & FMARK)
 		return;
 	unp_defer++;
-	fp->f_flag |= (FMARK|FDEFER);
+	fp->f_gcflag |= (FMARK|FDEFER);
 }
 
 static void
 unp_discard(fp)
 	struct file *fp;
 {
-
+	FILE_LOCK(fp);
 	fp->f_msgcount--;
 	unp_rights--;
+	FILE_UNLOCK(fp);
 	(void) closef(fp, (struct thread *)NULL);
 }
Index: head/sys/kern/vfs_acl.c
===================================================================
--- head/sys/kern/vfs_acl.c	(revision 89305)
+++ head/sys/kern/vfs_acl.c	(revision 89306)
@@ -1,817 +1,821 @@
 /*-
  * Copyright (c) 1999-2001 Robert N. M. Watson
  * All rights reserved.
  *
  * This software was developed by Robert Watson for the TrustedBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 /*
  * Developed by the TrustedBSD Project.
  * Support for POSIX.1e access control lists.
  */
 
 #include "opt_cap.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/vnode.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #include <sys/sysent.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/acl.h>
 
 MALLOC_DEFINE(M_ACL, "acl", "access control list");
 
 static int	vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
 	    struct acl *aclp);
 static int	vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
 	    struct acl *aclp);
 static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
 	    acl_type_t type, struct acl *aclp);
 
 /*
  * Implement a version of vaccess() that understands POSIX.1e ACL semantics.
  * Return 0 on success, else an errno value.  Should be merged into
  * vaccess() eventually.
  */
 int
 vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
     struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)
 {
 	struct acl_entry *acl_other, *acl_mask;
 	mode_t dac_granted;
 	mode_t cap_granted;
 	mode_t acl_mask_granted;
 	int group_matched, i;
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
 	 * as requested.  If it exists, go with that.  Otherwise, attempt
 	 * to use privileges granted via cap_granted.  In some cases,
 	 * which privileges to use may be ambiguous due to "best match",
 	 * in which case fall back on first match for the time being.
 	 */
 	if (privused != NULL)
 		*privused = 0;
 
 	/*
 	 * Determine privileges now, but don't apply until we've found
 	 * a DAC entry that matches but has failed to allow access.
 	 */
 #ifndef CAPABILITIES
 	if (suser_xxx(cred, NULL, PRISON_ROOT) == 0)
 		cap_granted = (VEXEC | VREAD | VWRITE | VADMIN);
 	else
 		cap_granted = 0;
 #else
 	cap_granted = 0;
 
 	if (type == VDIR) {
 		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
 		     CAP_DAC_READ_SEARCH, PRISON_ROOT))
 			cap_granted |= VEXEC;
 	} else {
 		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
 		    CAP_DAC_EXECUTE, PRISON_ROOT))
 			cap_granted |= VEXEC;
 	}
 
 	if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH,
 	    PRISON_ROOT))
 		cap_granted |= VREAD;
 
 	if ((acc_mode & VWRITE) && !cap_check(cred, NULL, CAP_DAC_WRITE,
 	    PRISON_ROOT))
 		cap_granted |= VWRITE;
 
 	if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER,
 	    PRISON_ROOT))
 		cap_granted |= VADMIN;
 #endif /* CAPABILITIES */
 
 	/*
 	 * The owner matches if the effective uid associated with the
 	 * credential matches that of the ACL_USER_OBJ entry.  While we're
 	 * doing the first scan, also cache the location of the ACL_MASK
 	 * and ACL_OTHER entries, preventing some future iterations.
 	 */
 	acl_mask = acl_other = NULL;
 	for (i = 0; i < acl->acl_cnt; i++) {
 		switch (acl->acl_entry[i].ae_tag) {
 		case ACL_USER_OBJ:
 			if (file_uid != cred->cr_uid)
 				break;
 			dac_granted = 0;
 			dac_granted |= VADMIN;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= VWRITE;
 			if ((acc_mode & dac_granted) == acc_mode)
 				return (0);
 			if ((acc_mode & (dac_granted | cap_granted)) ==
 			    acc_mode) {
 				if (privused != NULL)
 					*privused = 1;
 				return (0);
 			}
 			goto error;
 
 		case ACL_MASK:
 			acl_mask = &acl->acl_entry[i];
 			break;
 
 		case ACL_OTHER:
 			acl_other = &acl->acl_entry[i];
 			break;
 
 		default:
 		}
 	}
 
 	/*
 	 * An ACL_OTHER entry should always exist in a valid access
 	 * ACL.  If it doesn't, then generate a serious failure.  For now,
 	 * this means a debugging message and EPERM, but in the future
 	 * should probably be a panic.
 	 */
 	if (acl_other == NULL) {
 		/*
 		 * XXX This should never happen
 		 */
 		printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
 		return (EPERM);
 	}
 
 	/*
 	 * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields
 	 * are masked by an ACL_MASK entry, if any.  As such, first identify
 	 * the ACL_MASK field, then iterate through identifying potential
 	 * user matches, then group matches.  If there is no ACL_MASK,
 	 * assume that the mask allows all requests to succeed.
 	 */
 	if (acl_mask != NULL) {
 		acl_mask_granted = 0;
 		if (acl_mask->ae_perm & ACL_EXECUTE)
 			acl_mask_granted |= VEXEC;
 		if (acl_mask->ae_perm & ACL_READ)
 			acl_mask_granted |= VREAD;
 		if (acl_mask->ae_perm & ACL_WRITE)
 			acl_mask_granted |= VWRITE;
 	} else
 		acl_mask_granted = VEXEC | VREAD | VWRITE;
 
 	/*
 	 * Iterate through user ACL entries.  Do checks twice, first
 	 * without privilege, and then if a match is found but failed,
 	 * a second time with privilege.
 	 */
 
 	/*
 	 * Check ACL_USER ACL entries.
 	 */
 	for (i = 0; i < acl->acl_cnt; i++) {
 		switch (acl->acl_entry[i].ae_tag) {
 		case ACL_USER:
 			if (acl->acl_entry[i].ae_id != cred->cr_uid)
 				break;
 			dac_granted = 0;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= VWRITE;
 			dac_granted &= acl_mask_granted;
 			if ((acc_mode & dac_granted) == acc_mode)
 				return (0);
 			if ((acc_mode & (dac_granted | cap_granted)) !=
 			    acc_mode)
 				goto error;
 
 			if (privused != NULL)
 				*privused = 1;
 			return (0);
 		}
 	}
 
 	/*
 	 * Group match is best-match, not first-match, so find a 
 	 * "best" match.  Iterate across, testing each potential group
 	 * match.  Make sure we keep track of whether we found a match
 	 * or not, so that we know if we should try again with any
 	 * available privilege, or if we should move on to ACL_OTHER.
 	 */
 	group_matched = 0;
 	for (i = 0; i < acl->acl_cnt; i++) {
 		switch (acl->acl_entry[i].ae_tag) {
 		case ACL_GROUP_OBJ:
 			if (!groupmember(file_gid, cred))
 				break;
 			dac_granted = 0;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= VWRITE;
 			dac_granted  &= acl_mask_granted;
 
 			if ((acc_mode & dac_granted) == acc_mode)
 				return (0);
 
 			group_matched = 1;
 			break;
 
 		case ACL_GROUP:
 			if (!groupmember(acl->acl_entry[i].ae_id, cred))
 				break;
 			dac_granted = 0;
 			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 			if (acl->acl_entry[i].ae_perm & ACL_READ)
 				dac_granted |= VREAD;
 			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 				dac_granted |= VWRITE;
 			dac_granted  &= acl_mask_granted;
 
 			if ((acc_mode & dac_granted) == acc_mode)
 				return (0);
 
 			group_matched = 1;
 			break;
 
 		default:
 		}
 	}
 
 	if (group_matched == 1) {
 		/*
 		 * There was a match, but it did not grant rights via
 		 * pure DAC.  Try again, this time with privilege.
 		 */
 		for (i = 0; i < acl->acl_cnt; i++) {
 			switch (acl->acl_entry[i].ae_tag) {
 			case ACL_GROUP_OBJ:
 				if (!groupmember(file_gid, cred))
 					break;
 				dac_granted = 0;
 				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 					dac_granted |= VEXEC;
 				if (acl->acl_entry[i].ae_perm & ACL_READ)
 					dac_granted |= VREAD;
 				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 					dac_granted |= VWRITE;
 				dac_granted &= acl_mask_granted;
 
 				if ((acc_mode & (dac_granted | cap_granted)) !=
 				    acc_mode)
 					break;
 
 				if (privused != NULL)
 					*privused = 1;
 				return (0);
 
 			case ACL_GROUP:
 				if (!groupmember(acl->acl_entry[i].ae_id,
 				    cred))
 					break;
 				dac_granted = 0;
 				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
 				dac_granted |= VEXEC;
 				if (acl->acl_entry[i].ae_perm & ACL_READ)
 					dac_granted |= VREAD;
 				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
 					dac_granted |= VWRITE;
 				dac_granted &= acl_mask_granted;
 
 				if ((acc_mode & (dac_granted | cap_granted)) !=
 				    acc_mode)
 					break;
 
 				if (privused != NULL)
 					*privused = 1;
 				return (0);
 
 			default:
 			}
 		}
 		/*
 		 * Even with privilege, group membership was not sufficient.
 		 * Return failure.
 		 */
 		goto error;
 	}
 		
 	/*
 	 * Fall back on ACL_OTHER.  ACL_MASK is not applied to ACL_OTHER.
 	 */
 	dac_granted = 0;
 	if (acl_other->ae_perm & ACL_EXECUTE)
 		dac_granted |= VEXEC;
 	if (acl_other->ae_perm & ACL_READ)
 		dac_granted |= VREAD;
 	if (acl_other->ae_perm & ACL_WRITE)
 		dac_granted |= VWRITE;
 
 	if ((acc_mode & dac_granted) == acc_mode)
 		return (0);
 	if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) {
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
 
 error:
 	return ((acc_mode & VADMIN) ? EPERM : EACCES);
 }
 
 /*
  * For the purposes of file systems maintaining the _OBJ entries in an
  * inode with a mode_t field, this routine converts a mode_t entry
  * to an acl_perm_t.
  */
 acl_perm_t
 acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
 {
 	acl_perm_t	perm = 0;
 
 	switch(tag) {
 	case ACL_USER_OBJ:
 		if (mode & S_IXUSR)
 			perm |= ACL_EXECUTE;
 		if (mode & S_IRUSR)
 			perm |= ACL_READ;
 		if (mode & S_IWUSR)
 			perm |= ACL_WRITE;
 		return (perm);
 
 	case ACL_GROUP_OBJ:
 		if (mode & S_IXGRP)
 			perm |= ACL_EXECUTE;
 		if (mode & S_IRGRP)
 			perm |= ACL_READ;
 		if (mode & S_IWGRP)
 			perm |= ACL_WRITE;
 		return (perm);
 
 	case ACL_OTHER:
 		if (mode & S_IXOTH)
 			perm |= ACL_EXECUTE;
 		if (mode & S_IROTH)
 			perm |= ACL_READ;
 		if (mode & S_IWOTH)
 			perm |= ACL_WRITE;
 		return (perm);
 
 	default:
 		printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
 		return (0);
 	}
 }
 
 /*
  * Given inode information (uid, gid, mode), return an acl entry of the
  * appropriate type.
  */
 struct acl_entry
 acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
 {
 	struct acl_entry	acl_entry;
 
 	acl_entry.ae_tag = tag;
 	acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
 	switch(tag) {
 	case ACL_USER_OBJ:
 		acl_entry.ae_id = uid;
 		break;
 
 	case ACL_GROUP_OBJ:
 		acl_entry.ae_id = gid;
 		break;
 
 	case ACL_OTHER:
 		acl_entry.ae_id = ACL_UNDEFINED_ID;
 		break;
 
 	default:
 		acl_entry.ae_id = ACL_UNDEFINED_ID;
 		printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
 	}
 
 	return (acl_entry);
 }
 
 /*
  * Utility function to generate a file mode given appropriate ACL entries.
  */
 mode_t
 acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
     struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
 {
 	mode_t	mode;
 
 	mode = 0;
 	if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
 		mode |= S_IXUSR;
 	if (acl_user_obj_entry->ae_perm & ACL_READ)
 		mode |= S_IRUSR;
 	if (acl_user_obj_entry->ae_perm & ACL_WRITE)
 		mode |= S_IWUSR;
 	if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
 		mode |= S_IXGRP;
 	if (acl_group_obj_entry->ae_perm & ACL_READ)
 		mode |= S_IRGRP;
 	if (acl_group_obj_entry->ae_perm & ACL_WRITE)
 		mode |= S_IWGRP;
 	if (acl_other_entry->ae_perm & ACL_EXECUTE)
 		mode |= S_IXOTH;
 	if (acl_other_entry->ae_perm & ACL_READ)
 		mode |= S_IROTH;
 	if (acl_other_entry->ae_perm & ACL_WRITE)
 		mode |= S_IWOTH;
 
 	return (mode);
 }
 
 /*
  * Perform a syntactic check of the ACL, sufficient to allow an
  * implementing file system to determine if it should accept this and
  * rely on the POSIX.1e ACL properties.
  */
 int
 acl_posix1e_check(struct acl *acl)
 {
 	int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
 	int num_acl_mask, num_acl_other, i;
 
 	/*
 	 * Verify that the number of entries does not exceed the maximum
 	 * defined for acl_t.
 	 * Verify that the correct number of various sorts of ae_tags are
 	 * present:
 	 *   Exactly one ACL_USER_OBJ
 	 *   Exactly one ACL_GROUP_OBJ
 	 *   Exactly one ACL_OTHER
 	 *   If any ACL_USER or ACL_GROUP entries appear, then exactly one
 	 *   ACL_MASK entry must also appear.
 	 * Verify that all ae_perm entries are in ACL_PERM_BITS.
 	 * Verify all ae_tag entries are understood by this implementation.
 	 * Note: Does not check for uniqueness of qualifier (ae_id) field.
 	 */
 	num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
 	    num_acl_mask = num_acl_other = 0;
 	if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0)
 		return (EINVAL);
 	for (i = 0; i < acl->acl_cnt; i++) {
 		/*
 		 * Check for a valid tag.
 		 */
 		switch(acl->acl_entry[i].ae_tag) {
 		case ACL_USER_OBJ:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_user_obj++;
 			break;
 		case ACL_GROUP_OBJ:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_group_obj++;
 			break;
 		case ACL_USER:
 			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_user++;
 			break;
 		case ACL_GROUP:
 			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_group++;
 			break;
 		case ACL_OTHER:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_other++;
 			break;
 		case ACL_MASK:
 			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
 			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
 				return (EINVAL);
 			num_acl_mask++;
 			break;
 		default:
 			return (EINVAL);
 		}
 		/*
 		 * Check for valid perm entries.
 		 */
 		if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
 		    ACL_PERM_BITS)
 			return (EINVAL);
 	}
 	if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
 	    (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
 		return (EINVAL);
 	if (((num_acl_group != 0) || (num_acl_user != 0)) &&
 	    (num_acl_mask != 1))
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * These calls wrap the real vnode operations, and are called by the 
  * syscall code once the syscall has converted the path or file
  * descriptor to a vnode (unlocked).  The aclp pointer is assumed
  * still to point to userland, so this should not be consumed within
  * the kernel except by syscall code.  Other code should directly
  * invoke VOP_{SET,GET}ACL.
  */
 
 /*
  * Given a vnode, set its ACL.
  */
 static int
 vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernacl;
 	int error;
 
 	error = copyin(aclp, &inkernacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = VOP_SETACL(vp, type, &inkernacl, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	return(error);
 }
 
 /*
  * Given a vnode, get its ACL.
  */
 static int
 vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = VOP_GETACL(vp, type, &inkernelacl, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	if (error == 0)
 		error = copyout(&inkernelacl, aclp, sizeof(struct acl));
 	return (error);
 }
 
 /*
  * Given a vnode, delete its ACL.
  */
 static int
 vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
 {
 	int error;
 
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = VOP_SETACL(vp, ACL_TYPE_DEFAULT, 0, td->td_proc->p_ucred,
 	    td);
 	VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 /*
  * Given a vnode, check whether an ACL is appropriate for it
  */
 static int
 vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
     struct acl *aclp)
 {
 	struct acl inkernelacl;
 	int error;
 
 	error = copyin(aclp, &inkernelacl, sizeof(struct acl));
 	if (error)
 		return(error);
 	error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_proc->p_ucred,
 	    td);
 	return (error);
 }
 
 /*
  * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.
  * Don't need to lock, as the vacl_ code will get/release any locks
  * required.
  */
 
 /*
  * Given a file path, get an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	mtx_lock(&Giant);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_get_acl(td, nd.ni_vp, SCARG(uap, type), 
 			    SCARG(uap, aclp));
 		NDFREE(&nd, 0);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file path, set an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	mtx_lock(&Giant);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_set_acl(td, nd.ni_vp, SCARG(uap, type),
 			    SCARG(uap, aclp));
 		NDFREE(&nd, 0);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file descriptor, get an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
 	if (error == 0) {
 		error = vacl_get_acl(td, (struct vnode *)fp->f_data,
 			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file descriptor, set an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
 	if (error == 0) {
 		error = vacl_set_acl(td, (struct vnode *)fp->f_data,
 			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
  *
  * MPSAFE
  */
 int
 __acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
 {
 	struct nameidata nd;
 	int error;
 
 	mtx_lock(&Giant);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_delete(td, nd.ni_vp, SCARG(uap, type));
 		NDFREE(&nd, 0);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file path, delete an ACL from it.
  *
  * MPSAFE
  */
 int
 __acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
 	if (error == 0) {
 		error = vacl_delete(td, (struct vnode *)fp->f_data, 
 			    SCARG(uap, type));
+		fdrop(fp, td);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file path, check an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
 {
 	struct nameidata	nd;
 	int	error;
 
 	mtx_lock(&Giant);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error == 0) {
 		error = vacl_aclcheck(td, nd.ni_vp, SCARG(uap, type),
 			    SCARG(uap, aclp));
 		NDFREE(&nd, 0);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Given a file descriptor, check an ACL for it
  *
  * MPSAFE
  */
 int
 __acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
 {
 	struct file *fp;
 	int error;
 
 	mtx_lock(&Giant);
 	error = getvnode(td->td_proc->p_fd, SCARG(uap, filedes), &fp);
 	if (error == 0) {
 		error = vacl_aclcheck(td, (struct vnode *)fp->f_data,
 			    SCARG(uap, type), SCARG(uap, aclp));
+		fdrop(fp, td);
 	}
 	mtx_unlock(&Giant);
 	return (error);
 }
Index: head/sys/kern/vfs_cache.c
===================================================================
--- head/sys/kern/vfs_cache.c	(revision 89305)
+++ head/sys/kern/vfs_cache.c	(revision 89306)
@@ -1,868 +1,885 @@
 /*
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Poul-Henning Kamp of the FreeBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/sysctl.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 
 /*
  * This structure describes the elements in the cache of recent
  * names looked up by namei.
  */
 
 struct	namecache {
 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
 	struct	vnode *nc_vp;		/* vnode the name refers to */
 	u_char	nc_flag;		/* flag bits */
 	u_char	nc_nlen;		/* length of name */
 	char	nc_name[0];		/* segment name */
 };
 
 /*
  * Name caching works as follows:
  *
  * Names found by directory scans are retained in a cache
  * for future reference.  It is managed LRU, so frequently
  * used names will hang around.  Cache is indexed by hash value
  * obtained from (vp, name) where vp refers to the directory
  * containing name.
  *
  * If it is a "negative" entry, (i.e. for a name that is known NOT to
  * exist) the vnode pointer will be NULL.
  *
  * Upon reaching the last segment of a path, if the reference
  * is for DELETE, or NOCACHE is set (rewrite), and the
  * name is located in the cache, it will be dropped.
  */
 
 /*
  * Structures associated with name cacheing.
  */
 #define NCHHASH(hash) \
 	(&nchashtbl[(hash) & nchash])
 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
 static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
 static u_long	nchash;			/* size of hash table */
 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "");
 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
 SYSCTL_ULONG(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, "");
 static u_long	numneg;		/* number of cache entries allocated */
 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, "");
 static u_long	numcache;		/* number of cache entries allocated */
 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
 static u_long	numcachehv;		/* number of cache entries with vnodes held */
 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
 #if 0
 static u_long	numcachepl;		/* number of cache purge for leaf entries */
 SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
 #endif
 struct	nchstats nchstats;		/* cache effectiveness statistics */
 
 static int	doingcache = 1;		/* 1 => enable the cache */
 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
 SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
 SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
 
 /*
  * The new name cache statistics
  */
 SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics");
 #define STATNODE(mode, name, var) \
 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, "");
 STATNODE(CTLFLAG_RD, numneg, &numneg);
 STATNODE(CTLFLAG_RD, numcache, &numcache);
 static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls);
 static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits);
 static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits);
 static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks);
 static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss);
 static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap);
 static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps);
 static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits);
 static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps);
 static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits);
 
 SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD, &nchstats,
         sizeof(nchstats), "LU", "VFS cache effectiveness statistics");
 
 
 
 static void cache_zap __P((struct namecache *ncp));
 
 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
 /*
  * Flags in namecache.nc_flag
  */
 #define NCF_WHITE	1
 
 /*
  * Grab an atomic snapshot of the name cache hash chain lengths
  */
 SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");
 
 static int
 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int n_nchash;
 	int count;
 
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	if (!req->oldptr)
 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 		count = 0;
 		LIST_FOREACH(ncp, ncpp, nc_hash) {
 			count++;
 		}
 		error = SYSCTL_OUT(req, (caddr_t)&count, sizeof(count));
 		if (error)
 			return (error);
 	}
 	return (0);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_debug_hashstat_rawnchash, "S,int", "nchash chain lengths");
 
 static int
 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int n_nchash;
 	int count, maxlength, used, pct;
 
 	if (!req->oldptr)
 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	used = 0;
 	maxlength = 0;
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 		count = 0;
 		LIST_FOREACH(ncp, ncpp, nc_hash) {
 			count++;
 		}
 		if (count)
 			used++;
 		if (maxlength < count)
 			maxlength = count;
 	}
 	n_nchash = nchash + 1;
 	pct = (used * 100 * 100) / n_nchash;
 	error = SYSCTL_OUT(req, (caddr_t)&n_nchash, sizeof(n_nchash));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, (caddr_t)&used, sizeof(used));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, (caddr_t)&maxlength, sizeof(maxlength));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, (caddr_t)&pct, sizeof(pct));
 	if (error)
 		return (error);
 	return (0);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD,
 	0, 0, sysctl_debug_hashstat_nchash, "I", "nchash chain lengths");
 
 /*
  * Delete an entry from its hash list and move it to the front
  * of the LRU list for immediate reuse.
  */
 static void
 cache_zap(ncp)
 	struct namecache *ncp;
 {
 	LIST_REMOVE(ncp, nc_hash);
 	LIST_REMOVE(ncp, nc_src);
 	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
 		vdrop(ncp->nc_dvp);
 		numcachehv--;
 	}
 	if (ncp->nc_vp) {
 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
 	} else {
 		TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 		numneg--;
 	}
 	numcache--;
 	free(ncp, M_VFSCACHE);
 }
 
 /*
  * cache_leaf_test()
  * 
  *      Test whether this (directory) vnode's namei cache entry contains
  *      subdirectories or not.  Used to determine whether the directory is
  *      a leaf in the namei cache or not.  Note: the directory may still   
  *      contain files in the namei cache.
  *
  *      Returns 0 if the directory is a leaf, -1 if it isn't.
  */
 int
 cache_leaf_test(struct vnode *vp)
 {
 	struct namecache *ncpc;
 
 	for (ncpc = LIST_FIRST(&vp->v_cache_src);
 	     ncpc != NULL;
 	     ncpc = LIST_NEXT(ncpc, nc_src)
 	) {
 		if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR)
 			return(-1);
 	}
 	return(0);
 }
 
 /*
  * Lookup an entry in the cache
  *
  * We don't do this if the segment name is long, simply so the cache
  * can avoid holding long names (which would either waste space, or
  * add greatly to the complexity).
  *
  * Lookup is called with dvp pointing to the directory to search,
  * cnp pointing to the name of the entry being sought. If the lookup
  * succeeds, the vnode is returned in *vpp, and a status of -1 is
  * returned. If the lookup determines that the name does not exist
  * (negative cacheing), a status of ENOENT is returned. If the lookup
  * fails, a status of zero is returned.
  */
 
 int
 cache_lookup(dvp, vpp, cnp)
 	struct vnode *dvp;
 	struct vnode **vpp;
 	struct componentname *cnp;
 {
 	struct namecache *ncp;
 	u_int32_t hash;
 
 	if (!doingcache) {
 		cnp->cn_flags &= ~MAKEENTRY;
 		return (0);
 	}
 
 	numcalls++;
 
 	if (cnp->cn_nameptr[0] == '.') {
 		if (cnp->cn_namelen == 1) {
 			*vpp = dvp;
 			dothits++;
 			return (-1);
 		}
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			dotdothits++;
 			if (dvp->v_dd->v_id != dvp->v_ddid ||
 			    (cnp->cn_flags & MAKEENTRY) == 0) {
 				dvp->v_ddid = 0;
 				return (0);
 			}
 			*vpp = dvp->v_dd;
 			return (-1);
 		}
 	}
 
 	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		numchecks++;
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
 
 	/* We failed to find an entry */
 	if (ncp == 0) {
 		if ((cnp->cn_flags & MAKEENTRY) == 0) {
 			nummisszap++;
 		} else {
 			nummiss++;
 		}
 		nchstats.ncs_miss++;
 		return (0);
 	}
 
 	/* We don't want to have an entry, so dump it */
 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
 		numposzaps++;
 		nchstats.ncs_badhits++;
 		cache_zap(ncp);
 		return (0);
 	}
 
 	/* We found a "positive" match, return the vnode */
         if (ncp->nc_vp) {
 		numposhits++;
 		nchstats.ncs_goodhits++;
 		*vpp = ncp->nc_vp;
 		return (-1);
 	}
 
 	/* We found a negative match, and want to create it, so purge */
 	if (cnp->cn_nameiop == CREATE) {
 		numnegzaps++;
 		nchstats.ncs_badhits++;
 		cache_zap(ncp);
 		return (0);
 	}
 
 	numneghits++;
 	/*
 	 * We found a "negative" match, ENOENT notifies client of this match.
 	 * The nc_vpid field records whether this is a whiteout.
 	 */
 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 	nchstats.ncs_neghits++;
 	if (ncp->nc_flag & NCF_WHITE)
 		cnp->cn_flags |= ISWHITEOUT;
 	return (ENOENT);
 }
 
 /*
  * Add an entry to the cache.
  */
 void
 cache_enter(dvp, vp, cnp)
 	struct vnode *dvp;
 	struct vnode *vp;
 	struct componentname *cnp;
 {
 	struct namecache *ncp;
 	struct nchashhead *ncpp;
 	u_int32_t hash;
 	int len;
 
 	if (!doingcache)
 		return;
 
 	if (cnp->cn_nameptr[0] == '.') {
 		if (cnp->cn_namelen == 1) {
 			return;
 		}
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			if (vp) {
 				dvp->v_dd = vp;
 				dvp->v_ddid = vp->v_id;
 			} else {
 				dvp->v_dd = dvp;
 				dvp->v_ddid = 0;
 			}
 			return;
 		}
 	}
 	 
 	ncp = (struct namecache *)
 		malloc(sizeof *ncp + cnp->cn_namelen, M_VFSCACHE, M_WAITOK);
 	bzero((char *)ncp, sizeof *ncp);
 	numcache++;
 	if (!vp) {
 		numneg++;
 		ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0;
 	} else if (vp->v_type == VDIR) {
 		vp->v_dd = dvp;
 		vp->v_ddid = dvp->v_id;
 	}
 
 	/*
 	 * Fill in cache info, if vp is NULL this is a "negative" cache entry.
 	 * For negative entries, we have to record whether it is a whiteout.
 	 * the whiteout flag is stored in the nc_vpid field which is
 	 * otherwise unused.
 	 */
 	ncp->nc_vp = vp;
 	ncp->nc_dvp = dvp;
 	len = ncp->nc_nlen = cnp->cn_namelen;
 	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
 	bcopy(cnp->cn_nameptr, ncp->nc_name, len);
 	hash = fnv_32_buf(&dvp->v_id, sizeof(dvp->v_id), hash);
 	ncpp = NCHHASH(hash);
 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 	if (LIST_EMPTY(&dvp->v_cache_src)) {
 		vhold(dvp);
 		numcachehv++;
 	}
 	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 	if (vp) {
 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 	} else {
 		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 	}
 	if (numneg * ncnegfactor > numcache) {
 		ncp = TAILQ_FIRST(&ncneg);
 		cache_zap(ncp);
 	}
 }
 
 /*
  * Name cache initialization, from vfs_init() when we are booting
  */
 static void
 nchinit(void *dummy __unused)
 {
 
 	TAILQ_INIT(&ncneg);
 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL)
 
 
 /*
  * Invalidate all entries to a particular vnode.
  *
  * Remove all entries in the namecache relating to this vnode and
  * change the v_id.  We take the v_id from a global counter, since
  * it becomes a handy sequence number in crash-dumps that way.
  * No valid vnode will ever have (v_id == 0).
  *
  * XXX: Only time and the size of v_id prevents this from failing:
  * XXX: In theory we should hunt down all (struct vnode*, v_id)
  * XXX: soft references and nuke them, at least on the global
  * XXX: v_id wraparound.  The period of resistance can be extended
  * XXX: by incrementing each vnodes v_id individually instead of
  * XXX: using the global v_id.
  */
 
 void
 cache_purge(vp)
 	struct vnode *vp;
 {
 	static u_long nextid;
 
 	while (!LIST_EMPTY(&vp->v_cache_src)) 
 		cache_zap(LIST_FIRST(&vp->v_cache_src));
 	while (!TAILQ_EMPTY(&vp->v_cache_dst)) 
 		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
 
 	do
 		nextid++;
 	while (nextid == vp->v_id || !nextid);
 	vp->v_id = nextid;
 	vp->v_dd = vp;
 	vp->v_ddid = 0;
 }
 
 /*
  * Flush all entries referencing a particular filesystem.
  *
  * Since we need to check it anyway, we will flush all the invalid
  * entries at the same time.
  */
 void
 cache_purgevfs(mp)
 	struct mount *mp;
 {
 	struct nchashhead *ncpp;
 	struct namecache *ncp, *nnp;
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
 		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
 			nnp = LIST_NEXT(ncp, nc_hash);
 			if (ncp->nc_dvp->v_mount == mp) {
 				cache_zap(ncp);
 			}
 		}
 	}
 }
 
 #if 0
 
 /*
  * Flush all dirctory entries with no child directories held in
  * the cache.
  *
  * Since we need to check it anyway, we will flush all the invalid
  * entries at the same time.
  */
 void
 cache_purgeleafdirs(ndir)
 	int ndir;
 {
 	struct nchashhead *ncpp;
 	struct namecache *ncp, *nnp, *ncpc, *nnpc;
 	struct vnode *dvp;
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl && ndir > 0; ncpp--) {
 		for (ncp = LIST_FIRST(ncpp); ncp != 0 && ndir > 0; ncp = nnp) {
 			nnp = LIST_NEXT(ncp, nc_hash);
 			if (ncp->nc_dvp != 0) {
 				/*
 				 * Skip over if nc_dvp of this cache holds
 				 * a child directory, or the hold count of
 				 * nc_dvp is greater than 1 (in which case
 				 * nc_dvp is likely to be the working
 				 * directory of a process).
 				 */
 				if (ncp->nc_dvp->v_holdcnt > 1)
 					continue;
 				for (ncpc = LIST_FIRST(&ncp->nc_dvp->v_cache_src);
 				     ncpc != 0; ncpc = nnpc) {
 					nnpc = LIST_NEXT(ncpc, nc_src);
 					if (ncpc->nc_vp != 0 && ncpc->nc_vp->v_type == VDIR)
 						break;
 				}
 				if (ncpc == 0) {
 					/*
 					 * Zap all of this directory's children,
 					 * held in ncp->nc_dvp->v_cache_src.
 					 */
 					dvp = ncp->nc_dvp;
 					while (!LIST_EMPTY(&dvp->v_cache_src))
 						cache_zap(LIST_FIRST(&dvp->v_cache_src));
 
 					ndir--;
 
 					/* Restart in case where nnp is reclaimed. */
 					nnp = LIST_FIRST(ncpp);
 					continue;
 				}
 			}
 		}
 	}
 	numcachepl++;
 }
 
 #endif
 
 /*
  * Perform canonical checks and cache lookup and pass on to filesystem
  * through the vop_cachedlookup only if needed.
  */
 
 int
 vfs_cache_lookup(ap)
 	struct vop_lookup_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *dvp, *vp;
 	int lockparent;
 	int error;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct ucred *cred = cnp->cn_cred;
 	int flags = cnp->cn_flags;
 	struct thread *td = cnp->cn_thread;
 	u_long vpid;	/* capability number of vnode */
 
 	*vpp = NULL;
 	dvp = ap->a_dvp;
 	lockparent = flags & LOCKPARENT;
 
 	if (dvp->v_type != VDIR)
                 return (ENOTDIR);
 
 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 
 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
 
 	if (error)
 		return (error);
 
 	error = cache_lookup(dvp, vpp, cnp);
 
 	if (!error) 
 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 
 	if (error == ENOENT)
 		return (error);
 
 	vp = *vpp;
 	vpid = vp->v_id;
 	cnp->cn_flags &= ~PDIRUNLOCK;
 	if (dvp == vp) {   /* lookup on "." */
 		VREF(vp);
 		error = 0;
 	} else if (flags & ISDOTDOT) {
 		VOP_UNLOCK(dvp, 0, td);
 		cnp->cn_flags |= PDIRUNLOCK;
 		error = vget(vp, LK_EXCLUSIVE, td);
 		if (!error && lockparent && (flags & ISLASTCN)) {
 			if ((error = vn_lock(dvp, LK_EXCLUSIVE, td)) == 0)
 				cnp->cn_flags &= ~PDIRUNLOCK;
 		}
 	} else {
 		error = vget(vp, LK_EXCLUSIVE, td);
 		if (!lockparent || error || !(flags & ISLASTCN)) {
 			VOP_UNLOCK(dvp, 0, td);
 			cnp->cn_flags |= PDIRUNLOCK;
 		}
 	}
 	/*
 	 * Check that the capability number did not change
 	 * while we were waiting for the lock.
 	 */
 	if (!error) {
 		if (vpid == vp->v_id)
 			return (0);
 		vput(vp);
 		if (lockparent && dvp != vp && (flags & ISLASTCN)) {
 			VOP_UNLOCK(dvp, 0, td);
 			cnp->cn_flags |= PDIRUNLOCK;
 		}
 	}
 	if (cnp->cn_flags & PDIRUNLOCK) {
 		error = vn_lock(dvp, LK_EXCLUSIVE, td);
 		if (error)
 			return (error);
 		cnp->cn_flags &= ~PDIRUNLOCK;
 	}
 	return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 }
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct  __getcwd_args {
 	u_char	*buf;
 	u_int	buflen;
 };
 #endif
 
 static int disablecwd;
 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, "");
 
 static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls);
 static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1);
 static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2);
 static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3);
 static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4);
 static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound);
 int
 __getcwd(td, uap)
 	struct thread *td;
 	struct __getcwd_args *uap;
 {
 	char *bp, *buf;
 	int error, i, slash_prefixed;
 	struct filedesc *fdp;
 	struct namecache *ncp;
 	struct vnode *vp;
 
 	numcwdcalls++;
 	if (disablecwd)
 		return (ENODEV);
 	if (uap->buflen < 2)
 		return (EINVAL);
 	if (uap->buflen > MAXPATHLEN)
 		uap->buflen = MAXPATHLEN;
 	buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK);
 	bp += uap->buflen - 1;
 	*bp = '\0';
 	fdp = td->td_proc->p_fd;
 	slash_prefixed = 0;
+	FILEDESC_LOCK(fdp);
 	for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) {
 		if (vp->v_flag & VROOT) {
 			if (vp->v_mount == NULL) {	/* forced unmount */
+				FILEDESC_UNLOCK(fdp);
 				free(buf, M_TEMP);
 				return (EBADF);
 			}
 			vp = vp->v_mount->mnt_vnodecovered;
 			continue;
 		}
 		if (vp->v_dd->v_id != vp->v_ddid) {
+			FILEDESC_UNLOCK(fdp);
 			numcwdfail1++;
 			free(buf, M_TEMP);
 			return (ENOTDIR);
 		}
 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
 		if (!ncp) {
+			FILEDESC_UNLOCK(fdp);
 			numcwdfail2++;
 			free(buf, M_TEMP);
 			return (ENOENT);
 		}
 		if (ncp->nc_dvp != vp->v_dd) {
+			FILEDESC_UNLOCK(fdp);
 			numcwdfail3++;
 			free(buf, M_TEMP);
 			return (EBADF);
 		}
 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
 			if (bp == buf) {
+				FILEDESC_UNLOCK(fdp);
 				numcwdfail4++;
 				free(buf, M_TEMP);
 				return (ENOMEM);
 			}
 			*--bp = ncp->nc_name[i];
 		}
 		if (bp == buf) {
+			FILEDESC_UNLOCK(fdp);
 			numcwdfail4++;
 			free(buf, M_TEMP);
 			return (ENOMEM);
 		}
 		*--bp = '/';
 		slash_prefixed = 1;
 		vp = vp->v_dd;
 	}
+	FILEDESC_UNLOCK(fdp);
 	if (!slash_prefixed) {
 		if (bp == buf) {
 			numcwdfail4++;
 			free(buf, M_TEMP);
 			return (ENOMEM);
 		}
 		*--bp = '/';
 	}
 	numcwdfound++;
 	error = copyout(bp, uap->buf, strlen(bp) + 1);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 /*
  * Thus begins the fullpath magic.
  */
 
 #undef STATNODE
 #define STATNODE(name)							\
 	static u_int name;						\
 	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, "")
 
 static int disablefullpath;
 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW,
     &disablefullpath, 0, "");
 
 STATNODE(numfullpathcalls);
 STATNODE(numfullpathfail1);
 STATNODE(numfullpathfail2);
 STATNODE(numfullpathfail3);
 STATNODE(numfullpathfail4);
 STATNODE(numfullpathfound);
 
 int
 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
 {
 	char *bp, *buf;
 	int i, slash_prefixed;
 	struct filedesc *fdp;
 	struct namecache *ncp;
 	struct vnode *vp;
 
 	numfullpathcalls++;
 	if (disablefullpath)
 		return (ENODEV);
 	if (vn == NULL)
 		return (EINVAL);
 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	bp = buf + MAXPATHLEN - 1;
 	*bp = '\0';
 	fdp = td->td_proc->p_fd;
 	slash_prefixed = 0;
+	FILEDESC_LOCK(fdp);
 	for (vp = vn; vp != fdp->fd_rdir && vp != rootvnode;) {
 		if (vp->v_flag & VROOT) {
 			if (vp->v_mount == NULL) {	/* forced unmount */
+				FILEDESC_UNLOCK(fdp);
 				free(buf, M_TEMP);
 				return (EBADF);
 			}
 			vp = vp->v_mount->mnt_vnodecovered;
 			continue;
 		}
 		if (vp != vn && vp->v_dd->v_id != vp->v_ddid) {
+			FILEDESC_UNLOCK(fdp);
 			numfullpathfail1++;
 			free(buf, M_TEMP);
 			return (ENOTDIR);
 		}
 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
 		if (!ncp) {
+			FILEDESC_UNLOCK(fdp);
 			numfullpathfail2++;
 			free(buf, M_TEMP);
 			return (ENOENT);
 		}
 		if (vp != vn && ncp->nc_dvp != vp->v_dd) {
+			FILEDESC_UNLOCK(fdp);
 			numfullpathfail3++;
 			free(buf, M_TEMP);
 			return (EBADF);
 		}
 		for (i = ncp->nc_nlen - 1; i >= 0; i--) {
 			if (bp == buf) {
+				FILEDESC_UNLOCK(fdp);
 				numfullpathfail4++;
 				free(buf, M_TEMP);
 				return (ENOMEM);
 			}
 			*--bp = ncp->nc_name[i];
 		}
 		if (bp == buf) {
+			FILEDESC_UNLOCK(fdp);
 			numfullpathfail4++;
 			free(buf, M_TEMP);
 			return (ENOMEM);
 		}
 		*--bp = '/';
 		slash_prefixed = 1;
 		vp = ncp->nc_dvp;
 	}
 	if (!slash_prefixed) {
 		if (bp == buf) {
+			FILEDESC_UNLOCK(fdp);
 			numfullpathfail4++;
 			free(buf, M_TEMP);
 			return (ENOMEM);
 		}
 		*--bp = '/';
 	}
+	FILEDESC_UNLOCK(fdp);
 	numfullpathfound++;
 	*retbuf = bp; 
 	*freebuf = buf;
 	return (0);
 }
Index: head/sys/kern/vfs_extattr.c
===================================================================
--- head/sys/kern/vfs_extattr.c	(revision 89305)
+++ head/sys/kern/vfs_extattr.c	(revision 89306)
@@ -1,4191 +1,4304 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
  * $FreeBSD$
  */
 
 /* For 4.3 integer FS ID compatibility */
 #include "opt_compat.h"
 #include "opt_ffs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sysent.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/namei.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/linker.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/proc.h>
 #include <sys/dirent.h>
 #include <sys/extattr.h>
 #include <sys/jail.h>
 #include <sys/sysctl.h>
 
 #include <machine/limits.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_zone.h>
 #include <vm/vm_page.h>
 
 static int change_dir __P((struct nameidata *ndp, struct thread *td));
 static void checkdirs __P((struct vnode *olddp, struct vnode *newdp));
 static int chroot_refuse_vdir_fds __P((struct filedesc *fdp));
 static int getutimes __P((const struct timeval *, struct timespec *));
 static int setfown __P((struct thread *td, struct vnode *, uid_t, gid_t));
 static int setfmode __P((struct thread *td, struct vnode *, int));
 static int setfflags __P((struct thread *td, struct vnode *, int));
 static int setutimes __P((struct thread *td, struct vnode *,
     const struct timespec *, int));
 static int vn_access __P((struct vnode *vp, int user_flags, struct ucred *cred,
     struct thread *td));
 
 static int	usermount = 0;	/* if 1, non-root can mount fs. */
 
 int (*union_dircheckp) __P((struct thread *td, struct vnode **, struct file *));
 
 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
 
 /*
  * Virtual File System System Calls
  */
 
 #ifndef _SYS_SYSPROTO_H_
 struct nmount_args {
 	struct iovec    *iovp;
 	unsigned int    iovcnt;
 	int             flags;
 	};
 #endif
 /* ARGSUSED */
 int
 nmount(td, uap)
 	struct thread *td;
 	struct nmount_args /* {
 		syscallarg(struct iovec *) iovp;
 		syscallarg(unsigned int) iovcnt;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 
 	return(EOPNOTSUPP);
 }
 
 /*
  * Mount a file system.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mount_args {
 	char	*type;
 	char	*path;
 	int	flags;
 	caddr_t	data;
 };
 #endif
 /* ARGSUSED */
 int
 mount(td, uap)
 	struct thread *td;
 	struct mount_args /* {
 		syscallarg(char *) type;
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 		syscallarg(caddr_t) data;
 	} */ *uap;
 {
 	char *fstype;
 	char *fspath;
 	int error;
 
 	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK | M_ZERO);
 	fspath = malloc(MNAMELEN, M_TEMP, M_WAITOK | M_ZERO);
 
 	/*
 	 * vfs_mount() actually takes a kernel string for `type' and
 	 * `path' now, so extract them.
 	 */
 	error = copyinstr(SCARG(uap, type), fstype, MFSNAMELEN, NULL);
 	if (error)
 		goto finish;
 	error = copyinstr(SCARG(uap, path), fspath, MNAMELEN, NULL);
 	if (error)
 		goto finish;
 	error = vfs_mount(td, fstype, fspath, SCARG(uap, flags),
 	    SCARG(uap, data));
 finish:
 	free(fstype, M_TEMP);
 	free(fspath, M_TEMP);
 	return (error);
 }
 
 /*
  * vfs_mount(): actually attempt a filesystem mount.
  *
  * This routine is designed to be a "generic" entry point for routines
  * that wish to mount a filesystem. All parameters except `fsdata' are
  * pointers into kernel space. `fsdata' is currently still a pointer
  * into userspace.
  */
 int
 vfs_mount(td, fstype, fspath, fsflags, fsdata)
 	struct thread *td;
 	const char *fstype;
 	char *fspath;
 	int fsflags;
 	void *fsdata;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vfsconf *vfsp;
 	int error, flag = 0, flag2 = 0;
 	struct vattr va;
 	struct nameidata nd;
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if ((strlen(fstype) >= MFSNAMELEN - 1) ||
 	    (strlen(fspath) >= MNAMELEN - 1))
 		return (ENAMETOOLONG);
 
 	if (usermount == 0) {
 		error = suser_td(td);
 		if (error)
 			return (error);
 	}
 	/*
 	 * Do not allow NFS export by non-root users.
 	 */
 	if (fsflags & MNT_EXPORTED) {
 		error = suser_td(td);
 		if (error)
 			return (error);
 	}
 	/*
 	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
 	 */
 	if (suser_xxx(td->td_proc->p_ucred, 0, 0)) 
 		fsflags |= MNT_NOSUID | MNT_NODEV;
 	/*
 	 * Get vnode to be covered
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (fsflags & MNT_UPDATE) {
 		if ((vp->v_flag & VROOT) == 0) {
 			vput(vp);
 			return (EINVAL);
 		}
 		mp = vp->v_mount;
 		flag = mp->mnt_flag;
 		flag2 = mp->mnt_kern_flag;
 		/*
 		 * We only allow the filesystem to be reloaded if it
 		 * is currently mounted read-only.
 		 */
 		if ((fsflags & MNT_RELOAD) &&
 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 			vput(vp);
 			return (EOPNOTSUPP);	/* Needs translation */
 		}
 		/*
 		 * Only root, or the user that did the original mount is
 		 * permitted to update it.
 		 */
 		if (mp->mnt_stat.f_owner != td->td_proc->p_ucred->cr_uid) {
 			error = suser_td(td);
 			if (error) {
 				vput(vp);
 				return (error);
 			}
 		}
 		if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
 			vput(vp);
 			return (EBUSY);
 		}
 		mtx_lock(&vp->v_interlock);
 		if ((vp->v_flag & VMOUNT) != 0 ||
 		    vp->v_mountedhere != NULL) {
 			mtx_unlock(&vp->v_interlock);
 			vfs_unbusy(mp, td);
 			vput(vp);
 			return (EBUSY);
 		}
 		vp->v_flag |= VMOUNT;
 		mtx_unlock(&vp->v_interlock);
 		mp->mnt_flag |= fsflags &
 		    (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT);
 		VOP_UNLOCK(vp, 0, td);
 		goto update;
 	}
 	/*
 	 * If the user is not root, ensure that they own the directory
 	 * onto which we are attempting to mount.
 	 */
 	error = VOP_GETATTR(vp, &va, td->td_proc->p_ucred, td);
 	if (error) {
 		vput(vp);
 		return (error);
 	}
 	if (va.va_uid != td->td_proc->p_ucred->cr_uid) {
 		error = suser_td(td);
 		if (error) {
 			vput(vp);
 			return (error);
 		}
 	}
 	if ((error = vinvalbuf(vp, V_SAVE, td->td_proc->p_ucred, td, 0, 0))
 	    != 0) {
 		vput(vp);
 		return (error);
 	}
 	if (vp->v_type != VDIR) {
 		vput(vp);
 		return (ENOTDIR);
 	}
 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 		if (!strcmp(vfsp->vfc_name, fstype))
 			break;
 	if (vfsp == NULL) {
 		linker_file_t lf;
 
 		/* Only load modules for root (very important!) */
 		error = suser_td(td);
 		if (error) {
 			vput(vp);
 			return error;
 		}
 		error = linker_load_file(fstype, &lf);
 		if (error || lf == NULL) {
 			vput(vp);
 			if (lf == NULL)
 				error = ENODEV;
 			return error;
 		}
 		lf->userrefs++;
 		/* lookup again, see if the VFS was loaded */
 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 			if (!strcmp(vfsp->vfc_name, fstype))
 				break;
 		if (vfsp == NULL) {
 			lf->userrefs--;
 			linker_file_unload(lf);
 			vput(vp);
 			return (ENODEV);
 		}
 	}
 	mtx_lock(&vp->v_interlock);
 	if ((vp->v_flag & VMOUNT) != 0 ||
 	    vp->v_mountedhere != NULL) {
 		mtx_unlock(&vp->v_interlock);
 		vput(vp);
 		return (EBUSY);
 	}
 	vp->v_flag |= VMOUNT;
 	mtx_unlock(&vp->v_interlock);
 
 	/*
 	 * Allocate and initialize the filesystem.
 	 */
 	mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&mp->mnt_nvnodelist);
 	TAILQ_INIT(&mp->mnt_reservedvnlist);
 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
 	(void)vfs_busy(mp, LK_NOWAIT, 0, td);
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_vfc = vfsp;
 	vfsp->vfc_refcount++;
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 	strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN);
 	mp->mnt_stat.f_fstypename[MFSNAMELEN - 1] = '\0';
 	mp->mnt_vnodecovered = vp;
 	mp->mnt_stat.f_owner = td->td_proc->p_ucred->cr_uid;
 	strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
 	mp->mnt_stat.f_mntonname[MNAMELEN - 1] = '\0';
 	mp->mnt_iosize_max = DFLTPHYS;
 	VOP_UNLOCK(vp, 0, td);
 update:
 	/*
 	 * Set the mount level flags.
 	 */
 	if (fsflags & MNT_RDONLY)
 		mp->mnt_flag |= MNT_RDONLY;
 	else if (mp->mnt_flag & MNT_RDONLY)
 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
 	mp->mnt_flag &=~ MNT_UPDATEMASK;
 	mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE);
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
 	error = VFS_MOUNT(mp, fspath, fsdata, &nd, td);
 	if (mp->mnt_flag & MNT_UPDATE) {
 		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 			mp->mnt_flag &= ~MNT_RDONLY;
 		mp->mnt_flag &=~
 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
 		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 		if (error) {
 			mp->mnt_flag = flag;
 			mp->mnt_kern_flag = flag2;
 		}
 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 			if (mp->mnt_syncer == NULL)
 				error = vfs_allocate_syncvnode(mp);
 		} else {
 			if (mp->mnt_syncer != NULL)
 				vrele(mp->mnt_syncer);
 			mp->mnt_syncer = NULL;
 		}
 		vfs_unbusy(mp, td);
 		mtx_lock(&vp->v_interlock);
 		vp->v_flag &= ~VMOUNT;
 		mtx_unlock(&vp->v_interlock);
 		vrele(vp);
 		return (error);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	/*
 	 * Put the new filesystem on the mount list after root.
 	 */
 	cache_purge(vp);
 	if (!error) {
 		struct vnode *newdp;
 
 		mtx_lock(&vp->v_interlock);
 		vp->v_flag &= ~VMOUNT;
 		vp->v_mountedhere = mp;
 		mtx_unlock(&vp->v_interlock);
 		mtx_lock(&mountlist_mtx);
 		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 		mtx_unlock(&mountlist_mtx);
 		if (VFS_ROOT(mp, &newdp))
 			panic("mount: lost mount");
 		checkdirs(vp, newdp);
 		vput(newdp);
 		VOP_UNLOCK(vp, 0, td);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0)
 			error = vfs_allocate_syncvnode(mp);
 		vfs_unbusy(mp, td);
 		if ((error = VFS_START(mp, 0, td)) != 0)
 			vrele(vp);
 	} else {
 		mtx_lock(&vp->v_interlock);
 		vp->v_flag &= ~VMOUNT;
 		mtx_unlock(&vp->v_interlock);
 		mp->mnt_vfc->vfc_refcount--;
 		vfs_unbusy(mp, td);
 		free((caddr_t)mp, M_MOUNT);
 		vput(vp);
 	}
 	return (error);
 }
 
 /*
  * Scan all active processes to see if any of them have a current
  * or root directory of `olddp'. If so, replace them with the new
  * mount point.
  */
 static void
 checkdirs(olddp, newdp)
 	struct vnode *olddp, *newdp;
 {
 	struct filedesc *fdp;
 	struct proc *p;
 
 	if (olddp->v_usecount == 1)
 		return;
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &allproc, p_list) {
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
+		FILEDESC_LOCK(fdp);
 		if (fdp->fd_cdir == olddp) {
-			vrele(fdp->fd_cdir);
 			VREF(newdp);
 			fdp->fd_cdir = newdp;
+			FILEDESC_UNLOCK(fdp);
+			vrele(olddp);
+			FILEDESC_LOCK(fdp);
 		}
 		if (fdp->fd_rdir == olddp) {
-			vrele(fdp->fd_rdir);
 			VREF(newdp);
 			fdp->fd_rdir = newdp;
-		}
+			FILEDESC_UNLOCK(fdp);
+			vrele(olddp);
+		} else
+			FILEDESC_UNLOCK(fdp);
 	}
 	sx_sunlock(&allproc_lock);
 	if (rootvnode == olddp) {
 		vrele(rootvnode);
 		VREF(newdp);
 		rootvnode = newdp;
 	}
 }
 
 /*
  * Unmount a file system.
  *
  * Note: unmount takes a path to the vnode mounted on as argument,
  * not special file (as before).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unmount_args {
 	char	*path;
 	int	flags;
 };
 #endif
 /* ARGSUSED */
 int
 unmount(td, uap)
 	struct thread *td;
 	register struct unmount_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	register struct vnode *vp;
 	struct mount *mp;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	mp = vp->v_mount;
 
 	/*
 	 * Only root, or the user that did the original mount is
 	 * permitted to unmount this filesystem.
 	 */
 	if (mp->mnt_stat.f_owner != td->td_proc->p_ucred->cr_uid) {
 		error = suser_td(td);
 		if (error) {
 			vput(vp);
 			return (error);
 		}
 	}
 
 	/*
 	 * Don't allow unmounting the root file system.
 	 */
 	if (mp->mnt_flag & MNT_ROOTFS) {
 		vput(vp);
 		return (EINVAL);
 	}
 
 	/*
 	 * Must be the root of the filesystem
 	 */
 	if ((vp->v_flag & VROOT) == 0) {
 		vput(vp);
 		return (EINVAL);
 	}
 	vput(vp);
 	return (dounmount(mp, SCARG(uap, flags), td));
 }
 
 /*
  * Do the actual file system unmount.
  */
 int
 dounmount(mp, flags, td)
 	struct mount *mp;
 	int flags;
 	struct thread *td;
 {
 	struct vnode *coveredvp, *fsrootvp;
 	int error;
 	int async_flag;
 
 	mtx_lock(&mountlist_mtx);
 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
 	error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK |
 	    ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), &mountlist_mtx, td);
 	if (error) {
 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
 		if (mp->mnt_kern_flag & MNTK_MWAIT)
 			wakeup((caddr_t)mp);
 		return (error);
 	}
 	vn_start_write(NULL, &mp, V_WAIT);
 
 	if (mp->mnt_flag & MNT_EXPUBLIC)
 		vfs_setpublicfs(NULL, NULL, NULL);
 
 	vfs_msync(mp, MNT_WAIT);
 	async_flag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &=~ MNT_ASYNC;
 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
 	if (mp->mnt_syncer != NULL)
 		vrele(mp->mnt_syncer);
 	/* Move process cdir/rdir refs on fs root to underlying vnode. */
 	if (VFS_ROOT(mp, &fsrootvp) == 0) {
 		if (mp->mnt_vnodecovered != NULL)
 			checkdirs(fsrootvp, mp->mnt_vnodecovered);
 		if (fsrootvp == rootvnode) {
 			vrele(rootvnode);
 			rootvnode = NULL;
 		}
 		vput(fsrootvp);
 	}
 	if (((mp->mnt_flag & MNT_RDONLY) ||
 	     (error = VFS_SYNC(mp, MNT_WAIT, td->td_proc->p_ucred, td)) == 0) ||
 	    (flags & MNT_FORCE)) {
 		error = VFS_UNMOUNT(mp, flags, td);
 	}
 	vn_finished_write(mp);
 	if (error) {
 		/* Undo cdir/rdir and rootvnode changes made above. */
 		if (VFS_ROOT(mp, &fsrootvp) == 0) {
 			if (mp->mnt_vnodecovered != NULL)
 				checkdirs(mp->mnt_vnodecovered, fsrootvp);
 			if (rootvnode == NULL) {
 				rootvnode = fsrootvp;
 				vref(rootvnode);
 			}
 			vput(fsrootvp);
 		}
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
 			(void) vfs_allocate_syncvnode(mp);
 		mtx_lock(&mountlist_mtx);
 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
 		mp->mnt_flag |= async_flag;
 		lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK,
 		    &mountlist_mtx, td);
 		if (mp->mnt_kern_flag & MNTK_MWAIT)
 			wakeup((caddr_t)mp);
 		return (error);
 	}
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	if ((coveredvp = mp->mnt_vnodecovered) != NULL)
 		coveredvp->v_mountedhere = NULL;
 	mp->mnt_vfc->vfc_refcount--;
 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
 		panic("unmount: dangling vnode");
 	lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_mtx, td);
 	lockdestroy(&mp->mnt_lock);
 	if (coveredvp != NULL)
 		vrele(coveredvp);
 	if (mp->mnt_kern_flag & MNTK_MWAIT)
 		wakeup((caddr_t)mp);
 	free((caddr_t)mp, M_MOUNT);
 	return (0);
 }
 
 /*
  * Sync each mounted filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sync_args {
         int     dummy;
 };
 #endif
 
 #ifdef DEBUG
 static int syncprt = 0;
 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
 #endif
 
 /* ARGSUSED */
 int
 sync(td, uap)
 	struct thread *td;
 	struct sync_args *uap;
 {
 	struct mount *mp, *nmp;
 	int asyncflag;
 
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
 			asyncflag = mp->mnt_flag & MNT_ASYNC;
 			mp->mnt_flag &= ~MNT_ASYNC;
 			vfs_msync(mp, MNT_NOWAIT);
 			VFS_SYNC(mp, MNT_NOWAIT,
 			    ((td != NULL) ? td->td_proc->p_ucred : NOCRED), td);
 			mp->mnt_flag |= asyncflag;
 			vn_finished_write(mp);
 		}
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp, td);
 	}
 	mtx_unlock(&mountlist_mtx);
 #if 0
 /*
  * XXX don't call vfs_bufstats() yet because that routine
  * was not imported in the Lite2 merge.
  */
 #ifdef DIAGNOSTIC
 	if (syncprt)
 		vfs_bufstats();
 #endif /* DIAGNOSTIC */
 #endif
 	return (0);
 }
 
 /* XXX PRISON: could be per prison flag */
 static int prison_quotas;
 #if 0
 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
 #endif
 
 /*
  * Change filesystem quotas.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct quotactl_args {
 	char *path;
 	int cmd;
 	int uid;
 	caddr_t arg;
 };
 #endif
 /* ARGSUSED */
 int
 quotactl(td, uap)
 	struct thread *td;
 	register struct quotactl_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) cmd;
 		syscallarg(int) uid;
 		syscallarg(caddr_t) arg;
 	} */ *uap;
 {
 	struct mount *mp;
 	int error;
 	struct nameidata nd;
 
 	if (jailed(td->td_proc->p_ucred) && !prison_quotas)
 		return (EPERM);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
 	vrele(nd.ni_vp);
 	if (error)
 		return (error);
 	error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
 	    SCARG(uap, arg), td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct statfs_args {
 	char *path;
 	struct statfs *buf;
 };
 #endif
 /* ARGSUSED */
 int
 statfs(td, uap)
 	struct thread *td;
 	register struct statfs_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct statfs *) buf;
 	} */ *uap;
 {
 	register struct mount *mp;
 	register struct statfs *sp;
 	int error;
 	struct nameidata nd;
 	struct statfs sb;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	mp = nd.ni_vp->v_mount;
 	sp = &mp->mnt_stat;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vrele(nd.ni_vp);
 	error = VFS_STATFS(mp, sp, td);
 	if (error)
 		return (error);
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	if (suser_xxx(td->td_proc->p_ucred, 0, 0)) {
 		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		sp = &sb;
 	}
 	return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstatfs_args {
 	int fd;
 	struct statfs *buf;
 };
 #endif
 /* ARGSUSED */
 int
 fstatfs(td, uap)
 	struct thread *td;
 	register struct fstatfs_args /* {
 		syscallarg(int) fd;
 		syscallarg(struct statfs *) buf;
 	} */ *uap;
 {
 	struct file *fp;
 	struct mount *mp;
 	register struct statfs *sp;
 	int error;
 	struct statfs sb;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
 	mp = ((struct vnode *)fp->f_data)->v_mount;
+	fdrop(fp, td);
 	if (mp == NULL)
 		return (EBADF);
 	sp = &mp->mnt_stat;
 	error = VFS_STATFS(mp, sp, td);
 	if (error)
 		return (error);
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	if (suser_xxx(td->td_proc->p_ucred, 0, 0)) {
 		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		sp = &sb;
 	}
 	return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
 }
 
 /*
  * Get statistics on all filesystems.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getfsstat_args {
 	struct statfs *buf;
 	long bufsize;
 	int flags;
 };
 #endif
 int
 getfsstat(td, uap)
 	struct thread *td;
 	register struct getfsstat_args /* {
 		syscallarg(struct statfs *) buf;
 		syscallarg(long) bufsize;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	register struct mount *mp, *nmp;
 	register struct statfs *sp;
 	caddr_t sfsp;
 	long count, maxcount, error;
 
 	maxcount = SCARG(uap, bufsize) / sizeof(struct statfs);
 	sfsp = (caddr_t)SCARG(uap, buf);
 	count = 0;
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		if (sfsp && count < maxcount) {
 			sp = &mp->mnt_stat;
 			/*
 			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
 			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
 			 * overrides MNT_WAIT.
 			 */
 			if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
 			    (SCARG(uap, flags) & MNT_WAIT)) &&
 			    (error = VFS_STATFS(mp, sp, td))) {
 				mtx_lock(&mountlist_mtx);
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				vfs_unbusy(mp, td);
 				continue;
 			}
 			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 			error = copyout((caddr_t)sp, sfsp, sizeof(*sp));
 			if (error) {
 				vfs_unbusy(mp, td);
 				return (error);
 			}
 			sfsp += sizeof(*sp);
 		}
 		count++;
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp, td);
 	}
 	mtx_unlock(&mountlist_mtx);
 	if (sfsp && count > maxcount)
 		td->td_retval[0] = maxcount;
 	else
 		td->td_retval[0] = count;
 	return (0);
 }
 
 /*
  * Change current working directory to a given file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchdir_args {
 	int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 fchdir(td, uap)
 	struct thread *td;
 	struct fchdir_args /* {
 		syscallarg(int) fd;
 	} */ *uap;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
-	struct vnode *vp, *tdp;
+	struct vnode *vp, *tdp, *vpold;
 	struct mount *mp;
 	struct file *fp;
 	int error;
 
 	if ((error = getvnode(fdp, SCARG(uap, fd), &fp)) != 0)
 		return (error);
 	vp = (struct vnode *)fp->f_data;
 	VREF(vp);
+	fdrop(fp, td);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (vp->v_type != VDIR)
 		error = ENOTDIR;
 	else
 		error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred, td);
 	while (!error && (mp = vp->v_mountedhere) != NULL) {
 		if (vfs_busy(mp, 0, 0, td))
 			continue;
 		error = VFS_ROOT(mp, &tdp);
 		vfs_unbusy(mp, td);
 		if (error)
 			break;
 		vput(vp);
 		vp = tdp;
 	}
 	if (error) {
 		vput(vp);
 		return (error);
 	}
 	VOP_UNLOCK(vp, 0, td);
-	vrele(fdp->fd_cdir);
+	FILEDESC_LOCK(fdp);
+	vpold = fdp->fd_cdir;
 	fdp->fd_cdir = vp;
+	FILEDESC_UNLOCK(fdp);
+	vrele(vpold);
 	return (0);
 }
 
 /*
  * Change current working directory (``.'').
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chdir_args {
 	char	*path;
 };
 #endif
 /* ARGSUSED */
 int
 chdir(td, uap)
 	struct thread *td;
 	struct chdir_args /* {
 		syscallarg(char *) path;
 	} */ *uap;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 	struct nameidata nd;
+	struct vnode *vp;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = change_dir(&nd, td)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
-	vrele(fdp->fd_cdir);
+	FILEDESC_LOCK(fdp);
+	vp = fdp->fd_cdir;
 	fdp->fd_cdir = nd.ni_vp;
+	FILEDESC_UNLOCK(fdp);
+	vrele(vp);
 	return (0);
 }
 
 /*
  * Helper function for raised chroot(2) security function:  Refuse if
  * any filedescriptors are open directories.
  */
 static int
 chroot_refuse_vdir_fds(fdp)
 	struct filedesc *fdp;
 {
 	struct vnode *vp;
 	struct file *fp;
+	struct thread *td = curthread;
 	int error;
 	int fd;
 
+	FILEDESC_LOCK(fdp);
 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
 		error = getvnode(fdp, fd, &fp);
 		if (error)
 			continue;
 		vp = (struct vnode *)fp->f_data;
+		fdrop(fp, td);
 		if (vp->v_type != VDIR)
 			continue;
+		FILEDESC_UNLOCK(fdp);
 		return(EPERM);
 	}
+	FILEDESC_UNLOCK(fdp);
 	return (0);
 }
 
 /*
  * This sysctl determines if we will allow a process to chroot(2) if it
  * has a directory open:
  *	0: disallowed for all processes.
  *	1: allowed for processes that were not already chroot(2)'ed.
  *	2: allowed for all processes.
  */
 
 static int chroot_allow_open_directories = 1;
 
 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
      &chroot_allow_open_directories, 0, "");
 
 /*
  * Change notion of root (``/'') directory.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chroot_args {
 	char	*path;
 };
 #endif
 /* ARGSUSED */
 int
 chroot(td, uap)
 	struct thread *td;
 	struct chroot_args /* {
 		syscallarg(char *) path;
 	} */ *uap;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 	struct nameidata nd;
+	struct vnode *vp;
 
 	error = suser_xxx(0, td->td_proc, PRISON_ROOT);
 	if (error)
 		return (error);
+	FILEDESC_LOCK(fdp);
 	if (chroot_allow_open_directories == 0 ||
-	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode))
+	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
+		FILEDESC_UNLOCK(fdp);
 		error = chroot_refuse_vdir_fds(fdp);
+	} else
+		FILEDESC_UNLOCK(fdp);
 	if (error)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = change_dir(&nd, td)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
-	vrele(fdp->fd_rdir);
+	FILEDESC_LOCK(fdp);
+	vp = fdp->fd_rdir;
 	fdp->fd_rdir = nd.ni_vp;
 	if (!fdp->fd_jdir) {
 		fdp->fd_jdir = nd.ni_vp;
                 VREF(fdp->fd_jdir);
 	}
+	FILEDESC_UNLOCK(fdp);
+	vrele(vp);
 	return (0);
 }
 
 /*
  * Common routine for chroot and chdir.
  */
 static int
 change_dir(ndp, td)
 	register struct nameidata *ndp;
 	struct thread *td;
 {
 	struct vnode *vp;
 	int error;
 
 	error = namei(ndp);
 	if (error)
 		return (error);
 	vp = ndp->ni_vp;
 	if (vp->v_type != VDIR)
 		error = ENOTDIR;
 	else
 		error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred, td);
 	if (error)
 		vput(vp);
 	else
 		VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 /*
  * Check permissions, allocate an open file structure,
  * and call the device open routine if any.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct open_args {
 	char	*path;
 	int	flags;
 	int	mode;
 };
 #endif
 int
 open(td, uap)
 	struct thread *td;
 	register struct open_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	struct file *fp;
 	struct vnode *vp;
 	struct vattr vat;
 	struct mount *mp;
 	int cmode, flags, oflags;
 	struct file *nfp;
 	int type, indx, error;
 	struct flock lf;
 	struct nameidata nd;
 
 	oflags = SCARG(uap, flags);
 	if ((oflags & O_ACCMODE) == O_ACCMODE)
 		return (EINVAL);
 	flags = FFLAGS(oflags);
 	error = falloc(td, &nfp, &indx);
 	if (error)
 		return (error);
 	fp = nfp;
+	FILEDESC_LOCK(fdp);
 	cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
+	FILEDESC_UNLOCK(fdp);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	td->td_dupfd = -indx - 1;		/* XXX check for fdopen */
 	/*
 	 * Bump the ref count to prevent another process from closing
 	 * the descriptor while we are blocked in vn_open()
 	 */
 	fhold(fp);
 	error = vn_open(&nd, &flags, cmode);
 	if (error) {
 		/*
 		 * release our own reference
 		 */
 		fdrop(fp, td);
 
 		/*
 		 * handle special fdopen() case.  bleh.  dupfdopen() is
 		 * responsible for dropping the old contents of ofiles[indx]
 		 * if it succeeds.
 		 */
 		if ((error == ENODEV || error == ENXIO) &&
 		    td->td_dupfd >= 0 &&		/* XXX from fdopen */
 		    (error =
 			dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) {
 			td->td_retval[0] = indx;
 			return (0);
 		}
 		/*
 		 * Clean up the descriptor, but only if another thread hadn't
 		 * replaced or closed it.
 		 */
+		FILEDESC_LOCK(fdp);
 		if (fdp->fd_ofiles[indx] == fp) {
 			fdp->fd_ofiles[indx] = NULL;
+			FILEDESC_UNLOCK(fdp);
 			fdrop(fp, td);
-		}
+		} else
+			FILEDESC_UNLOCK(fdp);
 
 		if (error == ERESTART)
 			error = EINTR;
 		return (error);
 	}
 	td->td_dupfd = 0;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	/*
 	 * There should be 2 references on the file, one from the descriptor
 	 * table, and one for us.
 	 *
 	 * Handle the case where someone closed the file (via its file
 	 * descriptor) while we were blocked.  The end result should look
 	 * like opening the file succeeded but it was immediately closed.
 	 */
+	FILEDESC_LOCK(fdp);
+	FILE_LOCK(fp);
 	if (fp->f_count == 1) {
 		KASSERT(fdp->fd_ofiles[indx] != fp,
 		    ("Open file descriptor lost all refs"));
+		FILEDESC_UNLOCK(fdp);
+		FILE_UNLOCK(fp);
 		VOP_UNLOCK(vp, 0, td);
 		vn_close(vp, flags & FMASK, fp->f_cred, td);
 		fdrop(fp, td);
 		td->td_retval[0] = indx;
 		return 0;
 	}
 
 	fp->f_data = (caddr_t)vp;
 	fp->f_flag = flags & FMASK;
 	fp->f_ops = &vnops;
 	fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+	FILEDESC_UNLOCK(fdp);
+	FILE_UNLOCK(fp);
 	VOP_UNLOCK(vp, 0, td);
 	if (flags & (O_EXLOCK | O_SHLOCK)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		if (flags & O_EXLOCK)
 			lf.l_type = F_WRLCK;
 		else
 			lf.l_type = F_RDLCK;
 		type = F_FLOCK;
 		if ((flags & FNONBLOCK) == 0)
 			type |= F_WAIT;
 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0)
 			goto bad;
 		fp->f_flag |= FHASLOCK;
 	}
 	if (flags & O_TRUNC) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 			goto bad;
 		VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE);
 		VATTR_NULL(&vat);
 		vat.va_size = 0;
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		error = VOP_SETATTR(vp, &vat, p->p_ucred, td);
 		VOP_UNLOCK(vp, 0, td);
 		vn_finished_write(mp);
 		if (error)
 			goto bad;
 	}
 	/* assert that vn_open created a backing object if one is needed */
 	KASSERT(!vn_canvmio(vp) || VOP_GETVOBJECT(vp, NULL) == 0,
 		("open: vmio vnode has no backing object after vn_open"));
 	/*
 	 * Release our private reference, leaving the one associated with
 	 * the descriptor table intact.
 	 */
 	fdrop(fp, td);
 	td->td_retval[0] = indx;
 	return (0);
 bad:
+	FILEDESC_LOCK(fdp);
 	if (fdp->fd_ofiles[indx] == fp) {
 		fdp->fd_ofiles[indx] = NULL;
+		FILEDESC_UNLOCK(fdp);
 		fdrop(fp, td);
-	}
-	fdrop(fp, td);
+	} else
+		FILEDESC_UNLOCK(fdp);
 	return (error);
 }
 
 #ifdef COMPAT_43
 /*
  * Create a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ocreat_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 ocreat(td, uap)
 	struct thread *td;
 	register struct ocreat_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 	struct open_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 		syscallarg(int) mode;
 	} */ nuap;
 
 	SCARG(&nuap, path) = SCARG(uap, path);
 	SCARG(&nuap, mode) = SCARG(uap, mode);
 	SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC;
 	return (open(td, &nuap));
 }
 #endif /* COMPAT_43 */
 
 /*
  * Create a special file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mknod_args {
 	char	*path;
 	int	mode;
 	int	dev;
 };
 #endif
 /* ARGSUSED */
 int
 mknod(td, uap)
 	struct thread *td;
 	register struct mknod_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) mode;
 		syscallarg(int) dev;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 	int whiteout = 0;
 	struct nameidata nd;
 
 	switch (SCARG(uap, mode) & S_IFMT) {
 	case S_IFCHR:
 	case S_IFBLK:
 		error = suser_td(td);
 		break;
 	default:
 		error = suser_xxx(0, td->td_proc, PRISON_ROOT);
 		break;
 	}
 	if (error)
 		return (error);
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		vrele(vp);
 		error = EEXIST;
 	} else {
 		VATTR_NULL(&vattr);
+		FILEDESC_LOCK(td->td_proc->p_fd);
 		vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask;
+		FILEDESC_UNLOCK(td->td_proc->p_fd);
 		vattr.va_rdev = SCARG(uap, dev);
 		whiteout = 0;
 
 		switch (SCARG(uap, mode) & S_IFMT) {
 		case S_IFMT:	/* used by badsect to flag bad sectors */
 			vattr.va_type = VBAD;
 			break;
 		case S_IFCHR:
 			vattr.va_type = VCHR;
 			break;
 		case S_IFBLK:
 			vattr.va_type = VBLK;
 			break;
 		case S_IFWHT:
 			whiteout = 1;
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	if (!error) {
 		VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 		if (whiteout)
 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
 		else {
 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
 						&nd.ni_cnd, &vattr);
 			if (error == 0)
 				vput(nd.ni_vp);
 		}
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod");
 	return (error);
 }
 
 /*
  * Create a named pipe.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkfifo_args {
 	char	*path;
 	int	mode;
 };
 #endif
 /* ARGSUSED */
 int
 mkfifo(td, uap)
 	struct thread *td;
 	register struct mkfifo_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (nd.ni_vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vrele(nd.ni_vp);
 		vput(nd.ni_dvp);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VFIFO;
+	FILEDESC_LOCK(td->td_proc->p_fd);
 	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 	if (error == 0)
 		vput(nd.ni_vp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Make a hard file link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct link_args {
 	char	*path;
 	char	*link;
 };
 #endif
 /* ARGSUSED */
 int
 link(td, uap)
 	struct thread *td;
 	register struct link_args /* {
 		syscallarg(char *) path;
 		syscallarg(char *) link;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct nameidata nd;
 	int error;
 
 	bwillwrite();
 	NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR) {
 		vrele(vp);
 		return (EPERM);		/* POSIX */
 	}
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		vrele(vp);
 		return (error);
 	}
 	NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td);
 	if ((error = namei(&nd)) == 0) {
 		if (nd.ni_vp != NULL) {
 			vrele(nd.ni_vp);
 			error = EEXIST;
 		} else {
 			VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 			VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
 		}
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 	}
 	vrele(vp);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "link");
 	return (error);
 }
 
 /*
  * Make a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct symlink_args {
 	char	*path;
 	char	*link;
 };
 #endif
 /* ARGSUSED */
 int
 symlink(td, uap)
 	struct thread *td;
 	register struct symlink_args /* {
 		syscallarg(char *) path;
 		syscallarg(char *) link;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vattr vattr;
 	char *path;
 	int error;
 	struct nameidata nd;
 
 	path = zalloc(namei_zone);
 	if ((error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) != 0)
 		goto out;
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td);
 	if ((error = namei(&nd)) != 0)
 		goto out;
 	if (nd.ni_vp) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vrele(nd.ni_vp);
 		vput(nd.ni_dvp);
 		error = EEXIST;
 		goto out;
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
+	FILEDESC_LOCK(td->td_proc->p_fd);
 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error == 0)
 		vput(nd.ni_vp);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
 out:
 	zfree(namei_zone, path);
 	return (error);
 }
 
 /*
  * Delete a whiteout from the filesystem.
  */
 /* ARGSUSED */
 int
 undelete(td, uap)
 	struct thread *td;
 	register struct undelete_args /* {
 		syscallarg(char *) path;
 	} */ *uap;
 {
 	int error;
 	struct mount *mp;
 	struct nameidata nd;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 
 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp)
 			vrele(nd.ni_vp);
 		vput(nd.ni_dvp);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete");
 	return (error);
 }
 
 /*
  * Delete a name from the filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unlink_args {
 	char	*path;
 };
 #endif
 /* ARGSUSED */
 int
 unlink(td, uap)
 	struct thread *td;
 	struct unlink_args /* {
 		syscallarg(char *) path;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 	struct nameidata nd;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR)
 		error = EPERM;		/* POSIX */
 	else {
 		/*
 		 * The root of a mounted filesystem cannot be deleted.
 		 *
 		 * XXX: can this only be a VDIR case?
 		 */
 		if (vp->v_flag & VROOT)
 			error = EBUSY;
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vrele(vp);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (!error) {
 		VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vput(vp);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink");
 	return (error);
 }
 
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lseek_args {
 	int	fd;
 	int	pad;
 	off_t	offset;
 	int	whence;
 };
 #endif
 int
 lseek(td, uap)
 	struct thread *td;
 	register struct lseek_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) pad;
 		syscallarg(off_t) offset;
 		syscallarg(int) whence;
 	} */ *uap;
 {
 	struct ucred *cred = td->td_proc->p_ucred;
-	register struct filedesc *fdp = td->td_proc->p_fd;
 	register struct file *fp;
-	struct vattr vattr;
 	struct vnode *vp;
+	struct vattr vattr;
 	off_t offset;
 	int error, noneg;
 
-	if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
+	fp = ffind_hold(td, uap->fd);
+	if (fp == NULL)
 		return (EBADF);
-	if (fp->f_type != DTYPE_VNODE)
+	if (fp->f_type != DTYPE_VNODE) {
+		fdrop(fp, td);
 		return (ESPIPE);
+	}
 	vp = (struct vnode *)fp->f_data;
 	noneg = (vp->v_type != VCHR);
 	offset = SCARG(uap, offset);
 	switch (SCARG(uap, whence)) {
 	case L_INCR:
 		if (noneg &&
 		    (fp->f_offset < 0 ||
 		     (offset > 0 && fp->f_offset > OFF_MAX - offset)))
 			return (EOVERFLOW);
 		offset += fp->f_offset;
 		break;
 	case L_XTND:
 		error = VOP_GETATTR(vp, &vattr, cred, td);
 		if (error)
 			return (error);
 		if (noneg &&
 		    (vattr.va_size > OFF_MAX ||
 		     (offset > 0 && vattr.va_size > OFF_MAX - offset)))
 			return (EOVERFLOW);
 		offset += vattr.va_size;
 		break;
 	case L_SET:
 		break;
 	default:
+		fdrop(fp, td);
 		return (EINVAL);
 	}
 	if (noneg && offset < 0)
 		return (EINVAL);
 	fp->f_offset = offset;
 	*(off_t *)(td->td_retval) = fp->f_offset;
+	fdrop(fp, td);
 	return (0);
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olseek_args {
 	int	fd;
 	long	offset;
 	int	whence;
 };
 #endif
 int
 olseek(td, uap)
 	struct thread *td;
 	register struct olseek_args /* {
 		syscallarg(int) fd;
 		syscallarg(long) offset;
 		syscallarg(int) whence;
 	} */ *uap;
 {
 	struct lseek_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) pad;
 		syscallarg(off_t) offset;
 		syscallarg(int) whence;
 	} */ nuap;
 	int error;
 
 	SCARG(&nuap, fd) = SCARG(uap, fd);
 	SCARG(&nuap, offset) = SCARG(uap, offset);
 	SCARG(&nuap, whence) = SCARG(uap, whence);
 	error = lseek(td, &nuap);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Check access permissions using passed credentials.
  */
 static int
 vn_access(vp, user_flags, cred, td)
 	struct vnode	*vp;
 	int		user_flags;
 	struct ucred	*cred;
 	struct thread	*td;
 {
 	int error, flags;
 
 	/* Flags == 0 means only check for existence. */
 	error = 0;
 	if (user_flags) {
 		flags = 0;
 		if (user_flags & R_OK)
 			flags |= VREAD;
 		if (user_flags & W_OK)
 			flags |= VWRITE;
 		if (user_flags & X_OK)
 			flags |= VEXEC;
 		if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
 			error = VOP_ACCESS(vp, flags, cred, td);
 	}
 	return (error);
 }
 
 /*
  * Check access permissions using "real" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct access_args {
 	char	*path;
 	int	flags;
 };
 #endif
 int
 access(td, uap)
 	struct thread *td;
 	register struct access_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	struct ucred *cred, *tmpcred;
 	register struct vnode *vp;
 	int error;
 	struct nameidata nd;
 
 	cred = td->td_proc->p_ucred;
 	/*
 	 * Create and modify a temporary credential instead of one that
 	 * is potentially shared.  This could also mess up socket
 	 * buffer accounting which can run in an interrupt context.
 	 *
 	 * XXX - Depending on how "threads" are finally implemented, it
 	 * may be better to explicitly pass the credential to namei()
 	 * rather than to modify the potentially shared process structure.
 	 */
 	tmpcred = crdup(cred);
 	tmpcred->cr_uid = cred->cr_ruid;
 	tmpcred->cr_groups[0] = cred->cr_rgid;
 	td->td_proc->p_ucred = tmpcred;
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		goto out1;
 	vp = nd.ni_vp;
 
 	error = vn_access(vp, SCARG(uap, flags), tmpcred, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 out1:
 	td->td_proc->p_ucred = cred;
 	crfree(tmpcred);
 	return (error);
 }
 
 /*
  * Check access permissions using "effective" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct eaccess_args {
 	char	*path;
 	int	flags;
 };
 #endif
 int
 eaccess(td, uap)
 	struct thread *td;
 	register struct eaccess_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	struct nameidata nd;
 	struct vnode *vp;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 
 	error = vn_access(vp, SCARG(uap, flags), td->td_proc->p_ucred, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	return (error);
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 /*
  * Get file status; this version follows links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ostat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 /* ARGSUSED */
 int
 ostat(td, uap)
 	struct thread *td;
 	register struct ostat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct ostat *) ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct ostat osb;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = vn_stat(nd.ni_vp, &sb, td);
 	vput(nd.ni_vp);
 	if (error)
 		return (error);
 	cvtstat(&sb, &osb);
 	error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
 	return (error);
 }
 
 /*
  * Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olstat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 /* ARGSUSED */
 int
 olstat(td, uap)
 	struct thread *td;
 	register struct olstat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct ostat *) ub;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct stat sb;
 	struct ostat osb;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	error = vn_stat(vp, &sb, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	if (error)
 		return (error);
 	cvtstat(&sb, &osb);
 	error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
 	return (error);
 }
 
 /*
  * Convert from an old to a new stat structure.
  */
 void
 cvtstat(st, ost)
 	struct stat *st;
 	struct ostat *ost;
 {
 
 	ost->st_dev = st->st_dev;
 	ost->st_ino = st->st_ino;
 	ost->st_mode = st->st_mode;
 	ost->st_nlink = st->st_nlink;
 	ost->st_uid = st->st_uid;
 	ost->st_gid = st->st_gid;
 	ost->st_rdev = st->st_rdev;
 	if (st->st_size < (quad_t)1 << 32)
 		ost->st_size = st->st_size;
 	else
 		ost->st_size = -2;
 	ost->st_atime = st->st_atime;
 	ost->st_mtime = st->st_mtime;
 	ost->st_ctime = st->st_ctime;
 	ost->st_blksize = st->st_blksize;
 	ost->st_blocks = st->st_blocks;
 	ost->st_flags = st->st_flags;
 	ost->st_gen = st->st_gen;
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
 /*
  * Get file status; this version follows links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct stat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 /* ARGSUSED */
 int
 stat(td, uap)
 	struct thread *td;
 	register struct stat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct stat *) ub;
 	} */ *uap;
 {
 	struct stat sb;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	error = vn_stat(nd.ni_vp, &sb, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
 	if (error)
 		return (error);
 	error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
 	return (error);
 }
 
 /*
  * Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lstat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 /* ARGSUSED */
 int
 lstat(td, uap)
 	struct thread *td;
 	register struct lstat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct stat *) ub;
 	} */ *uap;
 {
 	int error;
 	struct vnode *vp;
 	struct stat sb;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	error = vn_stat(vp, &sb, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	if (error)
 		return (error);
 	error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
 	return (error);
 }
 
 /*
  * Implementation of the NetBSD stat() function.
  * XXX This should probably be collapsed with the FreeBSD version,
  * as the differences are only due to vn_stat() clearing spares at
  * the end of the structures.  vn_stat could be split to avoid this,
  * and thus collapse the following to close to zero code.
  */
 void
 cvtnstat(sb, nsb)
 	struct stat *sb;
 	struct nstat *nsb;
 {
 	nsb->st_dev = sb->st_dev;
 	nsb->st_ino = sb->st_ino;
 	nsb->st_mode = sb->st_mode;
 	nsb->st_nlink = sb->st_nlink;
 	nsb->st_uid = sb->st_uid;
 	nsb->st_gid = sb->st_gid;
 	nsb->st_rdev = sb->st_rdev;
 	nsb->st_atimespec = sb->st_atimespec;
 	nsb->st_mtimespec = sb->st_mtimespec;
 	nsb->st_ctimespec = sb->st_ctimespec;
 	nsb->st_size = sb->st_size;
 	nsb->st_blocks = sb->st_blocks;
 	nsb->st_blksize = sb->st_blksize;
 	nsb->st_flags = sb->st_flags;
 	nsb->st_gen = sb->st_gen;
 	nsb->st_qspare[0] = sb->st_qspare[0];
 	nsb->st_qspare[1] = sb->st_qspare[1];
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct nstat_args {
 	char	*path;
 	struct nstat *ub;
 };
 #endif
 /* ARGSUSED */
 int
 nstat(td, uap)
 	struct thread *td;
 	register struct nstat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct nstat *) ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct nstat nsb;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = vn_stat(nd.ni_vp, &sb, td);
 	vput(nd.ni_vp);
 	if (error)
 		return (error);
 	cvtnstat(&sb, &nsb);
 	error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb));
 	return (error);
 }
 
 /*
  * NetBSD lstat.  Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lstat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 /* ARGSUSED */
 int
 nlstat(td, uap)
 	struct thread *td;
 	register struct nlstat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct nstat *) ub;
 	} */ *uap;
 {
 	int error;
 	struct vnode *vp;
 	struct stat sb;
 	struct nstat nsb;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = vn_stat(vp, &sb, td);
 	vput(vp);
 	if (error)
 		return (error);
 	cvtnstat(&sb, &nsb);
 	error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb));
 	return (error);
 }
 
 /*
  * Get configurable pathname variables.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pathconf_args {
 	char	*path;
 	int	name;
 };
 #endif
 /* ARGSUSED */
 int
 pathconf(td, uap)
 	struct thread *td;
 	register struct pathconf_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) name;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), td->td_retval);
 	vput(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct readlink_args {
 	char	*path;
 	char	*buf;
 	int	count;
 };
 #endif
 /* ARGSUSED */
 int
 readlink(td, uap)
 	struct thread *td;
 	register struct readlink_args /* {
 		syscallarg(char *) path;
 		syscallarg(char *) buf;
 		syscallarg(int) count;
 	} */ *uap;
 {
 	register struct vnode *vp;
 	struct iovec aiov;
 	struct uio auio;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (vp->v_type != VLNK)
 		error = EINVAL;
 	else {
 		aiov.iov_base = SCARG(uap, buf);
 		aiov.iov_len = SCARG(uap, count);
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_USERSPACE;
 		auio.uio_td = td;
 		auio.uio_resid = SCARG(uap, count);
 		error = VOP_READLINK(vp, &auio, td->td_proc->p_ucred);
 	}
 	vput(vp);
 	td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
 	return (error);
 }
 
 /*
  * Common implementation code for chflags() and fchflags().
  */
 static int
 setfflags(td, vp, flags)
 	struct thread *td;
 	struct vnode *vp;
 	int flags;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	/*
 	 * Prevent non-root users from setting flags on devices.  When
 	 * a device is reused, users can retain ownership of the device
 	 * if they are allowed to set flags and programs assume that
 	 * chown can't fail when done as root.
 	 */
 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
 		error = suser_xxx(td->td_proc->p_ucred, td->td_proc,
 		    PRISON_ROOT);
 		if (error)
 			return (error);
 	}
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VATTR_NULL(&vattr);
 	vattr.va_flags = flags;
 	error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Change flags of a file given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chflags_args {
 	char	*path;
 	int	flags;
 };
 #endif
 /* ARGSUSED */
 int
 chflags(td, uap)
 	struct thread *td;
 	register struct chflags_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfflags(td, nd.ni_vp, SCARG(uap, flags));
 	vrele(nd.ni_vp);
 	return error;
 }
 
 /*
  * Change flags of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchflags_args {
 	int	fd;
 	int	flags;
 };
 #endif
 /* ARGSUSED */
 int
 fchflags(td, uap)
 	struct thread *td;
 	register struct fchflags_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	struct file *fp;
 	int error;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	return setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags));
+	error = setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags));
+	fdrop(fp, td);
+	return (error);
 }
 
 /*
  * Common implementation code for chmod(), lchmod() and fchmod().
  */
 static int
 setfmode(td, vp, mode)
 	struct thread *td;
 	struct vnode *vp;
 	int mode;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VATTR_NULL(&vattr);
 	vattr.va_mode = mode & ALLPERMS;
 	error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return error;
 }
 
 /*
  * Change mode of a file given path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 /* ARGSUSED */
 int
 chmod(td, uap)
 	struct thread *td;
 	register struct chmod_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfmode(td, nd.ni_vp, SCARG(uap, mode));
 	vrele(nd.ni_vp);
 	return error;
 }
 
 /*
  * Change mode of a file given path name (don't follow links.)
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 /* ARGSUSED */
 int
 lchmod(td, uap)
 	struct thread *td;
 	register struct lchmod_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfmode(td, nd.ni_vp, SCARG(uap, mode));
 	vrele(nd.ni_vp);
 	return error;
 }
 
 /*
  * Change mode of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchmod_args {
 	int	fd;
 	int	mode;
 };
 #endif
 /* ARGSUSED */
 int
 fchmod(td, uap)
 	struct thread *td;
 	register struct fchmod_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 	struct file *fp;
+	struct vnode *vp;
 	int error;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	return setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode));
+	vp = (struct vnode *)fp->f_data;
+	error = setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode));
+	fdrop(fp, td);
+	return (error);
 }
 
 /*
  * Common implementation for chown(), lchown(), and fchown()
  */
 static int
 setfown(td, vp, uid, gid)
 	struct thread *td;
 	struct vnode *vp;
 	uid_t uid;
 	gid_t gid;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VATTR_NULL(&vattr);
 	vattr.va_uid = uid;
 	vattr.va_gid = gid;
 	error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return error;
 }
 
 /*
  * Set ownership given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 /* ARGSUSED */
 int
 chown(td, uap)
 	struct thread *td;
 	register struct chown_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) uid;
 		syscallarg(int) gid;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set ownership given a path name, do not cross symlinks.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 /* ARGSUSED */
 int
 lchown(td, uap)
 	struct thread *td;
 	register struct lchown_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) uid;
 		syscallarg(int) gid;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set ownership given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchown_args {
 	int	fd;
 	int	uid;
 	int	gid;
 };
 #endif
 /* ARGSUSED */
 int
 fchown(td, uap)
 	struct thread *td;
 	register struct fchown_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) uid;
 		syscallarg(int) gid;
 	} */ *uap;
 {
 	struct file *fp;
+	struct vnode *vp;
 	int error;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	return setfown(td, (struct vnode *)fp->f_data,
+	vp = (struct vnode *)fp->f_data;
+	error = setfown(td, (struct vnode *)fp->f_data,
 		SCARG(uap, uid), SCARG(uap, gid));
+	fdrop(fp, td);
+	return (error);
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), and futimes().
  */
 static int
 getutimes(usrtvp, tsp)
 	const struct timeval *usrtvp;
 	struct timespec *tsp;
 {
 	struct timeval tv[2];
 	int error;
 
 	if (usrtvp == NULL) {
 		microtime(&tv[0]);
 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
 		tsp[1] = tsp[0];
 	} else {
 		if ((error = copyin(usrtvp, tv, sizeof (tv))) != 0)
 			return (error);
 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
 		TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
 	}
 	return 0;
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), and futimes().
  */
 static int
 setutimes(td, vp, ts, nullflag)
 	struct thread *td;
 	struct vnode *vp;
 	const struct timespec *ts;
 	int nullflag;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VATTR_NULL(&vattr);
 	vattr.va_atime = ts[0];
 	vattr.va_mtime = ts[1];
 	if (nullflag)
 		vattr.va_vaflags |= VA_UTIMES_NULL;
 	error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return error;
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct utimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 /* ARGSUSED */
 int
 utimes(td, uap)
 	struct thread *td;
 	register struct utimes_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct timeval *) tptr;
 	} */ *uap;
 {
 	struct timespec ts[2];
 	struct timeval *usrtvp;
 	int error;
 	struct nameidata nd;
 
 	usrtvp = SCARG(uap, tptr);
 	if ((error = getutimes(usrtvp, ts)) != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lutimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 /* ARGSUSED */
 int
 lutimes(td, uap)
 	struct thread *td;
 	register struct lutimes_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct timeval *) tptr;
 	} */ *uap;
 {
 	struct timespec ts[2];
 	struct timeval *usrtvp;
 	int error;
 	struct nameidata nd;
 
 	usrtvp = SCARG(uap, tptr);
 	if ((error = getutimes(usrtvp, ts)) != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct futimes_args {
 	int	fd;
 	struct	timeval *tptr;
 };
 #endif
 /* ARGSUSED */
 int
 futimes(td, uap)
 	struct thread *td;
 	register struct futimes_args /* {
 		syscallarg(int ) fd;
 		syscallarg(struct timeval *) tptr;
 	} */ *uap;
 {
 	struct timespec ts[2];
 	struct file *fp;
 	struct timeval *usrtvp;
 	int error;
 
 	usrtvp = SCARG(uap, tptr);
 	if ((error = getutimes(usrtvp, ts)) != 0)
 		return (error);
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	return setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL);
+	error = setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL);
+	fdrop(fp, td);
+	return (error);
 }
 
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct truncate_args {
 	char	*path;
 	int	pad;
 	off_t	length;
 };
 #endif
 /* ARGSUSED */
 int
 truncate(td, uap)
 	struct thread *td;
 	register struct truncate_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) pad;
 		syscallarg(off_t) length;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 
 	if (uap->length < 0)
 		return(EINVAL);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		vrele(vp);
 		return (error);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (vp->v_type == VDIR)
 		error = EISDIR;
 	else if ((error = vn_writechk(vp)) == 0 &&
 	    (error = VOP_ACCESS(vp, VWRITE, td->td_proc->p_ucred, td)) == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = SCARG(uap, length);
 		error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td);
 	}
 	vput(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Truncate a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ftruncate_args {
 	int	fd;
 	int	pad;
 	off_t	length;
 };
 #endif
 /* ARGSUSED */
 int
 ftruncate(td, uap)
 	struct thread *td;
 	register struct ftruncate_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) pad;
 		syscallarg(off_t) length;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vattr vattr;
 	struct vnode *vp;
 	struct file *fp;
 	int error;
 
 	if (uap->length < 0)
 		return(EINVAL);
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	if ((fp->f_flag & FWRITE) == 0)
+	if ((fp->f_flag & FWRITE) == 0) {
+		fdrop(fp, td);
 		return (EINVAL);
+	}
 	vp = (struct vnode *)fp->f_data;
-	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		fdrop(fp, td);
 		return (error);
+	}
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (vp->v_type == VDIR)
 		error = EISDIR;
 	else if ((error = vn_writechk(vp)) == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = SCARG(uap, length);
 		error = VOP_SETATTR(vp, &vattr, fp->f_cred, td);
 	}
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
+	fdrop(fp, td);
 	return (error);
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct otruncate_args {
 	char	*path;
 	long	length;
 };
 #endif
 /* ARGSUSED */
 int
 otruncate(td, uap)
 	struct thread *td;
 	register struct otruncate_args /* {
 		syscallarg(char *) path;
 		syscallarg(long) length;
 	} */ *uap;
 {
 	struct truncate_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) pad;
 		syscallarg(off_t) length;
 	} */ nuap;
 
 	SCARG(&nuap, path) = SCARG(uap, path);
 	SCARG(&nuap, length) = SCARG(uap, length);
 	return (truncate(td, &nuap));
 }
 
 /*
  * Truncate a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct oftruncate_args {
 	int	fd;
 	long	length;
 };
 #endif
 /* ARGSUSED */
 int
 oftruncate(td, uap)
 	struct thread *td;
 	register struct oftruncate_args /* {
 		syscallarg(int) fd;
 		syscallarg(long) length;
 	} */ *uap;
 {
 	struct ftruncate_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) pad;
 		syscallarg(off_t) length;
 	} */ nuap;
 
 	SCARG(&nuap, fd) = SCARG(uap, fd);
 	SCARG(&nuap, length) = SCARG(uap, length);
 	return (ftruncate(td, &nuap));
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
 /*
  * Sync an open file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fsync_args {
 	int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 fsync(td, uap)
 	struct thread *td;
 	struct fsync_args /* {
 		syscallarg(int) fd;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
 	vm_object_t obj;
 	int error;
 
 	GIANT_REQUIRED;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
 	vp = (struct vnode *)fp->f_data;
-	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		fdrop(fp, td);
 		return (error);
+	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (VOP_GETVOBJECT(vp, &obj) == 0) {
 		vm_object_page_clean(obj, 0, 0, 0);
 	}
 	error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, td);
 #ifdef SOFTUPDATES
 	if (error == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
 	    error = softdep_fsync(vp);
 #endif
 
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
+	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Rename files.  Source and destination must either both be directories,
  * or both not be directories.  If target is a directory, it must be empty.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rename_args {
 	char	*from;
 	char	*to;
 };
 #endif
 /* ARGSUSED */
 int
 rename(td, uap)
 	struct thread *td;
 	register struct rename_args /* {
 		syscallarg(char *) from;
 		syscallarg(char *) to;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vnode *tvp, *fvp, *tdvp;
 	struct nameidata fromnd, tond;
 	int error;
 
 	bwillwrite();
 	NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE,
 	    SCARG(uap, from), td);
 	if ((error = namei(&fromnd)) != 0)
 		return (error);
 	fvp = fromnd.ni_vp;
 	if ((error = vn_start_write(fvp, &mp, V_WAIT | PCATCH)) != 0) {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		goto out1;
 	}
 	NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ,
 	    UIO_USERSPACE, SCARG(uap, to), td);
 	if (fromnd.ni_vp->v_type == VDIR)
 		tond.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&tond)) != 0) {
 		/* Translate error code for rename("dir1", "dir2/."). */
 		if (error == EISDIR && fvp->v_type == VDIR)
 			error = EINVAL;
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		goto out1;
 	}
 	tdvp = tond.ni_dvp;
 	tvp = tond.ni_vp;
 	if (tvp != NULL) {
 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto out;
 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
 			error = EISDIR;
 			goto out;
 		}
 	}
 	if (fvp == tdvp)
 		error = EINVAL;
 	/*
 	 * If source is the same as the destination (that is the
 	 * same inode number with the same name in the same directory),
 	 * then there is nothing to do.
 	 */
 	if (fvp == tvp && fromnd.ni_dvp == tdvp &&
 	    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
 	    !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
 	      fromnd.ni_cnd.cn_namelen))
 		error = -1;
 out:
 	if (!error) {
 		VOP_LEASE(tdvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 		if (fromnd.ni_dvp != tdvp) {
 			VOP_LEASE(fromnd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 		}
 		if (tvp) {
 			VOP_LEASE(tvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 		}
 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 	} else {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 	}
 	vrele(tond.ni_startdir);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename");
 	ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename");
 	ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename");
 	ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename");
 out1:
 	if (fromnd.ni_startdir)
 		vrele(fromnd.ni_startdir);
 	if (error == -1)
 		return (0);
 	return (error);
 }
 
 /*
  * Make a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkdir_args {
 	char	*path;
 	int	mode;
 };
 #endif
 /* ARGSUSED */
 int
 mkdir(td, uap)
 	struct thread *td;
 	register struct mkdir_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 
 	return vn_mkdir(uap->path, uap->mode, UIO_USERSPACE, td);
 }
 
 int
 vn_mkdir(path, mode, segflg, td)
 	char *path;
 	int mode;
 	enum uio_seg segflg;
 	struct thread *td;
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT, segflg, path, td);
 	nd.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vrele(vp);
 		vput(nd.ni_dvp);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VDIR;
+	FILEDESC_LOCK(td->td_proc->p_fd);
 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	if (!error)
 		vput(nd.ni_vp);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir");
 	return (error);
 }
 
 /*
  * Remove a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rmdir_args {
 	char	*path;
 };
 #endif
 /* ARGSUSED */
 int
 rmdir(td, uap)
 	struct thread *td;
 	struct rmdir_args /* {
 		syscallarg(char *) path;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 	struct nameidata nd;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto out;
 	}
 	/*
 	 * No rmdir "." please.
 	 */
 	if (nd.ni_dvp == vp) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The root of a mounted filesystem cannot be deleted.
 	 */
 	if (vp->v_flag & VROOT) {
 		error = EBUSY;
 		goto out;
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vput(vp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 	vn_finished_write(mp);
 out:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_dvp == vp)
 		vrele(nd.ni_dvp);
 	else
 		vput(nd.ni_dvp);
 	vput(vp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir");
 	return (error);
 }
 
 #ifdef COMPAT_43
 /*
  * Read a block of directory entries in a file system independent format.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ogetdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 ogetdirentries(td, uap)
 	struct thread *td;
 	register struct ogetdirentries_args /* {
 		syscallarg(int) fd;
 		syscallarg(char *) buf;
 		syscallarg(u_int) count;
 		syscallarg(long *) basep;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct uio auio, kuio;
 	struct iovec aiov, kiov;
 	struct dirent *dp, *edp;
 	caddr_t dirbuf;
 	int error, eofflag, readcnt;
 	long loff;
 
 	/* XXX arbitrary sanity limit on `count'. */
 	if (SCARG(uap, count) > 64 * 1024)
 		return (EINVAL);
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	if ((fp->f_flag & FREAD) == 0)
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
 		return (EBADF);
+	}
 	vp = (struct vnode *)fp->f_data;
 unionread:
-	if (vp->v_type != VDIR)
+	if (vp->v_type != VDIR) {
+		fdrop(fp, td);
 		return (EINVAL);
+	}
 	aiov.iov_base = SCARG(uap, buf);
 	aiov.iov_len = SCARG(uap, count);
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = SCARG(uap, count);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	loff = auio.uio_offset = fp->f_offset;
 #	if (BYTE_ORDER != LITTLE_ENDIAN)
 		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
 			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
 			    NULL, NULL);
 			fp->f_offset = auio.uio_offset;
 		} else
 #	endif
 	{
 		kuio = auio;
 		kuio.uio_iov = &kiov;
 		kuio.uio_segflg = UIO_SYSSPACE;
 		kiov.iov_len = SCARG(uap, count);
 		MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
 		kiov.iov_base = dirbuf;
 		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
 			    NULL, NULL);
 		fp->f_offset = kuio.uio_offset;
 		if (error == 0) {
 			readcnt = SCARG(uap, count) - kuio.uio_resid;
 			edp = (struct dirent *)&dirbuf[readcnt];
 			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
 #				if (BYTE_ORDER == LITTLE_ENDIAN)
 					/*
 					 * The expected low byte of
 					 * dp->d_namlen is our dp->d_type.
 					 * The high MBZ byte of dp->d_namlen
 					 * is our dp->d_namlen.
 					 */
 					dp->d_type = dp->d_namlen;
 					dp->d_namlen = 0;
 #				else
 					/*
 					 * The dp->d_type is the high byte
 					 * of the expected dp->d_namlen,
 					 * so must be zero'ed.
 					 */
 					dp->d_type = 0;
 #				endif
 				if (dp->d_reclen > 0) {
 					dp = (struct dirent *)
 					    ((char *)dp + dp->d_reclen);
 				} else {
 					error = EIO;
 					break;
 				}
 			}
 			if (dp >= edp)
 				error = uiomove(dirbuf, readcnt, &auio);
 		}
 		FREE(dirbuf, M_TEMP);
 	}
 	VOP_UNLOCK(vp, 0, td);
-	if (error)
+	if (error) {
+		fdrop(fp, td);
 		return (error);
+	}
 	if (SCARG(uap, count) == auio.uio_resid) {
 		if (union_dircheckp) {
 			error = union_dircheckp(td, &vp, fp);
 			if (error == -1)
 				goto unionread;
-			if (error)
+			if (error) {
+				fdrop(fp, td);
 				return (error);
+			}
 		}
 		if ((vp->v_flag & VROOT) &&
 		    (vp->v_mount->mnt_flag & MNT_UNION)) {
 			struct vnode *tvp = vp;
 			vp = vp->v_mount->mnt_vnodecovered;
 			VREF(vp);
 			fp->f_data = (caddr_t) vp;
 			fp->f_offset = 0;
 			vrele(tvp);
 			goto unionread;
 		}
 	}
 	error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
 	    sizeof(long));
+	fdrop(fp, td);
 	td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Read a block of directory entries in a file system independent format.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 getdirentries(td, uap)
 	struct thread *td;
 	register struct getdirentries_args /* {
 		syscallarg(int) fd;
 		syscallarg(char *) buf;
 		syscallarg(u_int) count;
 		syscallarg(long *) basep;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	long loff;
 	int error, eofflag;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	if ((fp->f_flag & FREAD) == 0)
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
 		return (EBADF);
+	}
 	vp = (struct vnode *)fp->f_data;
 unionread:
-	if (vp->v_type != VDIR)
+	if (vp->v_type != VDIR) {
+		fdrop(fp, td);
 		return (EINVAL);
+	}
 	aiov.iov_base = SCARG(uap, buf);
 	aiov.iov_len = SCARG(uap, count);
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = SCARG(uap, count);
 	/* vn_lock(vp, LK_SHARED | LK_RETRY, td); */
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	loff = auio.uio_offset = fp->f_offset;
 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
 	fp->f_offset = auio.uio_offset;
 	VOP_UNLOCK(vp, 0, td);
-	if (error)
+	if (error) {
+		fdrop(fp, td);
 		return (error);
+	}
 	if (SCARG(uap, count) == auio.uio_resid) {
 		if (union_dircheckp) {
 			error = union_dircheckp(td, &vp, fp);
 			if (error == -1)
 				goto unionread;
-			if (error)
+			if (error) {
+				fdrop(fp, td);
 				return (error);
+			}
 		}
 		if ((vp->v_flag & VROOT) &&
 		    (vp->v_mount->mnt_flag & MNT_UNION)) {
 			struct vnode *tvp = vp;
 			vp = vp->v_mount->mnt_vnodecovered;
 			VREF(vp);
 			fp->f_data = (caddr_t) vp;
 			fp->f_offset = 0;
 			vrele(tvp);
 			goto unionread;
 		}
 	}
 	if (SCARG(uap, basep) != NULL) {
 		error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
 		    sizeof(long));
 	}
 	td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	fdrop(fp, td);
 	return (error);
 }
 #ifndef _SYS_SYSPROTO_H_
 struct getdents_args {
 	int fd;
 	char *buf;
 	size_t count;
 };
 #endif
 int
 getdents(td, uap)
 	struct thread *td;
 	register struct getdents_args /* {
 		syscallarg(int) fd;
 		syscallarg(char *) buf;
 		syscallarg(u_int) count;
 	} */ *uap;
 {
 	struct getdirentries_args ap;
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.count = uap->count;
 	ap.basep = NULL;
 	return getdirentries(td, &ap);
 }
 
 /*
  * Set the mode mask for creation of filesystem nodes.
  *
  * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct umask_args {
 	int	newmask;
 };
 #endif
 int
 umask(td, uap)
 	struct thread *td;
 	struct umask_args /* {
 		syscallarg(int) newmask;
 	} */ *uap;
 {
 	register struct filedesc *fdp;
 
+	FILEDESC_LOCK(td->td_proc->p_fd);
 	fdp = td->td_proc->p_fd;
 	td->td_retval[0] = fdp->fd_cmask;
 	fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	return (0);
 }
 
 /*
  * Void all references to file by ripping underlying filesystem
  * away from vnode.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct revoke_args {
 	char	*path;
 };
 #endif
 /* ARGSUSED */
 int
 revoke(td, uap)
 	struct thread *td;
 	register struct revoke_args /* {
 		syscallarg(char *) path;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (vp->v_type != VCHR) {
 		error = EINVAL;
 		goto out;
 	}
 	error = VOP_GETATTR(vp, &vattr, td->td_proc->p_ucred, td);
 	if (error)
 		goto out;
 	if (td->td_proc->p_ucred->cr_uid != vattr.va_uid) {
 		error = suser_xxx(0, td->td_proc, PRISON_ROOT);
 		if (error)
 			goto out;
 	}
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto out;
 	if (vcount(vp) > 1)
 		VOP_REVOKE(vp, REVOKEALL);
 	vn_finished_write(mp);
 out:
 	vrele(vp);
 	return (error);
 }
 
 /*
  * Convert a user file descriptor to a kernel file entry.
+ * The file entry is locked upon returning.
  */
 int
 getvnode(fdp, fd, fpp)
 	struct filedesc *fdp;
 	int fd;
 	struct file **fpp;
 {
+	int error;
 	struct file *fp;
 
-	if ((u_int)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL)
-		return (EBADF);
-	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO)
-		return (EINVAL);
+	fp = NULL;
+	if (fdp == NULL)
+		error = EBADF;
+	else {
+		FILEDESC_LOCK(fdp);
+		if ((u_int)fd >= fdp->fd_nfiles ||
+		    (fp = fdp->fd_ofiles[fd]) == NULL)
+			error = EBADF;
+		else if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
+			fp = NULL;
+			error = EINVAL;
+		} else {
+			fhold(fp);
+			error = 0;
+		}
+		FILEDESC_UNLOCK(fdp);
+	}
 	*fpp = fp;
-	return (0);
+	return (error);
 }
 /*
  * Get (NFS) file handle
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getfh_args {
 	char	*fname;
 	fhandle_t *fhp;
 };
 #endif
 int
 getfh(td, uap)
 	struct thread *td;
 	register struct getfh_args *uap;
 {
 	struct nameidata nd;
 	fhandle_t fh;
 	register struct vnode *vp;
 	int error;
 
 	/*
 	 * Must be super user
 	 */
 	error = suser_td(td);
 	if (error)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VFS_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
 	if (error)
 		return (error);
 	error = copyout(&fh, uap->fhp, sizeof (fh));
 	return (error);
 }
 
 /*
  * syscall for the rpc.lockd to use to translate a NFS file handle into
  * an open descriptor.
  *
  * warning: do not remove the suser() call or this becomes one giant
  * security hole.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhopen_args {
 	const struct fhandle *u_fhp;
 	int flags;
 };
 #endif
 int
 fhopen(td, uap)
 	struct thread *td;
 	struct fhopen_args /* {
 		syscallarg(const struct fhandle *) u_fhp;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	struct proc *p = td->td_proc;
 	struct mount *mp;
 	struct vnode *vp;
 	struct fhandle fhp;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	struct flock lf;
 	struct file *fp;
 	register struct filedesc *fdp = p->p_fd;
 	int fmode, mode, error, type;
 	struct file *nfp; 
 	int indx;
 
 	/*
 	 * Must be super user
 	 */
 	error = suser_td(td);
 	if (error)
 		return (error);
 
 	fmode = FFLAGS(SCARG(uap, flags));
 	/* why not allow a non-read/write open for our lockd? */
 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
 		return (EINVAL);
 	error = copyin(SCARG(uap,u_fhp), &fhp, sizeof(fhp));
 	if (error)
 		return(error);
 	/* find the mount point */
 	mp = vfs_getvfs(&fhp.fh_fsid);
 	if (mp == NULL)
 		return (ESTALE);
 	/* now give me my vnode, it gets returned to me locked */
 	error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
 	if (error)
 		return (error);
  	/*
 	 * from now on we have to make sure not
 	 * to forget about the vnode
 	 * any error that causes an abort must vput(vp) 
 	 * just set error = err and 'goto bad;'.
 	 */
 
 	/* 
 	 * from vn_open 
 	 */
 	if (vp->v_type == VLNK) {
 		error = EMLINK;
 		goto bad;
 	}
 	if (vp->v_type == VSOCK) {
 		error = EOPNOTSUPP;
 		goto bad;
 	}
 	mode = 0;
 	if (fmode & (FWRITE | O_TRUNC)) {
 		if (vp->v_type == VDIR) {
 			error = EISDIR;
 			goto bad;
 		}
 		error = vn_writechk(vp);
 		if (error)
 			goto bad;
 		mode |= VWRITE;
 	}
 	if (fmode & FREAD)
 		mode |= VREAD;
 	if (mode) {
 		error = VOP_ACCESS(vp, mode, p->p_ucred, td);
 		if (error)
 			goto bad;
 	}
 	if (fmode & O_TRUNC) {
 		VOP_UNLOCK(vp, 0, td);				/* XXX */
 		if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
 			vrele(vp);
 			return (error);
 		}
 		VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);	/* XXX */
 		VATTR_NULL(vap);
 		vap->va_size = 0;
 		error = VOP_SETATTR(vp, vap, p->p_ucred, td);
 		vn_finished_write(mp);
 		if (error)
 			goto bad;
 	}
 	error = VOP_OPEN(vp, fmode, p->p_ucred, td);
 	if (error)
 		goto bad;
 	/*
 	 * Make sure that a VM object is created for VMIO support.
 	 */
 	if (vn_canvmio(vp) == TRUE) {
 		if ((error = vfs_object_create(vp, td, p->p_ucred)) != 0)
 			goto bad;
 	}
 	if (fmode & FWRITE)
 		vp->v_writecount++;
 
 	/*
 	 * end of vn_open code 
 	 */
 
 	if ((error = falloc(td, &nfp, &indx)) != 0) {
 		if (fmode & FWRITE)
 			vp->v_writecount--;
 		goto bad;
 	}
 	fp = nfp;	
 
 	/*
 	 * Hold an extra reference to avoid having fp ripped out 
 	 * from under us while we block in the lock op
 	 */
 	fhold(fp);
 	nfp->f_data = (caddr_t)vp;
 	nfp->f_flag = fmode & FMASK;
 	nfp->f_ops = &vnops;
 	nfp->f_type = DTYPE_VNODE;
 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		if (fmode & O_EXLOCK)
 			lf.l_type = F_WRLCK;
 		else
 			lf.l_type = F_RDLCK;
 		type = F_FLOCK;
 		if ((fmode & FNONBLOCK) == 0)
 			type |= F_WAIT;
 		VOP_UNLOCK(vp, 0, td);
 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
 			/*
 			 * The lock request failed.  Normally close the
 			 * descriptor but handle the case where someone might
 			 * have dup()d or close()d it when we weren't looking.
 			 */
+			FILEDESC_LOCK(fdp);
 			if (fdp->fd_ofiles[indx] == fp) {
 				fdp->fd_ofiles[indx] = NULL;
+				FILEDESC_UNLOCK(fdp);
 				fdrop(fp, td);
-			}
+			} else
+				FILEDESC_UNLOCK(fdp);
 			/*
 			 * release our private reference
 			 */
 			fdrop(fp, td);
 			return(error);
 		}
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		fp->f_flag |= FHASLOCK;
 	}
 	if ((vp->v_type == VREG) && (VOP_GETVOBJECT(vp, NULL) != 0))
 		vfs_object_create(vp, td, p->p_ucred);
 
 	VOP_UNLOCK(vp, 0, td);
 	fdrop(fp, td);
 	td->td_retval[0] = indx;
 	return (0);
 
 bad:
 	vput(vp);
 	return (error);
 }
 
 /*
  * Stat an (NFS) file handle.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstat_args {
 	struct fhandle *u_fhp;
 	struct stat *sb;
 };
 #endif
 int
 fhstat(td, uap)
 	struct thread *td;
 	register struct fhstat_args /* {
 		syscallarg(struct fhandle *) u_fhp;
 		syscallarg(struct stat *) sb;
 	} */ *uap;
 {
 	struct stat sb;
 	fhandle_t fh;
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	/*
 	 * Must be super user
 	 */
 	error = suser_td(td);
 	if (error)
 		return (error);
 	
 	error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t));
 	if (error)
 		return (error);
 
 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
 		return (error);
 	error = vn_stat(vp, &sb, td);
 	vput(vp);
 	if (error)
 		return (error);
 	error = copyout(&sb, SCARG(uap, sb), sizeof(sb));
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstatfs_args {
 	struct fhandle *u_fhp;
 	struct statfs *buf;
 };
 #endif
 int
 fhstatfs(td, uap)
 	struct thread *td;
 	struct fhstatfs_args /* {
 		syscallarg(struct fhandle) *u_fhp;
 		syscallarg(struct statfs) *buf;
 	} */ *uap;
 {
 	struct statfs *sp;
 	struct mount *mp;
 	struct vnode *vp;
 	struct statfs sb;
 	fhandle_t fh;
 	int error;
 
 	/*
 	 * Must be super user
 	 */
 	error = suser_td(td);
 	if (error)
 		return (error);
 
 	if ((error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t))) != 0)
 		return (error);
 
 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
 		return (error);
 	mp = vp->v_mount;
 	sp = &mp->mnt_stat;
 	vput(vp);
 	if ((error = VFS_STATFS(mp, sp, td)) != 0)
 		return (error);
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	if (suser_xxx(td->td_proc->p_ucred, 0, 0)) {
 		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		sp = &sb;
 	}
 	return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
 }
 
 /*
  * Syscall to push extended attribute configuration information into the
  * VFS.  Accepts a path, which it converts to a mountpoint, as well as
  * a command (int cmd), and attribute name and misc data.  For now, the
  * attribute name is left in userspace for consumption by the VFS_op.
  * It will probably be changed to be copied into sysspace by the
  * syscall in the future, once issues with various consumers of the
  * attribute code have raised their hands.
  *
  * Currently this is used only by UFS Extended Attributes.
  */
 int
 extattrctl(td, uap)
 	struct thread *td;
 	struct extattrctl_args *uap;
 {
 	struct vnode *filename_vp;
 	struct nameidata nd;
 	struct mount *mp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	/*
 	 * SCARG(uap, attrname) not always defined.  We check again later
 	 * when we invoke the VFS call so as to pass in NULL there if needed.
 	 */
 	if (SCARG(uap, attrname) != NULL) {
 		error = copyinstr(SCARG(uap, attrname), attrname,
 		    EXTATTR_MAXNAMELEN, NULL);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * SCARG(uap, filename) not always defined.  If it is, grab
 	 * a vnode lock, which VFS_EXTATTRCTL() will later release.
 	 */
 	filename_vp = NULL;
 	if (SCARG(uap, filename) != NULL) {
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
 		    SCARG(uap, filename), td);
 		if ((error = namei(&nd)) != 0)
 			return (error);
 		filename_vp = nd.ni_vp;
 		NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK);
 	}
 
 	/* SCARG(uap, path) always defined. */
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
 	NDFREE(&nd, 0);
 	if (error) {
 		if (filename_vp)
 			vrele(filename_vp);
 		return (error);
 	}
 
 	if (SCARG(uap, attrname) != NULL) {
 		error = VFS_EXTATTRCTL(mp, SCARG(uap, cmd), filename_vp,
 		    SCARG(uap, attrnamespace), attrname, td);
 	} else {
 		error = VFS_EXTATTRCTL(mp, SCARG(uap, cmd), filename_vp,
 		    SCARG(uap, attrnamespace), NULL, td);
 	}
 
 	vn_finished_write(mp);
 	/*
 	 * VFS_EXTATTRCTL will have unlocked, but not de-ref'd,
 	 * filename_vp, so vrele it if it is defined.
 	 */
 	if (filename_vp != NULL)
 		vrele(filename_vp);
 
 	return (error);
 }
 
 /*
  * extattr_set_vp(): Set a named extended attribute on a file or directory
  * 
  * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
  *            kernelspace string pointer "attrname",
  *            userspace iovec array pointer "iovp", unsigned int iovcnt
  *            proc "p"
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
     struct iovec *iovp, unsigned iovcnt, struct thread *td)
 {
 	struct mount *mp;
 	struct uio auio;
 	struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
 	u_int iovlen, cnt;
 	int error, i;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	iovlen = iovcnt * sizeof(struct iovec);
 	if (iovcnt > UIO_SMALLIOV) {
 		if (iovcnt > UIO_MAXIOV) {
 			error = EINVAL;
 			goto done;
 		}
 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
 		needfree = iov;
 	} else
 		iov = aiov;
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = iovcnt;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	if ((error = copyin((caddr_t)iovp, (caddr_t)iov, iovlen)))
 		goto done;
 	auio.uio_resid = 0;
 	for (i = 0; i < iovcnt; i++) {
 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
 			error = EINVAL;
 			goto done;
 		}
 		auio.uio_resid += iov->iov_len;
 		iov++;
 	}
 	cnt = auio.uio_resid;
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
 	    td->td_proc->p_ucred, td);
 	cnt -= auio.uio_resid;
 	td->td_retval[0] = cnt;
 done:
 	if (needfree)
 		FREE(needfree, M_IOV);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 int
 extattr_set_file(td, uap)
 	struct thread *td;
 	struct extattr_set_file_args *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN,
 	    NULL);
 	if (error)
 		return (error);
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_set_vp(nd.ni_vp, SCARG(uap, attrnamespace), attrname,
 	    SCARG(uap, iovp), SCARG(uap, iovcnt), td);
 
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 int
 extattr_set_fd(td, uap)
 	struct thread *td;
 	struct extattr_set_fd_args *uap;
 {
 	struct file *fp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN,
 	    NULL);
 	if (error)
 		return (error);
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
 
 	error = extattr_set_vp((struct vnode *)fp->f_data,
 	    SCARG(uap, attrnamespace), attrname, SCARG(uap, iovp),
 	    SCARG(uap, iovcnt), td);
+	fdrop(fp, td);
 
 	return (error);
 }
 
 /*
  * extattr_get_vp(): Get a named extended attribute on a file or directory
  * 
  * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
  *            kernelspace string pointer "attrname",
  *            userspace iovec array pointer "iovp", unsigned int iovcnt,
  *            proc "p"
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
     struct iovec *iovp, unsigned iovcnt, struct thread *td)
 {
 	struct uio auio;
 	struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
 	u_int iovlen, cnt;
 	int error, i;
 
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_READ);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	iovlen = iovcnt * sizeof (struct iovec);
 	if (iovcnt > UIO_SMALLIOV) {
 		if (iovcnt > UIO_MAXIOV) {
 			error = EINVAL;
 			goto done;
 		}
 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
 		needfree = iov;
 	} else
 		iov = aiov;
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = iovcnt;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	if ((error = copyin((caddr_t)iovp, (caddr_t)iov, iovlen)))
 		goto done;
 	auio.uio_resid = 0;
 	for (i = 0; i < iovcnt; i++) {
 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
 			error = EINVAL;
 			goto done;
 		}
 		auio.uio_resid += iov->iov_len;
 		iov++;
 	}
 	cnt = auio.uio_resid;
 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio,
 	    td->td_proc->p_ucred, td);
 	cnt -= auio.uio_resid;
 	td->td_retval[0] = cnt;
 done:
 	if (needfree)
 		FREE(needfree, M_IOV);
 	VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 int
 extattr_get_file(td, uap)
 	struct thread *td;
 	struct extattr_get_file_args *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN,
 	    NULL);
 	if (error)
 		return (error);
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_get_vp(nd.ni_vp, SCARG(uap, attrnamespace), attrname,
 	    SCARG(uap, iovp), SCARG(uap, iovcnt), td);
 
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 int
 extattr_get_fd(td, uap)
 	struct thread *td;
 	struct extattr_get_fd_args *uap;
 {
 	struct file *fp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN,
 	    NULL);
 	if (error)
 		return (error);
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
 
 	error = extattr_get_vp((struct vnode *)fp->f_data,
 	    SCARG(uap, attrnamespace), attrname, SCARG(uap, iovp),
 	    SCARG(uap, iovcnt), td);
 
+	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * extattr_delete_vp(): Delete a named extended attribute on a file or
  *                      directory
  * 
  * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
  *            kernelspace string pointer "attrname", proc "p"
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
     struct thread *td)
 {
 	struct mount *mp;
 	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 	    td->td_proc->p_ucred, td);
 
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 int
 extattr_delete_file(td, uap)
 	struct thread *td;
 	struct extattr_delete_file_args *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN,
 	     NULL);
 	if (error)
 		return(error);
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return(error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_delete_vp(nd.ni_vp, SCARG(uap, attrnamespace),
 	    attrname, td);
 
 	vrele(nd.ni_vp);
 	return(error);
 }
 
 int
 extattr_delete_fd(td, uap)
 	struct thread *td;
 	struct extattr_delete_fd_args *uap;
 {
 	struct file *fp;
+	struct vnode *vp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN,
 	    NULL);
 	if (error)
 		return (error);
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
+	vp = (struct vnode *)fp->f_data;
 
 	error = extattr_delete_vp((struct vnode *)fp->f_data,
 	    SCARG(uap, attrnamespace), attrname, td);
 
+	fdrop(fp, td);
 	return (error);
 }
Index: head/sys/kern/vfs_lookup.c
===================================================================
--- head/sys/kern/vfs_lookup.c	(revision 89305)
+++ head/sys/kern/vfs_lookup.c	(revision 89306)
@@ -1,747 +1,749 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
  * $FreeBSD$
  */
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm_zone.h>
 
 /*
  * Allocation zone for namei
  */
 struct vm_zone *namei_zone;
 
 static void
 nameiinit(void *dummy __unused)
 {
 
 	namei_zone = zinit("NAMEI", MAXPATHLEN, 0, 0, 2);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL)
 
 /*
  * Convert a pathname into a pointer to a locked inode.
  *
  * The FOLLOW flag is set when symbolic links are to be followed
  * when they occur at the end of the name translation process.
  * Symbolic links are always followed for all other pathname
  * components other than the last.
  *
  * The segflg defines whether the name is to be copied from user
  * space or kernel space.
  *
  * Overall outline of namei:
  *
  *	copy in name
  *	get starting directory
  *	while (!done && !error) {
  *		call lookup to search path.
  *		if symbolic link, massage name in buffer and continue
  *	}
  */
 int
 namei(ndp)
 	register struct nameidata *ndp;
 {
 	register struct filedesc *fdp;	/* pointer to file descriptor state */
 	register char *cp;		/* pointer into pathname argument */
 	register struct vnode *dp;	/* the directory we are searching */
 	struct iovec aiov;		/* uio for reading symbolic links */
 	struct uio auio;
 	int error, linklen;
 	struct componentname *cnp = &ndp->ni_cnd;
 	struct thread *td = cnp->cn_thread;
 	struct proc *p = td->td_proc;
 
 	ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_proc->p_ucred;
 	KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
 	KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
 	    ("namei: nameiop contaminated with flags"));
 	KASSERT((cnp->cn_flags & OPMASK) == 0,
 	    ("namei: flags contaminated with nameiops"));
 	fdp = p->p_fd;
 
 	/*
 	 * Get a buffer for the name to be translated, and copy the
 	 * name into the buffer.
 	 */
 	if ((cnp->cn_flags & HASBUF) == 0)
 		cnp->cn_pnbuf = zalloc(namei_zone);
 	if (ndp->ni_segflg == UIO_SYSSPACE)
 		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
 			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
 	else
 		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
 			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
 
 	/*
 	 * Don't allow empty pathnames.
 	 */
 	if (!error && *cnp->cn_pnbuf == '\0')
 		error = ENOENT;
 
 	if (error) {
 		zfree(namei_zone, cnp->cn_pnbuf);
 		ndp->ni_vp = NULL;
 		return (error);
 	}
 	ndp->ni_loopcnt = 0;
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_NAMEI))
 		ktrnamei(p->p_tracep, cnp->cn_pnbuf);
 #endif
 
 	/*
 	 * Get starting point for the translation.
 	 */
+	FILEDESC_LOCK(fdp);
 	ndp->ni_rootdir = fdp->fd_rdir;
 	ndp->ni_topdir = fdp->fd_jdir;
 
 	dp = fdp->fd_cdir;
 	VREF(dp);
+	FILEDESC_UNLOCK(fdp);
 	for (;;) {
 		/*
 		 * Check if root directory should replace current directory.
 		 * Done at start of translation and after symbolic link.
 		 */
 		cnp->cn_nameptr = cnp->cn_pnbuf;
 		if (*(cnp->cn_nameptr) == '/') {
 			vrele(dp);
 			while (*(cnp->cn_nameptr) == '/') {
 				cnp->cn_nameptr++;
 				ndp->ni_pathlen--;
 			}
 			dp = ndp->ni_rootdir;
 			VREF(dp);
 		}
 		ndp->ni_startdir = dp;
 		error = lookup(ndp);
 		if (error) {
 			zfree(namei_zone, cnp->cn_pnbuf);
 			return (error);
 		}
 		/*
 		 * Check for symbolic link
 		 */
 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
 			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0)
 				zfree(namei_zone, cnp->cn_pnbuf);
 			else
 				cnp->cn_flags |= HASBUF;
 
 			if (vn_canvmio(ndp->ni_vp) == TRUE &&
 				(cnp->cn_nameiop != DELETE) &&
 				((cnp->cn_flags & (NOOBJ|LOCKLEAF)) ==
 				 LOCKLEAF))
 				vfs_object_create(ndp->ni_vp, td,
 					ndp->ni_cnd.cn_cred);
 
 			return (0);
 		}
 		if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
 			VOP_UNLOCK(ndp->ni_dvp, 0, td);
 		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
 			error = ELOOP;
 			break;
 		}
 		if (ndp->ni_pathlen > 1)
 			cp = zalloc(namei_zone);
 		else
 			cp = cnp->cn_pnbuf;
 		aiov.iov_base = cp;
 		aiov.iov_len = MAXPATHLEN;
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_td = (struct thread *)0;
 		auio.uio_resid = MAXPATHLEN;
 		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
 		if (error) {
 			if (ndp->ni_pathlen > 1)
 				zfree(namei_zone, cp);
 			break;
 		}
 		linklen = MAXPATHLEN - auio.uio_resid;
 		if (linklen == 0) {
 			if (ndp->ni_pathlen > 1)
 				zfree(namei_zone, cp);
 			error = ENOENT;
 			break;
 		}
 		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
 			if (ndp->ni_pathlen > 1)
 				zfree(namei_zone, cp);
 			error = ENAMETOOLONG;
 			break;
 		}
 		if (ndp->ni_pathlen > 1) {
 			bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
 			zfree(namei_zone, cnp->cn_pnbuf);
 			cnp->cn_pnbuf = cp;
 		} else
 			cnp->cn_pnbuf[linklen] = '\0';
 		ndp->ni_pathlen += linklen;
 		vput(ndp->ni_vp);
 		dp = ndp->ni_dvp;
 	}
 	zfree(namei_zone, cnp->cn_pnbuf);
 	vrele(ndp->ni_dvp);
 	vput(ndp->ni_vp);
 	ndp->ni_vp = NULL;
 	return (error);
 }
 
 /*
  * Search a pathname.
  * This is a very central and rather complicated routine.
  *
  * The pathname is pointed to by ni_ptr and is of length ni_pathlen.
  * The starting directory is taken from ni_startdir. The pathname is
  * descended until done, or a symbolic link is encountered. The variable
  * ni_more is clear if the path is completed; it is set to one if a
  * symbolic link needing interpretation is encountered.
  *
  * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
  * whether the name is to be looked up, created, renamed, or deleted.
  * When CREATE, RENAME, or DELETE is specified, information usable in
  * creating, renaming, or deleting a directory entry may be calculated.
  * If flag has LOCKPARENT or'ed into it, the parent directory is returned
  * locked. If flag has WANTPARENT or'ed into it, the parent directory is
  * returned unlocked. Otherwise the parent directory is not returned. If
  * the target of the pathname exists and LOCKLEAF is or'ed into the flag
  * the target is returned locked, otherwise it is returned unlocked.
  * When creating or renaming and LOCKPARENT is specified, the target may not
  * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
  *
  * Overall outline of lookup:
  *
  * dirloop:
  *	identify next component of name at ndp->ni_ptr
  *	handle degenerate case where name is null string
  *	if .. and crossing mount points and on mounted filesys, find parent
  *	call VOP_LOOKUP routine for next component name
  *	    directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set
  *	    component vnode returned in ni_vp (if it exists), locked.
  *	if result vnode is mounted on and crossing mount points,
  *	    find mounted on vnode
  *	if more components of name, do next level at dirloop
  *	return the answer in ni_vp, locked if LOCKLEAF set
  *	    if LOCKPARENT set, return locked parent in ni_dvp
  *	    if WANTPARENT set, return unlocked parent in ni_dvp
  */
 int
 lookup(ndp)
 	register struct nameidata *ndp;
 {
 	register char *cp;		/* pointer into pathname argument */
 	register struct vnode *dp = 0;	/* the directory we are searching */
 	struct vnode *tdp;		/* saved dp */
 	struct mount *mp;		/* mount table entry */
 	int docache;			/* == 0 do not cache last component */
 	int wantparent;			/* 1 => wantparent or lockparent flag */
 	int rdonly;			/* lookup read-only flag bit */
 	int trailing_slash;
 	int error = 0;
 	int dpunlocked = 0;		/* dp has already been unlocked */
 	struct componentname *cnp = &ndp->ni_cnd;
 	struct thread *td = cnp->cn_thread;
 
 	/*
 	 * Setup: break out flag bits into variables.
 	 */
 	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 	if (cnp->cn_nameiop == DELETE ||
 	    (wantparent && cnp->cn_nameiop != CREATE &&
 	     cnp->cn_nameiop != LOOKUP))
 		docache = 0;
 	rdonly = cnp->cn_flags & RDONLY;
 	ndp->ni_dvp = NULL;
 	cnp->cn_flags &= ~ISSYMLINK;
 	dp = ndp->ni_startdir;
 	ndp->ni_startdir = NULLVP;
 	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
 
 dirloop:
 	/*
 	 * Search a new directory.
 	 *
 	 * The last component of the filename is left accessible via
 	 * cnp->cn_nameptr for callers that need the name. Callers needing
 	 * the name set the SAVENAME flag. When done, they assume
 	 * responsibility for freeing the pathname buffer.
 	 */
 	cnp->cn_consume = 0;
 	for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++)
 		continue;
 	cnp->cn_namelen = cp - cnp->cn_nameptr;
 	if (cnp->cn_namelen > NAME_MAX) {
 		error = ENAMETOOLONG;
 		goto bad;
 	}
 #ifdef NAMEI_DIAGNOSTIC
 	{ char c = *cp;
 	*cp = '\0';
 	printf("{%s}: ", cnp->cn_nameptr);
 	*cp = c; }
 #endif
 	ndp->ni_pathlen -= cnp->cn_namelen;
 	ndp->ni_next = cp;
 
 	/*
 	 * Replace multiple slashes by a single slash and trailing slashes
 	 * by a null.  This must be done before VOP_LOOKUP() because some
 	 * fs's don't know about trailing slashes.  Remember if there were
 	 * trailing slashes to handle symlinks, existing non-directories
 	 * and non-existing files that won't be directories specially later.
 	 */
 	trailing_slash = 0;
 	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
 		cp++;
 		ndp->ni_pathlen--;
 		if (*cp == '\0') {
 			trailing_slash = 1;
 			*ndp->ni_next = '\0';	/* XXX for direnter() ... */
 		}
 	}
 	ndp->ni_next = cp;
 
 	cnp->cn_flags |= MAKEENTRY;
 	if (*cp == '\0' && docache == 0)
 		cnp->cn_flags &= ~MAKEENTRY;
 	if (cnp->cn_namelen == 2 &&
 	    cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
 		cnp->cn_flags |= ISDOTDOT;
 	else
 		cnp->cn_flags &= ~ISDOTDOT;
 	if (*ndp->ni_next == 0)
 		cnp->cn_flags |= ISLASTCN;
 	else
 		cnp->cn_flags &= ~ISLASTCN;
 
 
 	/*
 	 * Check for degenerate name (e.g. / or "")
 	 * which is a way of talking about a directory,
 	 * e.g. like "/." or ".".
 	 */
 	if (cnp->cn_nameptr[0] == '\0') {
 		if (dp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto bad;
 		}
 		if (cnp->cn_nameiop != LOOKUP) {
 			error = EISDIR;
 			goto bad;
 		}
 		if (wantparent) {
 			ndp->ni_dvp = dp;
 			VREF(dp);
 		}
 		ndp->ni_vp = dp;
 		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
 			VOP_UNLOCK(dp, 0, td);
 		/* XXX This should probably move to the top of function. */
 		if (cnp->cn_flags & SAVESTART)
 			panic("lookup: SAVESTART");
 		return (0);
 	}
 
 	/*
 	 * Handle "..": two special cases.
 	 * 1. If at root directory (e.g. after chroot)
 	 *    or at absolute root directory
 	 *    then ignore it so can't get out.
 	 * 2. If this vnode is the root of a mounted
 	 *    filesystem, then replace it with the
 	 *    vnode which was mounted on so we take the
 	 *    .. in the other file system.
 	 * 3. If the vnode is the top directory of
 	 *    the jail or chroot, don't let them out.
 	 */
 	if (cnp->cn_flags & ISDOTDOT) {
 		for (;;) {
 			if (dp == ndp->ni_rootdir || 
 			    dp == ndp->ni_topdir || 
 			    dp == rootvnode) {
 				ndp->ni_dvp = dp;
 				ndp->ni_vp = dp;
 				VREF(dp);
 				goto nextname;
 			}
 			if ((dp->v_flag & VROOT) == 0 ||
 			    (cnp->cn_flags & NOCROSSMOUNT))
 				break;
 			if (dp->v_mount == NULL) {	/* forced unmount */
 				error = EBADF;
 				goto bad;
 			}
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
 			vput(tdp);
 			VREF(dp);
 			vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
 		}
 	}
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 */
 unionlookup:
 	ndp->ni_dvp = dp;
 	ndp->ni_vp = NULL;
 	cnp->cn_flags &= ~PDIRUNLOCK;
 	ASSERT_VOP_LOCKED(dp, "lookup");
 	if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
 		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
 #ifdef NAMEI_DIAGNOSTIC
 		printf("not found\n");
 #endif
 		if ((error == ENOENT) &&
 		    (dp->v_flag & VROOT) && (dp->v_mount != NULL) &&
 		    (dp->v_mount->mnt_flag & MNT_UNION)) {
 			tdp = dp;
 			dp = dp->v_mount->mnt_vnodecovered;
 			if (cnp->cn_flags & PDIRUNLOCK)
 				vrele(tdp);
 			else
 				vput(tdp);
 			VREF(dp);
 			vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
 			goto unionlookup;
 		}
 
 		if (error != EJUSTRETURN)
 			goto bad;
 		/*
 		 * If creating and at end of pathname, then can consider
 		 * allowing file to be created.
 		 */
 		if (rdonly) {
 			error = EROFS;
 			goto bad;
 		}
 		if (*cp == '\0' && trailing_slash &&
 		     !(cnp->cn_flags & WILLBEDIR)) {
 			error = ENOENT;
 			goto bad;
 		}
 		/*
 		 * We return with ni_vp NULL to indicate that the entry
 		 * doesn't currently exist, leaving a pointer to the
 		 * (possibly locked) directory inode in ndp->ni_dvp.
 		 */
 		if (cnp->cn_flags & SAVESTART) {
 			ndp->ni_startdir = ndp->ni_dvp;
 			VREF(ndp->ni_startdir);
 		}
 		return (0);
 	}
 #ifdef NAMEI_DIAGNOSTIC
 	printf("found\n");
 #endif
 
 	ASSERT_VOP_LOCKED(ndp->ni_vp, "lookup");
 
 	/*
 	 * Take into account any additional components consumed by
 	 * the underlying filesystem.
 	 */
 	if (cnp->cn_consume > 0) {
 		cnp->cn_nameptr += cnp->cn_consume;
 		ndp->ni_next += cnp->cn_consume;
 		ndp->ni_pathlen -= cnp->cn_consume;
 		cnp->cn_consume = 0;
 	}
 
 	dp = ndp->ni_vp;
 
 	/*
 	 * Check to see if the vnode has been mounted on;
 	 * if so find the root of the mounted file system.
 	 */
 	while (dp->v_type == VDIR && (mp = dp->v_mountedhere) &&
 	       (cnp->cn_flags & NOCROSSMOUNT) == 0) {
 		if (vfs_busy(mp, 0, 0, td))
 			continue;
 		VOP_UNLOCK(dp, 0, td);
 		error = VFS_ROOT(mp, &tdp);
 		vfs_unbusy(mp, td);
 		if (error) {
 			dpunlocked = 1;
 			goto bad2;
 		}
 		vrele(dp);
 		ndp->ni_vp = dp = tdp;
 	}
 
 	/*
 	 * Check for symbolic link
 	 */
 	if ((dp->v_type == VLNK) &&
 	    ((cnp->cn_flags & FOLLOW) || trailing_slash ||
 	     *ndp->ni_next == '/')) {
 		cnp->cn_flags |= ISSYMLINK;
 		if (dp->v_mount == NULL) {
 			/* We can't know whether the directory was mounted with
 			 * NOSYMFOLLOW, so we can't follow safely. */
 			error = EBADF;
 			goto bad2;
 		}
 		if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) {
 			error = EACCES;
 			goto bad2;
 		}
 		return (0);
 	}
 
 	/*
 	 * Check for bogus trailing slashes.
 	 */
 	if (trailing_slash && dp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto bad2;
 	}
 
 nextname:
 	/*
 	 * Not a symbolic link.  If more pathname,
 	 * continue at next component, else return.
 	 */
 	if (*ndp->ni_next == '/') {
 		cnp->cn_nameptr = ndp->ni_next;
 		while (*cnp->cn_nameptr == '/') {
 			cnp->cn_nameptr++;
 			ndp->ni_pathlen--;
 		}
 		if (ndp->ni_dvp != ndp->ni_vp)
 			ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "lookup");
 		vrele(ndp->ni_dvp);
 		goto dirloop;
 	}
 	/*
 	 * Disallow directory write attempts on read-only file systems.
 	 */
 	if (rdonly &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EROFS;
 		goto bad2;
 	}
 	if (cnp->cn_flags & SAVESTART) {
 		ndp->ni_startdir = ndp->ni_dvp;
 		VREF(ndp->ni_startdir);
 	}
 	if (!wantparent)
 		vrele(ndp->ni_dvp);
 
 	if ((cnp->cn_flags & LOCKLEAF) == 0)
 		VOP_UNLOCK(dp, 0, td);
 	return (0);
 
 bad2:
 	if ((cnp->cn_flags & (LOCKPARENT | PDIRUNLOCK)) == LOCKPARENT &&
 	    *ndp->ni_next == '\0')
 		VOP_UNLOCK(ndp->ni_dvp, 0, td);
 	vrele(ndp->ni_dvp);
 bad:
 	if (dpunlocked)
 		vrele(dp);
 	else
 		vput(dp);
 	ndp->ni_vp = NULL;
 	return (error);
 }
 
 /*
  * relookup - lookup a path name component
  *    Used by lookup to re-aquire things.
  */
 int
 relookup(dvp, vpp, cnp)
 	struct vnode *dvp, **vpp;
 	struct componentname *cnp;
 {
 	struct thread *td = cnp->cn_thread;
 	struct vnode *dp = 0;		/* the directory we are searching */
 	int docache;			/* == 0 do not cache last component */
 	int wantparent;			/* 1 => wantparent or lockparent flag */
 	int rdonly;			/* lookup read-only flag bit */
 	int error = 0;
 #ifdef NAMEI_DIAGNOSTIC
 	int newhash;			/* DEBUG: check name hash */
 	char *cp;			/* DEBUG: check name ptr/len */
 #endif
 
 	/*
 	 * Setup: break out flag bits into variables.
 	 */
 	wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT);
 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 	if (cnp->cn_nameiop == DELETE ||
 	    (wantparent && cnp->cn_nameiop != CREATE))
 		docache = 0;
 	rdonly = cnp->cn_flags & RDONLY;
 	cnp->cn_flags &= ~ISSYMLINK;
 	dp = dvp;
 	vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, td);
 
 /* dirloop: */
 	/*
 	 * Search a new directory.
 	 *
 	 * The last component of the filename is left accessible via
 	 * cnp->cn_nameptr for callers that need the name. Callers needing
 	 * the name set the SAVENAME flag. When done, they assume
 	 * responsibility for freeing the pathname buffer.
 	 */
 #ifdef NAMEI_DIAGNOSTIC
 	if (cnp->cn_namelen != cp - cnp->cn_nameptr)
 		panic ("relookup: bad len");
 	if (*cp != 0)
 		panic("relookup: not last component");
 	printf("{%s}: ", cnp->cn_nameptr);
 #endif
 
 	/*
 	 * Check for degenerate name (e.g. / or "")
 	 * which is a way of talking about a directory,
 	 * e.g. like "/." or ".".
 	 */
 	if (cnp->cn_nameptr[0] == '\0') {
 		if (cnp->cn_nameiop != LOOKUP || wantparent) {
 			error = EISDIR;
 			goto bad;
 		}
 		if (dp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto bad;
 		}
 		if (!(cnp->cn_flags & LOCKLEAF))
 			VOP_UNLOCK(dp, 0, td);
 		*vpp = dp;
 		/* XXX This should probably move to the top of function. */
 		if (cnp->cn_flags & SAVESTART)
 			panic("lookup: SAVESTART");
 		return (0);
 	}
 
 	if (cnp->cn_flags & ISDOTDOT)
 		panic ("relookup: lookup on dot-dot");
 
 	/*
 	 * We now have a segment name to search for, and a directory to search.
 	 */
 	if ((error = VOP_LOOKUP(dp, vpp, cnp)) != 0) {
 		KASSERT(*vpp == NULL, ("leaf should be empty"));
 		if (error != EJUSTRETURN)
 			goto bad;
 		/*
 		 * If creating and at end of pathname, then can consider
 		 * allowing file to be created.
 		 */
 		if (rdonly) {
 			error = EROFS;
 			goto bad;
 		}
 		/* ASSERT(dvp == ndp->ni_startdir) */
 		if (cnp->cn_flags & SAVESTART)
 			VREF(dvp);
 		/*
 		 * We return with ni_vp NULL to indicate that the entry
 		 * doesn't currently exist, leaving a pointer to the
 		 * (possibly locked) directory inode in ndp->ni_dvp.
 		 */
 		return (0);
 	}
 	dp = *vpp;
 
 	/*
 	 * Check for symbolic link
 	 */
 	KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW),
 	    ("relookup: symlink found.\n"));
 
 	/*
 	 * Disallow directory write attempts on read-only file systems.
 	 */
 	if (rdonly &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
 		error = EROFS;
 		goto bad2;
 	}
 	/* ASSERT(dvp == ndp->ni_startdir) */
 	if (cnp->cn_flags & SAVESTART)
 		VREF(dvp);
 	
 	if (!wantparent)
 		vrele(dvp);
 
 	if (vn_canvmio(dp) == TRUE &&
 		((cnp->cn_flags & (NOOBJ|LOCKLEAF)) == LOCKLEAF))
 		vfs_object_create(dp, td, cnp->cn_cred);
 
 	if ((cnp->cn_flags & LOCKLEAF) == 0)
 		VOP_UNLOCK(dp, 0, td);
 	return (0);
 
 bad2:
 	if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN))
 		VOP_UNLOCK(dvp, 0, td);
 	vrele(dvp);
 bad:
 	vput(dp);
 	*vpp = NULL;
 	return (error);
 }
Index: head/sys/kern/vfs_syscalls.c
===================================================================
--- head/sys/kern/vfs_syscalls.c	(revision 89305)
+++ head/sys/kern/vfs_syscalls.c	(revision 89306)
@@ -1,4191 +1,4304 @@
 /*
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
  * $FreeBSD$
  */
 
 /* For 4.3 integer FS ID compatibility */
 #include "opt_compat.h"
 #include "opt_ffs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sysent.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/namei.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/linker.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/proc.h>
 #include <sys/dirent.h>
 #include <sys/extattr.h>
 #include <sys/jail.h>
 #include <sys/sysctl.h>
 
 #include <machine/limits.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_zone.h>
 #include <vm/vm_page.h>
 
 static int change_dir __P((struct nameidata *ndp, struct thread *td));
 static void checkdirs __P((struct vnode *olddp, struct vnode *newdp));
 static int chroot_refuse_vdir_fds __P((struct filedesc *fdp));
 static int getutimes __P((const struct timeval *, struct timespec *));
 static int setfown __P((struct thread *td, struct vnode *, uid_t, gid_t));
 static int setfmode __P((struct thread *td, struct vnode *, int));
 static int setfflags __P((struct thread *td, struct vnode *, int));
 static int setutimes __P((struct thread *td, struct vnode *,
     const struct timespec *, int));
 static int vn_access __P((struct vnode *vp, int user_flags, struct ucred *cred,
     struct thread *td));
 
 static int	usermount = 0;	/* if 1, non-root can mount fs. */
 
 int (*union_dircheckp) __P((struct thread *td, struct vnode **, struct file *));
 
 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, "");
 
 /*
  * Virtual File System System Calls
  */
 
 #ifndef _SYS_SYSPROTO_H_
 struct nmount_args {
 	struct iovec    *iovp;
 	unsigned int    iovcnt;
 	int             flags;
 	};
 #endif
 /* ARGSUSED */
 int
 nmount(td, uap)
 	struct thread *td;
 	struct nmount_args /* {
 		syscallarg(struct iovec *) iovp;
 		syscallarg(unsigned int) iovcnt;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 
 	return(EOPNOTSUPP);
 }
 
 /*
  * Mount a file system.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mount_args {
 	char	*type;
 	char	*path;
 	int	flags;
 	caddr_t	data;
 };
 #endif
 /* ARGSUSED */
 int
 mount(td, uap)
 	struct thread *td;
 	struct mount_args /* {
 		syscallarg(char *) type;
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 		syscallarg(caddr_t) data;
 	} */ *uap;
 {
 	char *fstype;
 	char *fspath;
 	int error;
 
 	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK | M_ZERO);
 	fspath = malloc(MNAMELEN, M_TEMP, M_WAITOK | M_ZERO);
 
 	/*
 	 * vfs_mount() actually takes a kernel string for `type' and
 	 * `path' now, so extract them.
 	 */
 	error = copyinstr(SCARG(uap, type), fstype, MFSNAMELEN, NULL);
 	if (error)
 		goto finish;
 	error = copyinstr(SCARG(uap, path), fspath, MNAMELEN, NULL);
 	if (error)
 		goto finish;
 	error = vfs_mount(td, fstype, fspath, SCARG(uap, flags),
 	    SCARG(uap, data));
 finish:
 	free(fstype, M_TEMP);
 	free(fspath, M_TEMP);
 	return (error);
 }
 
 /*
  * vfs_mount(): actually attempt a filesystem mount.
  *
  * This routine is designed to be a "generic" entry point for routines
  * that wish to mount a filesystem. All parameters except `fsdata' are
  * pointers into kernel space. `fsdata' is currently still a pointer
  * into userspace.
  */
 int
 vfs_mount(td, fstype, fspath, fsflags, fsdata)
 	struct thread *td;
 	const char *fstype;
 	char *fspath;
 	int fsflags;
 	void *fsdata;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vfsconf *vfsp;
 	int error, flag = 0, flag2 = 0;
 	struct vattr va;
 	struct nameidata nd;
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
 	 * variables will fit in our mp buffers, including the
 	 * terminating NUL.
 	 */
 	if ((strlen(fstype) >= MFSNAMELEN - 1) ||
 	    (strlen(fspath) >= MNAMELEN - 1))
 		return (ENAMETOOLONG);
 
 	if (usermount == 0) {
 		error = suser_td(td);
 		if (error)
 			return (error);
 	}
 	/*
 	 * Do not allow NFS export by non-root users.
 	 */
 	if (fsflags & MNT_EXPORTED) {
 		error = suser_td(td);
 		if (error)
 			return (error);
 	}
 	/*
 	 * Silently enforce MNT_NOSUID and MNT_NODEV for non-root users
 	 */
 	if (suser_xxx(td->td_proc->p_ucred, 0, 0)) 
 		fsflags |= MNT_NOSUID | MNT_NODEV;
 	/*
 	 * Get vnode to be covered
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (fsflags & MNT_UPDATE) {
 		if ((vp->v_flag & VROOT) == 0) {
 			vput(vp);
 			return (EINVAL);
 		}
 		mp = vp->v_mount;
 		flag = mp->mnt_flag;
 		flag2 = mp->mnt_kern_flag;
 		/*
 		 * We only allow the filesystem to be reloaded if it
 		 * is currently mounted read-only.
 		 */
 		if ((fsflags & MNT_RELOAD) &&
 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
 			vput(vp);
 			return (EOPNOTSUPP);	/* Needs translation */
 		}
 		/*
 		 * Only root, or the user that did the original mount is
 		 * permitted to update it.
 		 */
 		if (mp->mnt_stat.f_owner != td->td_proc->p_ucred->cr_uid) {
 			error = suser_td(td);
 			if (error) {
 				vput(vp);
 				return (error);
 			}
 		}
 		if (vfs_busy(mp, LK_NOWAIT, 0, td)) {
 			vput(vp);
 			return (EBUSY);
 		}
 		mtx_lock(&vp->v_interlock);
 		if ((vp->v_flag & VMOUNT) != 0 ||
 		    vp->v_mountedhere != NULL) {
 			mtx_unlock(&vp->v_interlock);
 			vfs_unbusy(mp, td);
 			vput(vp);
 			return (EBUSY);
 		}
 		vp->v_flag |= VMOUNT;
 		mtx_unlock(&vp->v_interlock);
 		mp->mnt_flag |= fsflags &
 		    (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT);
 		VOP_UNLOCK(vp, 0, td);
 		goto update;
 	}
 	/*
 	 * If the user is not root, ensure that they own the directory
 	 * onto which we are attempting to mount.
 	 */
 	error = VOP_GETATTR(vp, &va, td->td_proc->p_ucred, td);
 	if (error) {
 		vput(vp);
 		return (error);
 	}
 	if (va.va_uid != td->td_proc->p_ucred->cr_uid) {
 		error = suser_td(td);
 		if (error) {
 			vput(vp);
 			return (error);
 		}
 	}
 	if ((error = vinvalbuf(vp, V_SAVE, td->td_proc->p_ucred, td, 0, 0))
 	    != 0) {
 		vput(vp);
 		return (error);
 	}
 	if (vp->v_type != VDIR) {
 		vput(vp);
 		return (ENOTDIR);
 	}
 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 		if (!strcmp(vfsp->vfc_name, fstype))
 			break;
 	if (vfsp == NULL) {
 		linker_file_t lf;
 
 		/* Only load modules for root (very important!) */
 		error = suser_td(td);
 		if (error) {
 			vput(vp);
 			return error;
 		}
 		error = linker_load_file(fstype, &lf);
 		if (error || lf == NULL) {
 			vput(vp);
 			if (lf == NULL)
 				error = ENODEV;
 			return error;
 		}
 		lf->userrefs++;
 		/* lookup again, see if the VFS was loaded */
 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 			if (!strcmp(vfsp->vfc_name, fstype))
 				break;
 		if (vfsp == NULL) {
 			lf->userrefs--;
 			linker_file_unload(lf);
 			vput(vp);
 			return (ENODEV);
 		}
 	}
 	mtx_lock(&vp->v_interlock);
 	if ((vp->v_flag & VMOUNT) != 0 ||
 	    vp->v_mountedhere != NULL) {
 		mtx_unlock(&vp->v_interlock);
 		vput(vp);
 		return (EBUSY);
 	}
 	vp->v_flag |= VMOUNT;
 	mtx_unlock(&vp->v_interlock);
 
 	/*
 	 * Allocate and initialize the filesystem.
 	 */
 	mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
 	TAILQ_INIT(&mp->mnt_nvnodelist);
 	TAILQ_INIT(&mp->mnt_reservedvnlist);
 	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE);
 	(void)vfs_busy(mp, LK_NOWAIT, 0, td);
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_vfc = vfsp;
 	vfsp->vfc_refcount++;
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 	strncpy(mp->mnt_stat.f_fstypename, fstype, MFSNAMELEN);
 	mp->mnt_stat.f_fstypename[MFSNAMELEN - 1] = '\0';
 	mp->mnt_vnodecovered = vp;
 	mp->mnt_stat.f_owner = td->td_proc->p_ucred->cr_uid;
 	strncpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
 	mp->mnt_stat.f_mntonname[MNAMELEN - 1] = '\0';
 	mp->mnt_iosize_max = DFLTPHYS;
 	VOP_UNLOCK(vp, 0, td);
 update:
 	/*
 	 * Set the mount level flags.
 	 */
 	if (fsflags & MNT_RDONLY)
 		mp->mnt_flag |= MNT_RDONLY;
 	else if (mp->mnt_flag & MNT_RDONLY)
 		mp->mnt_kern_flag |= MNTK_WANTRDWR;
 	mp->mnt_flag &=~ MNT_UPDATEMASK;
 	mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE);
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
 	error = VFS_MOUNT(mp, fspath, fsdata, &nd, td);
 	if (mp->mnt_flag & MNT_UPDATE) {
 		if (mp->mnt_kern_flag & MNTK_WANTRDWR)
 			mp->mnt_flag &= ~MNT_RDONLY;
 		mp->mnt_flag &=~
 		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
 		mp->mnt_kern_flag &=~ MNTK_WANTRDWR;
 		if (error) {
 			mp->mnt_flag = flag;
 			mp->mnt_kern_flag = flag2;
 		}
 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 			if (mp->mnt_syncer == NULL)
 				error = vfs_allocate_syncvnode(mp);
 		} else {
 			if (mp->mnt_syncer != NULL)
 				vrele(mp->mnt_syncer);
 			mp->mnt_syncer = NULL;
 		}
 		vfs_unbusy(mp, td);
 		mtx_lock(&vp->v_interlock);
 		vp->v_flag &= ~VMOUNT;
 		mtx_unlock(&vp->v_interlock);
 		vrele(vp);
 		return (error);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	/*
 	 * Put the new filesystem on the mount list after root.
 	 */
 	cache_purge(vp);
 	if (!error) {
 		struct vnode *newdp;
 
 		mtx_lock(&vp->v_interlock);
 		vp->v_flag &= ~VMOUNT;
 		vp->v_mountedhere = mp;
 		mtx_unlock(&vp->v_interlock);
 		mtx_lock(&mountlist_mtx);
 		TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
 		mtx_unlock(&mountlist_mtx);
 		if (VFS_ROOT(mp, &newdp))
 			panic("mount: lost mount");
 		checkdirs(vp, newdp);
 		vput(newdp);
 		VOP_UNLOCK(vp, 0, td);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0)
 			error = vfs_allocate_syncvnode(mp);
 		vfs_unbusy(mp, td);
 		if ((error = VFS_START(mp, 0, td)) != 0)
 			vrele(vp);
 	} else {
 		mtx_lock(&vp->v_interlock);
 		vp->v_flag &= ~VMOUNT;
 		mtx_unlock(&vp->v_interlock);
 		mp->mnt_vfc->vfc_refcount--;
 		vfs_unbusy(mp, td);
 		free((caddr_t)mp, M_MOUNT);
 		vput(vp);
 	}
 	return (error);
 }
 
 /*
  * Scan all active processes to see if any of them have a current
  * or root directory of `olddp'. If so, replace them with the new
  * mount point.
  */
 static void
 checkdirs(olddp, newdp)
 	struct vnode *olddp, *newdp;
 {
 	struct filedesc *fdp;
 	struct proc *p;
 
 	if (olddp->v_usecount == 1)
 		return;
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &allproc, p_list) {
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
+		FILEDESC_LOCK(fdp);
 		if (fdp->fd_cdir == olddp) {
-			vrele(fdp->fd_cdir);
 			VREF(newdp);
 			fdp->fd_cdir = newdp;
+			FILEDESC_UNLOCK(fdp);
+			vrele(olddp);
+			FILEDESC_LOCK(fdp);
 		}
 		if (fdp->fd_rdir == olddp) {
-			vrele(fdp->fd_rdir);
 			VREF(newdp);
 			fdp->fd_rdir = newdp;
-		}
+			FILEDESC_UNLOCK(fdp);
+			vrele(olddp);
+		} else
+			FILEDESC_UNLOCK(fdp);
 	}
 	sx_sunlock(&allproc_lock);
 	if (rootvnode == olddp) {
 		vrele(rootvnode);
 		VREF(newdp);
 		rootvnode = newdp;
 	}
 }
 
 /*
  * Unmount a file system.
  *
  * Note: unmount takes a path to the vnode mounted on as argument,
  * not special file (as before).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unmount_args {
 	char	*path;
 	int	flags;
 };
 #endif
 /* ARGSUSED */
 int
 unmount(td, uap)
 	struct thread *td;
 	register struct unmount_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	register struct vnode *vp;
 	struct mount *mp;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	mp = vp->v_mount;
 
 	/*
 	 * Only root, or the user that did the original mount is
 	 * permitted to unmount this filesystem.
 	 */
 	if (mp->mnt_stat.f_owner != td->td_proc->p_ucred->cr_uid) {
 		error = suser_td(td);
 		if (error) {
 			vput(vp);
 			return (error);
 		}
 	}
 
 	/*
 	 * Don't allow unmounting the root file system.
 	 */
 	if (mp->mnt_flag & MNT_ROOTFS) {
 		vput(vp);
 		return (EINVAL);
 	}
 
 	/*
 	 * Must be the root of the filesystem
 	 */
 	if ((vp->v_flag & VROOT) == 0) {
 		vput(vp);
 		return (EINVAL);
 	}
 	vput(vp);
 	return (dounmount(mp, SCARG(uap, flags), td));
 }
 
 /*
  * Do the actual file system unmount.
  */
 int
 dounmount(mp, flags, td)
 	struct mount *mp;
 	int flags;
 	struct thread *td;
 {
 	struct vnode *coveredvp, *fsrootvp;
 	int error;
 	int async_flag;
 
 	mtx_lock(&mountlist_mtx);
 	mp->mnt_kern_flag |= MNTK_UNMOUNT;
 	error = lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK |
 	    ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), &mountlist_mtx, td);
 	if (error) {
 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
 		if (mp->mnt_kern_flag & MNTK_MWAIT)
 			wakeup((caddr_t)mp);
 		return (error);
 	}
 	vn_start_write(NULL, &mp, V_WAIT);
 
 	if (mp->mnt_flag & MNT_EXPUBLIC)
 		vfs_setpublicfs(NULL, NULL, NULL);
 
 	vfs_msync(mp, MNT_WAIT);
 	async_flag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &=~ MNT_ASYNC;
 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
 	if (mp->mnt_syncer != NULL)
 		vrele(mp->mnt_syncer);
 	/* Move process cdir/rdir refs on fs root to underlying vnode. */
 	if (VFS_ROOT(mp, &fsrootvp) == 0) {
 		if (mp->mnt_vnodecovered != NULL)
 			checkdirs(fsrootvp, mp->mnt_vnodecovered);
 		if (fsrootvp == rootvnode) {
 			vrele(rootvnode);
 			rootvnode = NULL;
 		}
 		vput(fsrootvp);
 	}
 	if (((mp->mnt_flag & MNT_RDONLY) ||
 	     (error = VFS_SYNC(mp, MNT_WAIT, td->td_proc->p_ucred, td)) == 0) ||
 	    (flags & MNT_FORCE)) {
 		error = VFS_UNMOUNT(mp, flags, td);
 	}
 	vn_finished_write(mp);
 	if (error) {
 		/* Undo cdir/rdir and rootvnode changes made above. */
 		if (VFS_ROOT(mp, &fsrootvp) == 0) {
 			if (mp->mnt_vnodecovered != NULL)
 				checkdirs(mp->mnt_vnodecovered, fsrootvp);
 			if (rootvnode == NULL) {
 				rootvnode = fsrootvp;
 				vref(rootvnode);
 			}
 			vput(fsrootvp);
 		}
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
 			(void) vfs_allocate_syncvnode(mp);
 		mtx_lock(&mountlist_mtx);
 		mp->mnt_kern_flag &= ~MNTK_UNMOUNT;
 		mp->mnt_flag |= async_flag;
 		lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK,
 		    &mountlist_mtx, td);
 		if (mp->mnt_kern_flag & MNTK_MWAIT)
 			wakeup((caddr_t)mp);
 		return (error);
 	}
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	if ((coveredvp = mp->mnt_vnodecovered) != NULL)
 		coveredvp->v_mountedhere = NULL;
 	mp->mnt_vfc->vfc_refcount--;
 	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
 		panic("unmount: dangling vnode");
 	lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_mtx, td);
 	lockdestroy(&mp->mnt_lock);
 	if (coveredvp != NULL)
 		vrele(coveredvp);
 	if (mp->mnt_kern_flag & MNTK_MWAIT)
 		wakeup((caddr_t)mp);
 	free((caddr_t)mp, M_MOUNT);
 	return (0);
 }
 
 /*
  * Sync each mounted filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sync_args {
         int     dummy;
 };
 #endif
 
 #ifdef DEBUG
 static int syncprt = 0;
 SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
 #endif
 
 /* ARGSUSED */
 int
 sync(td, uap)
 	struct thread *td;
 	struct sync_args *uap;
 {
 	struct mount *mp, *nmp;
 	int asyncflag;
 
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
 			asyncflag = mp->mnt_flag & MNT_ASYNC;
 			mp->mnt_flag &= ~MNT_ASYNC;
 			vfs_msync(mp, MNT_NOWAIT);
 			VFS_SYNC(mp, MNT_NOWAIT,
 			    ((td != NULL) ? td->td_proc->p_ucred : NOCRED), td);
 			mp->mnt_flag |= asyncflag;
 			vn_finished_write(mp);
 		}
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp, td);
 	}
 	mtx_unlock(&mountlist_mtx);
 #if 0
 /*
  * XXX don't call vfs_bufstats() yet because that routine
  * was not imported in the Lite2 merge.
  */
 #ifdef DIAGNOSTIC
 	if (syncprt)
 		vfs_bufstats();
 #endif /* DIAGNOSTIC */
 #endif
 	return (0);
 }
 
 /* XXX PRISON: could be per prison flag */
 static int prison_quotas;
 #if 0
 SYSCTL_INT(_kern_prison, OID_AUTO, quotas, CTLFLAG_RW, &prison_quotas, 0, "");
 #endif
 
 /*
  * Change filesystem quotas.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct quotactl_args {
 	char *path;
 	int cmd;
 	int uid;
 	caddr_t arg;
 };
 #endif
 /* ARGSUSED */
 int
 quotactl(td, uap)
 	struct thread *td;
 	register struct quotactl_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) cmd;
 		syscallarg(int) uid;
 		syscallarg(caddr_t) arg;
 	} */ *uap;
 {
 	struct mount *mp;
 	int error;
 	struct nameidata nd;
 
 	if (jailed(td->td_proc->p_ucred) && !prison_quotas)
 		return (EPERM);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
 	vrele(nd.ni_vp);
 	if (error)
 		return (error);
 	error = VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
 	    SCARG(uap, arg), td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct statfs_args {
 	char *path;
 	struct statfs *buf;
 };
 #endif
 /* ARGSUSED */
 int
 statfs(td, uap)
 	struct thread *td;
 	register struct statfs_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct statfs *) buf;
 	} */ *uap;
 {
 	register struct mount *mp;
 	register struct statfs *sp;
 	int error;
 	struct nameidata nd;
 	struct statfs sb;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	mp = nd.ni_vp->v_mount;
 	sp = &mp->mnt_stat;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vrele(nd.ni_vp);
 	error = VFS_STATFS(mp, sp, td);
 	if (error)
 		return (error);
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	if (suser_xxx(td->td_proc->p_ucred, 0, 0)) {
 		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		sp = &sb;
 	}
 	return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
 }
 
 /*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstatfs_args {
 	int fd;
 	struct statfs *buf;
 };
 #endif
 /* ARGSUSED */
 int
 fstatfs(td, uap)
 	struct thread *td;
 	register struct fstatfs_args /* {
 		syscallarg(int) fd;
 		syscallarg(struct statfs *) buf;
 	} */ *uap;
 {
 	struct file *fp;
 	struct mount *mp;
 	register struct statfs *sp;
 	int error;
 	struct statfs sb;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
 	mp = ((struct vnode *)fp->f_data)->v_mount;
+	fdrop(fp, td);
 	if (mp == NULL)
 		return (EBADF);
 	sp = &mp->mnt_stat;
 	error = VFS_STATFS(mp, sp, td);
 	if (error)
 		return (error);
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	if (suser_xxx(td->td_proc->p_ucred, 0, 0)) {
 		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		sp = &sb;
 	}
 	return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
 }
 
 /*
  * Get statistics on all filesystems.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getfsstat_args {
 	struct statfs *buf;
 	long bufsize;
 	int flags;
 };
 #endif
 int
 getfsstat(td, uap)
 	struct thread *td;
 	register struct getfsstat_args /* {
 		syscallarg(struct statfs *) buf;
 		syscallarg(long) bufsize;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	register struct mount *mp, *nmp;
 	register struct statfs *sp;
 	caddr_t sfsp;
 	long count, maxcount, error;
 
 	maxcount = SCARG(uap, bufsize) / sizeof(struct statfs);
 	sfsp = (caddr_t)SCARG(uap, buf);
 	count = 0;
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
 		if (sfsp && count < maxcount) {
 			sp = &mp->mnt_stat;
 			/*
 			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
 			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
 			 * overrides MNT_WAIT.
 			 */
 			if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
 			    (SCARG(uap, flags) & MNT_WAIT)) &&
 			    (error = VFS_STATFS(mp, sp, td))) {
 				mtx_lock(&mountlist_mtx);
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				vfs_unbusy(mp, td);
 				continue;
 			}
 			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 			error = copyout((caddr_t)sp, sfsp, sizeof(*sp));
 			if (error) {
 				vfs_unbusy(mp, td);
 				return (error);
 			}
 			sfsp += sizeof(*sp);
 		}
 		count++;
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp, td);
 	}
 	mtx_unlock(&mountlist_mtx);
 	if (sfsp && count > maxcount)
 		td->td_retval[0] = maxcount;
 	else
 		td->td_retval[0] = count;
 	return (0);
 }
 
 /*
  * Change current working directory to a given file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchdir_args {
 	int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 fchdir(td, uap)
 	struct thread *td;
 	struct fchdir_args /* {
 		syscallarg(int) fd;
 	} */ *uap;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
-	struct vnode *vp, *tdp;
+	struct vnode *vp, *tdp, *vpold;
 	struct mount *mp;
 	struct file *fp;
 	int error;
 
 	if ((error = getvnode(fdp, SCARG(uap, fd), &fp)) != 0)
 		return (error);
 	vp = (struct vnode *)fp->f_data;
 	VREF(vp);
+	fdrop(fp, td);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (vp->v_type != VDIR)
 		error = ENOTDIR;
 	else
 		error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred, td);
 	while (!error && (mp = vp->v_mountedhere) != NULL) {
 		if (vfs_busy(mp, 0, 0, td))
 			continue;
 		error = VFS_ROOT(mp, &tdp);
 		vfs_unbusy(mp, td);
 		if (error)
 			break;
 		vput(vp);
 		vp = tdp;
 	}
 	if (error) {
 		vput(vp);
 		return (error);
 	}
 	VOP_UNLOCK(vp, 0, td);
-	vrele(fdp->fd_cdir);
+	FILEDESC_LOCK(fdp);
+	vpold = fdp->fd_cdir;
 	fdp->fd_cdir = vp;
+	FILEDESC_UNLOCK(fdp);
+	vrele(vpold);
 	return (0);
 }
 
 /*
  * Change current working directory (``.'').
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chdir_args {
 	char	*path;
 };
 #endif
 /* ARGSUSED */
 int
 chdir(td, uap)
 	struct thread *td;
 	struct chdir_args /* {
 		syscallarg(char *) path;
 	} */ *uap;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 	struct nameidata nd;
+	struct vnode *vp;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = change_dir(&nd, td)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
-	vrele(fdp->fd_cdir);
+	FILEDESC_LOCK(fdp);
+	vp = fdp->fd_cdir;
 	fdp->fd_cdir = nd.ni_vp;
+	FILEDESC_UNLOCK(fdp);
+	vrele(vp);
 	return (0);
 }
 
 /*
  * Helper function for raised chroot(2) security function:  Refuse if
  * any filedescriptors are open directories.
  */
 static int
 chroot_refuse_vdir_fds(fdp)
 	struct filedesc *fdp;
 {
 	struct vnode *vp;
 	struct file *fp;
+	struct thread *td = curthread;
 	int error;
 	int fd;
 
+	FILEDESC_LOCK(fdp);
 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
 		error = getvnode(fdp, fd, &fp);
 		if (error)
 			continue;
 		vp = (struct vnode *)fp->f_data;
+		fdrop(fp, td);
 		if (vp->v_type != VDIR)
 			continue;
+		FILEDESC_UNLOCK(fdp);
 		return(EPERM);
 	}
+	FILEDESC_UNLOCK(fdp);
 	return (0);
 }
 
 /*
  * This sysctl determines if we will allow a process to chroot(2) if it
  * has a directory open:
  *	0: disallowed for all processes.
  *	1: allowed for processes that were not already chroot(2)'ed.
  *	2: allowed for all processes.
  */
 
 static int chroot_allow_open_directories = 1;
 
 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
      &chroot_allow_open_directories, 0, "");
 
 /*
  * Change notion of root (``/'') directory.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chroot_args {
 	char	*path;
 };
 #endif
 /* ARGSUSED */
 int
 chroot(td, uap)
 	struct thread *td;
 	struct chroot_args /* {
 		syscallarg(char *) path;
 	} */ *uap;
 {
 	register struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 	struct nameidata nd;
+	struct vnode *vp;
 
 	error = suser_xxx(0, td->td_proc, PRISON_ROOT);
 	if (error)
 		return (error);
+	FILEDESC_LOCK(fdp);
 	if (chroot_allow_open_directories == 0 ||
-	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode))
+	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
+		FILEDESC_UNLOCK(fdp);
 		error = chroot_refuse_vdir_fds(fdp);
+	} else
+		FILEDESC_UNLOCK(fdp);
 	if (error)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = change_dir(&nd, td)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
-	vrele(fdp->fd_rdir);
+	FILEDESC_LOCK(fdp);
+	vp = fdp->fd_rdir;
 	fdp->fd_rdir = nd.ni_vp;
 	if (!fdp->fd_jdir) {
 		fdp->fd_jdir = nd.ni_vp;
                 VREF(fdp->fd_jdir);
 	}
+	FILEDESC_UNLOCK(fdp);
+	vrele(vp);
 	return (0);
 }
 
 /*
  * Common routine for chroot and chdir.
  */
 static int
 change_dir(ndp, td)
 	register struct nameidata *ndp;
 	struct thread *td;
 {
 	struct vnode *vp;
 	int error;
 
 	error = namei(ndp);
 	if (error)
 		return (error);
 	vp = ndp->ni_vp;
 	if (vp->v_type != VDIR)
 		error = ENOTDIR;
 	else
 		error = VOP_ACCESS(vp, VEXEC, td->td_proc->p_ucred, td);
 	if (error)
 		vput(vp);
 	else
 		VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 /*
  * Check permissions, allocate an open file structure,
  * and call the device open routine if any.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct open_args {
 	char	*path;
 	int	flags;
 	int	mode;
 };
 #endif
 int
 open(td, uap)
 	struct thread *td;
 	register struct open_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	struct file *fp;
 	struct vnode *vp;
 	struct vattr vat;
 	struct mount *mp;
 	int cmode, flags, oflags;
 	struct file *nfp;
 	int type, indx, error;
 	struct flock lf;
 	struct nameidata nd;
 
 	oflags = SCARG(uap, flags);
 	if ((oflags & O_ACCMODE) == O_ACCMODE)
 		return (EINVAL);
 	flags = FFLAGS(oflags);
 	error = falloc(td, &nfp, &indx);
 	if (error)
 		return (error);
 	fp = nfp;
+	FILEDESC_LOCK(fdp);
 	cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
+	FILEDESC_UNLOCK(fdp);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	td->td_dupfd = -indx - 1;		/* XXX check for fdopen */
 	/*
 	 * Bump the ref count to prevent another process from closing
 	 * the descriptor while we are blocked in vn_open()
 	 */
 	fhold(fp);
 	error = vn_open(&nd, &flags, cmode);
 	if (error) {
 		/*
 		 * release our own reference
 		 */
 		fdrop(fp, td);
 
 		/*
 		 * handle special fdopen() case.  bleh.  dupfdopen() is
 		 * responsible for dropping the old contents of ofiles[indx]
 		 * if it succeeds.
 		 */
 		if ((error == ENODEV || error == ENXIO) &&
 		    td->td_dupfd >= 0 &&		/* XXX from fdopen */
 		    (error =
 			dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) {
 			td->td_retval[0] = indx;
 			return (0);
 		}
 		/*
 		 * Clean up the descriptor, but only if another thread hadn't
 		 * replaced or closed it.
 		 */
+		FILEDESC_LOCK(fdp);
 		if (fdp->fd_ofiles[indx] == fp) {
 			fdp->fd_ofiles[indx] = NULL;
+			FILEDESC_UNLOCK(fdp);
 			fdrop(fp, td);
-		}
+		} else
+			FILEDESC_UNLOCK(fdp);
 
 		if (error == ERESTART)
 			error = EINTR;
 		return (error);
 	}
 	td->td_dupfd = 0;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	/*
 	 * There should be 2 references on the file, one from the descriptor
 	 * table, and one for us.
 	 *
 	 * Handle the case where someone closed the file (via its file
 	 * descriptor) while we were blocked.  The end result should look
 	 * like opening the file succeeded but it was immediately closed.
 	 */
+	FILEDESC_LOCK(fdp);
+	FILE_LOCK(fp);
 	if (fp->f_count == 1) {
 		KASSERT(fdp->fd_ofiles[indx] != fp,
 		    ("Open file descriptor lost all refs"));
+		FILEDESC_UNLOCK(fdp);
+		FILE_UNLOCK(fp);
 		VOP_UNLOCK(vp, 0, td);
 		vn_close(vp, flags & FMASK, fp->f_cred, td);
 		fdrop(fp, td);
 		td->td_retval[0] = indx;
 		return 0;
 	}
 
 	fp->f_data = (caddr_t)vp;
 	fp->f_flag = flags & FMASK;
 	fp->f_ops = &vnops;
 	fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+	FILEDESC_UNLOCK(fdp);
+	FILE_UNLOCK(fp);
 	VOP_UNLOCK(vp, 0, td);
 	if (flags & (O_EXLOCK | O_SHLOCK)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		if (flags & O_EXLOCK)
 			lf.l_type = F_WRLCK;
 		else
 			lf.l_type = F_RDLCK;
 		type = F_FLOCK;
 		if ((flags & FNONBLOCK) == 0)
 			type |= F_WAIT;
 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0)
 			goto bad;
 		fp->f_flag |= FHASLOCK;
 	}
 	if (flags & O_TRUNC) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 			goto bad;
 		VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE);
 		VATTR_NULL(&vat);
 		vat.va_size = 0;
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		error = VOP_SETATTR(vp, &vat, p->p_ucred, td);
 		VOP_UNLOCK(vp, 0, td);
 		vn_finished_write(mp);
 		if (error)
 			goto bad;
 	}
 	/* assert that vn_open created a backing object if one is needed */
 	KASSERT(!vn_canvmio(vp) || VOP_GETVOBJECT(vp, NULL) == 0,
 		("open: vmio vnode has no backing object after vn_open"));
 	/*
 	 * Release our private reference, leaving the one associated with
 	 * the descriptor table intact.
 	 */
 	fdrop(fp, td);
 	td->td_retval[0] = indx;
 	return (0);
 bad:
+	FILEDESC_LOCK(fdp);
 	if (fdp->fd_ofiles[indx] == fp) {
 		fdp->fd_ofiles[indx] = NULL;
+		FILEDESC_UNLOCK(fdp);
 		fdrop(fp, td);
-	}
-	fdrop(fp, td);
+	} else
+		FILEDESC_UNLOCK(fdp);
 	return (error);
 }
 
 #ifdef COMPAT_43
 /*
  * Create a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ocreat_args {
 	char	*path;
 	int	mode;
 };
 #endif
 int
 ocreat(td, uap)
 	struct thread *td;
 	register struct ocreat_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 	struct open_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 		syscallarg(int) mode;
 	} */ nuap;
 
 	SCARG(&nuap, path) = SCARG(uap, path);
 	SCARG(&nuap, mode) = SCARG(uap, mode);
 	SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC;
 	return (open(td, &nuap));
 }
 #endif /* COMPAT_43 */
 
 /*
  * Create a special file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mknod_args {
 	char	*path;
 	int	mode;
 	int	dev;
 };
 #endif
 /* ARGSUSED */
 int
 mknod(td, uap)
 	struct thread *td;
 	register struct mknod_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) mode;
 		syscallarg(int) dev;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 	int whiteout = 0;
 	struct nameidata nd;
 
 	switch (SCARG(uap, mode) & S_IFMT) {
 	case S_IFCHR:
 	case S_IFBLK:
 		error = suser_td(td);
 		break;
 	default:
 		error = suser_xxx(0, td->td_proc, PRISON_ROOT);
 		break;
 	}
 	if (error)
 		return (error);
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		vrele(vp);
 		error = EEXIST;
 	} else {
 		VATTR_NULL(&vattr);
+		FILEDESC_LOCK(td->td_proc->p_fd);
 		vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask;
+		FILEDESC_UNLOCK(td->td_proc->p_fd);
 		vattr.va_rdev = SCARG(uap, dev);
 		whiteout = 0;
 
 		switch (SCARG(uap, mode) & S_IFMT) {
 		case S_IFMT:	/* used by badsect to flag bad sectors */
 			vattr.va_type = VBAD;
 			break;
 		case S_IFCHR:
 			vattr.va_type = VCHR;
 			break;
 		case S_IFBLK:
 			vattr.va_type = VBLK;
 			break;
 		case S_IFWHT:
 			whiteout = 1;
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	if (!error) {
 		VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 		if (whiteout)
 			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
 		else {
 			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
 						&nd.ni_cnd, &vattr);
 			if (error == 0)
 				vput(nd.ni_vp);
 		}
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod");
 	return (error);
 }
 
 /*
  * Create a named pipe.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkfifo_args {
 	char	*path;
 	int	mode;
 };
 #endif
 /* ARGSUSED */
 int
 mkfifo(td, uap)
 	struct thread *td;
 	register struct mkfifo_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	if (nd.ni_vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vrele(nd.ni_vp);
 		vput(nd.ni_dvp);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VFIFO;
+	FILEDESC_LOCK(td->td_proc->p_fd);
 	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 	if (error == 0)
 		vput(nd.ni_vp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Make a hard file link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct link_args {
 	char	*path;
 	char	*link;
 };
 #endif
 /* ARGSUSED */
 int
 link(td, uap)
 	struct thread *td;
 	register struct link_args /* {
 		syscallarg(char *) path;
 		syscallarg(char *) link;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct nameidata nd;
 	int error;
 
 	bwillwrite();
 	NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR) {
 		vrele(vp);
 		return (EPERM);		/* POSIX */
 	}
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		vrele(vp);
 		return (error);
 	}
 	NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td);
 	if ((error = namei(&nd)) == 0) {
 		if (nd.ni_vp != NULL) {
 			vrele(nd.ni_vp);
 			error = EEXIST;
 		} else {
 			VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 			VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 			error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
 		}
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 	}
 	vrele(vp);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "link");
 	return (error);
 }
 
 /*
  * Make a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct symlink_args {
 	char	*path;
 	char	*link;
 };
 #endif
 /* ARGSUSED */
 int
 symlink(td, uap)
 	struct thread *td;
 	register struct symlink_args /* {
 		syscallarg(char *) path;
 		syscallarg(char *) link;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vattr vattr;
 	char *path;
 	int error;
 	struct nameidata nd;
 
 	path = zalloc(namei_zone);
 	if ((error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) != 0)
 		goto out;
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), td);
 	if ((error = namei(&nd)) != 0)
 		goto out;
 	if (nd.ni_vp) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vrele(nd.ni_vp);
 		vput(nd.ni_dvp);
 		error = EEXIST;
 		goto out;
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
+	FILEDESC_LOCK(td->td_proc->p_fd);
 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error == 0)
 		vput(nd.ni_vp);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink");
 out:
 	zfree(namei_zone, path);
 	return (error);
 }
 
 /*
  * Delete a whiteout from the filesystem.
  */
 /* ARGSUSED */
 int
 undelete(td, uap)
 	struct thread *td;
 	register struct undelete_args /* {
 		syscallarg(char *) path;
 	} */ *uap;
 {
 	int error;
 	struct mount *mp;
 	struct nameidata nd;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 
 	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_vp)
 			vrele(nd.ni_vp);
 		vput(nd.ni_dvp);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete");
 	return (error);
 }
 
 /*
  * Delete a name from the filesystem.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unlink_args {
 	char	*path;
 };
 #endif
 /* ARGSUSED */
 int
 unlink(td, uap)
 	struct thread *td;
 	struct unlink_args /* {
 		syscallarg(char *) path;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 	struct nameidata nd;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp->v_type == VDIR)
 		error = EPERM;		/* POSIX */
 	else {
 		/*
 		 * The root of a mounted filesystem cannot be deleted.
 		 *
 		 * XXX: can this only be a VDIR case?
 		 */
 		if (vp->v_flag & VROOT)
 			error = EBUSY;
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vrele(vp);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (!error) {
 		VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 		error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	vput(vp);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink");
 	return (error);
 }
 
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lseek_args {
 	int	fd;
 	int	pad;
 	off_t	offset;
 	int	whence;
 };
 #endif
 int
 lseek(td, uap)
 	struct thread *td;
 	register struct lseek_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) pad;
 		syscallarg(off_t) offset;
 		syscallarg(int) whence;
 	} */ *uap;
 {
 	struct ucred *cred = td->td_proc->p_ucred;
-	register struct filedesc *fdp = td->td_proc->p_fd;
 	register struct file *fp;
-	struct vattr vattr;
 	struct vnode *vp;
+	struct vattr vattr;
 	off_t offset;
 	int error, noneg;
 
-	if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
+	fp = ffind_hold(td, uap->fd);
+	if (fp == NULL)
 		return (EBADF);
-	if (fp->f_type != DTYPE_VNODE)
+	if (fp->f_type != DTYPE_VNODE) {
+		fdrop(fp, td);
 		return (ESPIPE);
+	}
 	vp = (struct vnode *)fp->f_data;
 	noneg = (vp->v_type != VCHR);
 	offset = SCARG(uap, offset);
 	switch (SCARG(uap, whence)) {
 	case L_INCR:
 		if (noneg &&
 		    (fp->f_offset < 0 ||
 		     (offset > 0 && fp->f_offset > OFF_MAX - offset)))
 			return (EOVERFLOW);
 		offset += fp->f_offset;
 		break;
 	case L_XTND:
 		error = VOP_GETATTR(vp, &vattr, cred, td);
 		if (error)
 			return (error);
 		if (noneg &&
 		    (vattr.va_size > OFF_MAX ||
 		     (offset > 0 && vattr.va_size > OFF_MAX - offset)))
 			return (EOVERFLOW);
 		offset += vattr.va_size;
 		break;
 	case L_SET:
 		break;
 	default:
+		fdrop(fp, td);
 		return (EINVAL);
 	}
 	if (noneg && offset < 0)
 		return (EINVAL);
 	fp->f_offset = offset;
 	*(off_t *)(td->td_retval) = fp->f_offset;
+	fdrop(fp, td);
 	return (0);
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 /*
  * Reposition read/write file offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olseek_args {
 	int	fd;
 	long	offset;
 	int	whence;
 };
 #endif
 int
 olseek(td, uap)
 	struct thread *td;
 	register struct olseek_args /* {
 		syscallarg(int) fd;
 		syscallarg(long) offset;
 		syscallarg(int) whence;
 	} */ *uap;
 {
 	struct lseek_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) pad;
 		syscallarg(off_t) offset;
 		syscallarg(int) whence;
 	} */ nuap;
 	int error;
 
 	SCARG(&nuap, fd) = SCARG(uap, fd);
 	SCARG(&nuap, offset) = SCARG(uap, offset);
 	SCARG(&nuap, whence) = SCARG(uap, whence);
 	error = lseek(td, &nuap);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Check access permissions using passed credentials.
  */
 static int
 vn_access(vp, user_flags, cred, td)
 	struct vnode	*vp;
 	int		user_flags;
 	struct ucred	*cred;
 	struct thread	*td;
 {
 	int error, flags;
 
 	/* Flags == 0 means only check for existence. */
 	error = 0;
 	if (user_flags) {
 		flags = 0;
 		if (user_flags & R_OK)
 			flags |= VREAD;
 		if (user_flags & W_OK)
 			flags |= VWRITE;
 		if (user_flags & X_OK)
 			flags |= VEXEC;
 		if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
 			error = VOP_ACCESS(vp, flags, cred, td);
 	}
 	return (error);
 }
 
 /*
  * Check access permissions using "real" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct access_args {
 	char	*path;
 	int	flags;
 };
 #endif
 int
 access(td, uap)
 	struct thread *td;
 	register struct access_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	struct ucred *cred, *tmpcred;
 	register struct vnode *vp;
 	int error;
 	struct nameidata nd;
 
 	cred = td->td_proc->p_ucred;
 	/*
 	 * Create and modify a temporary credential instead of one that
 	 * is potentially shared.  This could also mess up socket
 	 * buffer accounting which can run in an interrupt context.
 	 *
 	 * XXX - Depending on how "threads" are finally implemented, it
 	 * may be better to explicitly pass the credential to namei()
 	 * rather than to modify the potentially shared process structure.
 	 */
 	tmpcred = crdup(cred);
 	tmpcred->cr_uid = cred->cr_ruid;
 	tmpcred->cr_groups[0] = cred->cr_rgid;
 	td->td_proc->p_ucred = tmpcred;
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		goto out1;
 	vp = nd.ni_vp;
 
 	error = vn_access(vp, SCARG(uap, flags), tmpcred, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 out1:
 	td->td_proc->p_ucred = cred;
 	crfree(tmpcred);
 	return (error);
 }
 
 /*
  * Check access permissions using "effective" credentials.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct eaccess_args {
 	char	*path;
 	int	flags;
 };
 #endif
 int
 eaccess(td, uap)
 	struct thread *td;
 	register struct eaccess_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	struct nameidata nd;
 	struct vnode *vp;
 	int error;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 
 	error = vn_access(vp, SCARG(uap, flags), td->td_proc->p_ucred, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	return (error);
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 /*
  * Get file status; this version follows links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ostat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 /* ARGSUSED */
 int
 ostat(td, uap)
 	struct thread *td;
 	register struct ostat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct ostat *) ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct ostat osb;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = vn_stat(nd.ni_vp, &sb, td);
 	vput(nd.ni_vp);
 	if (error)
 		return (error);
 	cvtstat(&sb, &osb);
 	error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
 	return (error);
 }
 
 /*
  * Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct olstat_args {
 	char	*path;
 	struct ostat *ub;
 };
 #endif
 /* ARGSUSED */
 int
 olstat(td, uap)
 	struct thread *td;
 	register struct olstat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct ostat *) ub;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct stat sb;
 	struct ostat osb;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	error = vn_stat(vp, &sb, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	if (error)
 		return (error);
 	cvtstat(&sb, &osb);
 	error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
 	return (error);
 }
 
 /*
  * Convert from an old to a new stat structure.
  */
 void
 cvtstat(st, ost)
 	struct stat *st;
 	struct ostat *ost;
 {
 
 	ost->st_dev = st->st_dev;
 	ost->st_ino = st->st_ino;
 	ost->st_mode = st->st_mode;
 	ost->st_nlink = st->st_nlink;
 	ost->st_uid = st->st_uid;
 	ost->st_gid = st->st_gid;
 	ost->st_rdev = st->st_rdev;
 	if (st->st_size < (quad_t)1 << 32)
 		ost->st_size = st->st_size;
 	else
 		ost->st_size = -2;
 	ost->st_atime = st->st_atime;
 	ost->st_mtime = st->st_mtime;
 	ost->st_ctime = st->st_ctime;
 	ost->st_blksize = st->st_blksize;
 	ost->st_blocks = st->st_blocks;
 	ost->st_flags = st->st_flags;
 	ost->st_gen = st->st_gen;
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
 /*
  * Get file status; this version follows links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct stat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 /* ARGSUSED */
 int
 stat(td, uap)
 	struct thread *td;
 	register struct stat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct stat *) ub;
 	} */ *uap;
 {
 	struct stat sb;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	error = vn_stat(nd.ni_vp, &sb, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
 	if (error)
 		return (error);
 	error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
 	return (error);
 }
 
 /*
  * Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lstat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 /* ARGSUSED */
 int
 lstat(td, uap)
 	struct thread *td;
 	register struct lstat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct stat *) ub;
 	} */ *uap;
 {
 	int error;
 	struct vnode *vp;
 	struct stat sb;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	error = vn_stat(vp, &sb, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	if (error)
 		return (error);
 	error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
 	return (error);
 }
 
 /*
  * Implementation of the NetBSD stat() function.
  * XXX This should probably be collapsed with the FreeBSD version,
  * as the differences are only due to vn_stat() clearing spares at
  * the end of the structures.  vn_stat could be split to avoid this,
  * and thus collapse the following to close to zero code.
  */
 void
 cvtnstat(sb, nsb)
 	struct stat *sb;
 	struct nstat *nsb;
 {
 	nsb->st_dev = sb->st_dev;
 	nsb->st_ino = sb->st_ino;
 	nsb->st_mode = sb->st_mode;
 	nsb->st_nlink = sb->st_nlink;
 	nsb->st_uid = sb->st_uid;
 	nsb->st_gid = sb->st_gid;
 	nsb->st_rdev = sb->st_rdev;
 	nsb->st_atimespec = sb->st_atimespec;
 	nsb->st_mtimespec = sb->st_mtimespec;
 	nsb->st_ctimespec = sb->st_ctimespec;
 	nsb->st_size = sb->st_size;
 	nsb->st_blocks = sb->st_blocks;
 	nsb->st_blksize = sb->st_blksize;
 	nsb->st_flags = sb->st_flags;
 	nsb->st_gen = sb->st_gen;
 	nsb->st_qspare[0] = sb->st_qspare[0];
 	nsb->st_qspare[1] = sb->st_qspare[1];
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct nstat_args {
 	char	*path;
 	struct nstat *ub;
 };
 #endif
 /* ARGSUSED */
 int
 nstat(td, uap)
 	struct thread *td;
 	register struct nstat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct nstat *) ub;
 	} */ *uap;
 {
 	struct stat sb;
 	struct nstat nsb;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = vn_stat(nd.ni_vp, &sb, td);
 	vput(nd.ni_vp);
 	if (error)
 		return (error);
 	cvtnstat(&sb, &nsb);
 	error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb));
 	return (error);
 }
 
 /*
  * NetBSD lstat.  Get file status; this version does not follow links.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lstat_args {
 	char	*path;
 	struct stat *ub;
 };
 #endif
 /* ARGSUSED */
 int
 nlstat(td, uap)
 	struct thread *td;
 	register struct nlstat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct nstat *) ub;
 	} */ *uap;
 {
 	int error;
 	struct vnode *vp;
 	struct stat sb;
 	struct nstat nsb;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = vn_stat(vp, &sb, td);
 	vput(vp);
 	if (error)
 		return (error);
 	cvtnstat(&sb, &nsb);
 	error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb));
 	return (error);
 }
 
 /*
  * Get configurable pathname variables.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pathconf_args {
 	char	*path;
 	int	name;
 };
 #endif
 /* ARGSUSED */
 int
 pathconf(td, uap)
 	struct thread *td;
 	register struct pathconf_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) name;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), td->td_retval);
 	vput(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct readlink_args {
 	char	*path;
 	char	*buf;
 	int	count;
 };
 #endif
 /* ARGSUSED */
 int
 readlink(td, uap)
 	struct thread *td;
 	register struct readlink_args /* {
 		syscallarg(char *) path;
 		syscallarg(char *) buf;
 		syscallarg(int) count;
 	} */ *uap;
 {
 	register struct vnode *vp;
 	struct iovec aiov;
 	struct uio auio;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	if (vp->v_type != VLNK)
 		error = EINVAL;
 	else {
 		aiov.iov_base = SCARG(uap, buf);
 		aiov.iov_len = SCARG(uap, count);
 		auio.uio_iov = &aiov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = 0;
 		auio.uio_rw = UIO_READ;
 		auio.uio_segflg = UIO_USERSPACE;
 		auio.uio_td = td;
 		auio.uio_resid = SCARG(uap, count);
 		error = VOP_READLINK(vp, &auio, td->td_proc->p_ucred);
 	}
 	vput(vp);
 	td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
 	return (error);
 }
 
 /*
  * Common implementation code for chflags() and fchflags().
  */
 static int
 setfflags(td, vp, flags)
 	struct thread *td;
 	struct vnode *vp;
 	int flags;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	/*
 	 * Prevent non-root users from setting flags on devices.  When
 	 * a device is reused, users can retain ownership of the device
 	 * if they are allowed to set flags and programs assume that
 	 * chown can't fail when done as root.
 	 */
 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
 		error = suser_xxx(td->td_proc->p_ucred, td->td_proc,
 		    PRISON_ROOT);
 		if (error)
 			return (error);
 	}
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VATTR_NULL(&vattr);
 	vattr.va_flags = flags;
 	error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Change flags of a file given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chflags_args {
 	char	*path;
 	int	flags;
 };
 #endif
 /* ARGSUSED */
 int
 chflags(td, uap)
 	struct thread *td;
 	register struct chflags_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfflags(td, nd.ni_vp, SCARG(uap, flags));
 	vrele(nd.ni_vp);
 	return error;
 }
 
 /*
  * Change flags of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchflags_args {
 	int	fd;
 	int	flags;
 };
 #endif
 /* ARGSUSED */
 int
 fchflags(td, uap)
 	struct thread *td;
 	register struct fchflags_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	struct file *fp;
 	int error;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	return setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags));
+	error = setfflags(td, (struct vnode *) fp->f_data, SCARG(uap, flags));
+	fdrop(fp, td);
+	return (error);
 }
 
 /*
  * Common implementation code for chmod(), lchmod() and fchmod().
  */
 static int
 setfmode(td, vp, mode)
 	struct thread *td;
 	struct vnode *vp;
 	int mode;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VATTR_NULL(&vattr);
 	vattr.va_mode = mode & ALLPERMS;
 	error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return error;
 }
 
 /*
  * Change mode of a file given path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 /* ARGSUSED */
 int
 chmod(td, uap)
 	struct thread *td;
 	register struct chmod_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfmode(td, nd.ni_vp, SCARG(uap, mode));
 	vrele(nd.ni_vp);
 	return error;
 }
 
 /*
  * Change mode of a file given path name (don't follow links.)
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchmod_args {
 	char	*path;
 	int	mode;
 };
 #endif
 /* ARGSUSED */
 int
 lchmod(td, uap)
 	struct thread *td;
 	register struct lchmod_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfmode(td, nd.ni_vp, SCARG(uap, mode));
 	vrele(nd.ni_vp);
 	return error;
 }
 
 /*
  * Change mode of a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchmod_args {
 	int	fd;
 	int	mode;
 };
 #endif
 /* ARGSUSED */
 int
 fchmod(td, uap)
 	struct thread *td;
 	register struct fchmod_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 	struct file *fp;
+	struct vnode *vp;
 	int error;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	return setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode));
+	vp = (struct vnode *)fp->f_data;
+	error = setfmode(td, (struct vnode *)fp->f_data, SCARG(uap, mode));
+	fdrop(fp, td);
+	return (error);
 }
 
 /*
  * Common implementation for chown(), lchown(), and fchown()
  */
 static int
 setfown(td, vp, uid, gid)
 	struct thread *td;
 	struct vnode *vp;
 	uid_t uid;
 	gid_t gid;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VATTR_NULL(&vattr);
 	vattr.va_uid = uid;
 	vattr.va_gid = gid;
 	error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return error;
 }
 
 /*
  * Set ownership given a path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct chown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 /* ARGSUSED */
 int
 chown(td, uap)
 	struct thread *td;
 	register struct chown_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) uid;
 		syscallarg(int) gid;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set ownership given a path name, do not cross symlinks.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lchown_args {
 	char	*path;
 	int	uid;
 	int	gid;
 };
 #endif
 /* ARGSUSED */
 int
 lchown(td, uap)
 	struct thread *td;
 	register struct lchown_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) uid;
 		syscallarg(int) gid;
 	} */ *uap;
 {
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setfown(td, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid));
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set ownership given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fchown_args {
 	int	fd;
 	int	uid;
 	int	gid;
 };
 #endif
 /* ARGSUSED */
 int
 fchown(td, uap)
 	struct thread *td;
 	register struct fchown_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) uid;
 		syscallarg(int) gid;
 	} */ *uap;
 {
 	struct file *fp;
+	struct vnode *vp;
 	int error;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	return setfown(td, (struct vnode *)fp->f_data,
+	vp = (struct vnode *)fp->f_data;
+	error = setfown(td, (struct vnode *)fp->f_data,
 		SCARG(uap, uid), SCARG(uap, gid));
+	fdrop(fp, td);
+	return (error);
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), and futimes().
  */
 static int
 getutimes(usrtvp, tsp)
 	const struct timeval *usrtvp;
 	struct timespec *tsp;
 {
 	struct timeval tv[2];
 	int error;
 
 	if (usrtvp == NULL) {
 		microtime(&tv[0]);
 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
 		tsp[1] = tsp[0];
 	} else {
 		if ((error = copyin(usrtvp, tv, sizeof (tv))) != 0)
 			return (error);
 		TIMEVAL_TO_TIMESPEC(&tv[0], &tsp[0]);
 		TIMEVAL_TO_TIMESPEC(&tv[1], &tsp[1]);
 	}
 	return 0;
 }
 
 /*
  * Common implementation code for utimes(), lutimes(), and futimes().
  */
 static int
 setutimes(td, vp, ts, nullflag)
 	struct thread *td;
 	struct vnode *vp;
 	const struct timespec *ts;
 	int nullflag;
 {
 	int error;
 	struct mount *mp;
 	struct vattr vattr;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VATTR_NULL(&vattr);
 	vattr.va_atime = ts[0];
 	vattr.va_mtime = ts[1];
 	if (nullflag)
 		vattr.va_vaflags |= VA_UTIMES_NULL;
 	error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return error;
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct utimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 /* ARGSUSED */
 int
 utimes(td, uap)
 	struct thread *td;
 	register struct utimes_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct timeval *) tptr;
 	} */ *uap;
 {
 	struct timespec ts[2];
 	struct timeval *usrtvp;
 	int error;
 	struct nameidata nd;
 
 	usrtvp = SCARG(uap, tptr);
 	if ((error = getutimes(usrtvp, ts)) != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lutimes_args {
 	char	*path;
 	struct	timeval *tptr;
 };
 #endif
 /* ARGSUSED */
 int
 lutimes(td, uap)
 	struct thread *td;
 	register struct lutimes_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct timeval *) tptr;
 	} */ *uap;
 {
 	struct timespec ts[2];
 	struct timeval *usrtvp;
 	int error;
 	struct nameidata nd;
 
 	usrtvp = SCARG(uap, tptr);
 	if ((error = getutimes(usrtvp, ts)) != 0)
 		return (error);
 	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	error = setutimes(td, nd.ni_vp, ts, usrtvp == NULL);
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 /*
  * Set the access and modification times of a file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct futimes_args {
 	int	fd;
 	struct	timeval *tptr;
 };
 #endif
 /* ARGSUSED */
 int
 futimes(td, uap)
 	struct thread *td;
 	register struct futimes_args /* {
 		syscallarg(int ) fd;
 		syscallarg(struct timeval *) tptr;
 	} */ *uap;
 {
 	struct timespec ts[2];
 	struct file *fp;
 	struct timeval *usrtvp;
 	int error;
 
 	usrtvp = SCARG(uap, tptr);
 	if ((error = getutimes(usrtvp, ts)) != 0)
 		return (error);
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	return setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL);
+	error = setutimes(td, (struct vnode *)fp->f_data, ts, usrtvp == NULL);
+	fdrop(fp, td);
+	return (error);
 }
 
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct truncate_args {
 	char	*path;
 	int	pad;
 	off_t	length;
 };
 #endif
 /* ARGSUSED */
 int
 truncate(td, uap)
 	struct thread *td;
 	register struct truncate_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) pad;
 		syscallarg(off_t) length;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 
 	if (uap->length < 0)
 		return(EINVAL);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
 		vrele(vp);
 		return (error);
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (vp->v_type == VDIR)
 		error = EISDIR;
 	else if ((error = vn_writechk(vp)) == 0 &&
 	    (error = VOP_ACCESS(vp, VWRITE, td->td_proc->p_ucred, td)) == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = SCARG(uap, length);
 		error = VOP_SETATTR(vp, &vattr, td->td_proc->p_ucred, td);
 	}
 	vput(vp);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * Truncate a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ftruncate_args {
 	int	fd;
 	int	pad;
 	off_t	length;
 };
 #endif
 /* ARGSUSED */
 int
 ftruncate(td, uap)
 	struct thread *td;
 	register struct ftruncate_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) pad;
 		syscallarg(off_t) length;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vattr vattr;
 	struct vnode *vp;
 	struct file *fp;
 	int error;
 
 	if (uap->length < 0)
 		return(EINVAL);
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	if ((fp->f_flag & FWRITE) == 0)
+	if ((fp->f_flag & FWRITE) == 0) {
+		fdrop(fp, td);
 		return (EINVAL);
+	}
 	vp = (struct vnode *)fp->f_data;
-	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		fdrop(fp, td);
 		return (error);
+	}
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (vp->v_type == VDIR)
 		error = EISDIR;
 	else if ((error = vn_writechk(vp)) == 0) {
 		VATTR_NULL(&vattr);
 		vattr.va_size = SCARG(uap, length);
 		error = VOP_SETATTR(vp, &vattr, fp->f_cred, td);
 	}
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
+	fdrop(fp, td);
 	return (error);
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 /*
  * Truncate a file given its path name.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct otruncate_args {
 	char	*path;
 	long	length;
 };
 #endif
 /* ARGSUSED */
 int
 otruncate(td, uap)
 	struct thread *td;
 	register struct otruncate_args /* {
 		syscallarg(char *) path;
 		syscallarg(long) length;
 	} */ *uap;
 {
 	struct truncate_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) pad;
 		syscallarg(off_t) length;
 	} */ nuap;
 
 	SCARG(&nuap, path) = SCARG(uap, path);
 	SCARG(&nuap, length) = SCARG(uap, length);
 	return (truncate(td, &nuap));
 }
 
 /*
  * Truncate a file given a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct oftruncate_args {
 	int	fd;
 	long	length;
 };
 #endif
 /* ARGSUSED */
 int
 oftruncate(td, uap)
 	struct thread *td;
 	register struct oftruncate_args /* {
 		syscallarg(int) fd;
 		syscallarg(long) length;
 	} */ *uap;
 {
 	struct ftruncate_args /* {
 		syscallarg(int) fd;
 		syscallarg(int) pad;
 		syscallarg(off_t) length;
 	} */ nuap;
 
 	SCARG(&nuap, fd) = SCARG(uap, fd);
 	SCARG(&nuap, length) = SCARG(uap, length);
 	return (ftruncate(td, &nuap));
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
 /*
  * Sync an open file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fsync_args {
 	int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 fsync(td, uap)
 	struct thread *td;
 	struct fsync_args /* {
 		syscallarg(int) fd;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct file *fp;
 	vm_object_t obj;
 	int error;
 
 	GIANT_REQUIRED;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
 	vp = (struct vnode *)fp->f_data;
-	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
+		fdrop(fp, td);
 		return (error);
+	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (VOP_GETVOBJECT(vp, &obj) == 0) {
 		vm_object_page_clean(obj, 0, 0, 0);
 	}
 	error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, td);
 #ifdef SOFTUPDATES
 	if (error == 0 && vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP))
 	    error = softdep_fsync(vp);
 #endif
 
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
+	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Rename files.  Source and destination must either both be directories,
  * or both not be directories.  If target is a directory, it must be empty.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rename_args {
 	char	*from;
 	char	*to;
 };
 #endif
 /* ARGSUSED */
 int
 rename(td, uap)
 	struct thread *td;
 	register struct rename_args /* {
 		syscallarg(char *) from;
 		syscallarg(char *) to;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vnode *tvp, *fvp, *tdvp;
 	struct nameidata fromnd, tond;
 	int error;
 
 	bwillwrite();
 	NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE,
 	    SCARG(uap, from), td);
 	if ((error = namei(&fromnd)) != 0)
 		return (error);
 	fvp = fromnd.ni_vp;
 	if ((error = vn_start_write(fvp, &mp, V_WAIT | PCATCH)) != 0) {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		goto out1;
 	}
 	NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ,
 	    UIO_USERSPACE, SCARG(uap, to), td);
 	if (fromnd.ni_vp->v_type == VDIR)
 		tond.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&tond)) != 0) {
 		/* Translate error code for rename("dir1", "dir2/."). */
 		if (error == EISDIR && fvp->v_type == VDIR)
 			error = EINVAL;
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 		goto out1;
 	}
 	tdvp = tond.ni_dvp;
 	tvp = tond.ni_vp;
 	if (tvp != NULL) {
 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
 			error = ENOTDIR;
 			goto out;
 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
 			error = EISDIR;
 			goto out;
 		}
 	}
 	if (fvp == tdvp)
 		error = EINVAL;
 	/*
 	 * If source is the same as the destination (that is the
 	 * same inode number with the same name in the same directory),
 	 * then there is nothing to do.
 	 */
 	if (fvp == tvp && fromnd.ni_dvp == tdvp &&
 	    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
 	    !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
 	      fromnd.ni_cnd.cn_namelen))
 		error = -1;
 out:
 	if (!error) {
 		VOP_LEASE(tdvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 		if (fromnd.ni_dvp != tdvp) {
 			VOP_LEASE(fromnd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 		}
 		if (tvp) {
 			VOP_LEASE(tvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 		}
 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 	} else {
 		NDFREE(&fromnd, NDF_ONLY_PNBUF);
 		NDFREE(&tond, NDF_ONLY_PNBUF);
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
 	}
 	vrele(tond.ni_startdir);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename");
 	ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename");
 	ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename");
 	ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename");
 out1:
 	if (fromnd.ni_startdir)
 		vrele(fromnd.ni_startdir);
 	if (error == -1)
 		return (0);
 	return (error);
 }
 
 /*
  * Make a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mkdir_args {
 	char	*path;
 	int	mode;
 };
 #endif
 /* ARGSUSED */
 int
 mkdir(td, uap)
 	struct thread *td;
 	register struct mkdir_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) mode;
 	} */ *uap;
 {
 
 	return vn_mkdir(uap->path, uap->mode, UIO_USERSPACE, td);
 }
 
 int
 vn_mkdir(path, mode, segflg, td)
 	char *path;
 	int mode;
 	enum uio_seg segflg;
 	struct thread *td;
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, CREATE, LOCKPARENT, segflg, path, td);
 	nd.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vrele(vp);
 		vput(nd.ni_dvp);
 		return (EEXIST);
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vput(nd.ni_dvp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VDIR;
+	FILEDESC_LOCK(td->td_proc->p_fd);
 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_dvp);
 	if (!error)
 		vput(nd.ni_vp);
 	vn_finished_write(mp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir");
 	return (error);
 }
 
 /*
  * Remove a directory file.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rmdir_args {
 	char	*path;
 };
 #endif
 /* ARGSUSED */
 int
 rmdir(td, uap)
 	struct thread *td;
 	struct rmdir_args /* {
 		syscallarg(char *) path;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 	struct nameidata nd;
 
 restart:
 	bwillwrite();
 	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE,
 	    SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp->v_type != VDIR) {
 		error = ENOTDIR;
 		goto out;
 	}
 	/*
 	 * No rmdir "." please.
 	 */
 	if (nd.ni_dvp == vp) {
 		error = EINVAL;
 		goto out;
 	}
 	/*
 	 * The root of a mounted filesystem cannot be deleted.
 	 */
 	if (vp->v_flag & VROOT) {
 		error = EBUSY;
 		goto out;
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if (nd.ni_dvp == vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		vput(vp);
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
 			return (error);
 		goto restart;
 	}
 	VOP_LEASE(nd.ni_dvp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
 	vn_finished_write(mp);
 out:
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_dvp == vp)
 		vrele(nd.ni_dvp);
 	else
 		vput(nd.ni_dvp);
 	vput(vp);
 	ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir");
 	ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir");
 	return (error);
 }
 
 #ifdef COMPAT_43
 /*
  * Read a block of directory entries in a file system independent format.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ogetdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 ogetdirentries(td, uap)
 	struct thread *td;
 	register struct ogetdirentries_args /* {
 		syscallarg(int) fd;
 		syscallarg(char *) buf;
 		syscallarg(u_int) count;
 		syscallarg(long *) basep;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct uio auio, kuio;
 	struct iovec aiov, kiov;
 	struct dirent *dp, *edp;
 	caddr_t dirbuf;
 	int error, eofflag, readcnt;
 	long loff;
 
 	/* XXX arbitrary sanity limit on `count'. */
 	if (SCARG(uap, count) > 64 * 1024)
 		return (EINVAL);
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	if ((fp->f_flag & FREAD) == 0)
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
 		return (EBADF);
+	}
 	vp = (struct vnode *)fp->f_data;
 unionread:
-	if (vp->v_type != VDIR)
+	if (vp->v_type != VDIR) {
+		fdrop(fp, td);
 		return (EINVAL);
+	}
 	aiov.iov_base = SCARG(uap, buf);
 	aiov.iov_len = SCARG(uap, count);
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = SCARG(uap, count);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	loff = auio.uio_offset = fp->f_offset;
 #	if (BYTE_ORDER != LITTLE_ENDIAN)
 		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
 			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
 			    NULL, NULL);
 			fp->f_offset = auio.uio_offset;
 		} else
 #	endif
 	{
 		kuio = auio;
 		kuio.uio_iov = &kiov;
 		kuio.uio_segflg = UIO_SYSSPACE;
 		kiov.iov_len = SCARG(uap, count);
 		MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
 		kiov.iov_base = dirbuf;
 		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
 			    NULL, NULL);
 		fp->f_offset = kuio.uio_offset;
 		if (error == 0) {
 			readcnt = SCARG(uap, count) - kuio.uio_resid;
 			edp = (struct dirent *)&dirbuf[readcnt];
 			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
 #				if (BYTE_ORDER == LITTLE_ENDIAN)
 					/*
 					 * The expected low byte of
 					 * dp->d_namlen is our dp->d_type.
 					 * The high MBZ byte of dp->d_namlen
 					 * is our dp->d_namlen.
 					 */
 					dp->d_type = dp->d_namlen;
 					dp->d_namlen = 0;
 #				else
 					/*
 					 * The dp->d_type is the high byte
 					 * of the expected dp->d_namlen,
 					 * so must be zero'ed.
 					 */
 					dp->d_type = 0;
 #				endif
 				if (dp->d_reclen > 0) {
 					dp = (struct dirent *)
 					    ((char *)dp + dp->d_reclen);
 				} else {
 					error = EIO;
 					break;
 				}
 			}
 			if (dp >= edp)
 				error = uiomove(dirbuf, readcnt, &auio);
 		}
 		FREE(dirbuf, M_TEMP);
 	}
 	VOP_UNLOCK(vp, 0, td);
-	if (error)
+	if (error) {
+		fdrop(fp, td);
 		return (error);
+	}
 	if (SCARG(uap, count) == auio.uio_resid) {
 		if (union_dircheckp) {
 			error = union_dircheckp(td, &vp, fp);
 			if (error == -1)
 				goto unionread;
-			if (error)
+			if (error) {
+				fdrop(fp, td);
 				return (error);
+			}
 		}
 		if ((vp->v_flag & VROOT) &&
 		    (vp->v_mount->mnt_flag & MNT_UNION)) {
 			struct vnode *tvp = vp;
 			vp = vp->v_mount->mnt_vnodecovered;
 			VREF(vp);
 			fp->f_data = (caddr_t) vp;
 			fp->f_offset = 0;
 			vrele(tvp);
 			goto unionread;
 		}
 	}
 	error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
 	    sizeof(long));
+	fdrop(fp, td);
 	td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Read a block of directory entries in a file system independent format.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdirentries_args {
 	int	fd;
 	char	*buf;
 	u_int	count;
 	long	*basep;
 };
 #endif
 int
 getdirentries(td, uap)
 	struct thread *td;
 	register struct getdirentries_args /* {
 		syscallarg(int) fd;
 		syscallarg(char *) buf;
 		syscallarg(u_int) count;
 		syscallarg(long *) basep;
 	} */ *uap;
 {
 	struct vnode *vp;
 	struct file *fp;
 	struct uio auio;
 	struct iovec aiov;
 	long loff;
 	int error, eofflag;
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
-	if ((fp->f_flag & FREAD) == 0)
+	if ((fp->f_flag & FREAD) == 0) {
+		fdrop(fp, td);
 		return (EBADF);
+	}
 	vp = (struct vnode *)fp->f_data;
 unionread:
-	if (vp->v_type != VDIR)
+	if (vp->v_type != VDIR) {
+		fdrop(fp, td);
 		return (EINVAL);
+	}
 	aiov.iov_base = SCARG(uap, buf);
 	aiov.iov_len = SCARG(uap, count);
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_resid = SCARG(uap, count);
 	/* vn_lock(vp, LK_SHARED | LK_RETRY, td); */
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	loff = auio.uio_offset = fp->f_offset;
 	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
 	fp->f_offset = auio.uio_offset;
 	VOP_UNLOCK(vp, 0, td);
-	if (error)
+	if (error) {
+		fdrop(fp, td);
 		return (error);
+	}
 	if (SCARG(uap, count) == auio.uio_resid) {
 		if (union_dircheckp) {
 			error = union_dircheckp(td, &vp, fp);
 			if (error == -1)
 				goto unionread;
-			if (error)
+			if (error) {
+				fdrop(fp, td);
 				return (error);
+			}
 		}
 		if ((vp->v_flag & VROOT) &&
 		    (vp->v_mount->mnt_flag & MNT_UNION)) {
 			struct vnode *tvp = vp;
 			vp = vp->v_mount->mnt_vnodecovered;
 			VREF(vp);
 			fp->f_data = (caddr_t) vp;
 			fp->f_offset = 0;
 			vrele(tvp);
 			goto unionread;
 		}
 	}
 	if (SCARG(uap, basep) != NULL) {
 		error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
 		    sizeof(long));
 	}
 	td->td_retval[0] = SCARG(uap, count) - auio.uio_resid;
+	fdrop(fp, td);
 	return (error);
 }
 #ifndef _SYS_SYSPROTO_H_
 struct getdents_args {
 	int fd;
 	char *buf;
 	size_t count;
 };
 #endif
 int
 getdents(td, uap)
 	struct thread *td;
 	register struct getdents_args /* {
 		syscallarg(int) fd;
 		syscallarg(char *) buf;
 		syscallarg(u_int) count;
 	} */ *uap;
 {
 	struct getdirentries_args ap;
 	ap.fd = uap->fd;
 	ap.buf = uap->buf;
 	ap.count = uap->count;
 	ap.basep = NULL;
 	return getdirentries(td, &ap);
 }
 
 /*
  * Set the mode mask for creation of filesystem nodes.
  *
  * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct umask_args {
 	int	newmask;
 };
 #endif
 int
 umask(td, uap)
 	struct thread *td;
 	struct umask_args /* {
 		syscallarg(int) newmask;
 	} */ *uap;
 {
 	register struct filedesc *fdp;
 
+	FILEDESC_LOCK(td->td_proc->p_fd);
 	fdp = td->td_proc->p_fd;
 	td->td_retval[0] = fdp->fd_cmask;
 	fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS;
+	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	return (0);
 }
 
 /*
  * Void all references to file by ripping underlying filesystem
  * away from vnode.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct revoke_args {
 	char	*path;
 };
 #endif
 /* ARGSUSED */
 int
 revoke(td, uap)
 	struct thread *td;
 	register struct revoke_args /* {
 		syscallarg(char *) path;
 	} */ *uap;
 {
 	struct mount *mp;
 	struct vnode *vp;
 	struct vattr vattr;
 	int error;
 	struct nameidata nd;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (vp->v_type != VCHR) {
 		error = EINVAL;
 		goto out;
 	}
 	error = VOP_GETATTR(vp, &vattr, td->td_proc->p_ucred, td);
 	if (error)
 		goto out;
 	if (td->td_proc->p_ucred->cr_uid != vattr.va_uid) {
 		error = suser_xxx(0, td->td_proc, PRISON_ROOT);
 		if (error)
 			goto out;
 	}
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto out;
 	if (vcount(vp) > 1)
 		VOP_REVOKE(vp, REVOKEALL);
 	vn_finished_write(mp);
 out:
 	vrele(vp);
 	return (error);
 }
 
 /*
  * Convert a user file descriptor to a kernel file entry.
+ * The file entry is locked upon returning.
  */
 int
 getvnode(fdp, fd, fpp)
 	struct filedesc *fdp;
 	int fd;
 	struct file **fpp;
 {
+	int error;
 	struct file *fp;
 
-	if ((u_int)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL)
-		return (EBADF);
-	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO)
-		return (EINVAL);
+	fp = NULL;
+	if (fdp == NULL)
+		error = EBADF;
+	else {
+		FILEDESC_LOCK(fdp);
+		if ((u_int)fd >= fdp->fd_nfiles ||
+		    (fp = fdp->fd_ofiles[fd]) == NULL)
+			error = EBADF;
+		else if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) {
+			fp = NULL;
+			error = EINVAL;
+		} else {
+			fhold(fp);
+			error = 0;
+		}
+		FILEDESC_UNLOCK(fdp);
+	}
 	*fpp = fp;
-	return (0);
+	return (error);
 }
 /*
  * Get (NFS) file handle
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getfh_args {
 	char	*fname;
 	fhandle_t *fhp;
 };
 #endif
 int
 getfh(td, uap)
 	struct thread *td;
 	register struct getfh_args *uap;
 {
 	struct nameidata nd;
 	fhandle_t fh;
 	register struct vnode *vp;
 	int error;
 
 	/*
 	 * Must be super user
 	 */
 	error = suser_td(td);
 	if (error)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, uap->fname, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
 	error = VFS_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
 	if (error)
 		return (error);
 	error = copyout(&fh, uap->fhp, sizeof (fh));
 	return (error);
 }
 
 /*
  * syscall for the rpc.lockd to use to translate a NFS file handle into
  * an open descriptor.
  *
  * warning: do not remove the suser() call or this becomes one giant
  * security hole.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhopen_args {
 	const struct fhandle *u_fhp;
 	int flags;
 };
 #endif
 int
 fhopen(td, uap)
 	struct thread *td;
 	struct fhopen_args /* {
 		syscallarg(const struct fhandle *) u_fhp;
 		syscallarg(int) flags;
 	} */ *uap;
 {
 	struct proc *p = td->td_proc;
 	struct mount *mp;
 	struct vnode *vp;
 	struct fhandle fhp;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	struct flock lf;
 	struct file *fp;
 	register struct filedesc *fdp = p->p_fd;
 	int fmode, mode, error, type;
 	struct file *nfp; 
 	int indx;
 
 	/*
 	 * Must be super user
 	 */
 	error = suser_td(td);
 	if (error)
 		return (error);
 
 	fmode = FFLAGS(SCARG(uap, flags));
 	/* why not allow a non-read/write open for our lockd? */
 	if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
 		return (EINVAL);
 	error = copyin(SCARG(uap,u_fhp), &fhp, sizeof(fhp));
 	if (error)
 		return(error);
 	/* find the mount point */
 	mp = vfs_getvfs(&fhp.fh_fsid);
 	if (mp == NULL)
 		return (ESTALE);
 	/* now give me my vnode, it gets returned to me locked */
 	error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
 	if (error)
 		return (error);
  	/*
 	 * from now on we have to make sure not
 	 * to forget about the vnode
 	 * any error that causes an abort must vput(vp) 
 	 * just set error = err and 'goto bad;'.
 	 */
 
 	/* 
 	 * from vn_open 
 	 */
 	if (vp->v_type == VLNK) {
 		error = EMLINK;
 		goto bad;
 	}
 	if (vp->v_type == VSOCK) {
 		error = EOPNOTSUPP;
 		goto bad;
 	}
 	mode = 0;
 	if (fmode & (FWRITE | O_TRUNC)) {
 		if (vp->v_type == VDIR) {
 			error = EISDIR;
 			goto bad;
 		}
 		error = vn_writechk(vp);
 		if (error)
 			goto bad;
 		mode |= VWRITE;
 	}
 	if (fmode & FREAD)
 		mode |= VREAD;
 	if (mode) {
 		error = VOP_ACCESS(vp, mode, p->p_ucred, td);
 		if (error)
 			goto bad;
 	}
 	if (fmode & O_TRUNC) {
 		VOP_UNLOCK(vp, 0, td);				/* XXX */
 		if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
 			vrele(vp);
 			return (error);
 		}
 		VOP_LEASE(vp, td, p->p_ucred, LEASE_WRITE);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);	/* XXX */
 		VATTR_NULL(vap);
 		vap->va_size = 0;
 		error = VOP_SETATTR(vp, vap, p->p_ucred, td);
 		vn_finished_write(mp);
 		if (error)
 			goto bad;
 	}
 	error = VOP_OPEN(vp, fmode, p->p_ucred, td);
 	if (error)
 		goto bad;
 	/*
 	 * Make sure that a VM object is created for VMIO support.
 	 */
 	if (vn_canvmio(vp) == TRUE) {
 		if ((error = vfs_object_create(vp, td, p->p_ucred)) != 0)
 			goto bad;
 	}
 	if (fmode & FWRITE)
 		vp->v_writecount++;
 
 	/*
 	 * end of vn_open code 
 	 */
 
 	if ((error = falloc(td, &nfp, &indx)) != 0) {
 		if (fmode & FWRITE)
 			vp->v_writecount--;
 		goto bad;
 	}
 	fp = nfp;	
 
 	/*
 	 * Hold an extra reference to avoid having fp ripped out 
 	 * from under us while we block in the lock op
 	 */
 	fhold(fp);
 	nfp->f_data = (caddr_t)vp;
 	nfp->f_flag = fmode & FMASK;
 	nfp->f_ops = &vnops;
 	nfp->f_type = DTYPE_VNODE;
 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
 		lf.l_len = 0;
 		if (fmode & O_EXLOCK)
 			lf.l_type = F_WRLCK;
 		else
 			lf.l_type = F_RDLCK;
 		type = F_FLOCK;
 		if ((fmode & FNONBLOCK) == 0)
 			type |= F_WAIT;
 		VOP_UNLOCK(vp, 0, td);
 		if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) != 0) {
 			/*
 			 * The lock request failed.  Normally close the
 			 * descriptor but handle the case where someone might
 			 * have dup()d or close()d it when we weren't looking.
 			 */
+			FILEDESC_LOCK(fdp);
 			if (fdp->fd_ofiles[indx] == fp) {
 				fdp->fd_ofiles[indx] = NULL;
+				FILEDESC_UNLOCK(fdp);
 				fdrop(fp, td);
-			}
+			} else
+				FILEDESC_UNLOCK(fdp);
 			/*
 			 * release our private reference
 			 */
 			fdrop(fp, td);
 			return(error);
 		}
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		fp->f_flag |= FHASLOCK;
 	}
 	if ((vp->v_type == VREG) && (VOP_GETVOBJECT(vp, NULL) != 0))
 		vfs_object_create(vp, td, p->p_ucred);
 
 	VOP_UNLOCK(vp, 0, td);
 	fdrop(fp, td);
 	td->td_retval[0] = indx;
 	return (0);
 
 bad:
 	vput(vp);
 	return (error);
 }
 
 /*
  * Stat an (NFS) file handle.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstat_args {
 	struct fhandle *u_fhp;
 	struct stat *sb;
 };
 #endif
 int
 fhstat(td, uap)
 	struct thread *td;
 	register struct fhstat_args /* {
 		syscallarg(struct fhandle *) u_fhp;
 		syscallarg(struct stat *) sb;
 	} */ *uap;
 {
 	struct stat sb;
 	fhandle_t fh;
 	struct mount *mp;
 	struct vnode *vp;
 	int error;
 
 	/*
 	 * Must be super user
 	 */
 	error = suser_td(td);
 	if (error)
 		return (error);
 	
 	error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t));
 	if (error)
 		return (error);
 
 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
 		return (error);
 	error = vn_stat(vp, &sb, td);
 	vput(vp);
 	if (error)
 		return (error);
 	error = copyout(&sb, SCARG(uap, sb), sizeof(sb));
 	return (error);
 }
 
 /*
  * Implement fstatfs() for (NFS) file handles.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstatfs_args {
 	struct fhandle *u_fhp;
 	struct statfs *buf;
 };
 #endif
 int
 fhstatfs(td, uap)
 	struct thread *td;
 	struct fhstatfs_args /* {
 		syscallarg(struct fhandle) *u_fhp;
 		syscallarg(struct statfs) *buf;
 	} */ *uap;
 {
 	struct statfs *sp;
 	struct mount *mp;
 	struct vnode *vp;
 	struct statfs sb;
 	fhandle_t fh;
 	int error;
 
 	/*
 	 * Must be super user
 	 */
 	error = suser_td(td);
 	if (error)
 		return (error);
 
 	if ((error = copyin(SCARG(uap, u_fhp), &fh, sizeof(fhandle_t))) != 0)
 		return (error);
 
 	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
 	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp)))
 		return (error);
 	mp = vp->v_mount;
 	sp = &mp->mnt_stat;
 	vput(vp);
 	if ((error = VFS_STATFS(mp, sp, td)) != 0)
 		return (error);
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	if (suser_xxx(td->td_proc->p_ucred, 0, 0)) {
 		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		sp = &sb;
 	}
 	return (copyout(sp, SCARG(uap, buf), sizeof(*sp)));
 }
 
 /*
  * Syscall to push extended attribute configuration information into the
  * VFS.  Accepts a path, which it converts to a mountpoint, as well as
  * a command (int cmd), and attribute name and misc data.  For now, the
  * attribute name is left in userspace for consumption by the VFS_op.
  * It will probably be changed to be copied into sysspace by the
  * syscall in the future, once issues with various consumers of the
  * attribute code have raised their hands.
  *
  * Currently this is used only by UFS Extended Attributes.
  */
 int
 extattrctl(td, uap)
 	struct thread *td;
 	struct extattrctl_args *uap;
 {
 	struct vnode *filename_vp;
 	struct nameidata nd;
 	struct mount *mp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	/*
 	 * SCARG(uap, attrname) not always defined.  We check again later
 	 * when we invoke the VFS call so as to pass in NULL there if needed.
 	 */
 	if (SCARG(uap, attrname) != NULL) {
 		error = copyinstr(SCARG(uap, attrname), attrname,
 		    EXTATTR_MAXNAMELEN, NULL);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * SCARG(uap, filename) not always defined.  If it is, grab
 	 * a vnode lock, which VFS_EXTATTRCTL() will later release.
 	 */
 	filename_vp = NULL;
 	if (SCARG(uap, filename) != NULL) {
 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
 		    SCARG(uap, filename), td);
 		if ((error = namei(&nd)) != 0)
 			return (error);
 		filename_vp = nd.ni_vp;
 		NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK);
 	}
 
 	/* SCARG(uap, path) always defined. */
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
 	NDFREE(&nd, 0);
 	if (error) {
 		if (filename_vp)
 			vrele(filename_vp);
 		return (error);
 	}
 
 	if (SCARG(uap, attrname) != NULL) {
 		error = VFS_EXTATTRCTL(mp, SCARG(uap, cmd), filename_vp,
 		    SCARG(uap, attrnamespace), attrname, td);
 	} else {
 		error = VFS_EXTATTRCTL(mp, SCARG(uap, cmd), filename_vp,
 		    SCARG(uap, attrnamespace), NULL, td);
 	}
 
 	vn_finished_write(mp);
 	/*
 	 * VFS_EXTATTRCTL will have unlocked, but not de-ref'd,
 	 * filename_vp, so vrele it if it is defined.
 	 */
 	if (filename_vp != NULL)
 		vrele(filename_vp);
 
 	return (error);
 }
 
 /*
  * extattr_set_vp(): Set a named extended attribute on a file or directory
  * 
  * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
  *            kernelspace string pointer "attrname",
  *            userspace iovec array pointer "iovp", unsigned int iovcnt
  *            proc "p"
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
     struct iovec *iovp, unsigned iovcnt, struct thread *td)
 {
 	struct mount *mp;
 	struct uio auio;
 	struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
 	u_int iovlen, cnt;
 	int error, i;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	iovlen = iovcnt * sizeof(struct iovec);
 	if (iovcnt > UIO_SMALLIOV) {
 		if (iovcnt > UIO_MAXIOV) {
 			error = EINVAL;
 			goto done;
 		}
 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
 		needfree = iov;
 	} else
 		iov = aiov;
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = iovcnt;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	if ((error = copyin((caddr_t)iovp, (caddr_t)iov, iovlen)))
 		goto done;
 	auio.uio_resid = 0;
 	for (i = 0; i < iovcnt; i++) {
 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
 			error = EINVAL;
 			goto done;
 		}
 		auio.uio_resid += iov->iov_len;
 		iov++;
 	}
 	cnt = auio.uio_resid;
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
 	    td->td_proc->p_ucred, td);
 	cnt -= auio.uio_resid;
 	td->td_retval[0] = cnt;
 done:
 	if (needfree)
 		FREE(needfree, M_IOV);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 int
 extattr_set_file(td, uap)
 	struct thread *td;
 	struct extattr_set_file_args *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN,
 	    NULL);
 	if (error)
 		return (error);
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_set_vp(nd.ni_vp, SCARG(uap, attrnamespace), attrname,
 	    SCARG(uap, iovp), SCARG(uap, iovcnt), td);
 
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 int
 extattr_set_fd(td, uap)
 	struct thread *td;
 	struct extattr_set_fd_args *uap;
 {
 	struct file *fp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN,
 	    NULL);
 	if (error)
 		return (error);
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
 
 	error = extattr_set_vp((struct vnode *)fp->f_data,
 	    SCARG(uap, attrnamespace), attrname, SCARG(uap, iovp),
 	    SCARG(uap, iovcnt), td);
+	fdrop(fp, td);
 
 	return (error);
 }
 
 /*
  * extattr_get_vp(): Get a named extended attribute on a file or directory
  * 
  * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
  *            kernelspace string pointer "attrname",
  *            userspace iovec array pointer "iovp", unsigned int iovcnt,
  *            proc "p"
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
     struct iovec *iovp, unsigned iovcnt, struct thread *td)
 {
 	struct uio auio;
 	struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
 	u_int iovlen, cnt;
 	int error, i;
 
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_READ);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	iovlen = iovcnt * sizeof (struct iovec);
 	if (iovcnt > UIO_SMALLIOV) {
 		if (iovcnt > UIO_MAXIOV) {
 			error = EINVAL;
 			goto done;
 		}
 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
 		needfree = iov;
 	} else
 		iov = aiov;
 	auio.uio_iov = iov;
 	auio.uio_iovcnt = iovcnt;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	if ((error = copyin((caddr_t)iovp, (caddr_t)iov, iovlen)))
 		goto done;
 	auio.uio_resid = 0;
 	for (i = 0; i < iovcnt; i++) {
 		if (iov->iov_len > INT_MAX - auio.uio_resid) {
 			error = EINVAL;
 			goto done;
 		}
 		auio.uio_resid += iov->iov_len;
 		iov++;
 	}
 	cnt = auio.uio_resid;
 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio,
 	    td->td_proc->p_ucred, td);
 	cnt -= auio.uio_resid;
 	td->td_retval[0] = cnt;
 done:
 	if (needfree)
 		FREE(needfree, M_IOV);
 	VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 int
 extattr_get_file(td, uap)
 	struct thread *td;
 	struct extattr_get_file_args *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN,
 	    NULL);
 	if (error)
 		return (error);
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_get_vp(nd.ni_vp, SCARG(uap, attrnamespace), attrname,
 	    SCARG(uap, iovp), SCARG(uap, iovcnt), td);
 
 	vrele(nd.ni_vp);
 	return (error);
 }
 
 int
 extattr_get_fd(td, uap)
 	struct thread *td;
 	struct extattr_get_fd_args *uap;
 {
 	struct file *fp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN,
 	    NULL);
 	if (error)
 		return (error);
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
 
 	error = extattr_get_vp((struct vnode *)fp->f_data,
 	    SCARG(uap, attrnamespace), attrname, SCARG(uap, iovp),
 	    SCARG(uap, iovcnt), td);
 
+	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * extattr_delete_vp(): Delete a named extended attribute on a file or
  *                      directory
  * 
  * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
  *            kernelspace string pointer "attrname", proc "p"
  * Returns: 0 on success, an error number otherwise
  * Locks: none
  * References: vp must be a valid reference for the duration of the call
  */
 static int
 extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
     struct thread *td)
 {
 	struct mount *mp;
 	int error;
 
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, td->td_proc->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
 	    td->td_proc->p_ucred, td);
 
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 int
 extattr_delete_file(td, uap)
 	struct thread *td;
 	struct extattr_delete_file_args *uap;
 {
 	struct nameidata nd;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN,
 	     NULL);
 	if (error)
 		return(error);
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), td);
 	if ((error = namei(&nd)) != 0)
 		return(error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 
 	error = extattr_delete_vp(nd.ni_vp, SCARG(uap, attrnamespace),
 	    attrname, td);
 
 	vrele(nd.ni_vp);
 	return(error);
 }
 
 int
 extattr_delete_fd(td, uap)
 	struct thread *td;
 	struct extattr_delete_fd_args *uap;
 {
 	struct file *fp;
+	struct vnode *vp;
 	char attrname[EXTATTR_MAXNAMELEN];
 	int error;
 
 	error = copyinstr(SCARG(uap, attrname), attrname, EXTATTR_MAXNAMELEN,
 	    NULL);
 	if (error)
 		return (error);
 
 	if ((error = getvnode(td->td_proc->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
+	vp = (struct vnode *)fp->f_data;
 
 	error = extattr_delete_vp((struct vnode *)fp->f_data,
 	    SCARG(uap, attrnamespace), attrname, td);
 
+	fdrop(fp, td);
 	return (error);
 }
Index: head/sys/kern/vfs_vnops.c
===================================================================
--- head/sys/kern/vfs_vnops.c	(revision 89305)
+++ head/sys/kern/vfs_vnops.c	(revision 89306)
@@ -1,979 +1,979 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/filio.h>
 #include <sys/ttycom.h>
 #include <sys/conf.h>
 #include <sys/syslog.h>
 
 #include <machine/limits.h>
 
 static int vn_closefile __P((struct file *fp, struct thread *td));
 static int vn_ioctl __P((struct file *fp, u_long com, caddr_t data, 
 		struct thread *td));
 static int vn_read __P((struct file *fp, struct uio *uio, 
 		struct ucred *cred, int flags, struct thread *td));
 static int vn_poll __P((struct file *fp, int events, struct ucred *cred,
 		struct thread *td));
 static int vn_kqfilter __P((struct file *fp, struct knote *kn));
 static int vn_statfile __P((struct file *fp, struct stat *sb, struct thread *td));
 static int vn_write __P((struct file *fp, struct uio *uio, 
 		struct ucred *cred, int flags, struct thread *td));
 
 struct 	fileops vnops = {
 	vn_read, vn_write, vn_ioctl, vn_poll, vn_kqfilter,
 	vn_statfile, vn_closefile
 };
 
 int
 vn_open(ndp, flagp, cmode)
 	register struct nameidata *ndp;
 	int *flagp, cmode;
 {
 	struct thread *td = ndp->ni_cnd.cn_thread;
 
 	return (vn_open_cred(ndp, flagp, cmode, td->td_proc->p_ucred));
 }
 
 /*
  * Common code for vnode open operations.
  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  * 
  * Note that this does NOT free nameidata for the successful case,
  * due to the NDINIT being done elsewhere.
  */
 int
 vn_open_cred(ndp, flagp, cmode, cred)
 	register struct nameidata *ndp;
 	int *flagp, cmode;
 	struct ucred *cred;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	struct thread *td = ndp->ni_cnd.cn_thread;
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int mode, fmode, error;
 
 restart:
 	fmode = *flagp;
 	if (fmode & O_CREAT) {
 		ndp->ni_cnd.cn_nameiop = CREATE;
 		ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 			ndp->ni_cnd.cn_flags |= FOLLOW;
 		bwillwrite();
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		if (ndp->ni_vp == NULL) {
 			VATTR_NULL(vap);
 			vap->va_type = VREG;
 			vap->va_mode = cmode;
 			if (fmode & O_EXCL)
 				vap->va_vaflags |= VA_EXCLUSIVE;
 			if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				vput(ndp->ni_dvp);
 				if ((error = vn_start_write(NULL, &mp,
 				    V_XSLEEP | PCATCH)) != 0)
 					return (error);
 				goto restart;
 			}
 			VOP_LEASE(ndp->ni_dvp, td, cred, LEASE_WRITE);
 			error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
 					   &ndp->ni_cnd, vap);
 			vput(ndp->ni_dvp);
 			vn_finished_write(mp);
 			if (error) {
 				NDFREE(ndp, NDF_ONLY_PNBUF);
 				return (error);
 			}
 			ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create");
 			ASSERT_VOP_LOCKED(ndp->ni_vp, "create");
 			fmode &= ~O_TRUNC;
 			vp = ndp->ni_vp;
 		} else {
 			if (ndp->ni_dvp == ndp->ni_vp)
 				vrele(ndp->ni_dvp);
 			else
 				vput(ndp->ni_dvp);
 			ndp->ni_dvp = NULL;
 			vp = ndp->ni_vp;
 			if (fmode & O_EXCL) {
 				error = EEXIST;
 				goto bad;
 			}
 			fmode &= ~O_CREAT;
 		}
 	} else {
 		ndp->ni_cnd.cn_nameiop = LOOKUP;
 		ndp->ni_cnd.cn_flags =
 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF;
 		if ((error = namei(ndp)) != 0)
 			return (error);
 		vp = ndp->ni_vp;
 	}
 	if (vp->v_type == VLNK) {
 		error = EMLINK;
 		goto bad;
 	}
 	if (vp->v_type == VSOCK) {
 		error = EOPNOTSUPP;
 		goto bad;
 	}
 	if ((fmode & O_CREAT) == 0) {
 		mode = 0;
 		if (fmode & (FWRITE | O_TRUNC)) {
 			if (vp->v_type == VDIR) {
 				error = EISDIR;
 				goto bad;
 			}
 			error = vn_writechk(vp);
 			if (error)
 				goto bad;
 			mode |= VWRITE;
 		}
 		if (fmode & FREAD)
 			mode |= VREAD;
 		if (mode) {
 		        error = VOP_ACCESS(vp, mode, cred, td);
 			if (error)
 				goto bad;
 		}
 	}
 	if ((error = VOP_OPEN(vp, fmode, cred, td)) != 0)
 		goto bad;
 	/*
 	 * Make sure that a VM object is created for VMIO support.
 	 */
 	if (vn_canvmio(vp) == TRUE) {
 		if ((error = vfs_object_create(vp, td, cred)) != 0)
 			/* XXX: Should VOP_CLOSE() again here. */
 			goto bad;
 	}
 
 	if (fmode & FWRITE)
 		vp->v_writecount++;
 	*flagp = fmode;
 	return (0);
 bad:
 	NDFREE(ndp, NDF_ONLY_PNBUF);
 	vput(vp);
 	*flagp = fmode;
 	return (error);
 }
 
 /*
  * Check for write permissions on the specified vnode.
  * Prototype text segments cannot be written.
  */
 int
 vn_writechk(vp)
 	register struct vnode *vp;
 {
 
 	/*
 	 * If there's shared text associated with
 	 * the vnode, try to free it up once.  If
 	 * we fail, we can't allow writing.
 	 */
 	if (vp->v_flag & VTEXT)
 		return (ETXTBSY);
 	return (0);
 }
 
 /*
  * Vnode close call
  */
 int
 vn_close(vp, flags, cred, td)
 	register struct vnode *vp;
 	int flags;
 	struct ucred *cred;
 	struct thread *td;
 {
 	int error;
 
 	if (flags & FWRITE)
 		vp->v_writecount--;
 	error = VOP_CLOSE(vp, flags, cred, td);
 	/*
 	 * XXX - In certain instances VOP_CLOSE has to do the vrele
 	 * itself. If the vrele has been done, it will return EAGAIN
 	 * to indicate that the vrele should not be done again. When
 	 * this happens, we just return success. The correct thing to
 	 * do would be to have all VOP_CLOSE instances do the vrele.
 	 */
 	if (error == EAGAIN)
 		return (0);
 	vrele(vp);
 	return (error);
 }
 
 static __inline
 int
 sequential_heuristic(struct uio *uio, struct file *fp)
 {
+
 	/*
 	 * Sequential heuristic - detect sequential operation
 	 */
 	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) ||
 	    uio->uio_offset == fp->f_nextoff) {
 		/*
 		 * XXX we assume that the filesystem block size is
 		 * the default.  Not true, but still gives us a pretty
 		 * good indicator of how sequential the read operations
 		 * are.
 		 */
 		fp->f_seqcount += (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
 		if (fp->f_seqcount >= 127)
 			fp->f_seqcount = 127;
 		return(fp->f_seqcount << 16);
 	}
 
 	/*
 	 * Not sequential, quick draw-down of seqcount
 	 */
 	if (fp->f_seqcount > 1)
 		fp->f_seqcount = 1;
 	else
 		fp->f_seqcount = 0;
 	return(0);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
 	enum uio_rw rw;
 	struct vnode *vp;
 	caddr_t base;
 	int len;
 	off_t offset;
 	enum uio_seg segflg;
 	int ioflg;
 	struct ucred *cred;
 	int *aresid;
 	struct thread *td;
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	int error;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		mp = NULL;
 		if (rw == UIO_WRITE &&
 		    vp->v_type != VCHR &&
 		    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	}
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
 	aiov.iov_base = base;
 	aiov.iov_len = len;
 	auio.uio_resid = len;
 	auio.uio_offset = offset;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = rw;
 	auio.uio_td = td;
 	if (rw == UIO_READ) {
 		error = VOP_READ(vp, &auio, ioflg, cred);
 	} else {
 		error = VOP_WRITE(vp, &auio, ioflg, cred);
 	}
 	if (aresid)
 		*aresid = auio.uio_resid;
 	else
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0, td);
 	}
 	return (error);
 }
 
 /*
  * Package up an I/O request on a vnode into a uio and do it.  The I/O
  * request is split up into smaller chunks and we try to avoid saturating
  * the buffer cache while potentially holding a vnode locked, so we 
  * check bwillwrite() before calling vn_rdwr().  We also call uio_yield()
  * to give other processes a chance to lock the vnode (either other processes
  * core'ing the same binary, or unrelated processes scanning the directory).
  */
 int
 vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, td)
 	enum uio_rw rw;
 	struct vnode *vp;
 	caddr_t base;
 	int len;
 	off_t offset;
 	enum uio_seg segflg;
 	int ioflg;
 	struct ucred *cred;
 	int *aresid;
 	struct thread *td;
 {
 	int error = 0;
 
 	do {
 		int chunk = (len > MAXBSIZE) ? MAXBSIZE : len;
 
 		if (rw != UIO_READ && vp->v_type == VREG)
 			bwillwrite();
 		error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
 		    ioflg, cred, aresid, td);
 		len -= chunk;	/* aresid calc already includes length */
 		if (error)
 			break;
 		offset += chunk;
 		base += chunk;
 		uio_yield();
 	} while (len);
 	if (aresid)
 		*aresid += len;
 	return (error);
 }
 
 /*
  * File table vnode read routine.
  */
 static int
 vn_read(fp, uio, cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *cred;
 	struct thread *td;
 	int flags;
 {
 	struct vnode *vp;
 	int error, ioflag;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	vp = (struct vnode *)fp->f_data;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	VOP_LEASE(vp, td, cred, LEASE_READ);
 	vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, td);
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = fp->f_offset;
 
 	ioflag |= sequential_heuristic(uio, fp);
 
 	error = VOP_READ(vp, uio, ioflag, cred);
 	if ((flags & FOF_OFFSET) == 0)
 		fp->f_offset = uio->uio_offset;
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0, td);
 	return (error);
 }
 
 /*
  * File table vnode write routine.
  */
 static int
 vn_write(fp, uio, cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *cred;
 	struct thread *td;
 	int flags;
 {
 	struct vnode *vp;
 	struct mount *mp;
 	int error, ioflag;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	vp = (struct vnode *)fp->f_data;
 	if (vp->v_type == VREG)
 		bwillwrite();
-	vp = (struct vnode *)fp->f_data;	/* XXX needed? */
 	ioflag = IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
 		ioflag |= IO_APPEND;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	if ((fp->f_flag & O_FSYNC) ||
 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
 		ioflag |= IO_SYNC;
 	mp = NULL;
 	if (vp->v_type != VCHR &&
 	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		return (error);
 	VOP_LEASE(vp, td, cred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if ((flags & FOF_OFFSET) == 0)
 		uio->uio_offset = fp->f_offset;
 	ioflag |= sequential_heuristic(uio, fp);
 	error = VOP_WRITE(vp, uio, ioflag, cred);
 	if ((flags & FOF_OFFSET) == 0)
 		fp->f_offset = uio->uio_offset;
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	return (error);
 }
 
 /*
  * File table vnode stat routine.
  */
 static int
 vn_statfile(fp, sb, td)
 	struct file *fp;
 	struct stat *sb;
 	struct thread *td;
 {
 	struct vnode *vp = (struct vnode *)fp->f_data;
 
 	return vn_stat(vp, sb, td);
 }
 
 int
 vn_stat(vp, sb, td)
 	struct vnode *vp;
 	register struct stat *sb;
 	struct thread *td;
 {
 	struct vattr vattr;
 	register struct vattr *vap;
 	int error;
 	u_short mode;
 
 	vap = &vattr;
 	error = VOP_GETATTR(vp, vap, td->td_proc->p_ucred, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Zero the spare stat fields
 	 */
 	sb->st_lspare = 0;
 	sb->st_qspare[0] = 0;
 	sb->st_qspare[1] = 0;
 
 	/*
 	 * Copy from vattr table
 	 */
 	if (vap->va_fsid != VNOVAL)
 		sb->st_dev = vap->va_fsid;
 	else
 		sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
 	sb->st_ino = vap->va_fileid;
 	mode = vap->va_mode;
 	switch (vap->va_type) {
 	case VREG:
 		mode |= S_IFREG;
 		break;
 	case VDIR:
 		mode |= S_IFDIR;
 		break;
 	case VBLK:
 		mode |= S_IFBLK;
 		break;
 	case VCHR:
 		mode |= S_IFCHR;
 		break;
 	case VLNK:
 		mode |= S_IFLNK;
 		/* This is a cosmetic change, symlinks do not have a mode. */
 		if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW)
 			sb->st_mode &= ~ACCESSPERMS;	/* 0000 */
 		else
 			sb->st_mode |= ACCESSPERMS;	/* 0777 */
 		break;
 	case VSOCK:
 		mode |= S_IFSOCK;
 		break;
 	case VFIFO:
 		mode |= S_IFIFO;
 		break;
 	default:
 		return (EBADF);
 	};
 	sb->st_mode = mode;
 	sb->st_nlink = vap->va_nlink;
 	sb->st_uid = vap->va_uid;
 	sb->st_gid = vap->va_gid;
 	sb->st_rdev = vap->va_rdev;
 	if (vap->va_size > OFF_MAX)
 		return (EOVERFLOW);
 	sb->st_size = vap->va_size;
 	sb->st_atimespec = vap->va_atime;
 	sb->st_mtimespec = vap->va_mtime;
 	sb->st_ctimespec = vap->va_ctime;
 
         /*
 	 * According to www.opengroup.org, the meaning of st_blksize is 
 	 *   "a filesystem-specific preferred I/O block size for this 
 	 *    object.  In some filesystem types, this may vary from file
 	 *    to file"
 	 * Default to zero to catch bogus uses of this field.
 	 */
 
 	if (vap->va_type == VREG) {
 		sb->st_blksize = vap->va_blocksize;
 	} else if (vn_isdisk(vp, NULL)) {
 		sb->st_blksize = vp->v_rdev->si_bsize_best;
 		if (sb->st_blksize < vp->v_rdev->si_bsize_phys)
 			sb->st_blksize = vp->v_rdev->si_bsize_phys;
 		if (sb->st_blksize < BLKDEV_IOSIZE)
 			sb->st_blksize = BLKDEV_IOSIZE;
 	} else {
 		sb->st_blksize = 0;
 	}
 	
 	sb->st_flags = vap->va_flags;
 	if (suser_xxx(td->td_proc->p_ucred, 0, 0))
 		sb->st_gen = 0;
 	else
 		sb->st_gen = vap->va_gen;
 
 #if (S_BLKSIZE == 512)
 	/* Optimize this case */
 	sb->st_blocks = vap->va_bytes >> 9;
 #else
 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
 #endif
 	return (0);
 }
 
 /*
  * File table vnode ioctl routine.
  */
 static int
 vn_ioctl(fp, com, data, td)
 	struct file *fp;
 	u_long com;
 	caddr_t data;
 	struct thread *td;
 {
 	register struct vnode *vp = ((struct vnode *)fp->f_data);
 	struct vattr vattr;
 	int error;
 
 	switch (vp->v_type) {
 
 	case VREG:
 	case VDIR:
 		if (com == FIONREAD) {
 			error = VOP_GETATTR(vp, &vattr, td->td_proc->p_ucred, td);
 			if (error)
 				return (error);
 			*(int *)data = vattr.va_size - fp->f_offset;
 			return (0);
 		}
 		if (com == FIONBIO || com == FIOASYNC)	/* XXX */
 			return (0);			/* XXX */
 		/* fall into ... */
 
 	default:
 #if 0
 		return (ENOTTY);
 #endif
 	case VFIFO:
 	case VCHR:
 	case VBLK:
 		if (com == FIODTYPE) {
 			if (vp->v_type != VCHR && vp->v_type != VBLK)
 				return (ENOTTY);
 			*(int *)data = devsw(vp->v_rdev)->d_flags & D_TYPEMASK;
 			return (0);
 		}
 		error = VOP_IOCTL(vp, com, data, fp->f_flag, td->td_proc->p_ucred, td);
 		if (error == 0 && com == TIOCSCTTY) {
 
 			/* Do nothing if reassigning same control tty */
 			if (td->td_proc->p_session->s_ttyvp == vp)
 				return (0);
 
 			/* Get rid of reference to old control tty */
 			if (td->td_proc->p_session->s_ttyvp)
 				vrele(td->td_proc->p_session->s_ttyvp);
 
 			td->td_proc->p_session->s_ttyvp = vp;
 			VREF(vp);
 		}
 		return (error);
 	}
 }
 
 /*
  * File table vnode poll routine.
  */
 static int
 vn_poll(fp, events, cred, td)
 	struct file *fp;
 	int events;
 	struct ucred *cred;
 	struct thread *td;
 {
 
 	return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, td));
 }
 
 /*
  * Check that the vnode is still valid, and if so
  * acquire requested lock.
  */
 int
 #ifndef	DEBUG_LOCKS
 vn_lock(vp, flags, td)
 #else
 debug_vn_lock(vp, flags, td, filename, line)
 #endif
 	struct vnode *vp;
 	int flags;
 	struct thread *td;
 #ifdef	DEBUG_LOCKS
 	const char *filename;
 	int line;
 #endif
 {
 	int error;
 
 	do {
 		if ((flags & LK_INTERLOCK) == 0)
 			mtx_lock(&vp->v_interlock);
 		if ((vp->v_flag & VXLOCK) && vp->v_vxproc != curthread) {
 			vp->v_flag |= VXWANT;
 			msleep(vp, &vp->v_interlock, PINOD | PDROP,
 			    "vn_lock", 0);
 			error = ENOENT;
 		} else {
 			if (vp->v_vxproc != NULL)
 				log(LOG_INFO, "VXLOCK interlock avoided in vn_lock\n");
 #ifdef	DEBUG_LOCKS
 			vp->filename = filename;
 			vp->line = line;
 #endif
 			error = VOP_LOCK(vp,
 				    flags | LK_NOPAUSE | LK_INTERLOCK, td);
 			if (error == 0)
 				return (error);
 		}
 		flags &= ~LK_INTERLOCK;
 	} while (flags & LK_RETRY);
 	return (error);
 }
 
 /*
  * File table vnode close routine.
  */
 static int
 vn_closefile(fp, td)
 	struct file *fp;
 	struct thread *td;
 {
 
 	fp->f_ops = &badfileops;
 	return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
 		fp->f_cred, td));
 }
 
 /*
  * Preparing to start a filesystem write operation. If the operation is
  * permitted, then we bump the count of operations in progress and
  * proceed. If a suspend request is in progress, we wait until the
  * suspension is over, and then proceed.
  */
 int
 vn_start_write(vp, mpp, flags)
 	struct vnode *vp;
 	struct mount **mpp;
 	int flags;
 {
 	struct mount *mp;
 	int error;
 
 	/*
 	 * If a vnode is provided, get and return the mount point that
 	 * to which it will write.
 	 */
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
 			*mpp = NULL;
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	if ((mp = *mpp) == NULL)
 		return (0);
 	/*
 	 * Check on status of suspension.
 	 */
 	while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
 		if (flags & V_NOWAIT)
 			return (EWOULDBLOCK);
 		error = tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
 		    "suspfs", 0);
 		if (error)
 			return (error);
 	}
 	if (flags & V_XSLEEP)
 		return (0);
 	mp->mnt_writeopcount++;
 	return (0);
 }
 
 /*
  * Secondary suspension. Used by operations such as vop_inactive
  * routines that are needed by the higher level functions. These
  * are allowed to proceed until all the higher level functions have
  * completed (indicated by mnt_writeopcount dropping to zero). At that
  * time, these operations are halted until the suspension is over.
  */
 int
 vn_write_suspend_wait(vp, mp, flags)
 	struct vnode *vp;
 	struct mount *mp;
 	int flags;
 {
 	int error;
 
 	if (vp != NULL) {
 		if ((error = VOP_GETWRITEMOUNT(vp, &mp)) != 0) {
 			if (error != EOPNOTSUPP)
 				return (error);
 			return (0);
 		}
 	}
 	/*
 	 * If we are not suspended or have not yet reached suspended
 	 * mode, then let the operation proceed.
 	 */
 	if (mp == NULL || (mp->mnt_kern_flag & MNTK_SUSPENDED) == 0)
 		return (0);
 	if (flags & V_NOWAIT)
 		return (EWOULDBLOCK);
 	/*
 	 * Wait for the suspension to finish.
 	 */
 	return (tsleep(&mp->mnt_flag, (PUSER - 1) | (flags & PCATCH),
 	    "suspfs", 0));
 }
 
 /*
  * Filesystem write operation has completed. If we are suspending and this
  * operation is the last one, notify the suspender that the suspension is
  * now in effect.
  */
 void
 vn_finished_write(mp)
 	struct mount *mp;
 {
 
 	if (mp == NULL)
 		return;
 	mp->mnt_writeopcount--;
 	if (mp->mnt_writeopcount < 0)
 		panic("vn_finished_write: neg cnt");
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
 	    mp->mnt_writeopcount <= 0)
 		wakeup(&mp->mnt_writeopcount);
 }
 
 /*
  * Request a filesystem to suspend write operations.
  */
 void
 vfs_write_suspend(mp)
 	struct mount *mp;
 {
 	struct thread *td = curthread;
 
 	if (mp->mnt_kern_flag & MNTK_SUSPEND)
 		return;
 	mp->mnt_kern_flag |= MNTK_SUSPEND;
 	if (mp->mnt_writeopcount > 0)
 		(void) tsleep(&mp->mnt_writeopcount, PUSER - 1, "suspwt", 0);
 	VFS_SYNC(mp, MNT_WAIT, td->td_proc->p_ucred, td);
 	mp->mnt_kern_flag |= MNTK_SUSPENDED;
 }
 
 /*
  * Request a filesystem to resume write operations.
  */
 void
 vfs_write_resume(mp)
 	struct mount *mp;
 {
 
 	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0)
 		return;
 	mp->mnt_kern_flag &= ~(MNTK_SUSPEND | MNTK_SUSPENDED);
 	wakeup(&mp->mnt_writeopcount);
 	wakeup(&mp->mnt_flag);
 }
 
 static int
 vn_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (VOP_KQFILTER(((struct vnode *)fp->f_data), kn));
 }
 
 /*
  * Simplified in-kernel wrapper calls for extended attribute access.
  * Both calls pass in a NULL credential, authorizing as "kernel" access.
  * Set IO_NODELOCKED in ioflg if the vnode is already locked.
  */
 int
 vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int *buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	int	error;
 
 	iov.iov_len = *buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = *buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	/* authorize attribute retrieval as kernel */
 	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0)
 		VOP_UNLOCK(vp, 0, td);
 
 	if (error == 0) {
 		*buflen = *buflen - auio.uio_resid;
 	}
 
 	return (error);
 }
 
 /*
  * XXX failure mode if partially written?
  */
 int
 vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, int buflen, char *buf, struct thread *td)
 {
 	struct uio	auio;
 	struct iovec	iov;
 	struct mount	*mp;
 	int	error;
 
 	iov.iov_len = buflen;
 	iov.iov_base = buf;
 
 	auio.uio_iov = &iov;
 	auio.uio_iovcnt = 1;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_SYSSPACE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;
 	auio.uio_resid = buflen;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	}
 
 	/* authorize attribute setting as kernel */
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0, td);
 	}
 
 	return (error);
 }
 
 int
 vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
     const char *attrname, struct thread *td)
 {
 	struct mount	*mp;
 	int	error;
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
 			return (error);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	}
 
 	/* authorize attribute removal as kernel */
 	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL, NULL, td);
 
 	if ((ioflg & IO_NODELOCKED) == 0) {
 		vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0, td);
 	}
 
 	return (error);
 }
Index: head/sys/netgraph/ng_socket.c
===================================================================
--- head/sys/netgraph/ng_socket.c	(revision 89305)
+++ head/sys/netgraph/ng_socket.c	(revision 89306)
@@ -1,1092 +1,1093 @@
 
 /*
  * ng_socket.c
  *
  * Copyright (c) 1996-1999 Whistle Communications, Inc.
  * All rights reserved.
  *
  * Subject to the following obligations and disclaimer of warranty, use and
  * redistribution of this software, in source or object code forms, with or
  * without modifications are expressly permitted by Whistle Communications;
  * provided, however, that:
  * 1. Any and all reproductions of the source or object code must include the
  *    copyright notice above and the following disclaimer of warranties; and
  * 2. No rights are granted, in any manner or form, to use Whistle
  *    Communications, Inc. trademarks, including the mark "WHISTLE
  *    COMMUNICATIONS" on advertising, endorsements, or otherwise except as
  *    such appears in the above copyright notice or in the software.
  *
  * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
  * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
  * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
  * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
  * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
  * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
  * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
  * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
  * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
  * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
  * OF SUCH DAMAGE.
  *
  * Author: Julian Elischer <julian@freebsd.org>
  *
  * $FreeBSD$
  * $Whistle: ng_socket.c,v 1.28 1999/11/01 09:24:52 julian Exp $
  */
 
 /*
  * Netgraph socket nodes
  *
  * There are two types of netgraph sockets, control and data.
  * Control sockets have a netgraph node, but data sockets are
  * parasitic on control sockets, and have no node of their own.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/domain.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/filedesc.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #ifdef NOTYET
 #include <sys/vnode.h>
 #endif
 #include <netgraph/ng_message.h>
 #include <netgraph/netgraph.h>
 #include <netgraph/ng_socketvar.h>
 #include <netgraph/ng_socket.h>
 
 #ifdef NG_SEPARATE_MALLOC
 MALLOC_DEFINE(M_NETGRAPH_PATH, "netgraph_path", "netgraph path info ");
 MALLOC_DEFINE(M_NETGRAPH_SOCK, "netgraph_sock", "netgraph socket info ");
 #else
 #define M_NETGRAPH_PATH M_NETGRAPH
 #define M_NETGRAPH_SOCK M_NETGRAPH
 #endif
 
 /*
  * It's Ascii-art time!
  *   +-------------+   +-------------+
  *   |socket  (ctl)|   |socket (data)|
  *   +-------------+   +-------------+
  *          ^                 ^
  *          |                 |
  *          v                 v
  *    +-----------+     +-----------+
  *    |pcb   (ctl)|     |pcb  (data)|
  *    +-----------+     +-----------+
  *          ^                 ^
  *          |                 |
  *          v                 v
  *      +--------------------------+
  *      |   Socket type private    |
  *      |       data               |
  *      +--------------------------+
  *                   ^
  *                   |
  *                   v
  *           +----------------+
  *           | struct ng_node |
  *           +----------------+
  */
 
 /* Netgraph node methods */
 static ng_constructor_t	ngs_constructor;
 static ng_rcvmsg_t	ngs_rcvmsg;
 static ng_shutdown_t	ngs_shutdown;
 static ng_newhook_t	ngs_newhook;
 static ng_connect_t	ngs_connect;
 static ng_rcvdata_t	ngs_rcvdata;
 static ng_disconnect_t	ngs_disconnect;
 
 /* Internal methods */
 static int	ng_attach_data(struct socket *so);
 static int	ng_attach_cntl(struct socket *so);
 static int	ng_attach_common(struct socket *so, int type);
 static void	ng_detach_common(struct ngpcb *pcbp, int type);
 /*static int	ng_internalize(struct mbuf *m, struct thread *p); */
 
 static int	ng_connect_data(struct sockaddr *nam, struct ngpcb *pcbp);
 static int	ng_bind(struct sockaddr *nam, struct ngpcb *pcbp);
 
 static int	ngs_mod_event(module_t mod, int event, void *data);
 static int	ship_msg(struct ngpcb *pcbp, struct ng_mesg *msg,
 			struct sockaddr_ng *addr);
 
 /* Netgraph type descriptor */
 static struct ng_type typestruct = {
 	NG_ABI_VERSION,
 	NG_SOCKET_NODE_TYPE,
 	ngs_mod_event,
 	ngs_constructor,
 	ngs_rcvmsg,
 	ngs_shutdown,
 	ngs_newhook,
 	NULL,
 	ngs_connect,
 	ngs_rcvdata,
 	ngs_disconnect,
 	NULL
 };
 NETGRAPH_INIT(socket, &typestruct);
 
 /* Buffer space */
 static u_long ngpdg_sendspace = 20 * 1024;	/* really max datagram size */
 static u_long ngpdg_recvspace = 20 * 1024;
 
 /* List of all sockets */
 static LIST_HEAD(, ngpcb) ngsocklist;
 
 #define sotongpcb(so) ((struct ngpcb *)(so)->so_pcb)
 
 /* If getting unexplained errors returned, set this to "Debugger("X"); */
 #ifndef TRAP_ERROR
 #define TRAP_ERROR
 #endif
 
 /***************************************************************
 	Control sockets
 ***************************************************************/
 
 static int
 ngc_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 
 	if (suser_td(td))
 		return (EPERM);
 	if (pcbp != NULL)
 		return (EISCONN);
 	return (ng_attach_cntl(so));
 }
 
 static int
 ngc_detach(struct socket *so)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 
 	if (pcbp == NULL)
 		return (EINVAL);
 	ng_detach_common(pcbp, NG_CONTROL);
 	return (0);
 }
 
 static int
 ngc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
 	 struct mbuf *control, struct thread *td)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 	struct sockaddr_ng *const sap = (struct sockaddr_ng *) addr;
 	struct ng_mesg *msg;
 	struct mbuf *m0;
 	char *path = NULL;
 	int len, error = 0;
 
 	if (pcbp == NULL) {
 		error = EINVAL;
 		goto release;
 	}
 #ifdef	NOTYET
 	if (control && (error = ng_internalize(control, td))) {
 		if (pcbp->sockdata == NULL) {
 			error = ENOTCONN;
 			goto release;
 		}
 	}
 #else	/* NOTYET */
 	if (control) {
 		error = EINVAL;
 		goto release;
 	}
 #endif	/* NOTYET */
 
 	/* Require destination as there may be >= 1 hooks on this node */
 	if (addr == NULL) {
 		error = EDESTADDRREQ;
 		goto release;
 	}
 
 	/* Allocate an expendable buffer for the path, chop off
 	 * the sockaddr header, and make sure it's NUL terminated */
 	len = sap->sg_len - 2;
 	MALLOC(path, char *, len + 1, M_NETGRAPH_PATH, M_WAITOK);
 	if (path == NULL) {
 		error = ENOMEM;
 		goto release;
 	}
 	bcopy(sap->sg_data, path, len);
 	path[len] = '\0';
 
 	/* Move the actual message out of mbufs into a linear buffer.
 	 * Start by adding up the size of the data. (could use mh_len?) */
 	for (len = 0, m0 = m; m0 != NULL; m0 = m0->m_next)
 		len += m0->m_len;
 
 	/* Move the data into a linear buffer as well. Messages are not
 	 * delivered in mbufs. */
 	MALLOC(msg, struct ng_mesg *, len + 1, M_NETGRAPH_MSG, M_WAITOK);
 	if (msg == NULL) {
 		error = ENOMEM;
 		goto release;
 	}
 	m_copydata(m, 0, len, (char *)msg);
 
 #ifdef TRACE_MESSAGES
 	do {								
 		item_p item;						
 		if ((item = ng_package_msg(msg)) == NULL) {		
 			(msg) = NULL;				
 			(error) = ENOMEM;				
 printf("err=%d\n",error);
 			break;						
 		}							
 		if (((error) = ng_address_path((pcbp->sockdata->node), (item),		
 					(path), (NULL))) == 0) {	
 printf("[%x]:<---------[socket]: c=<%d>cmd=%x(%s) f=%x #%d (%s)\n",
 item->el_dest->nd_ID,
 msg->header.typecookie,
 msg->header.cmd,
 msg->header.cmdstr,
 msg->header.flags,
 msg->header.token,
 item->el_dest->nd_type->name); 
 			SAVE_LINE(item);			
 			(error) = ng_snd_item((item), 0);		
 		}							
 else {
 printf("errx=%d\n",error);
 }
 		(msg) = NULL;						
 	} while (0);
 
 #else
 	/* The callee will free the msg when done. The path is our business. */
 	NG_SEND_MSG_PATH(error, pcbp->sockdata->node, msg, path, NULL);
 #endif
 release:
 	if (path != NULL)
 		FREE(path, M_NETGRAPH_PATH);
 	if (control != NULL)
 		m_freem(control);
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 static int
 ngc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 
 	if (pcbp == 0)
 		return (EINVAL);
 	return (ng_bind(nam, pcbp));
 }
 
 static int
 ngc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 printf(" program tried to connect control socket to remote node\n ");
 	/*
 	 * At this time refuse to do this.. it used to
 	 * do something but it was undocumented and not used.
 	 */
 	return (EINVAL);
 }
 
 /***************************************************************
 	Data sockets
 ***************************************************************/
 
 static int
 ngd_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 
 	if (pcbp != NULL)
 		return (EISCONN);
 	return (ng_attach_data(so));
 }
 
 static int
 ngd_detach(struct socket *so)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 
 	if (pcbp == NULL)
 		return (EINVAL);
 	ng_detach_common(pcbp, NG_DATA);
 	return (0);
 }
 
 static int
 ngd_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
 	 struct mbuf *control, struct thread *td)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 	struct sockaddr_ng *const sap = (struct sockaddr_ng *) addr;
 	int     len, error;
 	hook_p  hook = NULL;
 	char	hookname[NG_HOOKLEN + 1];
 
 	if ((pcbp == NULL) || (control != NULL)) {
 		error = EINVAL;
 		goto release;
 	}
 	if (pcbp->sockdata == NULL) {
 		error = ENOTCONN;
 		goto release;
 	}
 	/*
 	 * If the user used any of these ways to not specify an address
 	 * then handle specially.
 	 */
 	if ((sap == NULL)
 	    || ((len = sap->sg_len - 2) <= 0)
 	    || (*sap->sg_data == '\0')) {
 		if (NG_NODE_NUMHOOKS(pcbp->sockdata->node) != 1) {
 			error = EDESTADDRREQ;
 			goto release;
 		}
 		/*
 		 * if exactly one hook exists, just use it.
 		 * Special case to allow write(2) to work on an ng_socket.
 		 */
 		hook = LIST_FIRST(&pcbp->sockdata->node->nd_hooks);
 	} else {
 		if (len > NG_HOOKLEN) {
 			error = EINVAL;
 			goto release;
 		}
 
 		/*
 		 * chop off the sockaddr header, and make sure it's NUL
 		 * terminated
 		 */
 		bcopy(sap->sg_data, hookname, len);
 		hookname[len] = '\0';
 
 		/* Find the correct hook from 'hookname' */
 		LIST_FOREACH(hook, &pcbp->sockdata->node->nd_hooks, hk_hooks) {
 			if (strcmp(hookname, NG_HOOK_NAME(hook)) == 0) {
 				break;
 			}
 		}
 		if (hook == NULL) {
 			error = EHOSTUNREACH;
 		}
 	}
 
 	/* Send data (OK if hook is NULL) */
 	NG_SEND_DATA_ONLY(error, hook, m);	/* makes m NULL */
 
 release:
 	if (control != NULL)
 		m_freem(control);
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 static int
 ngd_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 
 	if (pcbp == 0)
 		return (EINVAL);
 	return (ng_connect_data(nam, pcbp));
 }
 
 /*
  * Used for both data and control sockets
  */
 static int
 ng_setsockaddr(struct socket *so, struct sockaddr **addr)
 {
 	struct ngpcb *pcbp;
 	struct sockaddr_ng *sg;
 	int sg_len, namelen, s;
 
 	/* Why isn't sg_data a `char[1]' ? :-( */
 	sg_len = sizeof(struct sockaddr_ng) - sizeof(sg->sg_data) + 1;
 
 	s = splnet();
 	pcbp = sotongpcb(so);
 	if ((pcbp == NULL) || (pcbp->sockdata == NULL)) {
 		splx(s);
 		return (EINVAL);
 	}
 
 	namelen = 0;		/* silence compiler ! */
 	if ( NG_NODE_HAS_NAME(pcbp->sockdata->node))
 		sg_len += namelen = strlen(NG_NODE_NAME(pcbp->sockdata->node));
 
 	MALLOC(sg, struct sockaddr_ng *, sg_len, M_SONAME, M_WAITOK | M_ZERO);
 
 	if (NG_NODE_HAS_NAME(pcbp->sockdata->node))
 		bcopy(NG_NODE_NAME(pcbp->sockdata->node), sg->sg_data, namelen);
 	splx(s);
 
 	sg->sg_len = sg_len;
 	sg->sg_family = AF_NETGRAPH;
 	*addr = (struct sockaddr *)sg;
 
 	return (0);
 }
 
 /*
  * Attach a socket to it's protocol specific partner.
  * For a control socket, actually create a netgraph node and attach
  * to it as well.
  */
 
 static int
 ng_attach_cntl(struct socket *so)
 {
 	struct ngsock *privdata;
 	struct ngpcb *pcbp;
 	int error;
 
 	/* Setup protocol control block */
 	if ((error = ng_attach_common(so, NG_CONTROL)) != 0)
 		return (error);
 	pcbp = sotongpcb(so);
 
 	/* Allocate node private info */
 	MALLOC(privdata, struct ngsock *,
 	    sizeof(*privdata), M_NETGRAPH_SOCK, M_WAITOK | M_ZERO);
 	if (privdata == NULL) {
 		ng_detach_common(pcbp, NG_CONTROL);
 		return (ENOMEM);
 	}
 
 	/* Make the generic node components */
 	if ((error = ng_make_node_common(&typestruct, &privdata->node)) != 0) {
 		FREE(privdata, M_NETGRAPH_SOCK);
 		ng_detach_common(pcbp, NG_CONTROL);
 		return (error);
 	}
 	NG_NODE_SET_PRIVATE(privdata->node, privdata);
 
 	/* Link the pcb and the node private data */
 	privdata->ctlsock = pcbp;
 	pcbp->sockdata = privdata;
 	privdata->refs++;
 	return (0);
 }
 
 static int
 ng_attach_data(struct socket *so)
 {
 	return(ng_attach_common(so, NG_DATA));
 }
 
 /*
  * Set up a socket protocol control block.
  * This code is shared between control and data sockets.
  */
 static int
 ng_attach_common(struct socket *so, int type)
 {
 	struct ngpcb *pcbp;
 	int error;
 
 	/* Standard socket setup stuff */
 	error = soreserve(so, ngpdg_sendspace, ngpdg_recvspace);
 	if (error)
 		return (error);
 
 	/* Allocate the pcb */
 	MALLOC(pcbp, struct ngpcb *, sizeof(*pcbp), M_PCB, M_WAITOK | M_ZERO);
 	if (pcbp == NULL)
 		return (ENOMEM);
 	pcbp->type = type;
 
 	/* Link the pcb and the socket */
 	so->so_pcb = (caddr_t) pcbp;
 	pcbp->ng_socket = so;
 
 	/* Add the socket to linked list */
 	LIST_INSERT_HEAD(&ngsocklist, pcbp, socks);
 	return (0);
 }
 
 /*
  * Disassociate the socket from it's protocol specific
  * partner. If it's attached to a node's private data structure,
  * then unlink from that too. If we were the last socket attached to it,
  * then shut down the entire node. Shared code for control and data sockets.
  */
 static void
 ng_detach_common(struct ngpcb *pcbp, int which)
 {
 	struct ngsock *priv;
 
 	if (pcbp->sockdata) {
 		priv = pcbp->sockdata;
 		pcbp->sockdata = NULL;
 		switch (which) {
 		case NG_CONTROL:
 			priv->ctlsock = NULL;
 			break;
 		case NG_DATA:
 			priv->datasock = NULL;
 			break;
 		default:
 			panic(__func__);
 		}
 		if ((--priv->refs == 0) && (priv->node != NULL))
 			ng_rmnode_self(priv->node);
 	}
 	pcbp->ng_socket->so_pcb = NULL;
 	pcbp->ng_socket = NULL;
 	LIST_REMOVE(pcbp, socks);
 	FREE(pcbp, M_PCB);
 }
 
 #ifdef NOTYET
 /*
  * File descriptors can be passed into a AF_NETGRAPH socket.
  * Note, that file descriptors cannot be passed OUT.
  * Only character device descriptors are accepted.
  * Character devices are useful to connect a graph to a device,
  * which after all is the purpose of this whole system.
  */
 static int
 ng_internalize(struct mbuf *control, struct thread *td)
 {
-	struct filedesc *fdp = td->td_proc->p_fd;
 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
 	struct file *fp;
 	struct vnode *vn;
 	int oldfds;
 	int fd;
 
 	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
 	    cm->cmsg_len != control->m_len) {
 		TRAP_ERROR;
 		return (EINVAL);
 	}
 
 	/* Check there is only one FD. XXX what would more than one signify? */
 	oldfds = ((caddr_t)cm + cm->cmsg_len - (caddr_t)data) / sizeof (int);
 	if (oldfds != 1) {
 		TRAP_ERROR;
 		return (EINVAL);
 	}
 
 	/* Check that the FD given is legit. and change it to a pointer to a
 	 * struct file. */
 	fd = CMSG_DATA(cm);
-	if ((unsigned) fd >= fdp->fd_nfiles
-	    || (fp = fdp->fd_ofiles[fd]) == NULL) {
+	fp = ffind_hold(td, fd);
+	if (fp == NULL)
 		return (EBADF);
-	}
 
 	/* Depending on what kind of resource it is, act differently. For
 	 * devices, we treat it as a file. For a AF_NETGRAPH socket,
 	 * shortcut straight to the node. */
 	switch (fp->f_type) {
 	case DTYPE_VNODE:
 		vn = (struct vnode *) fp->f_data;
 		if (vn && (vn->v_type == VCHR)) {
 			/* for a VCHR, actually reference the FILE */
 			fp->f_count++;
 			/* XXX then what :) */
 			/* how to pass on to other modules? */
 		} else {
+			fdrop(fp, td);
 			TRAP_ERROR;
 			return (EINVAL);
 		}
 		break;
 	default:
+		fdrop(fp, td);
 		TRAP_ERROR;
 		return (EINVAL);
 	}
+	fdrop(fp, td);
 	return (0);
 }
 #endif	/* NOTYET */
 
 /*
  * Connect the data socket to a named control socket node.
  */
 static int
 ng_connect_data(struct sockaddr *nam, struct ngpcb *pcbp)
 {
 	struct sockaddr_ng *sap;
 	node_p farnode;
 	struct ngsock *priv;
 	int error;
 	item_p item;
 
 	/* If we are already connected, don't do it again */
 	if (pcbp->sockdata != NULL)
 		return (EISCONN);
 
 	/* Find the target (victim) and check it doesn't already have a data
 	 * socket. Also check it is a 'socket' type node.
 	 * Use ng_package_data() and address_path() to do this.
 	 */
 
 	sap = (struct sockaddr_ng *) nam;
 	/* The item will hold the node reference */
 	item = ng_package_data(NULL, NULL);
 	if (item == NULL) {
 		return (ENOMEM);
 	}
 	if ((error = ng_address_path(NULL, item,  sap->sg_data, NULL)))
 		return (error); /* item is freed on failure */
 
 	/*
 	 * Extract node from item and free item. Remember we now have
 	 * a reference on the node. The item holds it for us.
 	 * when we free the item we release the reference.
 	 */
 	farnode = item->el_dest; /* shortcut */
 	if (strcmp(farnode->nd_type->name, NG_SOCKET_NODE_TYPE) != 0) {
 		NG_FREE_ITEM(item); /* drop the reference to the node */
 		return (EINVAL);
 	}
 	priv = NG_NODE_PRIVATE(farnode);
 	if (priv->datasock != NULL) {
 		NG_FREE_ITEM(item);	/* drop the reference to the node */
 		return (EADDRINUSE);
 	}
 
 	/*
 	 * Link the PCB and the private data struct. and note the extra
 	 * reference. Drop the extra reference on the node.
 	 */
 	priv->datasock = pcbp;
 	pcbp->sockdata = priv;
 	priv->refs++; /* XXX possible race if it's being freed */
 	NG_FREE_ITEM(item);	/* drop the reference to the node */
 	return (0);
 }
 
 /*
  * Binding a socket means giving the corresponding node a name
  */
 static int
 ng_bind(struct sockaddr *nam, struct ngpcb *pcbp)
 {
 	struct ngsock *const priv = pcbp->sockdata;
 	struct sockaddr_ng *const sap = (struct sockaddr_ng *) nam;
 
 	if (priv == NULL) {
 		TRAP_ERROR;
 		return (EINVAL);
 	}
 	if ((sap->sg_len < 4)
 	||  (sap->sg_len > (NG_NODELEN + 3))
 	||  (sap->sg_data[0] == '\0')
 	||  (sap->sg_data[sap->sg_len - 3] != '\0')) {
 		TRAP_ERROR;
 		return (EINVAL);
 	}
 	return (ng_name_node(priv->node, sap->sg_data));
 }
 
 /*
  * Take a message and pass it up to the control socket associated
  * with the node.
  */
 static int
 ship_msg(struct ngpcb *pcbp, struct ng_mesg *msg, struct sockaddr_ng *addr)
 {
 	struct socket *const so = pcbp->ng_socket;
 	struct mbuf *mdata;
 	int msglen;
 
 	/* Copy the message itself into an mbuf chain */
 	msglen = sizeof(struct ng_mesg) + msg->header.arglen;
 	mdata = m_devget((caddr_t) msg, msglen, 0, NULL, NULL);
 
 	/* Here we free the message, as we are the end of the line.
 	 * We need to do that regardless of whether we got mbufs. */
 	NG_FREE_MSG(msg);
 
 	if (mdata == NULL) {
 		TRAP_ERROR;
 		return (ENOBUFS);
 	}
 
 	/* Send it up to the socket */
 	if (sbappendaddr(&so->so_rcv,
 	    (struct sockaddr *) addr, mdata, NULL) == 0) {
 		TRAP_ERROR;
 		m_freem(mdata);
 		return (ENOBUFS);
 	}
 	sorwakeup(so);
 	return (0);
 }
 
 /*
  * You can only create new nodes from the socket end of things.
  */
 static int
 ngs_constructor(node_p nodep)
 {
 	return (EINVAL);
 }
 
 /*
  * We allow any hook to be connected to the node.
  * There is no per-hook private information though.
  */
 static int
 ngs_newhook(node_p node, hook_p hook, const char *name)
 {
 	NG_HOOK_SET_PRIVATE(hook, NG_NODE_PRIVATE(node));
 	return (0);
 }
 
 /* 
  * if only one hook, allow read(2) and write(2) to work.
  */
 static int
 ngs_connect(hook_p hook)
 {
 	node_p node = NG_HOOK_NODE(hook);
 	struct ngsock *priv = NG_NODE_PRIVATE(node);
 
 	if ((priv->datasock)
 	&&  (priv->datasock->ng_socket)) {
 		if (NG_NODE_NUMHOOKS(node) == 1) {
 			priv->datasock->ng_socket->so_state |= SS_ISCONNECTED;
 		} else {
 			priv->datasock->ng_socket->so_state &= ~SS_ISCONNECTED;
 		}
 	}
 	return (0);
 }
 
 /*
  * Incoming messages get passed up to the control socket.
  * Unless they are for us specifically (socket_type)
  */
 static int
 ngs_rcvmsg(node_p node, item_p item, hook_p lasthook)
 {
 	struct ngsock *const priv = NG_NODE_PRIVATE(node);
 	struct ngpcb *const pcbp = priv->ctlsock;
 	struct sockaddr_ng *addr;
 	int addrlen;
 	int error = 0;
 	struct	ng_mesg *msg;
 	ng_ID_t	retaddr = NGI_RETADDR(item);
 	char	retabuf[32];
 
 	NGI_GET_MSG(item, msg);
 	NG_FREE_ITEM(item); /* we have all we need */
 
 	/* Only allow mesgs to be passed if we have the control socket.
 	 * Data sockets can only support the generic messages. */
 	if (pcbp == NULL) {
 		TRAP_ERROR;
 		return (EINVAL);
 	}
 #ifdef TRACE_MESSAGES
 printf("[%x]:---------->[socket]: c=<%d>cmd=%x(%s) f=%x #%d\n",
 retaddr,
 msg->header.typecookie,
 msg->header.cmd,
 msg->header.cmdstr,
 msg->header.flags,
 msg->header.token);
 
 #endif
 
 	if (msg->header.typecookie == NGM_SOCKET_COOKIE) {
 		switch (msg->header.cmd) {
 		case NGM_SOCK_CMD_NOLINGER:
 			priv->flags |= NGS_FLAG_NOLINGER;
 			break;
 		case NGM_SOCK_CMD_LINGER:
 			priv->flags &= ~NGS_FLAG_NOLINGER;
 			break;
 		default:
 			error = EINVAL;		/* unknown command */
 		}
 		/* Free the message and return */
 		NG_FREE_MSG(msg);
 		return(error);
 
 	}
 	/* Get the return address into a sockaddr */
 	sprintf(retabuf,"[%x]:", retaddr);
 	addrlen = strlen(retabuf);
 	MALLOC(addr, struct sockaddr_ng *, addrlen + 4, M_NETGRAPH_PATH, M_NOWAIT);
 	if (addr == NULL) {
 		TRAP_ERROR;
 		return (ENOMEM);
 	}
 	addr->sg_len = addrlen + 3;
 	addr->sg_family = AF_NETGRAPH;
 	bcopy(retabuf, addr->sg_data, addrlen);
 	addr->sg_data[addrlen] = '\0';
 
 	/* Send it up */
 	error = ship_msg(pcbp, msg, addr);
 	FREE(addr, M_NETGRAPH_PATH);
 	return (error);
 }
 
 /*
  * Receive data on a hook
  */
 static int
 ngs_rcvdata(hook_p hook, item_p item)
 {
 	struct ngsock *const priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
 	struct ngpcb *const pcbp = priv->datasock;
 	struct socket *so;
 	struct sockaddr_ng *addr;
 	char *addrbuf[NG_HOOKLEN + 1 + 4];
 	int addrlen;
 	struct mbuf *m;
 
 	NGI_GET_M(item, m);
 	NG_FREE_ITEM(item);
 	/* If there is no data socket, black-hole it */
 	if (pcbp == NULL) {
 		NG_FREE_M(m);
 		return (0);
 	}
 	so = pcbp->ng_socket;
 
 	/* Get the return address into a sockaddr. */
 	addrlen = strlen(NG_HOOK_NAME(hook));	/* <= NG_HOOKLEN */
 	addr = (struct sockaddr_ng *) addrbuf;
 	addr->sg_len = addrlen + 3;
 	addr->sg_family = AF_NETGRAPH;
 	bcopy(NG_HOOK_NAME(hook), addr->sg_data, addrlen);
 	addr->sg_data[addrlen] = '\0';
 
 	/* Try to tell the socket which hook it came in on */
 	if (sbappendaddr(&so->so_rcv, (struct sockaddr *) addr, m, NULL) == 0) {
 		m_freem(m);
 		TRAP_ERROR;
 		return (ENOBUFS);
 	}
 	sorwakeup(so);
 	return (0);
 }
 
 /*
  * Hook disconnection
  *
  * For this type, removal of the last link destroys the node
  * if the NOLINGER flag is set.
  */
 static int
 ngs_disconnect(hook_p hook)
 {
 	node_p node = NG_HOOK_NODE(hook);
 	struct ngsock *const priv = NG_NODE_PRIVATE(node);
 
 	if ((priv->datasock)
 	&&  (priv->datasock->ng_socket)) {
 		if (NG_NODE_NUMHOOKS(node) == 1) {
 			priv->datasock->ng_socket->so_state |= SS_ISCONNECTED;
 		} else {
 			priv->datasock->ng_socket->so_state &= ~SS_ISCONNECTED;
 		}
 	}
 
 	if ((priv->flags & NGS_FLAG_NOLINGER )
 	&& (NG_NODE_NUMHOOKS(node) == 0)
 	&& (NG_NODE_IS_VALID(node))) {
 		ng_rmnode_self(node);
 	}
 	return (0);
 }
 
 /*
  * Do local shutdown processing.
  * In this case, that involves making sure the socket
  * knows we should be shutting down.
  */
 static int
 ngs_shutdown(node_p node)
 {
 	struct ngsock *const priv = NG_NODE_PRIVATE(node);
 	struct ngpcb *const dpcbp = priv->datasock;
 	struct ngpcb *const pcbp = priv->ctlsock;
 
 	if (dpcbp != NULL) {
 		soisdisconnected(dpcbp->ng_socket);
 		dpcbp->sockdata = NULL;
 		priv->datasock = NULL;
 		priv->refs--;
 	}
 	if (pcbp != NULL) {
 		soisdisconnected(pcbp->ng_socket);
 		pcbp->sockdata = NULL;
 		priv->ctlsock = NULL;
 		priv->refs--;
 	}
 	NG_NODE_SET_PRIVATE(node, NULL);
 	NG_NODE_UNREF(node);
 	FREE(priv, M_NETGRAPH_SOCK);
 	return (0);
 }
 
 static	int
 dummy_disconnect(struct socket *so)
 {
 	return (0);
 }
 /*
  * Control and data socket type descriptors
  */
 
 static struct pr_usrreqs ngc_usrreqs = {
 	NULL,			/* abort */
 	pru_accept_notsupp,
 	ngc_attach,
 	ngc_bind,
 	ngc_connect,
 	pru_connect2_notsupp,
 	pru_control_notsupp,
 	ngc_detach,
 	dummy_disconnect,	/* disconnect */
 	pru_listen_notsupp,
 	NULL,			/* setpeeraddr */
 	pru_rcvd_notsupp,
 	pru_rcvoob_notsupp,
 	ngc_send,
 	pru_sense_null,
 	NULL,			/* shutdown */
 	ng_setsockaddr,
 	sosend,
 	soreceive,
 	sopoll
 };
 
 static struct pr_usrreqs ngd_usrreqs = {
 	NULL,			/* abort */
 	pru_accept_notsupp,
 	ngd_attach,
 	NULL,			/* bind */
 	ngd_connect,
 	pru_connect2_notsupp,
 	pru_control_notsupp,
 	ngd_detach,
 	dummy_disconnect,	/* disconnect */
 	pru_listen_notsupp,
 	NULL,			/* setpeeraddr */
 	pru_rcvd_notsupp,
 	pru_rcvoob_notsupp,
 	ngd_send,
 	pru_sense_null,
 	NULL,			/* shutdown */
 	ng_setsockaddr,
 	sosend,
 	soreceive,
 	sopoll
 };
 
 /*
  * Definitions of protocols supported in the NETGRAPH domain.
  */
 
 extern struct domain ngdomain;		/* stop compiler warnings */
 
 static struct protosw ngsw[] = {
 	{
 		SOCK_DGRAM,		/* protocol type */
 		&ngdomain,		/* backpointer to domain */
 		NG_CONTROL,
 		PR_ATOMIC | PR_ADDR /* | PR_RIGHTS */,	/* flags */
 		0, 0, 0, 0,		/* input, output, ctlinput, ctloutput */
 		NULL,			/* ousrreq */
 		0, 0, 0, 0,		/* init, fasttimeo, slowtimo, drain */
 		&ngc_usrreqs,		/* usrreq table (above) */
 		/*{NULL}*/		/* pffh (protocol filter head?) */
 	},
 	{
 		SOCK_DGRAM,		/* protocol type */
 		&ngdomain,		/* backpointer to domain */
 		NG_DATA,
 		PR_ATOMIC | PR_ADDR,	/* flags */
 		0, 0, 0, 0,		/* input, output, ctlinput, ctloutput */
 		NULL,			/* ousrreq() */
 		0, 0, 0, 0,		/* init, fasttimeo, slowtimo, drain */
 		&ngd_usrreqs,		/* usrreq table (above) */
 		/*{NULL}*/		/* pffh (protocol filter head?) */
 	}
 };
 
 struct domain ngdomain = {
 	AF_NETGRAPH,
 	"netgraph",
 	NULL,					/* init() */
 	NULL,					/* externalise() */
 	NULL,					/* dispose() */
 	ngsw,					/* protosw entry */
 	&ngsw[sizeof(ngsw) / sizeof(ngsw[0])], 	/* Number of protosw entries */
 	NULL,					/* next domain in list */
 	NULL,					/* rtattach() */
 	0,					/* arg to rtattach in bits */
 	0					/* maxrtkey */
 };
 
 /*
  * Handle loading and unloading for this node type
  * This is to handle auxiliary linkages (e.g protocol domain addition).
  */
 static int
 ngs_mod_event(module_t mod, int event, void *data)
 {
 	int error = 0;
 
 	switch (event) {
 	case MOD_LOAD:
 		/* Register protocol domain */
 		net_add_domain(&ngdomain);
 		break;
 	case MOD_UNLOAD:
 		/* Insure there are no open netgraph sockets */
 		if (!LIST_EMPTY(&ngsocklist)) {
 			error = EBUSY;
 			break;
 		}
 
 #ifdef NOTYET
 		if ((LIST_EMPTY(&ngsocklist)) && (typestruct.refs == 0)) {
 		/* Unregister protocol domain XXX can't do this yet.. */
 			if ((error = net_rm_domain(&ngdomain)) != 0)
 				break;
 		} else
 #endif
 			error = EBUSY;
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 SYSCTL_INT(_net_graph, OID_AUTO, family, CTLFLAG_RD, 0, AF_NETGRAPH, "");
 SYSCTL_NODE(_net_graph, OID_AUTO, data, CTLFLAG_RW, 0, "DATA");
 SYSCTL_INT(_net_graph_data, OID_AUTO, proto, CTLFLAG_RD, 0, NG_DATA, "");
 SYSCTL_NODE(_net_graph, OID_AUTO, control, CTLFLAG_RW, 0, "CONTROL");
 SYSCTL_INT(_net_graph_control, OID_AUTO, proto, CTLFLAG_RD, 0, NG_CONTROL, "");
 
Index: head/sys/netsmb/smb_dev.c
===================================================================
--- head/sys/netsmb/smb_dev.c	(revision 89305)
+++ head/sys/netsmb/smb_dev.c	(revision 89306)
@@ -1,434 +1,445 @@
 /*
  * Copyright (c) 2000-2001 Boris Popov
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *    This product includes software developed by Boris Popov.
  * 4. Neither the name of the author nor the names of any co-contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/ioccom.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/file.h>		/* Must come after sys/malloc.h */
 #include <sys/mbuf.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/select.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/vnode.h>
 
 #include <net/if.h>
 
 #include <netsmb/smb.h>
 #include <netsmb/smb_conn.h>
 #include <netsmb/smb_subr.h>
 #include <netsmb/smb_dev.h>
 
 #define SMB_GETDEV(dev)		((struct smb_dev*)(dev)->si_drv1)
 #define	SMB_CHECKMINOR(dev)	do { \
 				    sdp = SMB_GETDEV(dev); \
 				    if (sdp == NULL) return ENXIO; \
 				} while(0)
 
 static d_open_t	 nsmb_dev_open;
 static d_close_t nsmb_dev_close;
 static d_read_t	 nsmb_dev_read;
 static d_write_t nsmb_dev_write;
 static d_ioctl_t nsmb_dev_ioctl;
 static d_poll_t	 nsmb_dev_poll;
 
 MODULE_DEPEND(netsmb, libiconv, 1, 1, 1);
 MODULE_VERSION(netsmb, NSMB_VERSION);
 
 static int smb_version = NSMB_VERSION;
 
 
 SYSCTL_DECL(_net_smb);
 SYSCTL_INT(_net_smb, OID_AUTO, version, CTLFLAG_RD, &smb_version, 0, "");
 
 static MALLOC_DEFINE(M_NSMBDEV, "NETSMBDEV", "NET/SMB device");
 
 
 /*
 int smb_dev_queue(struct smb_dev *ndp, struct smb_rq *rqp, int prio);
 */
 
 static struct cdevsw nsmb_cdevsw = {
 	/* open */	nsmb_dev_open,
 	/* close */	nsmb_dev_close,
 	/* read */	nsmb_dev_read,
 	/* write */	nsmb_dev_write,
 	/* ioctl */ 	nsmb_dev_ioctl,
 	/* poll */	nsmb_dev_poll,
 	/* mmap */	nommap,
 	/* strategy */	nostrategy,
 	/* name */	NSMB_NAME,
 	/* maj */	NSMB_MAJOR,
 	/* dump */	nodump,
 	/* psize */	nopsize,
 	/* flags */	0,
 #ifndef FB_CURRENT
 	/* bmaj */	-1
 #endif
 };
 
 static eventhandler_tag nsmb_dev_tag;
 
 static void
 nsmb_dev_clone(void *arg, char *name, int namelen, dev_t *dev)
 {
 	int min;
 
 	if (*dev != NODEV)
 		return;
 	if (dev_stdclone(name, NULL, NSMB_NAME, &min) != 1)
 		return;
 	*dev = make_dev(&nsmb_cdevsw, min, 0, 0, 0600, NSMB_NAME"%d", min);
 }
 
 static int
 nsmb_dev_open(dev_t dev, int oflags, int devtype, struct thread *td)
 {
 	struct smb_dev *sdp;
 	struct proc *p = td->td_proc;
 	struct ucred *cred = p->p_ucred;
 	int s;
 
 	sdp = SMB_GETDEV(dev);
 	if (sdp && (sdp->sd_flags & NSMBFL_OPEN))
 		return EBUSY;
 	if (sdp == NULL) {
 		sdp = malloc(sizeof(*sdp), M_NSMBDEV, M_WAITOK);
 		dev->si_drv1 = (void*)sdp;
 	}
 	/*
 	 * XXX: this is just crazy - make a device for an already passed device...
 	 * someone should take care of it.
 	 */
 	if ((dev->si_flags & SI_NAMED) == 0)
 		make_dev(&nsmb_cdevsw, minor(dev), cred->cr_uid, cred->cr_gid, 0700,
 		    NSMB_NAME"%d", dev2unit(dev));
 	bzero(sdp, sizeof(*sdp));
 /*
 	STAILQ_INIT(&sdp->sd_rqlist);
 	STAILQ_INIT(&sdp->sd_rplist);
 	bzero(&sdp->sd_pollinfo, sizeof(struct selinfo));
 */
 	s = splimp();
 	sdp->sd_level = -1;
 	sdp->sd_flags |= NSMBFL_OPEN;
 	splx(s);
 	return 0;
 }
 
 static int
 nsmb_dev_close(dev_t dev, int flag, int fmt, struct thread *td)
 {
 	struct smb_dev *sdp;
 	struct smb_vc *vcp;
 	struct smb_share *ssp;
 	struct smb_cred scred;
 	int s;
 
 	SMB_CHECKMINOR(dev);
 	s = splimp();
 	if ((sdp->sd_flags & NSMBFL_OPEN) == 0) {
 		splx(s);
 		return EBADF;
 	}
 	smb_makescred(&scred, td, NULL);
 	ssp = sdp->sd_share;
 	if (ssp != NULL)
 		smb_share_rele(ssp, &scred);
 	vcp = sdp->sd_vc;
 	if (vcp != NULL)
 		smb_vc_rele(vcp, &scred);
 /*
 	smb_flushq(&sdp->sd_rqlist);
 	smb_flushq(&sdp->sd_rplist);
 */
 	dev->si_drv1 = NULL;
 	free(sdp, M_NSMBDEV);
 	destroy_dev(dev);
 	splx(s);
 	return 0;
 }
 
 
 static int
 nsmb_dev_ioctl(dev_t dev, u_long cmd, caddr_t data, int flag, struct thread *td)
 {
 	struct smb_dev *sdp;
 	struct smb_vc *vcp;
 	struct smb_share *ssp;
 	struct smb_cred scred;
 	int error = 0;
 
 	SMB_CHECKMINOR(dev);
 	if ((sdp->sd_flags & NSMBFL_OPEN) == 0)
 		return EBADF;
 
 	smb_makescred(&scred, td, NULL);
 	switch (cmd) {
 	    case SMBIOC_OPENSESSION:
 		if (sdp->sd_vc)
 			return EISCONN;
 		error = smb_usr_opensession((struct smbioc_ossn*)data,
 		    &scred, &vcp);
 		if (error)
 			break;
 		sdp->sd_vc = vcp;
 		smb_vc_unlock(vcp, 0, td);
 		sdp->sd_level = SMBL_VC;
 		break;
 	    case SMBIOC_OPENSHARE:
 		if (sdp->sd_share)
 			return EISCONN;
 		if (sdp->sd_vc == NULL)
 			return ENOTCONN;
 		error = smb_usr_openshare(sdp->sd_vc,
 		    (struct smbioc_oshare*)data, &scred, &ssp);
 		if (error)
 			break;
 		sdp->sd_share = ssp;
 		smb_share_unlock(ssp, 0, td);
 		sdp->sd_level = SMBL_SHARE;
 		break;
 	    case SMBIOC_REQUEST:
 		if (sdp->sd_share == NULL)
 			return ENOTCONN;
 		error = smb_usr_simplerequest(sdp->sd_share,
 		    (struct smbioc_rq*)data, &scred);
 		break;
 	    case SMBIOC_T2RQ:
 		if (sdp->sd_share == NULL)
 			return ENOTCONN;
 		error = smb_usr_t2request(sdp->sd_share,
 		    (struct smbioc_t2rq*)data, &scred);
 		break;
 	    case SMBIOC_SETFLAGS: {
 		struct smbioc_flags *fl = (struct smbioc_flags*)data;
 		int on;
 	
 		if (fl->ioc_level == SMBL_VC) {
 			if (fl->ioc_mask & SMBV_PERMANENT) {
 				on = fl->ioc_flags & SMBV_PERMANENT;
 				if ((vcp = sdp->sd_vc) == NULL)
 					return ENOTCONN;
 				error = smb_vc_get(vcp, LK_EXCLUSIVE, &scred);
 				if (error)
 					break;
 				if (on && (vcp->obj.co_flags & SMBV_PERMANENT) == 0) {
 					vcp->obj.co_flags |= SMBV_PERMANENT;
 					smb_vc_ref(vcp);
 				} else if (!on && (vcp->obj.co_flags & SMBV_PERMANENT)) {
 					vcp->obj.co_flags &= ~SMBV_PERMANENT;
 					smb_vc_rele(vcp, &scred);
 				}
 				smb_vc_put(vcp, &scred);
 			} else
 				error = EINVAL;
 		} else if (fl->ioc_level == SMBL_SHARE) {
 			if (fl->ioc_mask & SMBS_PERMANENT) {
 				on = fl->ioc_flags & SMBS_PERMANENT;
 				if ((ssp = sdp->sd_share) == NULL)
 					return ENOTCONN;
 				error = smb_share_get(ssp, LK_EXCLUSIVE, &scred);
 				if (error)
 					break;
 				if (on && (ssp->obj.co_flags & SMBS_PERMANENT) == 0) {
 					ssp->obj.co_flags |= SMBS_PERMANENT;
 					smb_share_ref(ssp);
 				} else if (!on && (ssp->obj.co_flags & SMBS_PERMANENT)) {
 					ssp->obj.co_flags &= ~SMBS_PERMANENT;
 					smb_share_rele(ssp, &scred);
 				}
 				smb_share_put(ssp, &scred);
 			} else
 				error = EINVAL;
 			break;
 		} else
 			error = EINVAL;
 		break;
 	    }
 	    case SMBIOC_LOOKUP:
 		if (sdp->sd_vc || sdp->sd_share)
 			return EISCONN;
 		vcp = NULL;
 		ssp = NULL;
 		error = smb_usr_lookup((struct smbioc_lookup*)data, &scred, &vcp, &ssp);
 		if (error)
 			break;
 		if (vcp) {
 			sdp->sd_vc = vcp;
 			smb_vc_unlock(vcp, 0, td);
 			sdp->sd_level = SMBL_VC;
 		}
 		if (ssp) {
 			sdp->sd_share = ssp;
 			smb_share_unlock(ssp, 0, td);
 			sdp->sd_level = SMBL_SHARE;
 		}
 		break;
 	    case SMBIOC_READ: case SMBIOC_WRITE: {
 		struct smbioc_rw *rwrq = (struct smbioc_rw*)data;
 		struct uio auio;
 		struct iovec iov;
 	
 		if ((ssp = sdp->sd_share) == NULL)
 			return ENOTCONN;
 		iov.iov_base = rwrq->ioc_base;
 		iov.iov_len = rwrq->ioc_cnt;
 		auio.uio_iov = &iov;
 		auio.uio_iovcnt = 1;
 		auio.uio_offset = rwrq->ioc_offset;
 		auio.uio_resid = rwrq->ioc_cnt;
 		auio.uio_segflg = UIO_USERSPACE;
 		auio.uio_rw = (cmd == SMBIOC_READ) ? UIO_READ : UIO_WRITE;
 		auio.uio_td = td;
 		if (cmd == SMBIOC_READ)
 			error = smb_read(ssp, rwrq->ioc_fh, &auio, &scred);
 		else
 			error = smb_write(ssp, rwrq->ioc_fh, &auio, &scred);
 		rwrq->ioc_cnt -= auio.uio_resid;
 		break;
 	    }
 	    default:
 		error = ENODEV;
 	}
 	return error;
 }
 
 static int
 nsmb_dev_read(dev_t dev, struct uio *uio, int flag)
 {
 	return EACCES;
 }
 
 static int
 nsmb_dev_write(dev_t dev, struct uio *uio, int flag)
 {
 	return EACCES;
 }
 
 static int
 nsmb_dev_poll(dev_t dev, int events, struct thread *td)
 {
 	return ENODEV;
 }
 
 static int
 nsmb_dev_load(module_t mod, int cmd, void *arg)
 {
 	int error = 0;
 
 	switch (cmd) {
 	    case MOD_LOAD:
 		error = smb_sm_init();
 		if (error)
 			break;
 		error = smb_iod_init();
 		if (error) {
 			smb_sm_done();
 			break;
 		}
 		cdevsw_add(&nsmb_cdevsw);
 		nsmb_dev_tag = EVENTHANDLER_REGISTER(dev_clone, nsmb_dev_clone, 0, 1000);
 		printf("netsmb_dev: loaded\n");
 		break;
 	    case MOD_UNLOAD:
 		smb_iod_done();
 		error = smb_sm_done();
 		error = 0;
 		EVENTHANDLER_DEREGISTER(dev_clone, nsmb_dev_tag);
 		cdevsw_remove(&nsmb_cdevsw);
 		printf("netsmb_dev: unloaded\n");
 		break;
 	    default:
 		error = EINVAL;
 		break;
 	}
 	return error;
 }
 
 DEV_MODULE (dev_netsmb, nsmb_dev_load, 0);
 
 /*
  * Convert a file descriptor to appropriate smb_share pointer
  */
 static struct file*
 nsmb_getfp(struct filedesc* fdp, int fd, int flag)
 {
 	struct file* fp;
 
+	FILEDESC_LOCK(fdp);
 	if (((u_int)fd) >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd]) == NULL ||
-	    (fp->f_flag & flag) == 0)
+	    (fp->f_flag & flag) == 0) {
+		FILEDESC_UNLOCK(fdp);
 		return (NULL);
+	}
+	fhold(fp);
+	FILEDESC_UNLOCK(fdp);
 	return (fp);
 }
 
 int
 smb_dev2share(int fd, int mode, struct smb_cred *scred,
 	struct smb_share **sspp)
 {
 	struct file *fp;
 	struct vnode *vp;
 	struct smb_dev *sdp;
 	struct smb_share *ssp;
 	dev_t dev;
 	int error;
 
 	fp = nsmb_getfp(scred->scr_td->td_proc->p_fd, fd, FREAD | FWRITE);
 	if (fp == NULL)
 		return EBADF;
 	vp = (struct vnode*)fp->f_data;
-	if (vp == NULL)
+	if (vp == NULL) {
+		fdrop(fp, curthread);
 		return EBADF;
+	}
 	dev = vn_todev(vp);
-	if (dev == NODEV)
+	if (dev == NODEV) {
+		fdrop(fp, curthread);
 		return EBADF;
+	}
 	SMB_CHECKMINOR(dev);
 	ssp = sdp->sd_share;
-	if (ssp == NULL)
+	if (ssp == NULL) {
+		fdrop(fp, curthread);
 		return ENOTCONN;
+	}
 	error = smb_share_get(ssp, LK_EXCLUSIVE, scred);
-	if (error)
-		return error;
-	*sspp = ssp;
-	return 0;
+	if (error == 0) 
+		*sspp = ssp;
+	fdrop(fp, curthread);
+	return error;
 }
 
Index: head/sys/sys/fcntl.h
===================================================================
--- head/sys/sys/fcntl.h	(revision 89305)
+++ head/sys/sys/fcntl.h	(revision 89306)
@@ -1,211 +1,212 @@
 /*-
  * Copyright (c) 1983, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)fcntl.h	8.3 (Berkeley) 1/21/94
  * $FreeBSD$
  */
 
 #ifndef _SYS_FCNTL_H_
 #define	_SYS_FCNTL_H_
 
 /*
  * This file includes the definitions for open and fcntl
  * described by POSIX for <fcntl.h>; it also includes
  * related kernel definitions.
  */
 
 #ifndef _KERNEL
 #include <sys/types.h>
 #endif
 
 /*
  * File status flags: these are used by open(2), fcntl(2).
  * They are also used (indirectly) in the kernel file structure f_flags,
  * which is a superset of the open/fcntl flags.  Open flags and f_flags
  * are inter-convertible using OFLAGS(fflags) and FFLAGS(oflags).
  * Open/fcntl flags begin with O_; kernel-internal flags begin with F.
  */
 /* open-only flags */
 #define	O_RDONLY	0x0000		/* open for reading only */
 #define	O_WRONLY	0x0001		/* open for writing only */
 #define	O_RDWR		0x0002		/* open for reading and writing */
 #define	O_ACCMODE	0x0003		/* mask for above modes */
 
 /*
  * Kernel encoding of open mode; separate read and write bits that are
  * independently testable: 1 greater than the above.
  *
  * XXX
  * FREAD and FWRITE are excluded from the #ifdef _KERNEL so that TIOCFLUSH,
  * which was documented to use FREAD/FWRITE, continues to work.
  */
 #ifndef _POSIX_SOURCE
 #define	FREAD		0x0001
 #define	FWRITE		0x0002
 #endif
 #define	O_NONBLOCK	0x0004		/* no delay */
 #define	O_APPEND	0x0008		/* set append mode */
 #ifndef _POSIX_SOURCE
 #define	O_SHLOCK	0x0010		/* open with shared file lock */
 #define	O_EXLOCK	0x0020		/* open with exclusive file lock */
 #define	O_ASYNC		0x0040		/* signal pgrp when data ready */
 #define	O_FSYNC		0x0080		/* synchronous writes */
 #define	O_NOFOLLOW	0x0100		/* don't follow symlinks */
 #endif
 #define	O_CREAT		0x0200		/* create if nonexistent */
 #define	O_TRUNC		0x0400		/* truncate to zero length */
 #define	O_EXCL		0x0800		/* error if already exists */
 #ifdef _KERNEL
-#define	FMARK		0x1000		/* mark during gc() */
-#define	FDEFER		0x2000		/* defer for next gc pass */
+/* FMARK/FDEFER kept in f_gcflags */
+#define	FMARK		0x1		/* mark during gc() */
+#define	FDEFER		0x2		/* defer for next gc pass */
 #define	FHASLOCK	0x4000		/* descriptor holds advisory lock */
 #endif
 
 /* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */
 #define	O_NOCTTY	0x8000		/* don't assign controlling terminal */
 
 /* Attempt to bypass buffer cache */
 #define O_DIRECT	0x00010000
 
 #ifdef _KERNEL
 /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */
 #define	FFLAGS(oflags)	((oflags) + 1)
 #define	OFLAGS(fflags)	((fflags) - 1)
 
 /* bits to save after open */
 #define	FMASK		(FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT)
 /* bits settable by fcntl(F_SETFL, ...) */
 #define	FCNTLFLAGS	(FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT)
 #endif
 
 /*
  * The O_* flags used to have only F* names, which were used in the kernel
  * and by fcntl.  We retain the F* names for the kernel f_flag field
  * and for backward compatibility for fcntl.
  */
 #ifndef _POSIX_SOURCE
 #define	FAPPEND		O_APPEND	/* kernel/compat */
 #define	FASYNC		O_ASYNC		/* kernel/compat */
 #define	FFSYNC		O_FSYNC		/* kernel */
 #define	FNONBLOCK	O_NONBLOCK	/* kernel */
 #define	FNDELAY		O_NONBLOCK	/* compat */
 #define	O_NDELAY	O_NONBLOCK	/* compat */
 #endif
 
 /*
  * We are out of bits in f_flag (which is a short).  However,
  * the flag bits not set in FMASK are only meaningful in the
  * initial open syscall.  Those bits can thus be given a
  * different meaning for fcntl(2).
  */
 #ifndef _POSIX_SOURCE
 
 /*
  * Set by shm_open(3) to get automatic MAP_ASYNC behavior
  * for POSIX shared memory objects (which are otherwise
  * implemented as plain files).
  */
 #define	FPOSIXSHM	O_NOFOLLOW
 #endif
 
 /*
  * Constants used for fcntl(2)
  */
 
 /* command values */
 #define	F_DUPFD		0		/* duplicate file descriptor */
 #define	F_GETFD		1		/* get file descriptor flags */
 #define	F_SETFD		2		/* set file descriptor flags */
 #define	F_GETFL		3		/* get file status flags */
 #define	F_SETFL		4		/* set file status flags */
 #ifndef _POSIX_SOURCE
 #define	F_GETOWN	5		/* get SIGIO/SIGURG proc/pgrp */
 #define F_SETOWN	6		/* set SIGIO/SIGURG proc/pgrp */
 #endif
 #define	F_GETLK		7		/* get record locking information */
 #define	F_SETLK		8		/* set record locking information */
 #define	F_SETLKW	9		/* F_SETLK; wait if blocked */
 
 /* file descriptor flags (F_GETFD, F_SETFD) */
 #define	FD_CLOEXEC	1		/* close-on-exec flag */
 
 /* record locking flags (F_GETLK, F_SETLK, F_SETLKW) */
 #define	F_RDLCK		1		/* shared or read lock */
 #define	F_UNLCK		2		/* unlock */
 #define	F_WRLCK		3		/* exclusive or write lock */
 #ifdef _KERNEL
 #define	F_WAIT		0x010		/* Wait until lock is granted */
 #define	F_FLOCK		0x020	 	/* Use flock(2) semantics for lock */
 #define	F_POSIX		0x040	 	/* Use POSIX semantics for lock */
 #endif
 
 /*
  * Advisory file segment locking data type -
  * information passed to system by user
  */
 struct flock {
 	off_t	l_start;	/* starting offset */
 	off_t	l_len;		/* len = 0 means until end of file */
 	pid_t	l_pid;		/* lock owner */
 	short	l_type;		/* lock type: read/write, etc. */
 	short	l_whence;	/* type of l_start */
 };
 
 
 #ifndef _POSIX_SOURCE
 /* lock operations for flock(2) */
 #define	LOCK_SH		0x01		/* shared file lock */
 #define	LOCK_EX		0x02		/* exclusive file lock */
 #define	LOCK_NB		0x04		/* don't block when locking */
 #define	LOCK_UN		0x08		/* unlock file */
 #endif
 
 
 #ifndef _KERNEL
 #include <sys/cdefs.h>
 
 __BEGIN_DECLS
 int	open __P((const char *, int, ...));
 int	creat __P((const char *, mode_t));
 int	fcntl __P((int, int, ...));
 #ifndef _POSIX_SOURCE
 int	flock __P((int, int));
 #endif /* !_POSIX_SOURCE */
 __END_DECLS
 #endif
 
 #endif /* !_SYS_FCNTL_H_ */
Index: head/sys/sys/file.h
===================================================================
--- head/sys/sys/file.h	(revision 89305)
+++ head/sys/sys/file.h	(revision 89306)
@@ -1,243 +1,272 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)file.h	8.3 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_FILE_H_
 #define	_SYS_FILE_H_
 
 #ifndef _KERNEL
 #include <sys/fcntl.h>
 #include <sys/unistd.h>
 #endif
 
 #ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
 #include <sys/queue.h>
 
 struct stat;
 struct thread;
 struct uio;
 struct knote;
 struct vnode;
 struct socket;
 
 /*
  * Kernel descriptor table.
  * One entry for each open kernel vnode and socket.
+ *
+ * Below is the list of locks that protects members in struct file.
+ *
+ * (fl)	filelist_lock
+ * (f)	f_mtx in struct file
+ * none	not locked
  */
 struct file {
-	LIST_ENTRY(file) f_list;/* list of active files */
-	short	f_FILLER3;	/* (old f_flag) */
+	LIST_ENTRY(file) f_list;/* (fl) list of active files */
+	short	f_gcflag;	/* used by thread doing fd garbage collection */
 #define	DTYPE_VNODE	1	/* file */
 #define	DTYPE_SOCKET	2	/* communications endpoint */
 #define	DTYPE_PIPE	3	/* pipe */
 #define	DTYPE_FIFO	4	/* fifo (named pipe) */
 #define	DTYPE_KQUEUE	5	/* event queue */
 	short	f_type;		/* descriptor type */
-	int	f_count;	/* reference count */
-	int	f_msgcount;	/* references from message queue */
+	int	f_count;	/* (f) reference count */
+	int	f_msgcount;	/* (f) references from message queue */
 	struct	ucred *f_cred;	/* credentials associated with descriptor */
 	struct fileops {
 		int	(*fo_read)	__P((struct file *fp, struct uio *uio,
 					    struct ucred *cred, int flags,
 					    struct thread *td));
 		int	(*fo_write)	__P((struct file *fp, struct uio *uio,
 					    struct ucred *cred, int flags,
 					    struct thread *td));
 #define	FOF_OFFSET	1
 		int	(*fo_ioctl)	__P((struct file *fp, u_long com,
 					    caddr_t data, struct thread *td));
 		int	(*fo_poll)	__P((struct file *fp, int events,
 					    struct ucred *cred, struct thread *td));
 		int	(*fo_kqfilter)	__P((struct file *fp,
 					    struct knote *kn));
 		int	(*fo_stat)	__P((struct file *fp, struct stat *sb,
 					    struct thread *td));
 		int	(*fo_close)	__P((struct file *fp, struct thread *td));
 	} *f_ops;
 	int	f_seqcount;	/*
 				 * count of sequential accesses -- cleared
 				 * by most seek operations.
 				 */
 	off_t	f_nextoff;	/*
 				 * offset of next expected read or write
 				 */
 	off_t	f_offset;
 	caddr_t	f_data;		/* vnode or socket */
 	u_int	f_flag;		/* see fcntl.h */
+	struct mtx	f_mtx;	/* mutex to protect data */
 };
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_FILE);
 #endif
 
 LIST_HEAD(filelist, file);
-extern struct filelist filehead; /* head of list of open files */
+extern struct filelist filehead; /* (fl) head of list of open files */
 extern struct fileops vnops;
 extern struct fileops badfileops;
 extern int maxfiles;		/* kernel limit on number of open files */
 extern int maxfilesperproc;	/* per process limit on number of open files */
-extern int nfiles;		/* actual number of open files */
+extern int nfiles;		/* (fl) actual number of open files */
+extern struct sx filelist_lock; /* sx to protect filelist and nfiles */
 
-static __inline void fhold __P((struct file *fp));
+static __inline struct file * fhold __P((struct file *fp));
+static __inline struct file * fhold_locked __P((struct file *fp));
 int fget __P((struct thread *td, int fd, struct file **fpp));
 int fget_read __P((struct thread *td, int fd, struct file **fpp));
 int fget_write __P((struct thread *td, int fd, struct file **fpp));
 int fdrop __P((struct file *fp, struct thread *td));
+int fdrop_locked __P((struct file *fp, struct thread *td));
 
+/* Lock a file. */
+/*#define FILE_LOCK_DEBUG*/
+#ifdef FILE_LOCK_DEBUG
+#define FILE_LOCK(f)						\
+	do {							\
+		printf("FLCK: %p %s %d\n", &(f)->f_mtx, __FILE__, __LINE__);	\
+		mtx_lock(&(f)->f_mtx);				\
+	} while (0)
+#define FILE_UNLOCK(f)						\
+	do {							\
+		printf("FREL: %p %s %d\n", &(f)->f_mtx, __FILE__, __LINE__);	\
+		mtx_unlock(&(f)->f_mtx);			\
+	} while (0)
+#else
+#define FILE_LOCK(f)	mtx_lock(&(f)->f_mtx)
+#define FILE_UNLOCK(f)	mtx_unlock(&(f)->f_mtx)
+#endif
+#define	FILE_LOCKED(f)	mtx_owned(&(f)->f_mtx)
+#define	FILE_LOCK_ASSERT(f, type)	mtx_assert(&(f)->f_mtx, (type))
+
 int fgetvp __P((struct thread *td, int fd, struct vnode **vpp));
 int fgetvp_read __P((struct thread *td, int fd, struct vnode **vpp));
 int fgetvp_write __P((struct thread *td, int fd, struct vnode **vpp));
 
 int fgetsock __P((struct thread *td, int fd, struct socket **spp, u_int *fflagp));
 void fputsock __P((struct socket *sp));
 
-static __inline void
-fhold(fp)
+static __inline struct file *
+fhold_locked(fp)
 	struct file *fp;
 {
 
+#ifdef INVARIANTS
+	FILE_LOCK_ASSERT(fp, MA_OWNED);
+#endif
 	fp->f_count++;
+	return (fp);
 }
 
+static __inline struct file *
+fhold(fp)
+	struct file *fp;
+{
+
+	FILE_LOCK(fp);
+	fhold_locked(fp);
+	FILE_UNLOCK(fp);
+	return (fp);
+}
+
 static __inline int fo_read __P((struct file *fp, struct uio *uio,
     struct ucred *cred, int flags, struct thread *td));
 static __inline int fo_write __P((struct file *fp, struct uio *uio,
     struct ucred *cred, int flags, struct thread *td));
 static __inline int fo_ioctl __P((struct file *fp, u_long com, caddr_t data,
     struct thread *td));
 static __inline int fo_poll __P((struct file *fp, int events,
     struct ucred *cred, struct thread *td));
 static __inline int fo_stat __P((struct file *fp, struct stat *sb,
     struct thread *td));
 static __inline int fo_close __P((struct file *fp, struct thread *td));
 static __inline int fo_kqfilter __P((struct file *fp, struct knote *kn));
+struct proc;
+struct file *ffind_hold(struct thread *, int fd);
+struct file *ffind_lock(struct thread *, int fd);
 
 static __inline int
 fo_read(fp, uio, cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *cred;
 	struct thread *td;
 	int flags;
 {
-	int error;
 
-	fhold(fp);
-	error = (*fp->f_ops->fo_read)(fp, uio, cred, flags, td);
-	fdrop(fp, td);
-	return (error);
+	return ((*fp->f_ops->fo_read)(fp, uio, cred, flags, td));
 }
 
 static __inline int
 fo_write(fp, uio, cred, flags, td)
 	struct file *fp;
 	struct uio *uio;
 	struct ucred *cred;
 	struct thread *td;
 	int flags;
 {
-	int error;
-
-	fhold(fp);
-	error = (*fp->f_ops->fo_write)(fp, uio, cred, flags, td);
-	fdrop(fp, td);
-	return (error);
+	return ((*fp->f_ops->fo_write)(fp, uio, cred, flags, td));
 }
 
 static __inline int
 fo_ioctl(fp, com, data, td)
 	struct file *fp;
 	u_long com;
 	caddr_t data;
 	struct thread *td;
 {
-	int error;
-
-	fhold(fp);
-	error = (*fp->f_ops->fo_ioctl)(fp, com, data, td);
-	fdrop(fp, td);
-	return (error);
+	return ((*fp->f_ops->fo_ioctl)(fp, com, data, td));
 }
 
 static __inline int
 fo_poll(fp, events, cred, td)
 	struct file *fp;
 	int events;
 	struct ucred *cred;
 	struct thread *td;
 {
-	int error;
-
-	fhold(fp);
-	error = (*fp->f_ops->fo_poll)(fp, events, cred, td);
-	fdrop(fp, td);
-	return (error);
+	/* select(2) and poll(2) hold file descriptors. */
+	return ((*fp->f_ops->fo_poll)(fp, events, cred, td));
 }
 
 static __inline int
 fo_stat(fp, sb, td)
 	struct file *fp;
 	struct stat *sb;
 	struct thread *td;
 {
-	int error;
-
-	fhold(fp);
-	error = (*fp->f_ops->fo_stat)(fp, sb, td);
-	fdrop(fp, td);
-	return (error);
+	return ((*fp->f_ops->fo_stat)(fp, sb, td));
 }
 
 static __inline int
 fo_close(fp, td)
 	struct file *fp;
 	struct thread *td;
 {
 
 	return ((*fp->f_ops->fo_close)(fp, td));
 }
 
 static __inline int
 fo_kqfilter(fp, kn)
 	struct file *fp;
 	struct knote *kn;
 {
 
 	return ((*fp->f_ops->fo_kqfilter)(fp, kn));
 }
 
 #endif /* _KERNEL */
 
 #endif /* !SYS_FILE_H */
Index: head/sys/sys/filedesc.h
===================================================================
--- head/sys/sys/filedesc.h	(revision 89305)
+++ head/sys/sys/filedesc.h	(revision 89306)
@@ -1,150 +1,175 @@
 /*
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)filedesc.h	8.1 (Berkeley) 6/2/93
  * $FreeBSD$
  */
 
 #ifndef _SYS_FILEDESC_H_
 #define	_SYS_FILEDESC_H_
 
+#include <sys/lock.h>
+#include <sys/mutex.h>
 #include <sys/queue.h>
+#include <sys/sx.h>
 
 /*
  * This structure is used for the management of descriptors.  It may be
  * shared by multiple processes.
  *
  * A process is initially started out with NDFILE descriptors stored within
  * this structure, selected to be enough for typical applications based on
  * the historical limit of 20 open files (and the usage of descriptors by
  * shells).  If these descriptors are exhausted, a larger descriptor table
  * may be allocated, up to a process' resource limit; the internal arrays
  * are then unused.  The initial expansion is set to NDEXTENT; each time
  * it runs out, it is doubled until the resource limit is reached. NDEXTENT
  * should be selected to be the biggest multiple of OFILESIZE (see below)
  * that will fit in a power-of-two sized piece of memory.
  */
 #define NDFILE		20
 #define NDEXTENT	50		/* 250 bytes in 256-byte alloc. */
 
 struct filedesc {
 	struct	file **fd_ofiles;	/* file structures for open files */
 	char	*fd_ofileflags;		/* per-process open file flags */
 	struct	vnode *fd_cdir;		/* current directory */
 	struct	vnode *fd_rdir;		/* root directory */
 	struct	vnode *fd_jdir;		/* jail root directory */
 	int	fd_nfiles;		/* number of open files allocated */
-	u_short	fd_lastfile;		/* high-water mark of fd_ofiles */
-	u_short	fd_freefile;		/* approx. next free file */
+	int	fd_lastfile;		/* high-water mark of fd_ofiles */
+	int	fd_freefile;		/* approx. next free file */
 	u_short	fd_cmask;		/* mask for file creation */
 	u_short	fd_refcnt;		/* reference count */
 
 	int	fd_knlistsize;		/* size of knlist */
 	struct	klist *fd_knlist;	/* list of attached knotes */
 	u_long	fd_knhashmask;		/* size of knhash */
 	struct	klist *fd_knhash;	/* hash table for attached knotes */
+	struct mtx	fd_mtx;		/* mtx to protect the members of struct filedesc */
 };
 
 /*
  * Basic allocation of descriptors:
  * one of the above, plus arrays for NDFILE descriptors.
  */
 struct filedesc0 {
 	struct	filedesc fd_fd;
 	/*
 	 * These arrays are used when the number of open files is
 	 * <= NDFILE, and are then pointed to by the pointers above.
 	 */
 	struct	file *fd_dfiles[NDFILE];
 	char	fd_dfileflags[NDFILE];
 };
 
 /*
  * Per-process open flags.
  */
 #define	UF_EXCLOSE 	0x01		/* auto-close on exec */
 #if 0
 #define	UF_MAPPED 	0x02		/* mapped from device */
 #endif
 
 /*
  * Storage required per open file descriptor.
  */
 #define OFILESIZE (sizeof(struct file *) + sizeof(char))
 
 /*
  * This structure holds the information needed to send a SIGIO or
  * a SIGURG signal to a process or process group when new data arrives
  * on a device or socket.  The structure is placed on an SLIST belonging
  * to the proc or pgrp so that the entire list may be revoked when the
  * process exits or the process group disappears.
  */
 struct sigio {
 	union {
 		struct	proc *siu_proc; /* process to receive SIGIO/SIGURG */
 		struct	pgrp *siu_pgrp; /* process group to receive ... */
 	} sio_u;
 	SLIST_ENTRY(sigio) sio_pgsigio;	/* sigio's for process or group */
 	struct	sigio **sio_myref;	/* location of the pointer that holds
 					 * the reference to this structure */
 	struct	ucred *sio_ucred;	/* current credentials */
 	pid_t	sio_pgid;		/* pgid for signals */
 };
 #define	sio_proc	sio_u.siu_proc
 #define	sio_pgrp	sio_u.siu_pgrp
 
 SLIST_HEAD(sigiolst, sigio);
 
 #ifdef _KERNEL
+
+/* Lock a file descriptor table. */
+/*#define FILEDESC_LOCK_DEBUG*/
+#ifdef FILEDESC_LOCK_DEBUG
+#define FILEDESC_LOCK(fd)					\
+	do {							\
+		printf("FD_LCK: %p %s %d\n", &(fd)->fd_mtx, __FILE__, __LINE__);	\
+		mtx_lock(&(fd)->fd_mtx);			\
+	} while (0)
+#define FILEDESC_UNLOCK(fd)					\
+	do {							\
+		printf("FD_REL: %p %s %d\n", &(fd)->fd_mtx, __FILE__, __LINE__);	\
+		mtx_unlock(&(fd)->fd_mtx);			\
+	} while (0)
+#else
+#define FILEDESC_LOCK(fd)	mtx_lock(&(fd)->fd_mtx)
+#define FILEDESC_UNLOCK(fd)	mtx_unlock(&(fd)->fd_mtx)
+#endif
+#define	FILEDESC_LOCKED(fd)	mtx_owned(&(fd)->fd_mtx)
+#define	FILEDESC_LOCK_ASSERT(fd, type)	mtx_assert(&(fd)->fd_mtx, (type))
+
 int	closef __P((struct file *fp, struct thread *p));
 int	dupfdopen __P((struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode,
 		       int error));
 int	falloc __P((struct thread *p, struct file **resultfp, int *resultfd));
 int	fdalloc __P((struct thread *p, int want, int *result));
 int	fdavail __P((struct thread *td, int n));
 void	fdcloseexec __P((struct thread *td));
 struct	filedesc *fdcopy __P((struct thread *td));
 void	fdfree __P((struct thread *td));
 struct	filedesc *fdinit __P((struct thread *td));
 struct	filedesc *fdshare __P((struct proc *p));
 void	ffree __P((struct file *fp));
 pid_t	fgetown __P((struct sigio *sigio));
 int	fsetown __P((pid_t pgid, struct sigio **sigiop));
 void	funsetown __P((struct sigio *sigio));
 void	funsetownlst __P((struct sigiolst *sigiolst));
 struct	file *holdfp __P((struct filedesc *fdp, int fd, int flag));
 int	getvnode __P((struct filedesc *fdp, int fd, struct file **fpp));
 void	setugidsafety __P((struct thread *td));
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_FILEDESC_H_ */
Index: head/sys/ufs/ffs/ffs_alloc.c
===================================================================
--- head/sys/ufs/ffs/ffs_alloc.c	(revision 89305)
+++ head/sys/ufs/ffs/ffs_alloc.c	(revision 89306)
@@ -1,2046 +1,2049 @@
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ffs_alloc.c	8.18 (Berkeley) 5/26/95
  * $FreeBSD$
  */
 
 #include "opt_quota.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/ufs_extern.h>
 #include <ufs/ufs/ufsmount.h>
 
 #include <ufs/ffs/fs.h>
 #include <ufs/ffs/ffs_extern.h>
 
 typedef ufs_daddr_t allocfcn_t __P((struct inode *ip, int cg, ufs_daddr_t bpref,
 				  int size));
 
 static ufs_daddr_t ffs_alloccg __P((struct inode *, int, ufs_daddr_t, int));
 static ufs_daddr_t
 	      ffs_alloccgblk __P((struct inode *, struct buf *, ufs_daddr_t));
 #ifdef DIAGNOSTIC
 static int	ffs_checkblk __P((struct inode *, ufs_daddr_t, long));
 #endif
 static ufs_daddr_t ffs_clusteralloc __P((struct inode *, int, ufs_daddr_t,
 	    int));
 static ino_t	ffs_dirpref __P((struct inode *));
 static ufs_daddr_t ffs_fragextend __P((struct inode *, int, long, int, int));
 static void	ffs_fserr __P((struct fs *, u_int, char *));
 static u_long	ffs_hashalloc
 		    __P((struct inode *, int, long, int, allocfcn_t *));
 static ino_t	ffs_nodealloccg __P((struct inode *, int, ufs_daddr_t, int));
 static ufs_daddr_t ffs_mapsearch __P((struct fs *, struct cg *, ufs_daddr_t,
 	    int));
 
 /*
  * Allocate a block in the file system.
  *
  * The size of the requested block is given, which must be some
  * multiple of fs_fsize and <= fs_bsize.
  * A preference may be optionally specified. If a preference is given
  * the following hierarchy is used to allocate a block:
  *   1) allocate the requested block.
  *   2) allocate a rotationally optimal block in the same cylinder.
  *   3) allocate a block in the same cylinder group.
  *   4) quadradically rehash into other cylinder groups, until an
  *      available block is located.
  * If no block preference is given the following heirarchy is used
  * to allocate a block:
  *   1) allocate a block in the cylinder group that contains the
  *      inode for the file.
  *   2) quadradically rehash into other cylinder groups, until an
  *      available block is located.
  */
 int
 ffs_alloc(ip, lbn, bpref, size, cred, bnp)
 	register struct inode *ip;
 	ufs_daddr_t lbn, bpref;
 	int size;
 	struct ucred *cred;
 	ufs_daddr_t *bnp;
 {
 	register struct fs *fs;
 	ufs_daddr_t bno;
 	int cg;
 #ifdef QUOTA
 	int error;
 #endif
 
 	*bnp = 0;
 	fs = ip->i_fs;
 #ifdef DIAGNOSTIC
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 		printf("dev = %s, bsize = %ld, size = %d, fs = %s\n",
 		    devtoname(ip->i_dev), (long)fs->fs_bsize, size,
 		    fs->fs_fsmnt);
 		panic("ffs_alloc: bad size");
 	}
 	if (cred == NOCRED)
 		panic("ffs_alloc: missing credential");
 #endif /* DIAGNOSTIC */
 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
 		goto nospace;
 	if (suser_xxx(cred, NULL, PRISON_ROOT) &&
 	    freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0)
 		goto nospace;
 #ifdef QUOTA
 	error = chkdq(ip, (long)btodb(size), cred, 0);
 	if (error)
 		return (error);
 #endif
 	if (bpref >= fs->fs_size)
 		bpref = 0;
 	if (bpref == 0)
 		cg = ino_to_cg(fs, ip->i_number);
 	else
 		cg = dtog(fs, bpref);
 	bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, size,
 					 ffs_alloccg);
 	if (bno > 0) {
 		ip->i_blocks += btodb(size);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		*bnp = bno;
 		return (0);
 	}
 #ifdef QUOTA
 	/*
 	 * Restore user's disk quota because allocation failed.
 	 */
 	(void) chkdq(ip, (long)-btodb(size), cred, FORCE);
 #endif
 nospace:
 	ffs_fserr(fs, cred->cr_uid, "file system full");
 	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
 	return (ENOSPC);
 }
 
 /*
  * Reallocate a fragment to a bigger size
  *
  * The number and size of the old block is given, and a preference
  * and new size is also specified. The allocator attempts to extend
  * the original block. Failing that, the regular block allocator is
  * invoked to get an appropriate block.
  */
 int
 ffs_realloccg(ip, lbprev, bpref, osize, nsize, cred, bpp)
 	register struct inode *ip;
 	ufs_daddr_t lbprev;
 	ufs_daddr_t bpref;
 	int osize, nsize;
 	struct ucred *cred;
 	struct buf **bpp;
 {
 	register struct fs *fs;
 	struct buf *bp;
 	int cg, request, error;
 	ufs_daddr_t bprev, bno;
 
 	*bpp = 0;
 	fs = ip->i_fs;
 #ifdef DIAGNOSTIC
 	if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
 		panic("ffs_realloccg: allocation on suspended filesystem");
 	if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
 	    (u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
 		printf(
 		"dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n",
 		    devtoname(ip->i_dev), (long)fs->fs_bsize, osize,
 		    nsize, fs->fs_fsmnt);
 		panic("ffs_realloccg: bad size");
 	}
 	if (cred == NOCRED)
 		panic("ffs_realloccg: missing credential");
 #endif /* DIAGNOSTIC */
 	if (suser_xxx(cred, NULL, PRISON_ROOT) &&
 	    freespace(fs, fs->fs_minfree) -  numfrags(fs, nsize - osize) < 0)
 		goto nospace;
 	if ((bprev = ip->i_db[lbprev]) == 0) {
 		printf("dev = %s, bsize = %ld, bprev = %ld, fs = %s\n",
 		    devtoname(ip->i_dev), (long)fs->fs_bsize, (long)bprev,
 		    fs->fs_fsmnt);
 		panic("ffs_realloccg: bad bprev");
 	}
 	/*
 	 * Allocate the extra space in the buffer.
 	 */
 	error = bread(ITOV(ip), lbprev, osize, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 
 	if( bp->b_blkno == bp->b_lblkno) {
 		if( lbprev >= NDADDR)
 			panic("ffs_realloccg: lbprev out of range");
 		bp->b_blkno = fsbtodb(fs, bprev);
 	}
 
 #ifdef QUOTA
 	error = chkdq(ip, (long)btodb(nsize - osize), cred, 0);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 #endif
 	/*
 	 * Check for extension in the existing location.
 	 */
 	cg = dtog(fs, bprev);
 	bno = ffs_fragextend(ip, cg, (long)bprev, osize, nsize);
 	if (bno) {
 		if (bp->b_blkno != fsbtodb(fs, bno))
 			panic("ffs_realloccg: bad blockno");
 		ip->i_blocks += btodb(nsize - osize);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		allocbuf(bp, nsize);
 		bp->b_flags |= B_DONE;
 		bzero((char *)bp->b_data + osize, (u_int)nsize - osize);
 		*bpp = bp;
 		return (0);
 	}
 	/*
 	 * Allocate a new disk location.
 	 */
 	if (bpref >= fs->fs_size)
 		bpref = 0;
 	switch ((int)fs->fs_optim) {
 	case FS_OPTSPACE:
 		/*
 		 * Allocate an exact sized fragment. Although this makes
 		 * best use of space, we will waste time relocating it if
 		 * the file continues to grow. If the fragmentation is
 		 * less than half of the minimum free reserve, we choose
 		 * to begin optimizing for time.
 		 */
 		request = nsize;
 		if (fs->fs_minfree <= 5 ||
 		    fs->fs_cstotal.cs_nffree >
 		    (off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100))
 			break;
 		log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n",
 			fs->fs_fsmnt);
 		fs->fs_optim = FS_OPTTIME;
 		break;
 	case FS_OPTTIME:
 		/*
 		 * At this point we have discovered a file that is trying to
 		 * grow a small fragment to a larger fragment. To save time,
 		 * we allocate a full sized block, then free the unused portion.
 		 * If the file continues to grow, the `ffs_fragextend' call
 		 * above will be able to grow it in place without further
 		 * copying. If aberrant programs cause disk fragmentation to
 		 * grow within 2% of the free reserve, we choose to begin
 		 * optimizing for space.
 		 */
 		request = fs->fs_bsize;
 		if (fs->fs_cstotal.cs_nffree <
 		    (off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100)
 			break;
 		log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n",
 			fs->fs_fsmnt);
 		fs->fs_optim = FS_OPTSPACE;
 		break;
 	default:
 		printf("dev = %s, optim = %ld, fs = %s\n",
 		    devtoname(ip->i_dev), (long)fs->fs_optim, fs->fs_fsmnt);
 		panic("ffs_realloccg: bad optim");
 		/* NOTREACHED */
 	}
 	bno = (ufs_daddr_t)ffs_hashalloc(ip, cg, (long)bpref, request,
 					 ffs_alloccg);
 	if (bno > 0) {
 		bp->b_blkno = fsbtodb(fs, bno);
 		if (!DOINGSOFTDEP(ITOV(ip)))
 			ffs_blkfree(ip, bprev, (long)osize);
 		if (nsize < request)
 			ffs_blkfree(ip, bno + numfrags(fs, nsize),
 			    (long)(request - nsize));
 		ip->i_blocks += btodb(nsize - osize);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		allocbuf(bp, nsize);
 		bp->b_flags |= B_DONE;
 		bzero((char *)bp->b_data + osize, (u_int)nsize - osize);
 		*bpp = bp;
 		return (0);
 	}
 #ifdef QUOTA
 	/*
 	 * Restore user's disk quota because allocation failed.
 	 */
 	(void) chkdq(ip, (long)-btodb(nsize - osize), cred, FORCE);
 #endif
 	brelse(bp);
 nospace:
 	/*
 	 * no space available
 	 */
 	ffs_fserr(fs, cred->cr_uid, "file system full");
 	uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
 	return (ENOSPC);
 }
 
 /*
  * Reallocate a sequence of blocks into a contiguous sequence of blocks.
  *
  * The vnode and an array of buffer pointers for a range of sequential
  * logical blocks to be made contiguous is given. The allocator attempts
  * to find a range of sequential blocks starting as close as possible to
  * an fs_rotdelay offset from the end of the allocation for the logical
  * block immediately preceding the current range. If successful, the
  * physical block numbers in the buffer pointers and in the inode are
  * changed to reflect the new allocation. If unsuccessful, the allocation
  * is left unchanged. The success in doing the reallocation is returned.
  * Note that the error return is not reflected back to the user. Rather
  * the previous block allocation will be used.
  */
 
 SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem");
 
 static int doasyncfree = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0, "");
 
 static int doreallocblks = 1;
 SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
 
 #ifdef DEBUG
 static volatile int prtrealloc = 0;
 #endif
 
 int
 ffs_reallocblks(ap)
 	struct vop_reallocblks_args /* {
 		struct vnode *a_vp;
 		struct cluster_save *a_buflist;
 	} */ *ap;
 {
 	struct fs *fs;
 	struct inode *ip;
 	struct vnode *vp;
 	struct buf *sbp, *ebp;
 	ufs_daddr_t *bap, *sbap, *ebap = 0;
 	struct cluster_save *buflist;
 	ufs_daddr_t start_lbn, end_lbn, soff, newblk, blkno;
 	struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp;
 	int i, len, start_lvl, end_lvl, pref, ssize;
 
 	if (doreallocblks == 0)
 		return (ENOSPC);
 	vp = ap->a_vp;
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	if (fs->fs_contigsumsize <= 0)
 		return (ENOSPC);
 	buflist = ap->a_buflist;
 	len = buflist->bs_nchildren;
 	start_lbn = buflist->bs_children[0]->b_lblkno;
 	end_lbn = start_lbn + len - 1;
 #ifdef DIAGNOSTIC
 	for (i = 0; i < len; i++)
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 1");
 	for (i = 1; i < len; i++)
 		if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
 			panic("ffs_reallocblks: non-logical cluster");
 	blkno = buflist->bs_children[0]->b_blkno;
 	ssize = fsbtodb(fs, fs->fs_frag);
 	for (i = 1; i < len - 1; i++)
 		if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
 			panic("ffs_reallocblks: non-physical cluster %d", i);
 #endif
 	/*
 	 * If the latest allocation is in a new cylinder group, assume that
 	 * the filesystem has decided to move and do not force it back to
 	 * the previous cylinder group.
 	 */
 	if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
 	    dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
 		return (ENOSPC);
 	if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
 	    ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
 		return (ENOSPC);
 	/*
 	 * Get the starting offset and block map for the first block.
 	 */
 	if (start_lvl == 0) {
 		sbap = &ip->i_db[0];
 		soff = start_lbn;
 	} else {
 		idp = &start_ap[start_lvl - 1];
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
 			brelse(sbp);
 			return (ENOSPC);
 		}
 		sbap = (ufs_daddr_t *)sbp->b_data;
 		soff = idp->in_off;
 	}
 	/*
 	 * Find the preferred location for the cluster.
 	 */
 	pref = ffs_blkpref(ip, start_lbn, soff, sbap);
 	/*
 	 * If the block range spans two block maps, get the second map.
 	 */
 	if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
 		ssize = len;
 	} else {
 #ifdef DIAGNOSTIC
 		if (start_ap[start_lvl-1].in_lbn == idp->in_lbn)
 			panic("ffs_reallocblk: start == end");
 #endif
 		ssize = len - (idp->in_off + 1);
 		if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
 			goto fail;
 		ebap = (ufs_daddr_t *)ebp->b_data;
 	}
 	/*
 	 * Search the block map looking for an allocation of the desired size.
 	 */
 	if ((newblk = (ufs_daddr_t)ffs_hashalloc(ip, dtog(fs, pref), (long)pref,
 	    len, ffs_clusteralloc)) == 0)
 		goto fail;
 	/*
 	 * We have found a new contiguous block.
 	 *
 	 * First we have to replace the old block pointers with the new
 	 * block pointers in the inode and indirect blocks associated
 	 * with the file.
 	 */
 #ifdef DEBUG
 	if (prtrealloc)
 		printf("realloc: ino %d, lbns %d-%d\n\told:", ip->i_number,
 		    start_lbn, end_lbn);
 #endif
 	blkno = newblk;
 	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
 		if (i == ssize) {
 			bap = ebap;
 			soff = -i;
 		}
 #ifdef DIAGNOSTIC
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 2");
 		if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
 			panic("ffs_reallocblks: alloc mismatch");
 #endif
 #ifdef DEBUG
 		if (prtrealloc)
 			printf(" %d,", *bap);
 #endif
 		if (DOINGSOFTDEP(vp)) {
 			if (sbap == &ip->i_db[0] && i < ssize)
 				softdep_setup_allocdirect(ip, start_lbn + i,
 				    blkno, *bap, fs->fs_bsize, fs->fs_bsize,
 				    buflist->bs_children[i]);
 			else
 				softdep_setup_allocindir_page(ip, start_lbn + i,
 				    i < ssize ? sbp : ebp, soff + i, blkno,
 				    *bap, buflist->bs_children[i]);
 		}
 		*bap++ = blkno;
 	}
 	/*
 	 * Next we must write out the modified inode and indirect blocks.
 	 * For strict correctness, the writes should be synchronous since
 	 * the old block values may have been written to disk. In practise
 	 * they are almost never written, but if we are concerned about
 	 * strict correctness, the `doasyncfree' flag should be set to zero.
 	 *
 	 * The test on `doasyncfree' should be changed to test a flag
 	 * that shows whether the associated buffers and inodes have
 	 * been written. The flag should be set when the cluster is
 	 * started and cleared whenever the buffer or inode is flushed.
 	 * We can then check below to see if it is set, and do the
 	 * synchronous write only when it has been cleared.
 	 */
 	if (sbap != &ip->i_db[0]) {
 		if (doasyncfree)
 			bdwrite(sbp);
 		else
 			bwrite(sbp);
 	} else {
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		if (!doasyncfree)
 			UFS_UPDATE(vp, 1);
 	}
 	if (ssize < len) {
 		if (doasyncfree)
 			bdwrite(ebp);
 		else
 			bwrite(ebp);
 	}
 	/*
 	 * Last, free the old blocks and assign the new blocks to the buffers.
 	 */
 #ifdef DEBUG
 	if (prtrealloc)
 		printf("\n\tnew:");
 #endif
 	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
 		if (!DOINGSOFTDEP(vp))
 			ffs_blkfree(ip,
 			    dbtofsb(fs, buflist->bs_children[i]->b_blkno),
 			    fs->fs_bsize);
 		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
 #ifdef DIAGNOSTIC
 		if (!ffs_checkblk(ip,
 		   dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
 			panic("ffs_reallocblks: unallocated block 3");
 #endif
 #ifdef DEBUG
 		if (prtrealloc)
 			printf(" %d,", blkno);
 #endif
 	}
 #ifdef DEBUG
 	if (prtrealloc) {
 		prtrealloc--;
 		printf("\n");
 	}
 #endif
 	return (0);
 
 fail:
 	if (ssize < len)
 		brelse(ebp);
 	if (sbap != &ip->i_db[0])
 		brelse(sbp);
 	return (ENOSPC);
 }
 
 /*
  * Allocate an inode in the file system.
  *
  * If allocating a directory, use ffs_dirpref to select the inode.
  * If allocating in a directory, the following hierarchy is followed:
  *   1) allocate the preferred inode.
  *   2) allocate an inode in the same cylinder group.
  *   3) quadradically rehash into other cylinder groups, until an
  *      available inode is located.
  * If no inode preference is given the following heirarchy is used
  * to allocate an inode:
  *   1) allocate an inode in cylinder group 0.
  *   2) quadradically rehash into other cylinder groups, until an
  *      available inode is located.
  */
 int
 ffs_valloc(pvp, mode, cred, vpp)
 	struct vnode *pvp;
 	int mode;
 	struct ucred *cred;
 	struct vnode **vpp;
 {
 	register struct inode *pip;
 	register struct fs *fs;
 	register struct inode *ip;
 	ino_t ino, ipref;
 	int cg, error;
 
 	*vpp = NULL;
 	pip = VTOI(pvp);
 	fs = pip->i_fs;
 	if (fs->fs_cstotal.cs_nifree == 0)
 		goto noinodes;
 
 	if ((mode & IFMT) == IFDIR)
 		ipref = ffs_dirpref(pip);
 	else
 		ipref = pip->i_number;
 	if (ipref >= fs->fs_ncg * fs->fs_ipg)
 		ipref = 0;
 	cg = ino_to_cg(fs, ipref);
 	/*
 	 * Track number of dirs created one after another
 	 * in a same cg without intervening by files.
 	 */
 	if ((mode & IFMT) == IFDIR) {
 		if (fs->fs_contigdirs[cg] < 255)
 			fs->fs_contigdirs[cg]++;
 	} else {
 		if (fs->fs_contigdirs[cg] > 0)
 			fs->fs_contigdirs[cg]--;
 	}
 	ino = (ino_t)ffs_hashalloc(pip, cg, (long)ipref, mode,
 					(allocfcn_t *)ffs_nodealloccg);
 	if (ino == 0)
 		goto noinodes;
 	error = VFS_VGET(pvp->v_mount, ino, vpp);
 	if (error) {
 		UFS_VFREE(pvp, ino, mode);
 		return (error);
 	}
 	ip = VTOI(*vpp);
 	if (ip->i_mode) {
 		printf("mode = 0%o, inum = %lu, fs = %s\n",
 		    ip->i_mode, (u_long)ip->i_number, fs->fs_fsmnt);
 		panic("ffs_valloc: dup alloc");
 	}
 	if (ip->i_blocks && (fs->fs_flags & FS_UNCLEAN) == 0) {	    /* XXX */
 		printf("free inode %s/%lu had %ld blocks\n",
 		    fs->fs_fsmnt, (u_long)ino, (long)ip->i_blocks);
 		ip->i_blocks = 0;
 	}
 	ip->i_flags = 0;
 	/*
 	 * Set up a new generation number for this inode.
 	 */
 	if (ip->i_gen == 0 || ++ip->i_gen == 0)
 		ip->i_gen = random() / 2 + 1;
 	return (0);
 noinodes:
 	ffs_fserr(fs, cred->cr_uid, "out of inodes");
 	uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
 	return (ENOSPC);
 }
 
 /*
  * Find a cylinder group to place a directory.
  *
  * The policy implemented by this algorithm is to allocate a
  * directory inode in the same cylinder group as its parent
  * directory, but also to reserve space for its files inodes
  * and data. Restrict the number of directories which may be
  * allocated one after another in the same cylinder group
  * without intervening allocation of files.
  *
  * If we allocate a first level directory then force allocation
  * in another cylinder group.
  */
 static ino_t
 ffs_dirpref(pip)
 	struct inode *pip;
 {
 	register struct fs *fs;
 	int cg, prefcg, dirsize, cgsize;
 	int avgifree, avgbfree, avgndir, curdirsize;
 	int minifree, minbfree, maxndir;
 	int mincg, minndir;
 	int maxcontigdirs;
 
 	fs = pip->i_fs;
 
 	avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
 	avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 	avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
 
 	/*
 	 * Force allocation in another cg if creating a first level dir.
 	 */
 	if (ITOV(pip)->v_flag & VROOT) {
 		prefcg = arc4random() % fs->fs_ncg;
 		mincg = prefcg;
 		minndir = fs->fs_ipg;
 		for (cg = prefcg; cg < fs->fs_ncg; cg++)
 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				mincg = cg;
 				minndir = fs->fs_cs(fs, cg).cs_ndir;
 			}
 		for (cg = 0; cg < prefcg; cg++)
 			if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
 			    fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
 			    fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				mincg = cg;
 				minndir = fs->fs_cs(fs, cg).cs_ndir;
 			}
 		return ((ino_t)(fs->fs_ipg * mincg));
 	}
 
 	/*
 	 * Count various limits which used for
 	 * optimal allocation of a directory inode.
 	 */
 	maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
 	minifree = avgifree - fs->fs_ipg / 4;
 	if (minifree < 0)
 		minifree = 0;
 	minbfree = avgbfree - fs->fs_fpg / fs->fs_frag / 4;
 	if (minbfree < 0)
 		minbfree = 0;
 	cgsize = fs->fs_fsize * fs->fs_fpg;
 	dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir;
 	curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
 	if (dirsize < curdirsize)
 		dirsize = curdirsize;
 	maxcontigdirs = min(cgsize / dirsize, 255);
 	if (fs->fs_avgfpdir > 0)
 		maxcontigdirs = min(maxcontigdirs,
 				    fs->fs_ipg / fs->fs_avgfpdir);
 	if (maxcontigdirs == 0)
 		maxcontigdirs = 1;
 
 	/*
 	 * Limit number of dirs in one cg and reserve space for 
 	 * regular files, but only if we have no deficit in
 	 * inodes or space.
 	 */
 	prefcg = ino_to_cg(fs, pip->i_number);
 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
 	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
 				return ((ino_t)(fs->fs_ipg * cg));
 		}
 	for (cg = 0; cg < prefcg; cg++)
 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
 	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
 				return ((ino_t)(fs->fs_ipg * cg));
 		}
 	/*
 	 * This is a backstop when we have deficit in space.
 	 */
 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
 			return ((ino_t)(fs->fs_ipg * cg));
 	for (cg = 0; cg < prefcg; cg++)
 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
 			break;
 	return ((ino_t)(fs->fs_ipg * cg));
 }
 
 /*
  * Select the desired position for the next block in a file.  The file is
  * logically divided into sections. The first section is composed of the
  * direct blocks. Each additional section contains fs_maxbpg blocks.
  *
  * If no blocks have been allocated in the first section, the policy is to
  * request a block in the same cylinder group as the inode that describes
  * the file. If no blocks have been allocated in any other section, the
  * policy is to place the section in a cylinder group with a greater than
  * average number of free blocks.  An appropriate cylinder group is found
  * by using a rotor that sweeps the cylinder groups. When a new group of
  * blocks is needed, the sweep begins in the cylinder group following the
  * cylinder group from which the previous allocation was made. The sweep
  * continues until a cylinder group with greater than the average number
  * of free blocks is found. If the allocation is for the first block in an
  * indirect block, the information on the previous allocation is unavailable;
  * here a best guess is made based upon the logical block number being
  * allocated.
  *
  * If a section is already partially allocated, the policy is to
  * contiguously allocate fs_maxcontig blocks.  The end of one of these
  * contiguous blocks and the beginning of the next is physically separated
  * so that the disk head will be in transit between them for at least
  * fs_rotdelay milliseconds.  This is to allow time for the processor to
  * schedule another I/O transfer.
  */
 ufs_daddr_t
 ffs_blkpref(ip, lbn, indx, bap)
 	struct inode *ip;
 	ufs_daddr_t lbn;
 	int indx;
 	ufs_daddr_t *bap;
 {
 	register struct fs *fs;
 	register int cg;
 	int avgbfree, startcg;
 	ufs_daddr_t nextblk;
 
 	fs = ip->i_fs;
 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
 		if (lbn < NDADDR + NINDIR(fs)) {
 			cg = ino_to_cg(fs, ip->i_number);
 			return (fs->fs_fpg * cg + fs->fs_frag);
 		}
 		/*
 		 * Find a cylinder with greater than average number of
 		 * unused data blocks.
 		 */
 		if (indx == 0 || bap[indx - 1] == 0)
 			startcg =
 			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
 		else
 			startcg = dtog(fs, bap[indx - 1]) + 1;
 		startcg %= fs->fs_ncg;
 		avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
 		for (cg = startcg; cg < fs->fs_ncg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (fs->fs_fpg * cg + fs->fs_frag);
 			}
 		for (cg = 0; cg <= startcg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
 				return (fs->fs_fpg * cg + fs->fs_frag);
 			}
 		return (0);
 	}
 	/*
 	 * One or more previous blocks have been laid out. If less
 	 * than fs_maxcontig previous blocks are contiguous, the
 	 * next block is requested contiguously, otherwise it is
 	 * requested rotationally delayed by fs_rotdelay milliseconds.
 	 */
 	nextblk = bap[indx - 1] + fs->fs_frag;
 	if (fs->fs_rotdelay == 0 || indx < fs->fs_maxcontig ||
 	    bap[indx - fs->fs_maxcontig] +
 	    blkstofrags(fs, fs->fs_maxcontig) != nextblk)
 		return (nextblk);
 	/*
 	 * Here we convert ms of delay to frags as:
 	 * (frags) = (ms) * (rev/sec) * (sect/rev) /
 	 *	((sect/frag) * (ms/sec))
 	 * then round up to the next block.
 	 */
 	nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
 	    (NSPF(fs) * 1000), fs->fs_frag);
 	return (nextblk);
 }
 
 /*
  * Implement the cylinder overflow algorithm.
  *
  * The policy implemented by this algorithm is:
  *   1) allocate the block in its requested cylinder group.
  *   2) quadradically rehash on the cylinder group number.
  *   3) brute force search for a free block.
  */
 /*VARARGS5*/
 static u_long
 ffs_hashalloc(ip, cg, pref, size, allocator)
 	struct inode *ip;
 	int cg;
 	long pref;
 	int size;	/* size for data blocks, mode for inodes */
 	allocfcn_t *allocator;
 {
 	register struct fs *fs;
 	long result;	/* XXX why not same type as we return? */
 	int i, icg = cg;
 
 #ifdef DIAGNOSTIC
 	if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
 		panic("ffs_hashalloc: allocation on suspended filesystem");
 #endif
 	fs = ip->i_fs;
 	/*
 	 * 1: preferred cylinder group
 	 */
 	result = (*allocator)(ip, cg, pref, size);
 	if (result)
 		return (result);
 	/*
 	 * 2: quadratic rehash
 	 */
 	for (i = 1; i < fs->fs_ncg; i *= 2) {
 		cg += i;
 		if (cg >= fs->fs_ncg)
 			cg -= fs->fs_ncg;
 		result = (*allocator)(ip, cg, 0, size);
 		if (result)
 			return (result);
 	}
 	/*
 	 * 3: brute force search
 	 * Note that we start at i == 2, since 0 was checked initially,
 	 * and 1 is always checked in the quadratic rehash.
 	 */
 	cg = (icg + 2) % fs->fs_ncg;
 	for (i = 2; i < fs->fs_ncg; i++) {
 		result = (*allocator)(ip, cg, 0, size);
 		if (result)
 			return (result);
 		cg++;
 		if (cg == fs->fs_ncg)
 			cg = 0;
 	}
 	return (0);
 }
 
 /*
  * Determine whether a fragment can be extended.
  *
  * Check to see if the necessary fragments are available, and
  * if they are, allocate them.
  */
 static ufs_daddr_t
 ffs_fragextend(ip, cg, bprev, osize, nsize)
 	struct inode *ip;
 	int cg;
 	long bprev;
 	int osize, nsize;
 {
 	register struct fs *fs;
 	register struct cg *cgp;
 	struct buf *bp;
 	long bno;
 	int frags, bbase;
 	int i, error;
 	u_int8_t *blksfree;
 
 	fs = ip->i_fs;
 	if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
 		return (0);
 	frags = numfrags(fs, nsize);
 	bbase = fragnum(fs, bprev);
 	if (bbase > fragnum(fs, (bprev + frags - 1))) {
 		/* cannot extend across a block boundary */
 		return (0);
 	}
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (0);
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp)) {
 		brelse(bp);
 		return (0);
 	}
 	bp->b_xflags |= BX_BKGRDWRITE;
 	cgp->cg_time = time_second;
 	bno = dtogd(fs, bprev);
 	blksfree = cg_blksfree(cgp);
 	for (i = numfrags(fs, osize); i < frags; i++)
 		if (isclr(blksfree, bno + i)) {
 			brelse(bp);
 			return (0);
 		}
 	/*
 	 * the current fragment can be extended
 	 * deduct the count on fragment being extended into
 	 * increase the count on the remaining fragment (if any)
 	 * allocate the extended piece
 	 */
 	for (i = frags; i < fs->fs_frag - bbase; i++)
 		if (isclr(blksfree, bno + i))
 			break;
 	cgp->cg_frsum[i - numfrags(fs, osize)]--;
 	if (i != frags)
 		cgp->cg_frsum[i - frags]++;
 	for (i = numfrags(fs, osize); i < frags; i++) {
 		clrbit(blksfree, bno + i);
 		cgp->cg_cs.cs_nffree--;
 		fs->fs_cstotal.cs_nffree--;
 		fs->fs_cs(fs, cg).cs_nffree--;
 	}
 	fs->fs_fmod = 1;
 	if (DOINGSOFTDEP(ITOV(ip)))
 		softdep_setup_blkmapdep(bp, fs, bprev);
 	if (fs->fs_active != 0)
 		atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg));
 	bdwrite(bp);
 	return (bprev);
 }
 
 /*
  * Determine whether a block can be allocated.
  *
  * Check to see if a block of the appropriate size is available,
  * and if it is, allocate it.
  */
 static ufs_daddr_t
 ffs_alloccg(ip, cg, bpref, size)
 	struct inode *ip;
 	int cg;
 	ufs_daddr_t bpref;
 	int size;
 {
 	register struct fs *fs;
 	register struct cg *cgp;
 	struct buf *bp;
 	register int i;
 	ufs_daddr_t bno, blkno;
 	int allocsiz, error, frags;
 	u_int8_t *blksfree;
 
 	fs = ip->i_fs;
 	if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
 		return (0);
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (0);
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp) ||
 	    (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
 		brelse(bp);
 		return (0);
 	}
 	bp->b_xflags |= BX_BKGRDWRITE;
 	cgp->cg_time = time_second;
 	if (size == fs->fs_bsize) {
 		bno = ffs_alloccgblk(ip, bp, bpref);
 		if (fs->fs_active != 0)
 			atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg));
 		bdwrite(bp);
 		return (bno);
 	}
 	/*
 	 * check to see if any fragments are already available
 	 * allocsiz is the size which will be allocated, hacking
 	 * it down to a smaller size if necessary
 	 */
 	blksfree = cg_blksfree(cgp);
 	frags = numfrags(fs, size);
 	for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
 		if (cgp->cg_frsum[allocsiz] != 0)
 			break;
 	if (allocsiz == fs->fs_frag) {
 		/*
 		 * no fragments were available, so a block will be
 		 * allocated, and hacked up
 		 */
 		if (cgp->cg_cs.cs_nbfree == 0) {
 			brelse(bp);
 			return (0);
 		}
 		bno = ffs_alloccgblk(ip, bp, bpref);
 		bpref = dtogd(fs, bno);
 		for (i = frags; i < fs->fs_frag; i++)
 			setbit(blksfree, bpref + i);
 		i = fs->fs_frag - frags;
 		cgp->cg_cs.cs_nffree += i;
 		fs->fs_cstotal.cs_nffree += i;
 		fs->fs_cs(fs, cg).cs_nffree += i;
 		fs->fs_fmod = 1;
 		cgp->cg_frsum[i]++;
 		if (fs->fs_active != 0)
 			atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg));
 		bdwrite(bp);
 		return (bno);
 	}
 	bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
 	if (bno < 0) {
 		brelse(bp);
 		return (0);
 	}
 	for (i = 0; i < frags; i++)
 		clrbit(blksfree, bno + i);
 	cgp->cg_cs.cs_nffree -= frags;
 	fs->fs_cstotal.cs_nffree -= frags;
 	fs->fs_cs(fs, cg).cs_nffree -= frags;
 	fs->fs_fmod = 1;
 	cgp->cg_frsum[allocsiz]--;
 	if (frags != allocsiz)
 		cgp->cg_frsum[allocsiz - frags]++;
 	blkno = cg * fs->fs_fpg + bno;
 	if (DOINGSOFTDEP(ITOV(ip)))
 		softdep_setup_blkmapdep(bp, fs, blkno);
 	if (fs->fs_active != 0)
 		atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg));
 	bdwrite(bp);
 	return ((u_long)blkno);
 }
 
 /*
  * Allocate a block in a cylinder group.
  *
  * This algorithm implements the following policy:
  *   1) allocate the requested block.
  *   2) allocate a rotationally optimal block in the same cylinder.
  *   3) allocate the next available block on the block rotor for the
  *      specified cylinder group.
  * Note that this routine only allocates fs_bsize blocks; these
  * blocks may be fragmented by the routine that allocates them.
  */
 static ufs_daddr_t
 ffs_alloccgblk(ip, bp, bpref)
 	struct inode *ip;
 	struct buf *bp;
 	ufs_daddr_t bpref;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	ufs_daddr_t bno, blkno;
 	int cylno, pos, delta;
 	short *cylbp;
 	register int i;
 	u_int8_t *blksfree;
 
 	fs = ip->i_fs;
 	cgp = (struct cg *)bp->b_data;
 	blksfree = cg_blksfree(cgp);
 	if (bpref == 0 || dtog(fs, bpref) != cgp->cg_cgx) {
 		bpref = cgp->cg_rotor;
 		goto norot;
 	}
 	bpref = blknum(fs, bpref);
 	bpref = dtogd(fs, bpref);
 	/*
 	 * if the requested block is available, use it
 	 */
 	if (ffs_isblock(fs, blksfree, fragstoblks(fs, bpref))) {
 		bno = bpref;
 		goto gotit;
 	}
 	if (fs->fs_nrpos <= 1 || fs->fs_cpc == 0) {
 		/*
 		 * Block layout information is not available.
 		 * Leaving bpref unchanged means we take the
 		 * next available free block following the one
 		 * we just allocated. Hopefully this will at
 		 * least hit a track cache on drives of unknown
 		 * geometry (e.g. SCSI).
 		 */
 		goto norot;
 	}
 	/*
 	 * check for a block available on the same cylinder
 	 */
 	cylno = cbtocylno(fs, bpref);
 	if (cg_blktot(cgp)[cylno] == 0)
 		goto norot;
 	/*
 	 * check the summary information to see if a block is
 	 * available in the requested cylinder starting at the
 	 * requested rotational position and proceeding around.
 	 */
 	cylbp = cg_blks(fs, cgp, cylno);
 	pos = cbtorpos(fs, bpref);
 	for (i = pos; i < fs->fs_nrpos; i++)
 		if (cylbp[i] > 0)
 			break;
 	if (i == fs->fs_nrpos)
 		for (i = 0; i < pos; i++)
 			if (cylbp[i] > 0)
 				break;
 	if (cylbp[i] > 0) {
 		/*
 		 * found a rotational position, now find the actual
 		 * block. A panic if none is actually there.
 		 */
 		pos = cylno % fs->fs_cpc;
 		bno = (cylno - pos) * fs->fs_spc / NSPB(fs);
 		if (fs_postbl(fs, pos)[i] == -1) {
 			printf("pos = %d, i = %d, fs = %s\n",
 			    pos, i, fs->fs_fsmnt);
 			panic("ffs_alloccgblk: cyl groups corrupted");
 		}
 		for (i = fs_postbl(fs, pos)[i];; ) {
 			if (ffs_isblock(fs, blksfree, bno + i)) {
 				bno = blkstofrags(fs, (bno + i));
 				goto gotit;
 			}
 			delta = fs_rotbl(fs)[i];
 			if (delta <= 0 ||
 			    delta + i > fragstoblks(fs, fs->fs_fpg))
 				break;
 			i += delta;
 		}
 		printf("pos = %d, i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
 		panic("ffs_alloccgblk: can't find blk in cyl");
 	}
 norot:
 	/*
 	 * no blocks in the requested cylinder, so take next
 	 * available one in this cylinder group.
 	 */
 	bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
 	if (bno < 0)
 		return (0);
 	cgp->cg_rotor = bno;
 gotit:
 	blkno = fragstoblks(fs, bno);
 	ffs_clrblock(fs, blksfree, (long)blkno);
 	ffs_clusteracct(fs, cgp, blkno, -1);
 	cgp->cg_cs.cs_nbfree--;
 	fs->fs_cstotal.cs_nbfree--;
 	fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
 	cylno = cbtocylno(fs, bno);
 	cg_blks(fs, cgp, cylno)[cbtorpos(fs, bno)]--;
 	cg_blktot(cgp)[cylno]--;
 	fs->fs_fmod = 1;
 	blkno = cgp->cg_cgx * fs->fs_fpg + bno;
 	if (DOINGSOFTDEP(ITOV(ip)))
 		softdep_setup_blkmapdep(bp, fs, blkno);
 	return (blkno);
 }
 
 /*
  * Determine whether a cluster can be allocated.
  *
  * We do not currently check for optimal rotational layout if there
  * are multiple choices in the same cylinder group. Instead we just
  * take the first one that we find following bpref.
  */
 static ufs_daddr_t
 ffs_clusteralloc(ip, cg, bpref, len)
 	struct inode *ip;
 	int cg;
 	ufs_daddr_t bpref;
 	int len;
 {
 	register struct fs *fs;
 	register struct cg *cgp;
 	struct buf *bp;
 	int i, got, run, bno, bit, map;
 	u_char *mapp;
 	int32_t *lp;
 	u_int8_t *blksfree;
 
 	fs = ip->i_fs;
 	if (fs->fs_maxcluster[cg] < len)
 		return (0);
 	if (bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)), (int)fs->fs_cgsize,
 	    NOCRED, &bp))
 		goto fail;
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp))
 		goto fail;
 	bp->b_xflags |= BX_BKGRDWRITE;
 	/*
 	 * Check to see if a cluster of the needed size (or bigger) is
 	 * available in this cylinder group.
 	 */
 	lp = &cg_clustersum(cgp)[len];
 	for (i = len; i <= fs->fs_contigsumsize; i++)
 		if (*lp++ > 0)
 			break;
 	if (i > fs->fs_contigsumsize) {
 		/*
 		 * This is the first time looking for a cluster in this
 		 * cylinder group. Update the cluster summary information
 		 * to reflect the true maximum sized cluster so that
 		 * future cluster allocation requests can avoid reading
 		 * the cylinder group map only to find no clusters.
 		 */
 		lp = &cg_clustersum(cgp)[len - 1];
 		for (i = len - 1; i > 0; i--)
 			if (*lp-- > 0)
 				break;
 		fs->fs_maxcluster[cg] = i;
 		goto fail;
 	}
 	/*
 	 * Search the cluster map to find a big enough cluster.
 	 * We take the first one that we find, even if it is larger
 	 * than we need as we prefer to get one close to the previous
 	 * block allocation. We do not search before the current
 	 * preference point as we do not want to allocate a block
 	 * that is allocated before the previous one (as we will
 	 * then have to wait for another pass of the elevator
 	 * algorithm before it will be read). We prefer to fail and
 	 * be recalled to try an allocation in the next cylinder group.
 	 */
 	if (dtog(fs, bpref) != cg)
 		bpref = 0;
 	else
 		bpref = fragstoblks(fs, dtogd(fs, blknum(fs, bpref)));
 	mapp = &cg_clustersfree(cgp)[bpref / NBBY];
 	map = *mapp++;
 	bit = 1 << (bpref % NBBY);
 	for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) {
 		if ((map & bit) == 0) {
 			run = 0;
 		} else {
 			run++;
 			if (run == len)
 				break;
 		}
 		if ((got & (NBBY - 1)) != (NBBY - 1)) {
 			bit <<= 1;
 		} else {
 			map = *mapp++;
 			bit = 1;
 		}
 	}
 	if (got >= cgp->cg_nclusterblks)
 		goto fail;
 	/*
 	 * Allocate the cluster that we have found.
 	 */
 	blksfree = cg_blksfree(cgp);
 	for (i = 1; i <= len; i++)
 		if (!ffs_isblock(fs, blksfree, got - run + i))
 			panic("ffs_clusteralloc: map mismatch");
 	bno = cg * fs->fs_fpg + blkstofrags(fs, got - run + 1);
 	if (dtog(fs, bno) != cg)
 		panic("ffs_clusteralloc: allocated out of group");
 	len = blkstofrags(fs, len);
 	for (i = 0; i < len; i += fs->fs_frag)
 		if ((got = ffs_alloccgblk(ip, bp, bno + i)) != bno + i)
 			panic("ffs_clusteralloc: lost block");
 	if (fs->fs_active != 0)
 		atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg));
 	bdwrite(bp);
 	return (bno);
 
 fail:
 	brelse(bp);
 	return (0);
 }
 
 /*
  * Determine whether an inode can be allocated.
  *
  * Check to see if an inode is available, and if it is,
  * allocate it using the following policy:
  *   1) allocate the requested inode.
  *   2) allocate the next available inode after the requested
  *      inode in the specified cylinder group.
  */
 static ino_t
 ffs_nodealloccg(ip, cg, ipref, mode)
 	struct inode *ip;
 	int cg;
 	ufs_daddr_t ipref;
 	int mode;
 {
 	register struct fs *fs;
 	register struct cg *cgp;
 	struct buf *bp;
 	u_int8_t *inosused;
 	int error, start, len, loc, map, i;
 
 	fs = ip->i_fs;
 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
 		return (0);
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (0);
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) {
 		brelse(bp);
 		return (0);
 	}
 	bp->b_xflags |= BX_BKGRDWRITE;
 	cgp->cg_time = time_second;
 	inosused = cg_inosused(cgp);
 	if (ipref) {
 		ipref %= fs->fs_ipg;
 		if (isclr(inosused, ipref))
 			goto gotit;
 	}
 	start = cgp->cg_irotor / NBBY;
 	len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
 	loc = skpc(0xff, len, &inosused[start]);
 	if (loc == 0) {
 		len = start + 1;
 		start = 0;
 		loc = skpc(0xff, len, &inosused[0]);
 		if (loc == 0) {
 			printf("cg = %d, irotor = %ld, fs = %s\n",
 			    cg, (long)cgp->cg_irotor, fs->fs_fsmnt);
 			panic("ffs_nodealloccg: map corrupted");
 			/* NOTREACHED */
 		}
 	}
 	i = start + len - loc;
 	map = inosused[i];
 	ipref = i * NBBY;
 	for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
 		if ((map & i) == 0) {
 			cgp->cg_irotor = ipref;
 			goto gotit;
 		}
 	}
 	printf("fs = %s\n", fs->fs_fsmnt);
 	panic("ffs_nodealloccg: block not in map");
 	/* NOTREACHED */
 gotit:
 	if (DOINGSOFTDEP(ITOV(ip)))
 		softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref);
 	setbit(inosused, ipref);
 	cgp->cg_cs.cs_nifree--;
 	fs->fs_cstotal.cs_nifree--;
 	fs->fs_cs(fs, cg).cs_nifree--;
 	fs->fs_fmod = 1;
 	if ((mode & IFMT) == IFDIR) {
 		cgp->cg_cs.cs_ndir++;
 		fs->fs_cstotal.cs_ndir++;
 		fs->fs_cs(fs, cg).cs_ndir++;
 	}
 	bdwrite(bp);
 	return (cg * fs->fs_ipg + ipref);
 }
 
 /*
  * Free a block or fragment.
  *
  * The specified block or fragment is placed back in the
  * free map. If a fragment is deallocated, a possible
  * block reassembly is checked.
  */
 void
 ffs_blkfree(ip, bno, size)
 	register struct inode *ip;
 	ufs_daddr_t bno;
 	long size;
 {
 	register struct fs *fs;
 	register struct cg *cgp;
 	struct buf *bp;
 	ufs_daddr_t fragno, cgbno;
 	int i, error, cg, blk, frags, bbase;
 	u_int8_t *blksfree;
 #ifdef DIAGNOSTIC
 	struct vnode *vp;
 #endif
 
 	fs = ip->i_fs;
 #ifdef DIAGNOSTIC
 	if ((vp = ITOV(ip)) != NULL && vp->v_mount != NULL &&
 	    (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED))
 		panic("ffs_blkfree: deallocation on suspended filesystem");
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
 	    fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
 		printf("dev=%s, bno = %ld, bsize = %ld, size = %ld, fs = %s\n",
 		    devtoname(ip->i_dev), (long)bno, (long)fs->fs_bsize, size,
 		    fs->fs_fsmnt);
 		panic("ffs_blkfree: bad size");
 	}
 #endif
 	if ((ip->i_devvp->v_flag & VCOPYONWRITE) &&
 	    ffs_snapblkfree(ip, bno, size))
 		return;
 	VOP_FREEBLKS(ip->i_devvp, fsbtodb(fs, bno), size);
 	cg = dtog(fs, bno);
 	if ((u_int)bno >= fs->fs_size) {
 		printf("bad block %ld, ino %lu\n",
 		    (long)bno, (u_long)ip->i_number);
 		ffs_fserr(fs, ip->i_uid, "bad block");
 		return;
 	}
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return;
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp)) {
 		brelse(bp);
 		return;
 	}
 	bp->b_xflags |= BX_BKGRDWRITE;
 	cgp->cg_time = time_second;
 	cgbno = dtogd(fs, bno);
 	blksfree = cg_blksfree(cgp);
 	if (size == fs->fs_bsize) {
 		fragno = fragstoblks(fs, cgbno);
 		if (!ffs_isfreeblock(fs, blksfree, fragno)) {
 			printf("dev = %s, block = %ld, fs = %s\n",
 			    devtoname(ip->i_dev), (long)bno, fs->fs_fsmnt);
 			panic("ffs_blkfree: freeing free block");
 		}
 		ffs_setblock(fs, blksfree, fragno);
 		ffs_clusteracct(fs, cgp, fragno, 1);
 		cgp->cg_cs.cs_nbfree++;
 		fs->fs_cstotal.cs_nbfree++;
 		fs->fs_cs(fs, cg).cs_nbfree++;
 		i = cbtocylno(fs, cgbno);
 		cg_blks(fs, cgp, i)[cbtorpos(fs, cgbno)]++;
 		cg_blktot(cgp)[i]++;
 	} else {
 		bbase = cgbno - fragnum(fs, cgbno);
 		/*
 		 * decrement the counts associated with the old frags
 		 */
 		blk = blkmap(fs, blksfree, bbase);
 		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
 		/*
 		 * deallocate the fragment
 		 */
 		frags = numfrags(fs, size);
 		for (i = 0; i < frags; i++) {
 			if (isset(blksfree, cgbno + i)) {
 				printf("dev = %s, block = %ld, fs = %s\n",
 				    devtoname(ip->i_dev), (long)(bno + i),
 				    fs->fs_fsmnt);
 				panic("ffs_blkfree: freeing free frag");
 			}
 			setbit(blksfree, cgbno + i);
 		}
 		cgp->cg_cs.cs_nffree += i;
 		fs->fs_cstotal.cs_nffree += i;
 		fs->fs_cs(fs, cg).cs_nffree += i;
 		/*
 		 * add back in counts associated with the new frags
 		 */
 		blk = blkmap(fs, blksfree, bbase);
 		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
 		/*
 		 * if a complete block has been reassembled, account for it
 		 */
 		fragno = fragstoblks(fs, bbase);
 		if (ffs_isblock(fs, blksfree, fragno)) {
 			cgp->cg_cs.cs_nffree -= fs->fs_frag;
 			fs->fs_cstotal.cs_nffree -= fs->fs_frag;
 			fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
 			ffs_clusteracct(fs, cgp, fragno, 1);
 			cgp->cg_cs.cs_nbfree++;
 			fs->fs_cstotal.cs_nbfree++;
 			fs->fs_cs(fs, cg).cs_nbfree++;
 			i = cbtocylno(fs, bbase);
 			cg_blks(fs, cgp, i)[cbtorpos(fs, bbase)]++;
 			cg_blktot(cgp)[i]++;
 		}
 	}
 	fs->fs_fmod = 1;
 	if (fs->fs_active != 0)
 		atomic_clear_int(&ACTIVECGNUM(fs, cg), ACTIVECGOFF(cg));
 	bdwrite(bp);
 }
 
 #ifdef DIAGNOSTIC
 /*
  * Verify allocation of a block or fragment. Returns true if block or
  * fragment is allocated, false if it is free.
  */
 static int
 ffs_checkblk(ip, bno, size)
 	struct inode *ip;
 	ufs_daddr_t bno;
 	long size;
 {
 	struct fs *fs;
 	struct cg *cgp;
 	struct buf *bp;
 	int i, error, frags, free;
 	u_int8_t *blksfree;
 
 	fs = ip->i_fs;
 	if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
 		printf("bsize = %ld, size = %ld, fs = %s\n",
 		    (long)fs->fs_bsize, size, fs->fs_fsmnt);
 		panic("ffs_checkblk: bad size");
 	}
 	if ((u_int)bno >= fs->fs_size)
 		panic("ffs_checkblk: bad block %d", bno);
 	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, dtog(fs, bno))),
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error)
 		panic("ffs_checkblk: cg bread failed");
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp))
 		panic("ffs_checkblk: cg magic mismatch");
 	bp->b_xflags |= BX_BKGRDWRITE;
 	blksfree = cg_blksfree(cgp);
 	bno = dtogd(fs, bno);
 	if (size == fs->fs_bsize) {
 		free = ffs_isblock(fs, blksfree, fragstoblks(fs, bno));
 	} else {
 		frags = numfrags(fs, size);
 		for (free = 0, i = 0; i < frags; i++)
 			if (isset(blksfree, bno + i))
 				free++;
 		if (free != 0 && free != frags)
 			panic("ffs_checkblk: partially free fragment");
 	}
 	brelse(bp);
 	return (!free);
 }
 #endif /* DIAGNOSTIC */
 
 /*
  * Free an inode.
  */
 int
 ffs_vfree(pvp, ino, mode)
 	struct vnode *pvp;
 	ino_t ino;
 	int mode;
 {
 	if (DOINGSOFTDEP(pvp)) {
 		softdep_freefile(pvp, ino, mode);
 		return (0);
 	}
 	return (ffs_freefile(VTOI(pvp), ino, mode));
 }
 
 /*
  * Do the actual free operation.
  * The specified inode is placed back in the free map.
  */
 int
 ffs_freefile(pip, ino, mode)
 	struct inode *pip;
 	ino_t ino;
 	int mode;
 {
 	register struct fs *fs;
 	register struct cg *cgp;
 	struct buf *bp;
 	int error, cg;
 	u_int8_t *inosused;
 
 	fs = pip->i_fs;
 	if ((u_int)ino >= fs->fs_ipg * fs->fs_ncg)
 		panic("ffs_vfree: range: dev = (%d,%d), ino = %d, fs = %s",
 		    major(pip->i_dev), minor(pip->i_dev), ino, fs->fs_fsmnt);
 	cg = ino_to_cg(fs, ino);
 	error = bread(pip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
 		(int)fs->fs_cgsize, NOCRED, &bp);
 	if (error) {
 		brelse(bp);
 		return (error);
 	}
 	cgp = (struct cg *)bp->b_data;
 	if (!cg_chkmagic(cgp)) {
 		brelse(bp);
 		return (0);
 	}
 	bp->b_xflags |= BX_BKGRDWRITE;
 	cgp->cg_time = time_second;
 	inosused = cg_inosused(cgp);
 	ino %= fs->fs_ipg;
 	if (isclr(inosused, ino)) {
 		printf("dev = %s, ino = %lu, fs = %s\n", devtoname(pip->i_dev),
 		    (u_long)ino + cg * fs->fs_ipg, fs->fs_fsmnt);
 		if (fs->fs_ronly == 0)
 			panic("ffs_vfree: freeing free inode");
 	}
 	clrbit(inosused, ino);
 	if (ino < cgp->cg_irotor)
 		cgp->cg_irotor = ino;
 	cgp->cg_cs.cs_nifree++;
 	fs->fs_cstotal.cs_nifree++;
 	fs->fs_cs(fs, cg).cs_nifree++;
 	if ((mode & IFMT) == IFDIR) {
 		cgp->cg_cs.cs_ndir--;
 		fs->fs_cstotal.cs_ndir--;
 		fs->fs_cs(fs, cg).cs_ndir--;
 	}
 	fs->fs_fmod = 1;
 	bdwrite(bp);
 	return (0);
 }
 
 /*
  * Find a block of the specified size in the specified cylinder group.
  *
  * It is a panic if a request is made to find a block if none are
  * available.
  */
 static ufs_daddr_t
 ffs_mapsearch(fs, cgp, bpref, allocsiz)
 	register struct fs *fs;
 	register struct cg *cgp;
 	ufs_daddr_t bpref;
 	int allocsiz;
 {
 	ufs_daddr_t bno;
 	int start, len, loc, i;
 	int blk, field, subfield, pos;
 	u_int8_t *blksfree;
 
 	/*
 	 * find the fragment by searching through the free block
 	 * map for an appropriate bit pattern
 	 */
 	if (bpref)
 		start = dtogd(fs, bpref) / NBBY;
 	else
 		start = cgp->cg_frotor / NBBY;
 	blksfree = cg_blksfree(cgp);
 	len = howmany(fs->fs_fpg, NBBY) - start;
 	loc = scanc((u_int)len, (u_char *)&blksfree[start],
 		(u_char *)fragtbl[fs->fs_frag],
 		(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
 	if (loc == 0) {
 		len = start + 1;
 		start = 0;
 		loc = scanc((u_int)len, (u_char *)&blksfree[0],
 			(u_char *)fragtbl[fs->fs_frag],
 			(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
 		if (loc == 0) {
 			printf("start = %d, len = %d, fs = %s\n",
 			    start, len, fs->fs_fsmnt);
 			panic("ffs_alloccg: map corrupted");
 			/* NOTREACHED */
 		}
 	}
 	bno = (start + len - loc) * NBBY;
 	cgp->cg_frotor = bno;
 	/*
 	 * found the byte in the map
 	 * sift through the bits to find the selected frag
 	 */
 	for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
 		blk = blkmap(fs, blksfree, bno);
 		blk <<= 1;
 		field = around[allocsiz];
 		subfield = inside[allocsiz];
 		for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
 			if ((blk & field) == subfield)
 				return (bno + pos);
 			field <<= 1;
 			subfield <<= 1;
 		}
 	}
 	printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt);
 	panic("ffs_alloccg: block not in map");
 	return (-1);
 }
 
 /*
  * Update the cluster map because of an allocation or free.
  *
  * Cnt == 1 means free; cnt == -1 means allocating.
  */
 void
 ffs_clusteracct(fs, cgp, blkno, cnt)
 	struct fs *fs;
 	struct cg *cgp;
 	ufs_daddr_t blkno;
 	int cnt;
 {
 	int32_t *sump;
 	int32_t *lp;
 	u_char *freemapp, *mapp;
 	int i, start, end, forw, back, map, bit;
 
 	if (fs->fs_contigsumsize <= 0)
 		return;
 	freemapp = cg_clustersfree(cgp);
 	sump = cg_clustersum(cgp);
 	/*
 	 * Allocate or clear the actual block.
 	 */
 	if (cnt > 0)
 		setbit(freemapp, blkno);
 	else
 		clrbit(freemapp, blkno);
 	/*
 	 * Find the size of the cluster going forward.
 	 */
 	start = blkno + 1;
 	end = start + fs->fs_contigsumsize;
 	if (end >= cgp->cg_nclusterblks)
 		end = cgp->cg_nclusterblks;
 	mapp = &freemapp[start / NBBY];
 	map = *mapp++;
 	bit = 1 << (start % NBBY);
 	for (i = start; i < end; i++) {
 		if ((map & bit) == 0)
 			break;
 		if ((i & (NBBY - 1)) != (NBBY - 1)) {
 			bit <<= 1;
 		} else {
 			map = *mapp++;
 			bit = 1;
 		}
 	}
 	forw = i - start;
 	/*
 	 * Find the size of the cluster going backward.
 	 */
 	start = blkno - 1;
 	end = start - fs->fs_contigsumsize;
 	if (end < 0)
 		end = -1;
 	mapp = &freemapp[start / NBBY];
 	map = *mapp--;
 	bit = 1 << (start % NBBY);
 	for (i = start; i > end; i--) {
 		if ((map & bit) == 0)
 			break;
 		if ((i & (NBBY - 1)) != 0) {
 			bit >>= 1;
 		} else {
 			map = *mapp--;
 			bit = 1 << (NBBY - 1);
 		}
 	}
 	back = start - i;
 	/*
 	 * Account for old cluster and the possibly new forward and
 	 * back clusters.
 	 */
 	i = back + forw + 1;
 	if (i > fs->fs_contigsumsize)
 		i = fs->fs_contigsumsize;
 	sump[i] += cnt;
 	if (back > 0)
 		sump[back] -= cnt;
 	if (forw > 0)
 		sump[forw] -= cnt;
 	/*
 	 * Update cluster summary information.
 	 */
 	lp = &sump[fs->fs_contigsumsize];
 	for (i = fs->fs_contigsumsize; i > 0; i--)
 		if (*lp-- > 0)
 			break;
 	fs->fs_maxcluster[cgp->cg_cgx] = i;
 }
 
 /*
  * Fserr prints the name of a file system with an error diagnostic.
  *
  * The form of the error message is:
  *	fs: error message
  */
 static void
 ffs_fserr(fs, uid, cp)
 	struct fs *fs;
 	u_int uid;
 	char *cp;
 {
 	struct proc *p = curproc;	/* XXX */
 
 	log(LOG_ERR, "pid %d (%s), uid %d on %s: %s\n", p ? p->p_pid : -1,
 			p ? p->p_comm : "-", uid, fs->fs_fsmnt, cp);
 }
 
 /*
  * This function provides the capability for the fsck program to
  * update an active filesystem. Six operations are provided:
  *
  * adjrefcnt(inode, amt) - adjusts the reference count on the
  *	specified inode by the specified amount. Under normal
  *	operation the count should always go down. Decrementing
  *	the count to zero will cause the inode to be freed.
  * adjblkcnt(inode, amt) - adjust the number of blocks used to
  *	by the specifed amount.
  * freedirs(inode, count) - directory inodes [inode..inode + count - 1]
  *	are marked as free. Inodes should never have to be marked
  *	as in use.
  * freefiles(inode, count) - file inodes [inode..inode + count - 1]
  *	are marked as free. Inodes should never have to be marked
  *	as in use.
  * freeblks(blockno, size) - blocks [blockno..blockno + size - 1]
  *	are marked as free. Blocks should never have to be marked
  *	as in use.
  * setflags(flags, set/clear) - the fs_flags field has the specified
  *	flags set (second parameter +1) or cleared (second parameter -1).
  */
 
 static int sysctl_ffs_fsck __P((SYSCTL_HANDLER_ARGS));
 
 SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR|CTLTYPE_STRUCT,
 	0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count");
 
 SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Adjust Inode Used Blocks Count");
 
 SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Free Range of Directory Inodes");
 
 SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Free Range of File Inodes");
 
 SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Free Range of Blocks");
 
 SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR,
 	sysctl_ffs_fsck, "Change Filesystem Flags");
 
 #ifdef DEBUG
 static int fsckcmds = 0;
 SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, "");
 #endif /* DEBUG */
 
 static int
 sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
 {
 	struct fsck_cmd cmd;
 	struct inode tip;
 	struct ufsmount *ump;
 	struct vnode *vp;
 	struct inode *ip;
 	struct mount *mp;
 	struct fs *fs;
 	ufs_daddr_t blkno;
 	long blkcnt, blksize;
 	struct file *fp;
 	int filetype, error;
 
 	if (req->newlen > sizeof cmd)
 		return (EBADRPC);
 	if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0)
 		return (error);
 	if (cmd.version != FFS_CMD_VERSION)
 		return (ERPCMISMATCH);
 	if ((error = getvnode(curproc->p_fd, cmd.handle, &fp)) != 0)
 		return (error);
 	vn_start_write((struct vnode *)fp->f_data, &mp, V_WAIT);
 	if (mp == 0 || strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) {
 		vn_finished_write(mp);
+		fdrop(fp, curthread);
 		return (EINVAL);
 	}
 	if (mp->mnt_flag & MNT_RDONLY) {
 		vn_finished_write(mp);
+		fdrop(fp, curthread);
 		return (EROFS);
 	}
 	ump = VFSTOUFS(mp);
 	fs = ump->um_fs;
 	filetype = IFREG;
 
 	switch (oidp->oid_number) {
 
 	case FFS_SET_FLAGS:
 #ifdef DEBUG
 		if (fsckcmds)
 			printf("%s: %s flags\n", mp->mnt_stat.f_mntonname,
 			    cmd.size > 0 ? "set" : "clear");
 #endif /* DEBUG */
 		if (cmd.size > 0)
 			fs->fs_flags |= (long)cmd.value;
 		else
 			fs->fs_flags &= ~(long)cmd.value;
 		break;
 
 	case FFS_ADJ_REFCNT:
 #ifdef DEBUG
 		if (fsckcmds) {
 			printf("%s: adjust inode %d count by %ld\n",
 			    mp->mnt_stat.f_mntonname, (ino_t)cmd.value,
 			    cmd.size);
 		}
 #endif /* DEBUG */
 		if ((error = VFS_VGET(mp, (ino_t)cmd.value, &vp)) != 0)
 			break;
 		ip = VTOI(vp);
 		ip->i_nlink += cmd.size;
 		ip->i_effnlink += cmd.size;
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(vp))
 			softdep_change_linkcnt(ip);
 		vput(vp);
 		break;
 
 	case FFS_ADJ_BLKCNT:
 #ifdef DEBUG
 		if (fsckcmds) {
 			printf("%s: adjust inode %d block count by %ld\n",
 			    mp->mnt_stat.f_mntonname, (ino_t)cmd.value,
 			    cmd.size);
 		}
 #endif /* DEBUG */
 		if ((error = VFS_VGET(mp, (ino_t)cmd.value, &vp)) != 0)
 			break;
 		ip = VTOI(vp);
 		ip->i_blocks += cmd.size;
 		ip->i_flag |= IN_CHANGE;
 		vput(vp);
 		break;
 
 	case FFS_DIR_FREE:
 		filetype = IFDIR;
 		/* fall through */
 
 	case FFS_FILE_FREE:
 #ifdef DEBUG
 		if (fsckcmds) {
 			if (cmd.size == 1)
 				printf("%s: free %s inode %d\n",
 				    mp->mnt_stat.f_mntonname,
 				    filetype == IFDIR ? "directory" : "file",
 				    (ino_t)cmd.value);
 			else
 				printf("%s: free %s inodes %d-%d\n",
 				    mp->mnt_stat.f_mntonname,
 				    filetype == IFDIR ? "directory" : "file",
 				    (ino_t)cmd.value,
 				    (ino_t)(cmd.value + cmd.size - 1));
 		}
 #endif /* DEBUG */
 		tip.i_devvp = ump->um_devvp;
 		tip.i_dev = ump->um_dev;
 		tip.i_fs = fs;
 		while (cmd.size > 0) {
 			if ((error = ffs_freefile(&tip, cmd.value, filetype)))
 				break;
 			cmd.size -= 1;
 			cmd.value += 1;
 		}
 		break;
 
 	case FFS_BLK_FREE:
 #ifdef DEBUG
 		if (fsckcmds) {
 			if (cmd.size == 1)
 				printf("%s: free block %d\n",
 				    mp->mnt_stat.f_mntonname,
 				    (ufs_daddr_t)cmd.value);
 			else
 				printf("%s: free blocks %d-%ld\n",
 				    mp->mnt_stat.f_mntonname, 
 				    (ufs_daddr_t)cmd.value,
 				    (ufs_daddr_t)cmd.value + cmd.size - 1);
 		}
 #endif /* DEBUG */
 		tip.i_number = ROOTINO;
 		tip.i_devvp = ump->um_devvp;
 		tip.i_dev = ump->um_dev;
 		tip.i_fs = fs;
 		tip.i_size = cmd.size * fs->fs_fsize;
 		tip.i_uid = 0;
 		tip.i_vnode = NULL;
 		blkno = (ufs_daddr_t)cmd.value;
 		blkcnt = cmd.size;
 		blksize = fs->fs_frag - (blkno % fs->fs_frag);
 		while (blkcnt > 0) {
 			if (blksize > blkcnt)
 				blksize = blkcnt;
 			ffs_blkfree(&tip, blkno, blksize * fs->fs_fsize);
 			blkno += blksize;
 			blkcnt -= blksize;
 			blksize = fs->fs_frag;
 		}
 		break;
 
 	default:
 #ifdef DEBUG
 		if (fsckcmds) {
 			printf("Invalid request %d from fsck\n",
 			    oidp->oid_number);
 		}
 #endif /* DEBUG */
 		error = EINVAL;
 		break;
 
 	}
+	fdrop(fp, curthread);
 	vn_finished_write(mp);
 	return (error);
 }
Index: head/sys/vm/vm_mmap.c
===================================================================
--- head/sys/vm/vm_mmap.c	(revision 89305)
+++ head/sys/vm/vm_mmap.c	(revision 89306)
@@ -1,1284 +1,1281 @@
 /*
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
  *
  *	@(#)vm_mmap.c	8.4 (Berkeley) 1/12/94
  * $FreeBSD$
  */
 
 /*
  * Mapped file (mmap) interface to VM
  */
 
 #include "opt_bleed.h"
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/mman.h>
 #include <sys/conf.h>
 #include <sys/stat.h>
 #include <sys/vmmeter.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_kern.h>
 
 #ifndef _SYS_SYSPROTO_H_
 struct sbrk_args {
 	int incr;
 };
 #endif
 
 static int max_proc_mmap;
 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, "");
 
 /*
  * Set the maximum number of vm_map_entry structures per process.  Roughly
  * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100
  * of our KVM malloc space still results in generous limits.  We want a 
  * default that is good enough to prevent the kernel running out of resources
  * if attacked from compromised user account but generous enough such that
  * multi-threaded processes are not unduly inconvenienced.
  */
 
 static void vmmapentry_rsrc_init __P((void *));
 SYSINIT(vmmersrc, SI_SUB_KVM_RSRC, SI_ORDER_FIRST, vmmapentry_rsrc_init, NULL)
 
 static void
 vmmapentry_rsrc_init(dummy)
         void *dummy;
 {
     max_proc_mmap = vm_kmem_size / sizeof(struct vm_map_entry);
     max_proc_mmap /= 100;
 }
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sbrk(td, uap)
 	struct thread *td;
 	struct sbrk_args *uap;
 {
 	/* Not yet implemented */
 	/* mtx_lock(&Giant); */
 	/* mtx_unlock(&Giant); */
 	return (EOPNOTSUPP);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sstk_args {
 	int incr;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sstk(td, uap)
 	struct thread *td;
 	struct sstk_args *uap;
 {
 	/* Not yet implemented */
 	/* mtx_lock(&Giant); */
 	/* mtx_unlock(&Giant); */
 	return (EOPNOTSUPP);
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 #ifndef _SYS_SYSPROTO_H_
 struct getpagesize_args {
 	int dummy;
 };
 #endif
 
 /* ARGSUSED */
 int
 ogetpagesize(td, uap)
 	struct thread *td;
 	struct getpagesize_args *uap;
 {
 	/* MP SAFE */
 	td->td_retval[0] = PAGE_SIZE;
 	return (0);
 }
 #endif				/* COMPAT_43 || COMPAT_SUNOS */
 
 
 /* 
  * Memory Map (mmap) system call.  Note that the file offset
  * and address are allowed to be NOT page aligned, though if
  * the MAP_FIXED flag it set, both must have the same remainder
  * modulo the PAGE_SIZE (POSIX 1003.1b).  If the address is not
  * page-aligned, the actual mapping starts at trunc_page(addr)
  * and the return value is adjusted up by the page offset.
  *
  * Generally speaking, only character devices which are themselves
  * memory-based, such as a video framebuffer, can be mmap'd.  Otherwise
  * there would be no cache coherency between a descriptor and a VM mapping
  * both to the same character device.
  *
  * Block devices can be mmap'd no matter what they represent.  Cache coherency
  * is maintained as long as you do not write directly to the underlying
  * character device.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct mmap_args {
 	void *addr;
 	size_t len;
 	int prot;
 	int flags;
 	int fd;
 	long pad;
 	off_t pos;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 mmap(td, uap)
 	struct thread *td;
 	struct mmap_args *uap;
 {
-	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *fp = NULL;
 	struct vnode *vp;
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_prot_t prot, maxprot;
 	void *handle;
 	int flags, error;
 	int disablexworkaround;
 	off_t pos;
 	struct vmspace *vms = td->td_proc->p_vmspace;
 	vm_object_t obj;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	prot = uap->prot & VM_PROT_ALL;
 	flags = uap->flags;
 	pos = uap->pos;
 
+	fp = NULL;
 	/* make sure mapping fits into numeric range etc */
 	if ((ssize_t) uap->len < 0 ||
 	    ((flags & MAP_ANON) && uap->fd != -1))
 		return (EINVAL);
 
 	if (flags & MAP_STACK) {
 		if ((uap->fd != -1) ||
 		    ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE)))
 			return (EINVAL);
 		flags |= MAP_ANON;
 		pos = 0;
 	}
 
 	/*
 	 * Align the file position to a page boundary,
 	 * and save its page offset component.
 	 */
 	pageoff = (pos & PAGE_MASK);
 	pos -= pageoff;
 
 	/* Adjust size for rounding (on both ends). */
 	size += pageoff;			/* low end... */
 	size = (vm_size_t) round_page(size);	/* hi end */
 
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	if (flags & MAP_FIXED) {
 		/*
 		 * The specified address must have the same remainder
 		 * as the file offset taken modulo PAGE_SIZE, so it
 		 * should be aligned after adjustment by pageoff.
 		 */
 		addr -= pageoff;
 		if (addr & PAGE_MASK)
 			return (EINVAL);
 		/* Address range must be all in user VM space. */
 		if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
 			return (EINVAL);
 #ifndef i386
 		if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS)
 			return (EINVAL);
 #endif
 		if (addr + size < addr)
 			return (EINVAL);
 	}
 	/*
 	 * XXX for non-fixed mappings where no hint is provided or
 	 * the hint would fall in the potential heap space,
 	 * place it after the end of the largest possible heap.
 	 *
 	 * There should really be a pmap call to determine a reasonable
 	 * location.
 	 */
 	else if (addr == 0 ||
 	    (addr >= round_page((vm_offset_t)vms->vm_taddr) &&
 	     addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz)))
 		addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz);
 
 	mtx_lock(&Giant);	/* syscall marked mp-safe but isn't */
 	if (flags & MAP_ANON) {
 		/*
 		 * Mapping blank space is trivial.
 		 */
 		handle = NULL;
 		maxprot = VM_PROT_ALL;
 		pos = 0;
 	} else {
 		/*
 		 * Mapping file, get fp for validation. Obtain vnode and make
 		 * sure it is of appropriate type.
+		 * don't let the descriptor disappear on us if we block
 		 */
-		if (((unsigned) uap->fd) >= fdp->fd_nfiles ||
-		    (fp = fdp->fd_ofiles[uap->fd]) == NULL) {
+		fp = ffind_hold(td, uap->fd);
+		if (fp == NULL) {
 			error = EBADF;
-			goto done2;
+			goto done;
 		}
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EINVAL;
-			goto done2;
+			goto done;
 		}
 
 		/*
-		 * don't let the descriptor disappear on us if we block
-		 */
-		fhold(fp);
-
-		/*
 		 * POSIX shared-memory objects are defined to have
 		 * kernel persistence, and are not defined to support
 		 * read(2)/write(2) -- or even open(2).  Thus, we can
 		 * use MAP_ASYNC to trade on-disk coherence for speed.
 		 * The shm_open(3) library routine turns on the FPOSIXSHM
 		 * flag to request this behavior.
 		 */
 		if (fp->f_flag & FPOSIXSHM)
 			flags |= MAP_NOSYNC;
 		vp = (struct vnode *) fp->f_data;
 		if (vp->v_type != VREG && vp->v_type != VCHR) {
 			error = EINVAL;
 			goto done;
 		}
 		if (vp->v_type == VREG) {
 			/*
 			 * Get the proper underlying object
 			 */
 			if (VOP_GETVOBJECT(vp, &obj) != 0) {
 				error = EINVAL;
 				goto done;
 			}
 			vp = (struct vnode*)obj->handle;
 		}
 		/*
 		 * XXX hack to handle use of /dev/zero to map anon memory (ala
 		 * SunOS).
 		 */
 		if ((vp->v_type == VCHR) && 
 		    (vp->v_rdev->si_devsw->d_flags & D_MMAP_ANON)) {
 			handle = NULL;
 			maxprot = VM_PROT_ALL;
 			flags |= MAP_ANON;
 			pos = 0;
 		} else {
 			/*
 			 * cdevs does not provide private mappings of any kind.
 			 */
 			/*
 			 * However, for XIG X server to continue to work,
 			 * we should allow the superuser to do it anyway.
 			 * We only allow it at securelevel < 1.
 			 * (Because the XIG X server writes directly to video
 			 * memory via /dev/mem, it should never work at any
 			 * other securelevel.
 			 * XXX this will have to go
 			 */
 			if (securelevel_ge(td->td_proc->p_ucred, 1))
 				disablexworkaround = 1;
 			else
 				disablexworkaround = suser_td(td);
 			if (vp->v_type == VCHR && disablexworkaround &&
 			    (flags & (MAP_PRIVATE|MAP_COPY))) {
 				error = EINVAL;
 				goto done;
 			}
 			/*
 			 * Ensure that file and memory protections are
 			 * compatible.  Note that we only worry about
 			 * writability if mapping is shared; in this case,
 			 * current and max prot are dictated by the open file.
 			 * XXX use the vnode instead?  Problem is: what
 			 * credentials do we use for determination? What if
 			 * proc does a setuid?
 			 */
 			maxprot = VM_PROT_EXECUTE;	/* ??? */
 			if (fp->f_flag & FREAD) {
 				maxprot |= VM_PROT_READ;
 			} else if (prot & PROT_READ) {
 				error = EACCES;
 				goto done;
 			}
 			/*
 			 * If we are sharing potential changes (either via
 			 * MAP_SHARED or via the implicit sharing of character
 			 * device mappings), and we are trying to get write
 			 * permission although we opened it without asking
 			 * for it, bail out.  Check for superuser, only if
 			 * we're at securelevel < 1, to allow the XIG X server
 			 * to continue to work.
 			 */
 
 			if ((flags & MAP_SHARED) != 0 ||
 			    (vp->v_type == VCHR && disablexworkaround)) {
 				if ((fp->f_flag & FWRITE) != 0) {
 					struct vattr va;
 					if ((error =
 					    VOP_GETATTR(vp, &va,
 						        td->td_proc->p_ucred, td))) {
 						goto done;
 					}
 					if ((va.va_flags &
 					   (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) {
 						maxprot |= VM_PROT_WRITE;
 					} else if (prot & PROT_WRITE) {
 						error = EPERM;
 						goto done;
 					}
 				} else if ((prot & PROT_WRITE) != 0) {
 					error = EACCES;
 					goto done;
 				}
 			} else {
 				maxprot |= VM_PROT_WRITE;
 			}
 
 			handle = (void *)vp;
 		}
 	}
 
 	/*
 	 * Do not allow more then a certain number of vm_map_entry structures
 	 * per process.  Scale with the number of rforks sharing the map
 	 * to make the limit reasonable for threads.
 	 */
 	if (max_proc_mmap && 
 	    vms->vm_map.nentries >= max_proc_mmap * vms->vm_refcnt) {
 		error = ENOMEM;
 		goto done;
 	}
 
 	mtx_unlock(&Giant);
 	error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
 	    flags, handle, pos);
 	if (error == 0)
 		td->td_retval[0] = (register_t) (addr + pageoff);
 	mtx_lock(&Giant);
 done:
 	if (fp)
 		fdrop(fp, td);
-done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifdef COMPAT_43
 #ifndef _SYS_SYSPROTO_H_
 struct ommap_args {
 	caddr_t addr;
 	int len;
 	int prot;
 	int flags;
 	int fd;
 	long pos;
 };
 #endif
 int
 ommap(td, uap)
 	struct thread *td;
 	struct ommap_args *uap;
 {
 	struct mmap_args nargs;
 	static const char cvtbsdprot[8] = {
 		0,
 		PROT_EXEC,
 		PROT_WRITE,
 		PROT_EXEC | PROT_WRITE,
 		PROT_READ,
 		PROT_EXEC | PROT_READ,
 		PROT_WRITE | PROT_READ,
 		PROT_EXEC | PROT_WRITE | PROT_READ,
 	};
 
 #define	OMAP_ANON	0x0002
 #define	OMAP_COPY	0x0020
 #define	OMAP_SHARED	0x0010
 #define	OMAP_FIXED	0x0100
 
 	nargs.addr = uap->addr;
 	nargs.len = uap->len;
 	nargs.prot = cvtbsdprot[uap->prot & 0x7];
 	nargs.flags = 0;
 	if (uap->flags & OMAP_ANON)
 		nargs.flags |= MAP_ANON;
 	if (uap->flags & OMAP_COPY)
 		nargs.flags |= MAP_COPY;
 	if (uap->flags & OMAP_SHARED)
 		nargs.flags |= MAP_SHARED;
 	else
 		nargs.flags |= MAP_PRIVATE;
 	if (uap->flags & OMAP_FIXED)
 		nargs.flags |= MAP_FIXED;
 	nargs.fd = uap->fd;
 	nargs.pos = uap->pos;
 	return (mmap(td, &nargs));
 }
 #endif				/* COMPAT_43 */
 
 
 #ifndef _SYS_SYSPROTO_H_
 struct msync_args {
 	void *addr;
 	int len;
 	int flags;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 msync(td, uap)
 	struct thread *td;
 	struct msync_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	int flags;
 	vm_map_t map;
 	int rv;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	flags = uap->flags;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return(EINVAL);
 
 	if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE))
 		return (EINVAL);
 
 	mtx_lock(&Giant);
 
 	map = &td->td_proc->p_vmspace->vm_map;
 
 	/*
 	 * XXX Gak!  If size is zero we are supposed to sync "all modified
 	 * pages with the region containing addr".  Unfortunately, we don't
 	 * really keep track of individual mmaps so we approximate by flushing
 	 * the range of the map entry containing addr. This can be incorrect
 	 * if the region splits or is coalesced with a neighbor.
 	 */
 	if (size == 0) {
 		vm_map_entry_t entry;
 
 		vm_map_lock_read(map);
 		rv = vm_map_lookup_entry(map, addr, &entry);
 		vm_map_unlock_read(map);
 		if (rv == FALSE) {
 			rv = -1;
 			goto done2;
 		}
 		addr = entry->start;
 		size = entry->end - entry->start;
 	}
 
 	/*
 	 * Clean the pages and interpret the return value.
 	 */
 	rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0,
 	    (flags & MS_INVALIDATE) != 0);
 
 done2:
 	mtx_unlock(&Giant);
 
 	switch (rv) {
 	case KERN_SUCCESS:
 		return(0);
 	case KERN_INVALID_ADDRESS:
 		return (EINVAL);	/* Sun returns ENOMEM? */
 	case KERN_FAILURE:
 		return (EIO);
 	default:
 		return (EINVAL);
 	}
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munmap_args {
 	void *addr;
 	size_t len;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 munmap(td, uap)
 	struct thread *td;
 	struct munmap_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_map_t map;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return(EINVAL);
 
 	if (size == 0)
 		return (0);
 
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS)
 		return (EINVAL);
 #ifndef i386
 	if (VM_MIN_ADDRESS > 0 && addr < VM_MIN_ADDRESS)
 		return (EINVAL);
 #endif
 	mtx_lock(&Giant);
 	map = &td->td_proc->p_vmspace->vm_map;
 	/*
 	 * Make sure entire range is allocated.
 	 */
 	if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) {
 		mtx_unlock(&Giant);
 		return (EINVAL);
 	}
 	/* returns nothing but KERN_SUCCESS anyway */
 	(void) vm_map_remove(map, addr, addr + size);
 	mtx_unlock(&Giant);
 	return (0);
 }
 
 #if 0
 void
 munmapfd(td, fd)
 	struct thread *td;
 	int fd;
 {
 	/*
 	 * XXX should unmap any regions mapped to this file
 	 */
+	FILEDESC_LOCK(p->p_fd);
 	td->td_proc->p_fd->fd_ofileflags[fd] &= ~UF_MAPPED;
+	FILEDESC_UNLOCK(p->p_fd);
 }
 #endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct mprotect_args {
 	const void *addr;
 	size_t len;
 	int prot;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 mprotect(td, uap)
 	struct thread *td;
 	struct mprotect_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_prot_t prot;
 	int ret;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 	prot = uap->prot & VM_PROT_ALL;
 #if defined(VM_PROT_READ_IS_EXEC)
 	if (prot & VM_PROT_READ)
 		prot |= VM_PROT_EXECUTE;
 #endif
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return(EINVAL);
 
 	mtx_lock(&Giant);
 	ret = vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
 		     addr + size, prot, FALSE);
 	mtx_unlock(&Giant);
 	switch (ret) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	}
 	return (EINVAL);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct minherit_args {
 	void *addr;
 	size_t len;
 	int inherit;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 minherit(td, uap)
 	struct thread *td;
 	struct minherit_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	vm_inherit_t inherit;
 	int ret;
 
 	addr = (vm_offset_t)uap->addr;
 	size = uap->len;
 	inherit = uap->inherit;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 	if (addr + size < addr)
 		return(EINVAL);
 
 	mtx_lock(&Giant);
 	ret = vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr, addr+size,
 		    inherit);
 	mtx_unlock(&Giant);
 
 	switch (ret) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	}
 	return (EINVAL);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct madvise_args {
 	void *addr;
 	size_t len;
 	int behav;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 madvise(td, uap)
 	struct thread *td;
 	struct madvise_args *uap;
 {
 	vm_offset_t start, end;
 	int ret;
 
 	/*
 	 * Check for illegal behavior
 	 */
 	if (uap->behav < 0 || uap->behav > MADV_CORE)
 		return (EINVAL);
 	/*
 	 * Check for illegal addresses.  Watch out for address wrap... Note
 	 * that VM_*_ADDRESS are not constants due to casts (argh).
 	 */
 	if (VM_MAXUSER_ADDRESS > 0 &&
 		((vm_offset_t) uap->addr + uap->len) > VM_MAXUSER_ADDRESS)
 		return (EINVAL);
 #ifndef i386
 	if (VM_MIN_ADDRESS > 0 && uap->addr < VM_MIN_ADDRESS)
 		return (EINVAL);
 #endif
 	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
 		return (EINVAL);
 
 	/*
 	 * Since this routine is only advisory, we default to conservative
 	 * behavior.
 	 */
 	start = trunc_page((vm_offset_t) uap->addr);
 	end = round_page((vm_offset_t) uap->addr + uap->len);
 	
 	mtx_lock(&Giant);
 	ret = vm_map_madvise(&td->td_proc->p_vmspace->vm_map, start, end, uap->behav);
 	mtx_unlock(&Giant);
 	return (ret ? EINVAL : 0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mincore_args {
 	const void *addr;
 	size_t len;
 	char *vec;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 mincore(td, uap)
 	struct thread *td;
 	struct mincore_args *uap;
 {
 	vm_offset_t addr, first_addr;
 	vm_offset_t end, cend;
 	pmap_t pmap;
 	vm_map_t map;
 	char *vec;
 	int error = 0;
 	int vecindex, lastvecindex;
 	vm_map_entry_t current;
 	vm_map_entry_t entry;
 	int mincoreinfo;
 	unsigned int timestamp;
 
 	/*
 	 * Make sure that the addresses presented are valid for user
 	 * mode.
 	 */
 	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
 	end = addr + (vm_size_t)round_page(uap->len);
 	if (VM_MAXUSER_ADDRESS > 0 && end > VM_MAXUSER_ADDRESS)
 		return (EINVAL);
 	if (end < addr)
 		return (EINVAL);
 
 	/*
 	 * Address of byte vector
 	 */
 	vec = uap->vec;
 
 	mtx_lock(&Giant);
 	map = &td->td_proc->p_vmspace->vm_map;
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 
 	vm_map_lock_read(map);
 RestartScan:
 	timestamp = map->timestamp;
 
 	if (!vm_map_lookup_entry(map, addr, &entry))
 		entry = entry->next;
 
 	/*
 	 * Do this on a map entry basis so that if the pages are not
 	 * in the current processes address space, we can easily look
 	 * up the pages elsewhere.
 	 */
 	lastvecindex = -1;
 	for (current = entry;
 	    (current != &map->header) && (current->start < end);
 	    current = current->next) {
 
 		/*
 		 * ignore submaps (for now) or null objects
 		 */
 		if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) ||
 			current->object.vm_object == NULL)
 			continue;
 		
 		/*
 		 * limit this scan to the current map entry and the
 		 * limits for the mincore call
 		 */
 		if (addr < current->start)
 			addr = current->start;
 		cend = current->end;
 		if (cend > end)
 			cend = end;
 
 		/*
 		 * scan this entry one page at a time
 		 */
 		while (addr < cend) {
 			/*
 			 * Check pmap first, it is likely faster, also
 			 * it can provide info as to whether we are the
 			 * one referencing or modifying the page.
 			 */
 			mincoreinfo = pmap_mincore(pmap, addr);
 			if (!mincoreinfo) {
 				vm_pindex_t pindex;
 				vm_ooffset_t offset;
 				vm_page_t m;
 				/*
 				 * calculate the page index into the object
 				 */
 				offset = current->offset + (addr - current->start);
 				pindex = OFF_TO_IDX(offset);
 				m = vm_page_lookup(current->object.vm_object,
 					pindex);
 				/*
 				 * if the page is resident, then gather information about
 				 * it.
 				 */
 				if (m) {
 					mincoreinfo = MINCORE_INCORE;
 					if (m->dirty ||
 						pmap_is_modified(m))
 						mincoreinfo |= MINCORE_MODIFIED_OTHER;
 					if ((m->flags & PG_REFERENCED) ||
 						pmap_ts_referenced(m)) {
 						vm_page_flag_set(m, PG_REFERENCED);
 						mincoreinfo |= MINCORE_REFERENCED_OTHER;
 					}
 				}
 			}
 
 			/*
 			 * subyte may page fault.  In case it needs to modify
 			 * the map, we release the lock.
 			 */
 			vm_map_unlock_read(map);
 
 			/*
 			 * calculate index into user supplied byte vector
 			 */
 			vecindex = OFF_TO_IDX(addr - first_addr);
 
 			/*
 			 * If we have skipped map entries, we need to make sure that
 			 * the byte vector is zeroed for those skipped entries.
 			 */
 			while ((lastvecindex + 1) < vecindex) {
 				error = subyte( vec + lastvecindex, 0);
 				if (error) {
 					error = EFAULT;
 					goto done2;
 				}
 				++lastvecindex;
 			}
 
 			/*
 			 * Pass the page information to the user
 			 */
 			error = subyte( vec + vecindex, mincoreinfo);
 			if (error) {
 				error = EFAULT;
 				goto done2;
 			}
 
 			/*
 			 * If the map has changed, due to the subyte, the previous
 			 * output may be invalid.
 			 */
 			vm_map_lock_read(map);
 			if (timestamp != map->timestamp)
 				goto RestartScan;
 
 			lastvecindex = vecindex;
 			addr += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * subyte may page fault.  In case it needs to modify
 	 * the map, we release the lock.
 	 */
 	vm_map_unlock_read(map);
 
 	/*
 	 * Zero the last entries in the byte vector.
 	 */
 	vecindex = OFF_TO_IDX(end - first_addr);
 	while ((lastvecindex + 1) < vecindex) {
 		error = subyte( vec + lastvecindex, 0);
 		if (error) {
 			error = EFAULT;
 			goto done2;
 		}
 		++lastvecindex;
 	}
 	
 	/*
 	 * If the map has changed, due to the subyte, the previous
 	 * output may be invalid.
 	 */
 	vm_map_lock_read(map);
 	if (timestamp != map->timestamp)
 		goto RestartScan;
 	vm_map_unlock_read(map);
 done2:
 	mtx_unlock(&Giant);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlock_args {
 	const void *addr;
 	size_t len;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 mlock(td, uap)
 	struct thread *td;
 	struct mlock_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	int error;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 
 	/* disable wrap around */
 	if (addr + size < addr)
 		return (EINVAL);
 
 	if (atop(size) + cnt.v_wire_count > vm_page_max_wired)
 		return (EAGAIN);
 
 #ifdef pmap_wired_count
 	if (size + ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map))) >
 	    td->td_proc->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
 		return (ENOMEM);
 #else
 	error = suser_td(td);
 	if (error)
 		return (error);
 #endif
 
 	mtx_lock(&Giant);
 	error = vm_map_user_pageable(&td->td_proc->p_vmspace->vm_map, addr,
 		     addr + size, FALSE);
 	mtx_unlock(&Giant);
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlockall_args {
 	int	how;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 mlockall(td, uap)
 	struct thread *td;
 	struct mlockall_args *uap;
 {
 	/* mtx_lock(&Giant); */
 	/* mtx_unlock(&Giant); */
 	return 0;
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct mlockall_args {
 	int	how;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 munlockall(td, uap)
 	struct thread *td;
 	struct munlockall_args *uap;
 {
 	/* mtx_lock(&Giant); */
 	/* mtx_unlock(&Giant); */
 	return 0;
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct munlock_args {
 	const void *addr;
 	size_t len;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 munlock(td, uap)
 	struct thread *td;
 	struct munlock_args *uap;
 {
 	vm_offset_t addr;
 	vm_size_t size, pageoff;
 	int error;
 
 	addr = (vm_offset_t) uap->addr;
 	size = uap->len;
 
 	pageoff = (addr & PAGE_MASK);
 	addr -= pageoff;
 	size += pageoff;
 	size = (vm_size_t) round_page(size);
 
 	/* disable wrap around */
 	if (addr + size < addr)
 		return (EINVAL);
 
 #ifndef pmap_wired_count
 	error = suser_td(td);
 	if (error)
 		return (error);
 #endif
 
 	mtx_lock(&Giant);
 	error = vm_map_user_pageable(&td->td_proc->p_vmspace->vm_map, addr,
 		     addr + size, TRUE);
 	mtx_unlock(&Giant);
 	return (error == KERN_SUCCESS ? 0 : ENOMEM);
 }
 
 /*
  * vm_mmap()
  *
  * MPSAFE
  *
  * Internal version of mmap.  Currently used by mmap, exec, and sys5
  * shared memory.  Handle is either a vnode pointer or NULL for MAP_ANON.
  */
 int
 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 	vm_prot_t maxprot, int flags,
 	void *handle,
 	vm_ooffset_t foff)
 {
 	boolean_t fitit;
 	vm_object_t object;
 	struct vnode *vp = NULL;
 	objtype_t type;
 	int rv = KERN_SUCCESS;
 	vm_ooffset_t objsize;
 	int docow;
 	struct thread *td = curthread;
 
 	if (size == 0)
 		return (0);
 
 	objsize = size = round_page(size);
 
 	/*
 	 * We currently can only deal with page aligned file offsets.
 	 * The check is here rather than in the syscall because the
 	 * kernel calls this function internally for other mmaping
 	 * operations (such as in exec) and non-aligned offsets will
 	 * cause pmap inconsistencies...so we want to be sure to
 	 * disallow this in all cases.
 	 */
 	if (foff & PAGE_MASK)
 		return (EINVAL);
 
 	if ((flags & MAP_FIXED) == 0) {
 		fitit = TRUE;
 		*addr = round_page(*addr);
 		mtx_lock(&Giant);
 	} else {
 		if (*addr != trunc_page(*addr))
 			return (EINVAL);
 		fitit = FALSE;
 		mtx_lock(&Giant);
 		(void) vm_map_remove(map, *addr, *addr + size);
 	}
 
 	/*
 	 * Lookup/allocate object.
 	 */
 	if (flags & MAP_ANON) {
 		type = OBJT_DEFAULT;
 		/*
 		 * Unnamed anonymous regions always start at 0.
 		 */
 		if (handle == 0)
 			foff = 0;
 	} else {
 		vp = (struct vnode *) handle;
 		if (vp->v_type == VCHR) {
 			type = OBJT_DEVICE;
 			handle = (void *)(intptr_t)vp->v_rdev;
 		} else {
 			struct vattr vat;
 			int error;
 
 			error = VOP_GETATTR(vp, &vat, td->td_proc->p_ucred, td);
 			if (error) {
 				mtx_unlock(&Giant);
 				return (error);
 			}
 			objsize = round_page(vat.va_size);
 			type = OBJT_VNODE;
 			/*
 			 * if it is a regular file without any references
 			 * we do not need to sync it.
 			 */
 			if (vp->v_type == VREG && vat.va_nlink == 0) {
 				flags |= MAP_NOSYNC;
 			}
 		}
 	}
 
 	if (handle == NULL) {
 		object = NULL;
 		docow = 0;
 	} else {
 		object = vm_pager_allocate(type,
 			handle, objsize, prot, foff);
 		if (object == NULL) {
 			mtx_unlock(&Giant);
 			return (type == OBJT_DEVICE ? EINVAL : ENOMEM);
 		}
 		docow = MAP_PREFAULT_PARTIAL;
 	}
 
 	/*
 	 * Force device mappings to be shared.
 	 */
 	if (type == OBJT_DEVICE || type == OBJT_PHYS) {
 		flags &= ~(MAP_PRIVATE|MAP_COPY);
 		flags |= MAP_SHARED;
 	}
 
 	if ((flags & (MAP_ANON|MAP_SHARED)) == 0)
 		docow |= MAP_COPY_ON_WRITE;
 	if (flags & MAP_NOSYNC)
 		docow |= MAP_DISABLE_SYNCER;
 	if (flags & MAP_NOCORE)
 		docow |= MAP_DISABLE_COREDUMP;
 
 #if defined(VM_PROT_READ_IS_EXEC)
 	if (prot & VM_PROT_READ)
 		prot |= VM_PROT_EXECUTE;
 
 	if (maxprot & VM_PROT_READ)
 		maxprot |= VM_PROT_EXECUTE;
 #endif
 
 	if (fitit)
 		*addr = pmap_addr_hint(object, *addr, size);
 
 	if (flags & MAP_STACK)
 		rv = vm_map_stack (map, *addr, size, prot,
 				   maxprot, docow);
 	else
 		rv = vm_map_find(map, object, foff, addr, size, fitit,
 				 prot, maxprot, docow);
 
 	if (rv != KERN_SUCCESS) {
 		/*
 		 * Lose the object reference. Will destroy the
 		 * object if it's an unnamed anonymous mapping
 		 * or named anonymous without other references.
 		 */
 		vm_object_deallocate(object);
 	} else if (flags & MAP_SHARED) {
 		/*
 		 * Shared memory is also shared with children.
 		 */
 		rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE);
 		if (rv != KERN_SUCCESS)
 			(void) vm_map_remove(map, *addr, *addr + size);
 	}
 	mtx_unlock(&Giant);
 	switch (rv) {
 	case KERN_SUCCESS:
 		return (0);
 	case KERN_INVALID_ADDRESS:
 	case KERN_NO_SPACE:
 		return (ENOMEM);
 	case KERN_PROTECTION_FAILURE:
 		return (EACCES);
 	default:
 		return (EINVAL);
 	}
 }