Index: stable/6/sys/amd64/linux32/linux32_sysvec.c
===================================================================
--- stable/6/sys/amd64/linux32/linux32_sysvec.c	(revision 169910)
+++ stable/6/sys/amd64/linux32/linux32_sysvec.c	(revision 169911)
@@ -1,1093 +1,1086 @@
 /*-
  * Copyright (c) 2004 Tim J. Robbins
  * Copyright (c) 2003 Peter Wemm
  * Copyright (c) 2002 Doug Rabson
  * Copyright (c) 1998-1999 Andrew Gallatin
  * Copyright (c) 1994-1996 Søren Schmidt
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /* XXX we use functions that might not exist. */
 #include "opt_compat.h"
 
 #ifndef COMPAT_43
 #error "Unable to compile Linux-emulator due to missing COMPAT_43 option!"
 #endif
 #ifndef COMPAT_IA32
 #error "Unable to compile Linux-emulator due to missing COMPAT_IA32 option!"
 #endif
 
 #define	__ELF_WORD_SIZE	32
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/sysctl.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 
 #include <machine/cpu.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 
 #include <amd64/linux32/linux.h>
 #include <amd64/linux32/linux32_proto.h>
 #include <compat/linux/linux_mib.h>
 #include <compat/linux/linux_signal.h>
 #include <compat/linux/linux_util.h>
 
 MODULE_VERSION(linux, 1);
 MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
 MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
 MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
 
 MALLOC_DEFINE(M_LINUX, "linux", "Linux mode structures");
 
 #define	AUXARGS_ENTRY_32(pos, id, val)	\
 	do {				\
 		suword32(pos++, id);	\
 		suword32(pos++, val);	\
 	} while (0)
 
 #if BYTE_ORDER == LITTLE_ENDIAN
 #define SHELLMAGIC      0x2123 /* #! */
 #else
 #define SHELLMAGIC      0x2321
 #endif
 
 /*
  * Allow the sendsig functions to use the ldebug() facility
  * even though they are not syscalls themselves. Map them
  * to syscall 0. This is slightly less bogus than using
  * ldebug(sigreturn).
  */
 #define	LINUX_SYS_linux_rt_sendsig	0
 #define	LINUX_SYS_linux_sendsig		0
 
 extern char linux_sigcode[];
 extern int linux_szsigcode;
 
 extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
 
 SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler);
 SET_DECLARE(linux_device_handler_set, struct linux_device_handler);
 
 static int	elf_linux_fixup(register_t **stack_base,
 		    struct image_params *iparams);
 static register_t *linux_copyout_strings(struct image_params *imgp);
 static void	linux_prepsyscall(struct trapframe *tf, int *args, u_int *code,
 		    caddr_t *params);
 static void     linux_sendsig(sig_t catcher, int sig, sigset_t *mask,
 		    u_long code);
 static void	exec_linux_setregs(struct thread *td, u_long entry,
 				   u_long stack, u_long ps_strings);
-static void	linux32_fixlimits(struct image_params *imgp);
+static void	linux32_fixlimit(struct rlimit *rl, int which);
 
 /*
  * Linux syscalls return negative errno's, we do positive and map them
  */
 static int bsd_to_linux_errno[ELAST + 1] = {
 	-0,  -1,  -2,  -3,  -4,  -5,  -6,  -7,  -8,  -9,
 	-10, -35, -12, -13, -14, -15, -16, -17, -18, -19,
 	-20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
 	-30, -31, -32, -33, -34, -11,-115,-114, -88, -89,
 	-90, -91, -92, -93, -94, -95, -96, -97, -98, -99,
 	-100,-101,-102,-103,-104,-105,-106,-107,-108,-109,
 	-110,-111, -40, -36,-112,-113, -39, -11, -87,-122,
 	-116, -66,  -6,  -6,  -6,  -6,  -6, -37, -38,  -9,
 	-6, -6, -43, -42, -75, -6, -84
 };
 
 int bsd_to_linux_signal[LINUX_SIGTBLSZ] = {
 	LINUX_SIGHUP, LINUX_SIGINT, LINUX_SIGQUIT, LINUX_SIGILL,
 	LINUX_SIGTRAP, LINUX_SIGABRT, 0, LINUX_SIGFPE,
 	LINUX_SIGKILL, LINUX_SIGBUS, LINUX_SIGSEGV, LINUX_SIGSYS,
 	LINUX_SIGPIPE, LINUX_SIGALRM, LINUX_SIGTERM, LINUX_SIGURG,
 	LINUX_SIGSTOP, LINUX_SIGTSTP, LINUX_SIGCONT, LINUX_SIGCHLD,
 	LINUX_SIGTTIN, LINUX_SIGTTOU, LINUX_SIGIO, LINUX_SIGXCPU,
 	LINUX_SIGXFSZ, LINUX_SIGVTALRM, LINUX_SIGPROF, LINUX_SIGWINCH,
 	0, LINUX_SIGUSR1, LINUX_SIGUSR2
 };
 
 int linux_to_bsd_signal[LINUX_SIGTBLSZ] = {
 	SIGHUP, SIGINT, SIGQUIT, SIGILL,
 	SIGTRAP, SIGABRT, SIGBUS, SIGFPE,
 	SIGKILL, SIGUSR1, SIGSEGV, SIGUSR2,
 	SIGPIPE, SIGALRM, SIGTERM, SIGBUS,
 	SIGCHLD, SIGCONT, SIGSTOP, SIGTSTP,
 	SIGTTIN, SIGTTOU, SIGURG, SIGXCPU,
 	SIGXFSZ, SIGVTALRM, SIGPROF, SIGWINCH,
 	SIGIO, SIGURG, SIGSYS
 };
 
 #define LINUX_T_UNKNOWN  255
 static int _bsd_to_linux_trapcode[] = {
 	LINUX_T_UNKNOWN,	/* 0 */
 	6,			/* 1  T_PRIVINFLT */
 	LINUX_T_UNKNOWN,	/* 2 */
 	3,			/* 3  T_BPTFLT */
 	LINUX_T_UNKNOWN,	/* 4 */
 	LINUX_T_UNKNOWN,	/* 5 */
 	16,			/* 6  T_ARITHTRAP */
 	254,			/* 7  T_ASTFLT */
 	LINUX_T_UNKNOWN,	/* 8 */
 	13,			/* 9  T_PROTFLT */
 	1,			/* 10 T_TRCTRAP */
 	LINUX_T_UNKNOWN,	/* 11 */
 	14,			/* 12 T_PAGEFLT */
 	LINUX_T_UNKNOWN,	/* 13 */
 	17,			/* 14 T_ALIGNFLT */
 	LINUX_T_UNKNOWN,	/* 15 */
 	LINUX_T_UNKNOWN,	/* 16 */
 	LINUX_T_UNKNOWN,	/* 17 */
 	0,			/* 18 T_DIVIDE */
 	2,			/* 19 T_NMI */
 	4,			/* 20 T_OFLOW */
 	5,			/* 21 T_BOUND */
 	7,			/* 22 T_DNA */
 	8,			/* 23 T_DOUBLEFLT */
 	9,			/* 24 T_FPOPFLT */
 	10,			/* 25 T_TSSFLT */
 	11,			/* 26 T_SEGNPFLT */
 	12,			/* 27 T_STKFLT */
 	18,			/* 28 T_MCHK */
 	19,			/* 29 T_XMMFLT */
 	15			/* 30 T_RESERVED */
 };
 #define bsd_to_linux_trapcode(code) \
     ((code)<sizeof(_bsd_to_linux_trapcode)/sizeof(*_bsd_to_linux_trapcode)? \
      _bsd_to_linux_trapcode[(code)]: \
      LINUX_T_UNKNOWN)
 
 struct linux32_ps_strings {
 	u_int32_t ps_argvstr;	/* first of 0 or more argument strings */
 	u_int ps_nargvstr;	/* the number of argument strings */
 	u_int32_t ps_envstr;	/* first of 0 or more environment strings */
 	u_int ps_nenvstr;	/* the number of environment strings */
 };
 
 /*
  * If FreeBSD & Linux have a difference of opinion about what a trap
  * means, deal with it here.
  *
  * MPSAFE
  */
 static int
 translate_traps(int signal, int trap_code)
 {
 	if (signal != SIGBUS)
 		return signal;
 	switch (trap_code) {
 	case T_PROTFLT:
 	case T_TSSFLT:
 	case T_DOUBLEFLT:
 	case T_PAGEFLT:
 		return SIGSEGV;
 	default:
 		return signal;
 	}
 }
 
 static int
 elf_linux_fixup(register_t **stack_base, struct image_params *imgp)
 {
 	Elf32_Auxargs *args;
 	Elf32_Addr *base;
 	Elf32_Addr *pos;
 
 	KASSERT(curthread->td_proc == imgp->proc &&
 	    (curthread->td_proc->p_flag & P_SA) == 0,
 	    ("unsafe elf_linux_fixup(), should be curproc"));
 	base = (Elf32_Addr *)*stack_base;
 	args = (Elf32_Auxargs *)imgp->auxargs;
 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
 
 	if (args->trace)
 		AUXARGS_ENTRY_32(pos, AT_DEBUG, 1);
 	if (args->execfd != -1)
 		AUXARGS_ENTRY_32(pos, AT_EXECFD, args->execfd);
 	AUXARGS_ENTRY_32(pos, AT_PHDR, args->phdr);
 	AUXARGS_ENTRY_32(pos, AT_PHENT, args->phent);
 	AUXARGS_ENTRY_32(pos, AT_PHNUM, args->phnum);
 	AUXARGS_ENTRY_32(pos, AT_PAGESZ, args->pagesz);
 	AUXARGS_ENTRY_32(pos, AT_FLAGS, args->flags);
 	AUXARGS_ENTRY_32(pos, AT_ENTRY, args->entry);
 	AUXARGS_ENTRY_32(pos, AT_BASE, args->base);
 	AUXARGS_ENTRY_32(pos, AT_UID, imgp->proc->p_ucred->cr_ruid);
 	AUXARGS_ENTRY_32(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid);
 	AUXARGS_ENTRY_32(pos, AT_GID, imgp->proc->p_ucred->cr_rgid);
 	AUXARGS_ENTRY_32(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid);
 	AUXARGS_ENTRY_32(pos, AT_NULL, 0);
 
 	free(imgp->auxargs, M_TEMP);
 	imgp->auxargs = NULL;
 
 	base--;
 	suword32(base, (uint32_t)imgp->args->argc);
 	*stack_base = (register_t *)base;
 	return 0;
 }
 
 extern int _ucodesel, _ucode32sel, _udatasel;
 extern unsigned long linux_sznonrtsigcode;
 
 static void
 linux_rt_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	struct l_rt_sigframe *fp, frame;
 	int oonstack;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 #ifdef DEBUG
 	if (ldebug(rt_sendsig))
 		printf(ARGS(rt_sendsig, "%p, %d, %p, %lu"),
 		    catcher, sig, (void*)mask, code);
 #endif
 	/*
 	 * Allocate space for the signal handler context.
 	 */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct l_rt_sigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct l_rt_sigframe));
 	} else
 		fp = (struct l_rt_sigframe *)regs->tf_rsp - 1;
 	mtx_unlock(&psp->ps_mtx);
 
 	/*
 	 * Build the argument list for the signal handler.
 	 */
 	if (p->p_sysent->sv_sigtbl)
 		if (sig <= p->p_sysent->sv_sigsize)
 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	bzero(&frame, sizeof(frame));
 
 	frame.sf_handler = PTROUT(catcher);
 	frame.sf_sig = sig;
 	frame.sf_siginfo = PTROUT(&fp->sf_si);
 	frame.sf_ucontext = PTROUT(&fp->sf_sc);
 
 	/* Fill in POSIX parts */
 	frame.sf_si.lsi_signo = sig;
 	frame.sf_si.lsi_code = code;
 	frame.sf_si.lsi_addr = PTROUT(regs->tf_err);
 
 	/*
 	 * Build the signal context to be used by sigreturn.
 	 */
 	frame.sf_sc.uc_flags = 0;		/* XXX ??? */
 	frame.sf_sc.uc_link = 0;		/* XXX ??? */
 
 	frame.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp);
 	frame.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size;
 	frame.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
 	    ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE;
 	PROC_UNLOCK(p);
 
 	bsd_to_linux_sigset(mask, &frame.sf_sc.uc_sigmask);
 
 	frame.sf_sc.uc_mcontext.sc_mask   = frame.sf_sc.uc_sigmask.__bits[0];
         frame.sf_sc.uc_mcontext.sc_gs     = rgs();
         frame.sf_sc.uc_mcontext.sc_fs     = rfs();
         __asm __volatile("movl %%es,%0" :
 	    "=rm" (frame.sf_sc.uc_mcontext.sc_es));
         __asm __volatile("movl %%ds,%0" :
 	    "=rm" (frame.sf_sc.uc_mcontext.sc_ds));
 	frame.sf_sc.uc_mcontext.sc_edi    = regs->tf_rdi;
 	frame.sf_sc.uc_mcontext.sc_esi    = regs->tf_rsi;
 	frame.sf_sc.uc_mcontext.sc_ebp    = regs->tf_rbp;
 	frame.sf_sc.uc_mcontext.sc_ebx    = regs->tf_rbx;
 	frame.sf_sc.uc_mcontext.sc_edx    = regs->tf_rdx;
 	frame.sf_sc.uc_mcontext.sc_ecx    = regs->tf_rcx;
 	frame.sf_sc.uc_mcontext.sc_eax    = regs->tf_rax;
 	frame.sf_sc.uc_mcontext.sc_eip    = regs->tf_rip;
 	frame.sf_sc.uc_mcontext.sc_cs     = regs->tf_cs;
 	frame.sf_sc.uc_mcontext.sc_eflags = regs->tf_rflags;
 	frame.sf_sc.uc_mcontext.sc_esp_at_signal = regs->tf_rsp;
 	frame.sf_sc.uc_mcontext.sc_ss     = regs->tf_ss;
 	frame.sf_sc.uc_mcontext.sc_err    = regs->tf_err;
 	frame.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code);
 
 #ifdef DEBUG
 	if (ldebug(rt_sendsig))
 		printf(LMSG("rt_sendsig flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
 		    frame.sf_sc.uc_stack.ss_flags, td->td_sigstk.ss_sp,
 		    td->td_sigstk.ss_size, frame.sf_sc.uc_mcontext.sc_mask);
 #endif
 
 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 #ifdef DEBUG
 		if (ldebug(rt_sendsig))
 			printf(LMSG("rt_sendsig: bad stack %p, oonstack=%x"),
 			    fp, oonstack);
 #endif
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	/*
 	 * Build context to run handler in.
 	 */
 	regs->tf_rsp = PTROUT(fp);
 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode) +
 	    linux_sznonrtsigcode;
 	regs->tf_rflags &= ~PSL_T;
 	regs->tf_cs = _ucode32sel;
 	regs->tf_ss = _udatasel;
 	load_ds(_udatasel);
 	td->td_pcb->pcb_ds = _udatasel;
 	load_es(_udatasel);
 	td->td_pcb->pcb_es = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 
 /*
  * Send an interrupt to process.
  *
  * Stack is set up to allow sigcode stored
  * in u. to call routine, followed by kcall
  * to sigreturn routine below.  After sigreturn
  * resets the signal mask, the stack, and the
  * frame pointer, it returns to the user
  * specified pc, psl.
  */
 static void
 linux_sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
 {
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
 	struct sigacts *psp;
 	struct trapframe *regs;
 	struct l_sigframe *fp, frame;
 	l_sigset_t lmask;
 	int oonstack, i;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	psp = p->p_sigacts;
 	mtx_assert(&psp->ps_mtx, MA_OWNED);
 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
 		/* Signal handler installed with SA_SIGINFO. */
 		linux_rt_sendsig(catcher, sig, mask, code);
 		return;
 	}
 
 	regs = td->td_frame;
 	oonstack = sigonstack(regs->tf_rsp);
 
 #ifdef DEBUG
 	if (ldebug(sendsig))
 		printf(ARGS(sendsig, "%p, %d, %p, %lu"),
 		    catcher, sig, (void*)mask, code);
 #endif
 
 	/*
 	 * Allocate space for the signal handler context.
 	 */
 	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
 		fp = (struct l_sigframe *)(td->td_sigstk.ss_sp +
 		    td->td_sigstk.ss_size - sizeof(struct l_sigframe));
 	} else
 		fp = (struct l_sigframe *)regs->tf_rsp - 1;
 	mtx_unlock(&psp->ps_mtx);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Build the argument list for the signal handler.
 	 */
 	if (p->p_sysent->sv_sigtbl)
 		if (sig <= p->p_sysent->sv_sigsize)
 			sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
 
 	bzero(&frame, sizeof(frame));
 
 	frame.sf_handler = PTROUT(catcher);
 	frame.sf_sig = sig;
 
 	bsd_to_linux_sigset(mask, &lmask);
 
 	/*
 	 * Build the signal context to be used by sigreturn.
 	 */
 	frame.sf_sc.sc_mask   = lmask.__bits[0];
         frame.sf_sc.sc_gs     = rgs();
         frame.sf_sc.sc_fs     = rfs();
         __asm __volatile("movl %%es,%0" : "=rm" (frame.sf_sc.sc_es));
         __asm __volatile("movl %%ds,%0" : "=rm" (frame.sf_sc.sc_ds));
 	frame.sf_sc.sc_edi    = regs->tf_rdi;
 	frame.sf_sc.sc_esi    = regs->tf_rsi;
 	frame.sf_sc.sc_ebp    = regs->tf_rbp;
 	frame.sf_sc.sc_ebx    = regs->tf_rbx;
 	frame.sf_sc.sc_edx    = regs->tf_rdx;
 	frame.sf_sc.sc_ecx    = regs->tf_rcx;
 	frame.sf_sc.sc_eax    = regs->tf_rax;
 	frame.sf_sc.sc_eip    = regs->tf_rip;
 	frame.sf_sc.sc_cs     = regs->tf_cs;
 	frame.sf_sc.sc_eflags = regs->tf_rflags;
 	frame.sf_sc.sc_esp_at_signal = regs->tf_rsp;
 	frame.sf_sc.sc_ss     = regs->tf_ss;
 	frame.sf_sc.sc_err    = regs->tf_err;
 	frame.sf_sc.sc_trapno = bsd_to_linux_trapcode(code);
 
 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
 		frame.sf_extramask[i] = lmask.__bits[i+1];
 
 	if (copyout(&frame, fp, sizeof(frame)) != 0) {
 		/*
 		 * Process has trashed its stack; give it an illegal
 		 * instruction to halt it in its tracks.
 		 */
 		PROC_LOCK(p);
 		sigexit(td, SIGILL);
 	}
 
 	/*
 	 * Build context to run handler in.
 	 */
 	regs->tf_rsp = PTROUT(fp);
 	regs->tf_rip = LINUX32_PS_STRINGS - *(p->p_sysent->sv_szsigcode);
 	regs->tf_rflags &= ~PSL_T;
 	regs->tf_cs = _ucode32sel;
 	regs->tf_ss = _udatasel;
 	load_ds(_udatasel);
 	td->td_pcb->pcb_ds = _udatasel;
 	load_es(_udatasel);
 	td->td_pcb->pcb_es = _udatasel;
 	PROC_LOCK(p);
 	mtx_lock(&psp->ps_mtx);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * psl to gain improper privileges or to cause
  * a machine fault.
  */
 int
 linux_sigreturn(struct thread *td, struct linux_sigreturn_args *args)
 {
 	struct proc *p = td->td_proc;
 	struct l_sigframe frame;
 	struct trapframe *regs;
 	l_sigset_t lmask;
 	int eflags, i;
 
 	regs = td->td_frame;
 
 #ifdef DEBUG
 	if (ldebug(sigreturn))
 		printf(ARGS(sigreturn, "%p"), (void *)args->sfp);
 #endif
 	/*
 	 * The trampoline code hands us the sigframe.
 	 * It is unsafe to keep track of it ourselves, in the event that a
 	 * program jumps out of a signal handler.
 	 */
 	if (copyin(args->sfp, &frame, sizeof(frame)) != 0)
 		return (EFAULT);
 
 	/*
 	 * Check for security violations.
 	 */
 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 	eflags = frame.sf_sc.sc_eflags;
 	/*
 	 * XXX do allow users to change the privileged flag PSL_RF.  The
 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
 	 * sometimes set it there too.  tf_eflags is kept in the signal
 	 * context during signal handling and there is no other place
 	 * to remember it, so the PSL_RF bit may be corrupted by the
 	 * signal handler without us knowing.  Corruption of the PSL_RF
 	 * bit at worst causes one more or one less debugger trap, so
 	 * allowing it is fairly harmless.
 	 */
 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
 		return(EINVAL);
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
 	if (!CS_SECURE(frame.sf_sc.sc_cs)) {
 		trapsignal(td, SIGBUS, T_PROTFLT);
 		return(EINVAL);
 	}
 
 	lmask.__bits[0] = frame.sf_sc.sc_mask;
 	for (i = 0; i < (LINUX_NSIG_WORDS-1); i++)
 		lmask.__bits[i+1] = frame.sf_extramask[i];
 	PROC_LOCK(p);
 	linux_to_bsd_sigset(&lmask, &td->td_sigmask);
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Restore signal context.
 	 */
 	/* Selectors were restored by the trampoline. */
 	regs->tf_rdi    = frame.sf_sc.sc_edi;
 	regs->tf_rsi    = frame.sf_sc.sc_esi;
 	regs->tf_rbp    = frame.sf_sc.sc_ebp;
 	regs->tf_rbx    = frame.sf_sc.sc_ebx;
 	regs->tf_rdx    = frame.sf_sc.sc_edx;
 	regs->tf_rcx    = frame.sf_sc.sc_ecx;
 	regs->tf_rax    = frame.sf_sc.sc_eax;
 	regs->tf_rip    = frame.sf_sc.sc_eip;
 	regs->tf_cs     = frame.sf_sc.sc_cs;
 	regs->tf_rflags = eflags;
 	regs->tf_rsp    = frame.sf_sc.sc_esp_at_signal;
 	regs->tf_ss     = frame.sf_sc.sc_ss;
 
 	return (EJUSTRETURN);
 }
 
 /*
  * System call to cleanup state after a signal
  * has been taken.  Reset signal mask and
  * stack state from context left by rt_sendsig (above).
  * Return to previous pc and psl as specified by
  * context left by sendsig. Check carefully to
  * make sure that the user has not modified the
  * psl to gain improper privileges or to cause
  * a machine fault.
  */
 int
 linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args)
 {
 	struct proc *p = td->td_proc;
 	struct l_ucontext uc;
 	struct l_sigcontext *context;
 	l_stack_t *lss;
 	stack_t ss;
 	struct trapframe *regs;
 	int eflags;
 
 	regs = td->td_frame;
 
 #ifdef DEBUG
 	if (ldebug(rt_sigreturn))
 		printf(ARGS(rt_sigreturn, "%p"), (void *)args->ucp);
 #endif
 	/*
 	 * The trampoline code hands us the ucontext.
 	 * It is unsafe to keep track of it ourselves, in the event that a
 	 * program jumps out of a signal handler.
 	 */
 	if (copyin(args->ucp, &uc, sizeof(uc)) != 0)
 		return (EFAULT);
 
 	context = &uc.uc_mcontext;
 
 	/*
 	 * Check for security violations.
 	 */
 #define	EFLAGS_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
 	eflags = context->sc_eflags;
 	/*
 	 * XXX do allow users to change the privileged flag PSL_RF.  The
 	 * cpu sets PSL_RF in tf_eflags for faults.  Debuggers should
 	 * sometimes set it there too.  tf_eflags is kept in the signal
 	 * context during signal handling and there is no other place
 	 * to remember it, so the PSL_RF bit may be corrupted by the
 	 * signal handler without us knowing.  Corruption of the PSL_RF
 	 * bit at worst causes one more or one less debugger trap, so
 	 * allowing it is fairly harmless.
 	 */
 	if (!EFLAGS_SECURE(eflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF))
 		return(EINVAL);
 
 	/*
 	 * Don't allow users to load a valid privileged %cs.  Let the
 	 * hardware check for invalid selectors, excess privilege in
 	 * other selectors, invalid %eip's and invalid %esp's.
 	 */
 #define	CS_SECURE(cs)	(ISPL(cs) == SEL_UPL)
 	if (!CS_SECURE(context->sc_cs)) {
 		trapsignal(td, SIGBUS, T_PROTFLT);
 		return(EINVAL);
 	}
 
 	PROC_LOCK(p);
 	linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask);
 	SIG_CANTMASK(td->td_sigmask);
 	signotify(td);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Restore signal context
 	 */
 	/* Selectors were restored by the trampoline. */
 	regs->tf_rdi    = context->sc_edi;
 	regs->tf_rsi    = context->sc_esi;
 	regs->tf_rbp    = context->sc_ebp;
 	regs->tf_rbx    = context->sc_ebx;
 	regs->tf_rdx    = context->sc_edx;
 	regs->tf_rcx    = context->sc_ecx;
 	regs->tf_rax    = context->sc_eax;
 	regs->tf_rip    = context->sc_eip;
 	regs->tf_cs     = context->sc_cs;
 	regs->tf_rflags = eflags;
 	regs->tf_rsp    = context->sc_esp_at_signal;
 	regs->tf_ss     = context->sc_ss;
 
 	/*
 	 * call sigaltstack & ignore results..
 	 */
 	lss = &uc.uc_stack;
 	ss.ss_sp = PTRIN(lss->ss_sp);
 	ss.ss_size = lss->ss_size;
 	ss.ss_flags = linux_to_bsd_sigaltstack(lss->ss_flags);
 
 #ifdef DEBUG
 	if (ldebug(rt_sigreturn))
 		printf(LMSG("rt_sigret flags: 0x%x, sp: %p, ss: 0x%lx, mask: 0x%x"),
 		    ss.ss_flags, ss.ss_sp, ss.ss_size, context->sc_mask);
 #endif
 	(void)kern_sigaltstack(td, &ss, NULL);
 
 	return (EJUSTRETURN);
 }
 
 /*
  * MPSAFE
  */
 static void
 linux_prepsyscall(struct trapframe *tf, int *args, u_int *code, caddr_t *params)
 {
 	args[0] = tf->tf_rbx;
 	args[1] = tf->tf_rcx;
 	args[2] = tf->tf_rdx;
 	args[3] = tf->tf_rsi;
 	args[4] = tf->tf_rdi;
 	args[5] = tf->tf_rbp;	/* Unconfirmed */
 	*params = NULL;		/* no copyin */
 }
 
 /*
  * If a linux binary is exec'ing something, try this image activator
  * first.  We override standard shell script execution in order to
  * be able to modify the interpreter path.  We only do this if a linux
  * binary is doing the exec, so we do not create an EXEC module for it.
  */
 static int	exec_linux_imgact_try(struct image_params *iparams);
 
 static int
 exec_linux_imgact_try(struct image_params *imgp)
 {
     const char *head = (const char *)imgp->image_header;
     char *rpath;
     int error = -1, len;
 
     /*
      * The interpreter for shell scripts run from a linux binary needs
      * to be located in /compat/linux if possible in order to recursively
      * maintain linux path emulation.
      */
     if (((const short *)head)[0] == SHELLMAGIC) {
 	    /*
 	     * Run our normal shell image activator.  If it succeeds attempt
 	     * to use the alternate path for the interpreter.  If an alternate
 	     * path is found, use our stringspace to store it.
 	     */
 	    if ((error = exec_shell_imgact(imgp)) == 0) {
 		    linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc),
 			imgp->interpreter_name, UIO_SYSSPACE, &rpath, 0);
 		    if (rpath != NULL) {
 			    len = strlen(rpath) + 1;
 
 			    if (len <= MAXSHELLCMDLEN) {
 				    memcpy(imgp->interpreter_name, rpath, len);
 			    }
 			    free(rpath, M_TEMP);
 		    }
 	    }
     }
     return(error);
 }
 
 /*
  * Clear registers on exec
  * XXX copied from ia32_signal.c.
  */
 static void
 exec_linux_setregs(td, entry, stack, ps_strings)
 	struct thread *td;
 	u_long entry;
 	u_long stack;
 	u_long ps_strings;
 {
 	struct trapframe *regs = td->td_frame;
 	struct pcb *pcb = td->td_pcb;
 
 	wrmsr(MSR_FSBASE, 0);
 	wrmsr(MSR_KGSBASE, 0);	/* User value while we're in the kernel */
 	pcb->pcb_fsbase = 0;
 	pcb->pcb_gsbase = 0;
 	load_ds(_udatasel);
 	load_es(_udatasel);
 	load_fs(_udatasel);
 	load_gs(0);
 	pcb->pcb_ds = _udatasel;
 	pcb->pcb_es = _udatasel;
 	pcb->pcb_fs = _udatasel;
 	pcb->pcb_gs = 0;
 
 	bzero((char *)regs, sizeof(struct trapframe));
 	regs->tf_rip = entry;
 	regs->tf_rsp = stack;
 	regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
 	regs->tf_ss = _udatasel;
 	regs->tf_cs = _ucode32sel;
 	regs->tf_rbx = ps_strings;
 	load_cr0(rcr0() | CR0_MP | CR0_TS);
 	fpstate_drop(td);
 
 	/* Return via doreti so that we can change to a different %cs */
 	pcb->pcb_flags |= PCB_FULLCTX;
 	td->td_retval[1] = 0;
 }
 
 /*
  * XXX copied from ia32_sysvec.c.
  */
 static register_t *
 linux_copyout_strings(struct image_params *imgp)
 {
 	int argc, envc;
 	u_int32_t *vectp;
 	char *stringp, *destp;
 	u_int32_t *stack_base;
 	struct linux32_ps_strings *arginfo;
 	int sigcodesz;
 
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	arginfo = (struct linux32_ps_strings *)LINUX32_PS_STRINGS;
 	sigcodesz = *(imgp->proc->p_sysent->sv_szsigcode);
 	destp =	(caddr_t)arginfo - sigcodesz - SPARE_USRSPACE -
 		roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
 
 	/*
 	 * install sigcode
 	 */
 	if (sigcodesz)
 		copyout(imgp->proc->p_sysent->sv_sigcode,
 			((caddr_t)arginfo - sigcodesz), sigcodesz);
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
 			: (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 +
 				       imgp->auxarg_size) * sizeof(u_int32_t));
 
 	} else
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (u_int32_t *)
 			(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t));
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp);
 	suword32(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword32(vectp++, 0);
 
 	suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp);
 	suword32(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword32(vectp, 0);
 
 	return ((register_t *)stack_base);
 }
 
 SYSCTL_NODE(_compat, OID_AUTO, linux32, CTLFLAG_RW, 0,
     "32-bit Linux emulation");
 
 static u_long	linux32_maxdsiz = LINUX32_MAXDSIZ;
 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxdsiz, CTLFLAG_RW,
     &linux32_maxdsiz, 0, "");
 static u_long	linux32_maxssiz = LINUX32_MAXSSIZ;
 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxssiz, CTLFLAG_RW,
     &linux32_maxssiz, 0, "");
 static u_long	linux32_maxvmem = LINUX32_MAXVMEM;
 SYSCTL_ULONG(_compat_linux32, OID_AUTO, maxvmem, CTLFLAG_RW,
     &linux32_maxvmem, 0, "");
 
-/*
- * XXX copied from ia32_sysvec.c.
- */
 static void
-linux32_fixlimits(struct image_params *imgp)
+linux32_fixlimit(struct rlimit *rl, int which)
 {
-	struct proc *p = imgp->proc;
-	struct plimit *oldlim, *newlim;
 
-	if (linux32_maxdsiz == 0 && linux32_maxssiz == 0 &&
-	    linux32_maxvmem == 0)
-		return;
-	newlim = lim_alloc();
-	PROC_LOCK(p);
-	oldlim = p->p_limit;
-	lim_copy(newlim, oldlim);
-	if (linux32_maxdsiz != 0) {
-		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_cur > linux32_maxdsiz)
-		    newlim->pl_rlimit[RLIMIT_DATA].rlim_cur = linux32_maxdsiz;
-		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_max > linux32_maxdsiz)
-		    newlim->pl_rlimit[RLIMIT_DATA].rlim_max = linux32_maxdsiz;
+	switch (which) {
+	case RLIMIT_DATA:
+		if (linux32_maxdsiz != 0) {			
+			if (rl->rlim_cur > linux32_maxdsiz)
+				rl->rlim_cur = linux32_maxdsiz;
+			if (rl->rlim_max > linux32_maxdsiz)
+				rl->rlim_max = linux32_maxdsiz;
+		}
+		break;
+	case RLIMIT_STACK:
+		if (linux32_maxssiz != 0) {
+			if (rl->rlim_cur > linux32_maxssiz)
+				rl->rlim_cur = linux32_maxssiz;
+			if (rl->rlim_max > linux32_maxssiz)
+				rl->rlim_max = linux32_maxssiz;
+		}
+		break;
+	case RLIMIT_VMEM:
+		if (linux32_maxvmem != 0) {
+			if (rl->rlim_cur > linux32_maxvmem)
+				rl->rlim_cur = linux32_maxvmem;
+			if (rl->rlim_max > linux32_maxvmem)
+				rl->rlim_max = linux32_maxvmem;
+		}
+		break;
 	}
-	if (linux32_maxssiz != 0) {
-		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_cur > linux32_maxssiz)
-		    newlim->pl_rlimit[RLIMIT_STACK].rlim_cur = linux32_maxssiz;
-		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_max > linux32_maxssiz)
-		    newlim->pl_rlimit[RLIMIT_STACK].rlim_max = linux32_maxssiz;
-	}
-	if (linux32_maxvmem != 0) {
-		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur > linux32_maxvmem)
-		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur = linux32_maxvmem;
-		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_max > linux32_maxvmem)
-		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_max = linux32_maxvmem;
-	}
-	p->p_limit = newlim;
-	PROC_UNLOCK(p);
-	lim_free(oldlim);
 }
 
 struct sysentvec elf_linux_sysvec = {
 	LINUX_SYS_MAXSYSCALL,
 	linux_sysent,
 	0xff,
 	LINUX_SIGTBLSZ,
 	bsd_to_linux_signal,
 	ELAST + 1,
 	bsd_to_linux_errno,
 	translate_traps,
 	elf_linux_fixup,
 	linux_sendsig,
 	linux_sigcode,
 	&linux_szsigcode,
 	linux_prepsyscall,
 	"Linux ELF32",
 	elf32_coredump,
 	exec_linux_imgact_try,
 	LINUX_MINSIGSTKSZ,
 	PAGE_SIZE,
 	VM_MIN_ADDRESS,
 	LINUX32_USRSTACK,
 	LINUX32_USRSTACK,
 	LINUX32_PS_STRINGS,
 	VM_PROT_ALL,
 	linux_copyout_strings,
 	exec_linux_setregs,
-	linux32_fixlimits
+	linux32_fixlimit
 };
 
 static Elf32_Brandinfo linux_brand = {
 					ELFOSABI_LINUX,
 					EM_386,
 					"Linux",
 					"/compat/linux",
 					"/lib/ld-linux.so.1",
 					&elf_linux_sysvec,
 					NULL,
 				 };
 
 static Elf32_Brandinfo linux_glibc2brand = {
 					ELFOSABI_LINUX,
 					EM_386,
 					"Linux",
 					"/compat/linux",
 					"/lib/ld-linux.so.2",
 					&elf_linux_sysvec,
 					NULL,
 				 };
 
 Elf32_Brandinfo *linux_brandlist[] = {
 					&linux_brand,
 					&linux_glibc2brand,
 					NULL
 				};
 
 static int
 linux_elf_modevent(module_t mod, int type, void *data)
 {
 	Elf32_Brandinfo **brandinfo;
 	int error;
 	struct linux_ioctl_handler **lihp;
 	struct linux_device_handler **ldhp;
 
 	error = 0;
 
 	switch(type) {
 	case MOD_LOAD:
 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
 		     ++brandinfo)
 			if (elf32_insert_brand_entry(*brandinfo) < 0)
 				error = EINVAL;
 		if (error == 0) {
 			SET_FOREACH(lihp, linux_ioctl_handler_set)
 				linux_ioctl_register_handler(*lihp);
 			SET_FOREACH(ldhp, linux_device_handler_set)
 				linux_device_register_handler(*ldhp);
 			if (bootverbose)
 				printf("Linux ELF exec handler installed\n");
 		} else
 			printf("cannot insert Linux ELF brand handler\n");
 		break;
 	case MOD_UNLOAD:
 		for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL;
 		     ++brandinfo)
 			if (elf32_brand_inuse(*brandinfo))
 				error = EBUSY;
 		if (error == 0) {
 			for (brandinfo = &linux_brandlist[0];
 			     *brandinfo != NULL; ++brandinfo)
 				if (elf32_remove_brand_entry(*brandinfo) < 0)
 					error = EINVAL;
 		}
 		if (error == 0) {
 			SET_FOREACH(lihp, linux_ioctl_handler_set)
 				linux_ioctl_unregister_handler(*lihp);
 			SET_FOREACH(ldhp, linux_device_handler_set)
 				linux_device_unregister_handler(*ldhp);
 			if (bootverbose)
 				printf("Linux ELF exec handler removed\n");
 		} else
 			printf("Could not deinstall ELF interpreter entry\n");
 		break;
 	default:
 		break;
 	}
 	return error;
 }
 
 static moduledata_t linux_elf_mod = {
 	"linuxelf",
 	linux_elf_modevent,
 	0
 };
 
 DECLARE_MODULE(linuxelf, linux_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY);
Index: stable/6/sys/compat/ia32/ia32_sysvec.c
===================================================================
--- stable/6/sys/compat/ia32/ia32_sysvec.c	(revision 169910)
+++ stable/6/sys/compat/ia32/ia32_sysvec.c	(revision 169911)
@@ -1,313 +1,310 @@
 /*-
  * Copyright (c) 2002 Doug Rabson
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #define __ELF_WORD_SIZE 32
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mman.h>
 #include <sys/namei.h>
 #include <sys/pioctl.h>
 #include <sys/proc.h>
 #include <sys/procfs.h>
 #include <sys/resourcevar.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 #include <sys/imgact_elf.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <compat/freebsd32/freebsd32_util.h>
 #include <compat/freebsd32/freebsd32_proto.h>
 #include <compat/freebsd32/freebsd32_syscall.h>
 #include <compat/ia32/ia32_signal.h>
 #ifdef __amd64__
 #include <machine/psl.h>
 #include <machine/segments.h>
 #include <machine/specialreg.h>
 #else
 #include <i386/include/psl.h>
 #include <i386/include/segments.h>
 #include <i386/include/specialreg.h>
 #endif
 #include <machine/frame.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/cpufunc.h>
 
 CTASSERT(sizeof(struct ia32_mcontext) == 640);
 CTASSERT(sizeof(struct ia32_ucontext) == 704);
 CTASSERT(sizeof(struct ia32_sigframe) == 800);
 CTASSERT(sizeof(struct ia32_siginfo) == 64);
 #ifdef COMPAT_FREEBSD4
 CTASSERT(sizeof(struct ia32_mcontext4) == 260);
 CTASSERT(sizeof(struct ia32_ucontext4) == 324);
 CTASSERT(sizeof(struct ia32_sigframe4) == 408);
 #endif
 
 static register_t *ia32_copyout_strings(struct image_params *imgp);
-static void ia32_fixlimits(struct image_params *imgp);
+static void ia32_fixlimit(struct rlimit *rl, int which);
 
 extern struct sysent freebsd32_sysent[];
 
 SYSCTL_NODE(_compat, OID_AUTO, ia32, CTLFLAG_RW, 0, "ia32 mode");
 
 struct sysentvec ia32_freebsd_sysvec = {
 	FREEBSD32_SYS_MAXSYSCALL,
 	freebsd32_sysent,
 	0,
 	0,
 	NULL,
 	0,
 	NULL,
 	NULL,
 	elf32_freebsd_fixup,
 	ia32_sendsig,
 	ia32_sigcode,
 	&sz_ia32_sigcode,
 	NULL,
 	"FreeBSD ELF32",
 	elf32_coredump,
 	NULL,
 	MINSIGSTKSZ,
 	IA32_PAGE_SIZE,
 	0,
 	FREEBSD32_USRSTACK,
 	FREEBSD32_USRSTACK,
 	FREEBSD32_PS_STRINGS,
 	VM_PROT_ALL,
 	ia32_copyout_strings,
 	ia32_setregs,
-	ia32_fixlimits
+	ia32_fixlimit
 };
 
 
 static Elf32_Brandinfo ia32_brand_info = {
 						ELFOSABI_FREEBSD,
 						EM_386,
 						"FreeBSD",
 						NULL,
 						"/libexec/ld-elf.so.1",
 						&ia32_freebsd_sysvec,
 						"/libexec/ld-elf32.so.1",
 					  };
 
 SYSINIT(ia32, SI_SUB_EXEC, SI_ORDER_ANY,
 	(sysinit_cfunc_t) elf32_insert_brand_entry,
 	&ia32_brand_info);
 
 static Elf32_Brandinfo ia32_brand_oinfo = {
 						ELFOSABI_FREEBSD,
 						EM_386,
 						"FreeBSD",
 						NULL,
 						"/usr/libexec/ld-elf.so.1",
 						&ia32_freebsd_sysvec,
 						"/libexec/ld-elf32.so.1",
 					  };
 
 SYSINIT(oia32, SI_SUB_EXEC, SI_ORDER_ANY,
 	(sysinit_cfunc_t) elf32_insert_brand_entry,
 	&ia32_brand_oinfo);
 
 
 void
 elf32_dump_thread(struct thread *td __unused, void *dst __unused,
     size_t *off __unused)
 {
 }
 
 
 /* XXX may be freebsd32 MI */
 static register_t *
 ia32_copyout_strings(struct image_params *imgp)
 {
 	int argc, envc;
 	u_int32_t *vectp;
 	char *stringp, *destp;
 	u_int32_t *stack_base;
 	struct freebsd32_ps_strings *arginfo;
 	int szsigcode;
 
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	arginfo = (struct freebsd32_ps_strings *)FREEBSD32_PS_STRINGS;
 	szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
 	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
 		roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode)
 		copyout(imgp->proc->p_sysent->sv_sigcode,
 			((caddr_t)arginfo - szsigcode), szsigcode);
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
 			: (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (u_int32_t *) (destp - (imgp->args->argc + imgp->args->envc + 2 +
 				       imgp->auxarg_size) * sizeof(u_int32_t));
 
 	} else
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (u_int32_t *)
 			(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t));
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp);
 	suword32(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword32(vectp++, 0);
 
 	suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp);
 	suword32(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword32(vectp++, (u_int32_t)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword32(vectp, 0);
 
 	return ((register_t *)stack_base);
 }
 
 static u_long	ia32_maxdsiz = IA32_MAXDSIZ;
 SYSCTL_ULONG(_compat_ia32, OID_AUTO, maxdsiz, CTLFLAG_RW, &ia32_maxdsiz, 0, "");
 static u_long	ia32_maxssiz = IA32_MAXSSIZ;
 SYSCTL_ULONG(_compat_ia32, OID_AUTO, maxssiz, CTLFLAG_RW, &ia32_maxssiz, 0, "");
 static u_long	ia32_maxvmem = IA32_MAXVMEM;
 SYSCTL_ULONG(_compat_ia32, OID_AUTO, maxvmem, CTLFLAG_RW, &ia32_maxvmem, 0, "");
 
 static void
-ia32_fixlimits(struct image_params *imgp)
+ia32_fixlimit(struct rlimit *rl, int which)
 {
-	struct proc *p = imgp->proc;
-	struct plimit *oldlim, *newlim;
 
-	if (ia32_maxdsiz == 0 && ia32_maxssiz == 0 && ia32_maxvmem == 0)
-		return;
-	newlim = lim_alloc();
-	PROC_LOCK(p);
-	oldlim = p->p_limit;
-	lim_copy(newlim, oldlim);
-	if (ia32_maxdsiz != 0) {
-		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_cur > ia32_maxdsiz)
-		    newlim->pl_rlimit[RLIMIT_DATA].rlim_cur = ia32_maxdsiz;
-		if (newlim->pl_rlimit[RLIMIT_DATA].rlim_max > ia32_maxdsiz)
-		    newlim->pl_rlimit[RLIMIT_DATA].rlim_max = ia32_maxdsiz;
+	switch (which) {
+	case RLIMIT_DATA:
+		if (ia32_maxdsiz != 0) {
+			if (rl->rlim_cur > ia32_maxdsiz)
+				rl->rlim_cur = ia32_maxdsiz;
+			if (rl->rlim_max > ia32_maxdsiz)
+				rl->rlim_max = ia32_maxdsiz;
+		}
+		break;
+	case RLIMIT_STACK:
+		if (ia32_maxssiz != 0) {
+			if (rl->rlim_cur > ia32_maxssiz)
+				rl->rlim_cur = ia32_maxssiz;
+			if (rl->rlim_max > ia32_maxssiz)
+				rl->rlim_max = ia32_maxssiz;
+		}
+		break;
+	case RLIMIT_VMEM:
+		if (ia32_maxvmem != 0) {
+			if (rl->rlim_cur > ia32_maxvmem)
+				rl->rlim_cur = ia32_maxvmem;
+			if (rl->rlim_max > ia32_maxvmem)
+				rl->rlim_max = ia32_maxvmem;
+		}
+		break;
 	}
-	if (ia32_maxssiz != 0) {
-		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_cur > ia32_maxssiz)
-		    newlim->pl_rlimit[RLIMIT_STACK].rlim_cur = ia32_maxssiz;
-		if (newlim->pl_rlimit[RLIMIT_STACK].rlim_max > ia32_maxssiz)
-		    newlim->pl_rlimit[RLIMIT_STACK].rlim_max = ia32_maxssiz;
-	}
-	if (ia32_maxvmem != 0) {
-		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur > ia32_maxvmem)
-		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_cur = ia32_maxvmem;
-		if (newlim->pl_rlimit[RLIMIT_VMEM].rlim_max > ia32_maxvmem)
-		    newlim->pl_rlimit[RLIMIT_VMEM].rlim_max = ia32_maxvmem;
-	}
-	p->p_limit = newlim;
-	PROC_UNLOCK(p);
-	lim_free(oldlim);
 }
Index: stable/6/sys/kern/imgact_elf.c
===================================================================
--- stable/6/sys/kern/imgact_elf.c	(revision 169910)
+++ stable/6/sys/kern/imgact_elf.c	(revision 169911)
@@ -1,1295 +1,1295 @@
 /*-
  * Copyright (c) 2000 David O'Brien
  * Copyright (c) 1995-1996 Søren Schmidt
  * Copyright (c) 1996 Peter Wemm
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer
  *    in this position and unchanged.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/exec.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/mman.h>
 #include <sys/namei.h>
 #include <sys/pioctl.h>
 #include <sys/proc.h>
 #include <sys/procfs.h>
 #include <sys/resourcevar.h>
 #include <sys/sf_buf.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscall.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 
 #include <machine/elf.h>
 #include <machine/md_var.h>
 
 #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32
 #include <machine/fpu.h>
 #include <compat/ia32/ia32_reg.h>
 #endif
 
 #define OLD_EI_BRAND	8
 
 static int __elfN(check_header)(const Elf_Ehdr *hdr);
 static Elf_Brandinfo *__elfN(get_brandinfo)(const Elf_Ehdr *hdr,
     const char *interp);
 static int __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
     u_long *entry, size_t pagesize);
 static int __elfN(load_section)(struct vmspace *vmspace, vm_object_t object,
     vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz,
     vm_prot_t prot, size_t pagesize);
 static int __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp);
 
 SYSCTL_NODE(_kern, OID_AUTO, __CONCAT(elf, __ELF_WORD_SIZE), CTLFLAG_RW, 0,
     "");
 
 int __elfN(fallback_brand) = -1;
 SYSCTL_INT(__CONCAT(_kern_elf, __ELF_WORD_SIZE), OID_AUTO,
     fallback_brand, CTLFLAG_RW, &__elfN(fallback_brand), 0,
     __XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE)) " brand of last resort");
 TUNABLE_INT("kern.elf" __XSTRING(__ELF_WORD_SIZE) ".fallback_brand",
     &__elfN(fallback_brand));
 
 static int elf_trace = 0;
 SYSCTL_INT(_debug, OID_AUTO, __elfN(trace), CTLFLAG_RW, &elf_trace, 0, "");
 
 static int elf_legacy_coredump = 0;
 SYSCTL_INT(_debug, OID_AUTO, __elfN(legacy_coredump), CTLFLAG_RW, 
     &elf_legacy_coredump, 0, "");
 
 static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
 
 int
 __elfN(insert_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == NULL) {
 			elf_brand_list[i] = entry;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS)
 		return (-1);
 	return (0);
 }
 
 int
 __elfN(remove_brand_entry)(Elf_Brandinfo *entry)
 {
 	int i;
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		if (elf_brand_list[i] == entry) {
 			elf_brand_list[i] = NULL;
 			break;
 		}
 	}
 	if (i == MAX_BRANDS)
 		return (-1);
 	return (0);
 }
 
 int
 __elfN(brand_inuse)(Elf_Brandinfo *entry)
 {
 	struct proc *p;
 	int rval = FALSE;
 
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &allproc, p_list) {
 		if (p->p_sysent == entry->sysvec) {
 			rval = TRUE;
 			break;
 		}
 	}
 	sx_sunlock(&allproc_lock);
 
 	return (rval);
 }
 
 static Elf_Brandinfo *
 __elfN(get_brandinfo)(const Elf_Ehdr *hdr, const char *interp)
 {
 	Elf_Brandinfo *bi;
 	int i;
 
 	/*
 	 * We support three types of branding -- (1) the ELF EI_OSABI field
 	 * that SCO added to the ELF spec, (2) FreeBSD 3.x's traditional string
 	 * branding w/in the ELF header, and (3) path of the `interp_path'
 	 * field.  We should also look for an ".note.ABI-tag" ELF section now
 	 * in all Linux ELF binaries, FreeBSD 4.1+, and some NetBSD ones.
 	 */
 
 	/* If the executable has a brand, search for it in the brand list. */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi != NULL && hdr->e_machine == bi->machine &&
 		    (hdr->e_ident[EI_OSABI] == bi->brand ||
 		    strncmp((const char *)&hdr->e_ident[OLD_EI_BRAND],
 		    bi->compat_3_brand, strlen(bi->compat_3_brand)) == 0))
 			return (bi);
 	}
 
 	/* Lacking a known brand, search for a recognized interpreter. */
 	if (interp != NULL) {
 		for (i = 0; i < MAX_BRANDS; i++) {
 			bi = elf_brand_list[i];
 			if (bi != NULL && hdr->e_machine == bi->machine &&
 			    strcmp(interp, bi->interp_path) == 0)
 				return (bi);
 		}
 	}
 
 	/* Lacking a recognized interpreter, try the default brand */
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi != NULL && hdr->e_machine == bi->machine &&
 		    __elfN(fallback_brand) == bi->brand)
 			return (bi);
 	}
 	return (NULL);
 }
 
 static int
 __elfN(check_header)(const Elf_Ehdr *hdr)
 {
 	Elf_Brandinfo *bi;
 	int i;
 
 	if (!IS_ELF(*hdr) ||
 	    hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS ||
 	    hdr->e_ident[EI_DATA] != ELF_TARG_DATA ||
 	    hdr->e_ident[EI_VERSION] != EV_CURRENT ||
 	    hdr->e_phentsize != sizeof(Elf_Phdr) ||
 	    hdr->e_version != ELF_TARG_VER)
 		return (ENOEXEC);
 
 	/*
 	 * Make sure we have at least one brand for this machine.
 	 */
 
 	for (i = 0; i < MAX_BRANDS; i++) {
 		bi = elf_brand_list[i];
 		if (bi != NULL && bi->machine == hdr->e_machine)
 			break;
 	}
 	if (i == MAX_BRANDS)
 		return (ENOEXEC);
 
 	return (0);
 }
 
 static int
 __elfN(map_partial)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot)
 {
 	struct sf_buf *sf;
 	int error;
 	vm_offset_t off;
 
 	/*
 	 * Create the page if it doesn't exist yet. Ignore errors.
 	 */
 	vm_map_lock(map);
 	vm_map_insert(map, NULL, 0, trunc_page(start), round_page(end),
 	    VM_PROT_ALL, VM_PROT_ALL, 0);
 	vm_map_unlock(map);
 
 	/*
 	 * Find the page from the underlying object.
 	 */
 	if (object) {
 		sf = vm_imgact_map_page(object, offset);
 		if (sf == NULL)
 			return (KERN_FAILURE);
 		off = offset - trunc_page(offset);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off, (caddr_t)start,
 		    end - start);
 		vm_imgact_unmap_page(sf);
 		if (error) {
 			return (KERN_FAILURE);
 		}
 	}
 
 	return (KERN_SUCCESS);
 }
 
 static int
 __elfN(map_insert)(vm_map_t map, vm_object_t object, vm_ooffset_t offset,
     vm_offset_t start, vm_offset_t end, vm_prot_t prot, int cow)
 {
 	struct sf_buf *sf;
 	vm_offset_t off;
 	vm_size_t sz;
 	int error, rv;
 
 	if (start != trunc_page(start)) {
 		rv = __elfN(map_partial)(map, object, offset, start,
 		    round_page(start), prot);
 		if (rv)
 			return (rv);
 		offset += round_page(start) - start;
 		start = round_page(start);
 	}
 	if (end != round_page(end)) {
 		rv = __elfN(map_partial)(map, object, offset +
 		    trunc_page(end) - start, trunc_page(end), end, prot);
 		if (rv)
 			return (rv);
 		end = trunc_page(end);
 	}
 	if (end > start) {
 		if (offset & PAGE_MASK) {
 			/*
 			 * The mapping is not page aligned. This means we have
 			 * to copy the data. Sigh.
 			 */
 			rv = vm_map_find(map, NULL, 0, &start, end - start,
 			    FALSE, prot | VM_PROT_WRITE, VM_PROT_ALL, 0);
 			if (rv)
 				return (rv);
 			if (object == NULL)
 				return (KERN_SUCCESS);
 			for (; start < end; start += sz) {
 				sf = vm_imgact_map_page(object, offset);
 				if (sf == NULL)
 					return (KERN_FAILURE);
 				off = offset - trunc_page(offset);
 				sz = end - start;
 				if (sz > PAGE_SIZE - off)
 					sz = PAGE_SIZE - off;
 				error = copyout((caddr_t)sf_buf_kva(sf) + off,
 				    (caddr_t)start, sz);
 				vm_imgact_unmap_page(sf);
 				if (error) {
 					return (KERN_FAILURE);
 				}
 				offset += sz;
 			}
 			rv = KERN_SUCCESS;
 		} else {
 			vm_object_reference(object);
 			vm_map_lock(map);
 			rv = vm_map_insert(map, object, offset, start, end,
 			    prot, VM_PROT_ALL, cow);
 			vm_map_unlock(map);
 			if (rv != KERN_SUCCESS)
 				vm_object_deallocate(object);
 		}
 		return (rv);
 	} else {
 		return (KERN_SUCCESS);
 	}
 }
 
 static int
 __elfN(load_section)(struct vmspace *vmspace,
 	vm_object_t object, vm_offset_t offset,
 	caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot,
 	size_t pagesize)
 {
 	struct sf_buf *sf;
 	size_t map_len;
 	vm_offset_t map_addr;
 	int error, rv, cow;
 	size_t copy_len;
 	vm_offset_t file_addr;
 
 	/*
 	 * It's necessary to fail if the filsz + offset taken from the
 	 * header is greater than the actual file pager object's size.
 	 * If we were to allow this, then the vm_map_find() below would
 	 * walk right off the end of the file object and into the ether.
 	 *
 	 * While I'm here, might as well check for something else that
 	 * is invalid: filsz cannot be greater than memsz.
 	 */
 	if ((off_t)filsz + offset > object->un_pager.vnp.vnp_size ||
 	    filsz > memsz) {
 		uprintf("elf_load_section: truncated ELF file\n");
 		return (ENOEXEC);
 	}
 
 #define trunc_page_ps(va, ps)	((va) & ~(ps - 1))
 #define round_page_ps(va, ps)	(((va) + (ps - 1)) & ~(ps - 1))
 
 	map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
 	file_addr = trunc_page_ps(offset, pagesize);
 
 	/*
 	 * We have two choices.  We can either clear the data in the last page
 	 * of an oversized mapping, or we can start the anon mapping a page
 	 * early and copy the initialized data into that first page.  We
 	 * choose the second..
 	 */
 	if (memsz > filsz)
 		map_len = trunc_page_ps(offset + filsz, pagesize) - file_addr;
 	else
 		map_len = round_page_ps(offset + filsz, pagesize) - file_addr;
 
 	if (map_len != 0) {
 		/* cow flags: don't dump readonly sections in core */
 		cow = MAP_COPY_ON_WRITE | MAP_PREFAULT |
 		    (prot & VM_PROT_WRITE ? 0 : MAP_DISABLE_COREDUMP);
 
 		rv = __elfN(map_insert)(&vmspace->vm_map,
 				      object,
 				      file_addr,	/* file offset */
 				      map_addr,		/* virtual start */
 				      map_addr + map_len,/* virtual end */
 				      prot,
 				      cow);
 		if (rv != KERN_SUCCESS)
 			return (EINVAL);
 
 		/* we can stop now if we've covered it all */
 		if (memsz == filsz) {
 			return (0);
 		}
 	}
 
 
 	/*
 	 * We have to get the remaining bit of the file into the first part
 	 * of the oversized map segment.  This is normally because the .data
 	 * segment in the file is extended to provide bss.  It's a neat idea
 	 * to try and save a page, but it's a pain in the behind to implement.
 	 */
 	copy_len = (offset + filsz) - trunc_page_ps(offset + filsz, pagesize);
 	map_addr = trunc_page_ps((vm_offset_t)vmaddr + filsz, pagesize);
 	map_len = round_page_ps((vm_offset_t)vmaddr + memsz, pagesize) -
 	    map_addr;
 
 	/* This had damn well better be true! */
 	if (map_len != 0) {
 		rv = __elfN(map_insert)(&vmspace->vm_map, NULL, 0, map_addr,
 		    map_addr + map_len, VM_PROT_ALL, 0);
 		if (rv != KERN_SUCCESS) {
 			return (EINVAL);
 		}
 	}
 
 	if (copy_len != 0) {
 		vm_offset_t off;
 
 		sf = vm_imgact_map_page(object, offset + filsz);
 		if (sf == NULL)
 			return (EIO);
 
 		/* send the page fragment to user space */
 		off = trunc_page_ps(offset + filsz, pagesize) -
 		    trunc_page(offset + filsz);
 		error = copyout((caddr_t)sf_buf_kva(sf) + off,
 		    (caddr_t)map_addr, copy_len);
 		vm_imgact_unmap_page(sf);
 		if (error) {
 			return (error);
 		}
 	}
 
 	/*
 	 * set it to the specified protection.
 	 * XXX had better undo the damage from pasting over the cracks here!
 	 */
 	vm_map_protect(&vmspace->vm_map, trunc_page(map_addr),
 	    round_page(map_addr + map_len),  prot, FALSE);
 
 	return (0);
 }
 
 /*
  * Load the file "file" into memory.  It may be either a shared object
  * or an executable.
  *
  * The "addr" reference parameter is in/out.  On entry, it specifies
  * the address where a shared object should be loaded.  If the file is
  * an executable, this value is ignored.  On exit, "addr" specifies
  * where the file was actually loaded.
  *
  * The "entry" reference parameter is out only.  On exit, it specifies
  * the entry point for the loaded file.
  */
 static int
 __elfN(load_file)(struct proc *p, const char *file, u_long *addr,
 	u_long *entry, size_t pagesize)
 {
 	struct {
 		struct nameidata nd;
 		struct vattr attr;
 		struct image_params image_params;
 	} *tempdata;
 	const Elf_Ehdr *hdr = NULL;
 	const Elf_Phdr *phdr = NULL;
 	struct nameidata *nd;
 	struct vmspace *vmspace = p->p_vmspace;
 	struct vattr *attr;
 	struct image_params *imgp;
 	vm_prot_t prot;
 	u_long rbase;
 	u_long base_addr = 0;
 	int vfslocked, error, i, numsegs;
 
 	if (curthread->td_proc != p)
 		panic("elf_load_file - thread");	/* XXXKSE DIAGNOSTIC */
 
 	tempdata = malloc(sizeof(*tempdata), M_TEMP, M_WAITOK);
 	nd = &tempdata->nd;
 	attr = &tempdata->attr;
 	imgp = &tempdata->image_params;
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->attr = attr;
 	imgp->firstpage = NULL;
 	imgp->image_header = NULL;
 	imgp->object = NULL;
 	imgp->execlabel = NULL;
 
 	/* XXXKSE */
 	NDINIT(nd, LOOKUP, MPSAFE|LOCKLEAF|FOLLOW, UIO_SYSSPACE, file,
 	    curthread);
 	vfslocked = 0;
 	if ((error = namei(nd)) != 0) {
 		nd->ni_vp = NULL;
 		goto fail;
 	}
 	vfslocked = NDHASGIANT(nd);
 	NDFREE(nd, NDF_ONLY_PNBUF);
 	imgp->vp = nd->ni_vp;
 
 	/*
 	 * Check permissions, modes, uid, etc on the file, and "open" it.
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto fail;
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto fail;
 
 	/*
 	 * Also make certain that the interpreter stays the same, so set
 	 * its VV_TEXT flag, too.
 	 */
 	nd->ni_vp->v_vflag |= VV_TEXT;
 
 	imgp->object = nd->ni_vp->v_object;
 
 	hdr = (const Elf_Ehdr *)imgp->image_header;
 	if ((error = __elfN(check_header)(hdr)) != 0)
 		goto fail;
 	if (hdr->e_type == ET_DYN)
 		rbase = *addr;
 	else if (hdr->e_type == ET_EXEC)
 		rbase = 0;
 	else {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	/* Only support headers that fit within first page for now      */
 	/*    (multiplication of two Elf_Half fields will not overflow) */
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE - hdr->e_phoff) {
 		error = ENOEXEC;
 		goto fail;
 	}
 
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 
 	for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_LOAD) {	/* Loadable segment */
 			prot = 0;
 			if (phdr[i].p_flags & PF_X)
   				prot |= VM_PROT_EXECUTE;
 			if (phdr[i].p_flags & PF_W)
   				prot |= VM_PROT_WRITE;
 			if (phdr[i].p_flags & PF_R)
   				prot |= VM_PROT_READ;
 
 			if ((error = __elfN(load_section)(vmspace,
 			    imgp->object, phdr[i].p_offset,
 			    (caddr_t)(uintptr_t)phdr[i].p_vaddr + rbase,
 			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
 			    pagesize)) != 0)
 				goto fail;
 			/*
 			 * Establish the base address if this is the
 			 * first segment.
 			 */
 			if (numsegs == 0)
   				base_addr = trunc_page(phdr[i].p_vaddr +
 				    rbase);
 			numsegs++;
 		}
 	}
 	*addr = base_addr;
 	*entry = (unsigned long)hdr->e_entry + rbase;
 
 fail:
 	if (imgp->firstpage)
 		exec_unmap_first_page(imgp);
 
 	if (nd->ni_vp)
 		vput(nd->ni_vp);
 
 	VFS_UNLOCK_GIANT(vfslocked);
 	free(tempdata, M_TEMP);
 
 	return (error);
 }
 
 static int
 __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 {
 	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
 	const Elf_Phdr *phdr;
 	Elf_Auxargs *elf_auxargs;
 	struct vmspace *vmspace;
 	vm_prot_t prot;
 	u_long text_size = 0, data_size = 0, total_size = 0;
 	u_long text_addr = 0, data_addr = 0;
 	u_long seg_size, seg_addr;
 	u_long addr, entry = 0, proghdr = 0;
 	int error = 0, i;
 	const char *interp = NULL;
 	Elf_Brandinfo *brand_info;
 	char *path;
 	struct thread *td = curthread;
 	struct sysentvec *sv;
 
 	/*
 	 * Do we have a valid ELF header ?
 	 *
 	 * Only allow ET_EXEC & ET_DYN here, reject ET_DYN later
 	 * if particular brand doesn't support it.
 	 */
 	if (__elfN(check_header)(hdr) != 0 ||
 	    (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN))
 		return (-1);
 
 	/*
 	 * From here on down, we return an errno, not -1, as we've
 	 * detected an ELF file.
 	 */
 
 	if ((hdr->e_phoff > PAGE_SIZE) ||
 	    (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) {
 		/* Only support headers in first page for now */
 		return (ENOEXEC);
 	}
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
 	for (i = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_INTERP) {
 			/* Path to interpreter */
 			if (phdr[i].p_filesz > MAXPATHLEN ||
 			    phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE)
 				return (ENOEXEC);
 			interp = imgp->image_header + phdr[i].p_offset;
 			break;
 		}
 	}
 
 	brand_info = __elfN(get_brandinfo)(hdr, interp);
 	if (brand_info == NULL) {
 		uprintf("ELF binary type \"%u\" not known.\n",
 		    hdr->e_ident[EI_OSABI]);
 		return (ENOEXEC);
 	}
 	if (hdr->e_type == ET_DYN && brand_info->brand != ELFOSABI_LINUX)
 		return (ENOEXEC);
 	sv = brand_info->sysvec;
 	if (interp != NULL && brand_info->interp_newpath != NULL)
 		interp = brand_info->interp_newpath;
 
 	/*
 	 * Avoid a possible deadlock if the current address space is destroyed
 	 * and that address space maps the locked vnode.  In the common case,
 	 * the locked vnode's v_usecount is decremented but remains greater
 	 * than zero.  Consequently, the vnode lock is not needed by vrele().
 	 * However, in cases where the vnode lock is external, such as nullfs,
 	 * v_usecount may become zero.
 	 */
 	VOP_UNLOCK(imgp->vp, 0, td);
 
 	exec_new_vmspace(imgp, sv);
+	imgp->proc->p_sysent = sv;
 
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	vmspace = imgp->proc->p_vmspace;
 
 	for (i = 0; i < hdr->e_phnum; i++) {
 		switch (phdr[i].p_type) {
 		case PT_LOAD:	/* Loadable segment */
 			prot = 0;
 			if (phdr[i].p_flags & PF_X)
   				prot |= VM_PROT_EXECUTE;
 			if (phdr[i].p_flags & PF_W)
   				prot |= VM_PROT_WRITE;
 			if (phdr[i].p_flags & PF_R)
   				prot |= VM_PROT_READ;
 
 #if defined(__ia64__) && __ELF_WORD_SIZE == 32 && defined(IA32_ME_HARDER)
 			/*
 			 * Some x86 binaries assume read == executable,
 			 * notably the M3 runtime and therefore cvsup
 			 */
 			if (prot & VM_PROT_READ)
 				prot |= VM_PROT_EXECUTE;
 #endif
 
 			if ((error = __elfN(load_section)(vmspace,
 			    imgp->object, phdr[i].p_offset,
 			    (caddr_t)(uintptr_t)phdr[i].p_vaddr,
 			    phdr[i].p_memsz, phdr[i].p_filesz, prot,
 			    sv->sv_pagesize)) != 0)
 				return (error);
 
 			/*
 			 * If this segment contains the program headers,
 			 * remember their virtual address for the AT_PHDR
 			 * aux entry. Static binaries don't usually include
 			 * a PT_PHDR entry.
 			 */
 			if (phdr[i].p_offset == 0 &&
 			    hdr->e_phoff + hdr->e_phnum * hdr->e_phentsize
 				<= phdr[i].p_filesz)
 				proghdr = phdr[i].p_vaddr + hdr->e_phoff;
 
 			seg_addr = trunc_page(phdr[i].p_vaddr);
 			seg_size = round_page(phdr[i].p_memsz +
 			    phdr[i].p_vaddr - seg_addr);
 
 			/*
 			 * Is this .text or .data?  We can't use
 			 * VM_PROT_WRITE or VM_PROT_EXEC, it breaks the
 			 * alpha terribly and possibly does other bad
 			 * things so we stick to the old way of figuring
 			 * it out:  If the segment contains the program
 			 * entry point, it's a text segment, otherwise it
 			 * is a data segment.
 			 *
 			 * Note that obreak() assumes that data_addr + 
 			 * data_size == end of data load area, and the ELF
 			 * file format expects segments to be sorted by
 			 * address.  If multiple data segments exist, the
 			 * last one will be used.
 			 */
 			if (hdr->e_entry >= phdr[i].p_vaddr &&
 			    hdr->e_entry < (phdr[i].p_vaddr +
 			    phdr[i].p_memsz)) {
 				text_size = seg_size;
 				text_addr = seg_addr;
 				entry = (u_long)hdr->e_entry;
 			} else {
 				data_size = seg_size;
 				data_addr = seg_addr;
 			}
 			total_size += seg_size;
 			break;
 		case PT_PHDR: 	/* Program header table info */
 			proghdr = phdr[i].p_vaddr;
 			break;
 		default:
 			break;
 		}
 	}
 	
 	if (data_addr == 0 && data_size == 0) {
 		data_addr = text_addr;
 		data_size = text_size;
 	}
 
 	/*
 	 * Check limits.  It should be safe to check the
 	 * limits after loading the segments since we do
 	 * not actually fault in all the segments pages.
 	 */
 	PROC_LOCK(imgp->proc);
 	if (data_size > lim_cur(imgp->proc, RLIMIT_DATA) ||
 	    text_size > maxtsiz ||
 	    total_size > lim_cur(imgp->proc, RLIMIT_VMEM)) {
 		PROC_UNLOCK(imgp->proc);
 		return (ENOMEM);
 	}
 
 	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
 	vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr;
 	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
 	vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr;
 
 	/*
 	 * We load the dynamic linker where a userland call
 	 * to mmap(0, ...) would put it.  The rationale behind this
 	 * calculation is that it leaves room for the heap to grow to
 	 * its maximum allowed size.
 	 */
 	addr = round_page((vm_offset_t)imgp->proc->p_vmspace->vm_daddr +
 	    lim_max(imgp->proc, RLIMIT_DATA));
 	PROC_UNLOCK(imgp->proc);
 
 	imgp->entry_addr = entry;
 
-	imgp->proc->p_sysent = sv;
 	if (interp != NULL) {
 		VOP_UNLOCK(imgp->vp, 0, td);
 		if (brand_info->emul_path != NULL &&
 		    brand_info->emul_path[0] != '\0') {
 			path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 			snprintf(path, MAXPATHLEN, "%s%s",
 			    brand_info->emul_path, interp);
 			error = __elfN(load_file)(imgp->proc, path, &addr,
 			    &imgp->entry_addr, sv->sv_pagesize);
 			free(path, M_TEMP);
 			if (error == 0)
 				interp = NULL;
 		}
 		if (interp != NULL) {
 			error = __elfN(load_file)(imgp->proc, interp, &addr,
 			    &imgp->entry_addr, sv->sv_pagesize);
 		}
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 		if (error != 0) {
 			uprintf("ELF interpreter %s not found\n", interp);
 			return (error);
 		}
 	}
 
 	/*
 	 * Construct auxargs table (used by the fixup routine)
 	 */
 	elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK);
 	elf_auxargs->execfd = -1;
 	elf_auxargs->phdr = proghdr;
 	elf_auxargs->phent = hdr->e_phentsize;
 	elf_auxargs->phnum = hdr->e_phnum;
 	elf_auxargs->pagesz = PAGE_SIZE;
 	elf_auxargs->base = addr;
 	elf_auxargs->flags = 0;
 	elf_auxargs->entry = entry;
 	elf_auxargs->trace = elf_trace;
 
 	imgp->auxargs = elf_auxargs;
 	imgp->interpreted = 0;
 
 	return (error);
 }
 
 #define	suword __CONCAT(suword, __ELF_WORD_SIZE)
 
 int
 __elfN(freebsd_fixup)(register_t **stack_base, struct image_params *imgp)
 {
 	Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs;
 	Elf_Addr *base;
 	Elf_Addr *pos;
 
 	base = (Elf_Addr *)*stack_base;
 	pos = base + (imgp->args->argc + imgp->args->envc + 2);
 
 	if (args->trace) {
 		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
 	}
 	if (args->execfd != -1) {
 		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
 	}
 	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
 	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
 	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
 	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
 	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
 	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
 	AUXARGS_ENTRY(pos, AT_BASE, args->base);
 	AUXARGS_ENTRY(pos, AT_NULL, 0);
 
 	free(imgp->auxargs, M_TEMP);
 	imgp->auxargs = NULL;
 
 	base--;
 	suword(base, (long)imgp->args->argc);
 	*stack_base = (register_t *)base;
 	return (0);
 }
 
 /*
  * Code for generating ELF core dumps.
  */
 
 typedef void (*segment_callback)(vm_map_entry_t, void *);
 
 /* Closure for cb_put_phdr(). */
 struct phdr_closure {
 	Elf_Phdr *phdr;		/* Program header to fill in */
 	Elf_Off offset;		/* Offset of segment in core file */
 };
 
 /* Closure for cb_size_segment(). */
 struct sseg_closure {
 	int count;		/* Count of writable segments. */
 	size_t size;		/* Total size of all writable segments. */
 };
 
 static void cb_put_phdr(vm_map_entry_t, void *);
 static void cb_size_segment(vm_map_entry_t, void *);
 static void each_writable_segment(struct thread *, segment_callback, void *);
 static int __elfN(corehdr)(struct thread *, struct vnode *, struct ucred *,
     int, void *, size_t);
 static void __elfN(puthdr)(struct thread *, void *, size_t *, int);
 static void __elfN(putnote)(void *, size_t *, const char *, int,
     const void *, size_t);
 
 extern int osreldate;
 
 int
 __elfN(coredump)(td, vp, limit)
 	struct thread *td;
 	struct vnode *vp;
 	off_t limit;
 {
 	struct ucred *cred = td->td_ucred;
 	int error = 0;
 	struct sseg_closure seginfo;
 	void *hdr;
 	size_t hdrsize;
 
 	/* Size the program segments. */
 	seginfo.count = 0;
 	seginfo.size = 0;
 	each_writable_segment(td, cb_size_segment, &seginfo);
 
 	/*
 	 * Calculate the size of the core file header area by making
 	 * a dry run of generating it.  Nothing is written, but the
 	 * size is calculated.
 	 */
 	hdrsize = 0;
 	__elfN(puthdr)(td, (void *)NULL, &hdrsize, seginfo.count);
 
 	if (hdrsize + seginfo.size >= limit)
 		return (EFAULT);
 
 	/*
 	 * Allocate memory for building the header, fill it up,
 	 * and write it out.
 	 */
 	hdr = malloc(hdrsize, M_TEMP, M_WAITOK);
 	if (hdr == NULL) {
 		return (EINVAL);
 	}
 	error = __elfN(corehdr)(td, vp, cred, seginfo.count, hdr, hdrsize);
 
 	/* Write the contents of all of the writable segments. */
 	if (error == 0) {
 		Elf_Phdr *php;
 		off_t offset;
 		int i;
 
 		php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1;
 		offset = hdrsize;
 		for (i = 0; i < seginfo.count; i++) {
 			error = vn_rdwr_inchunks(UIO_WRITE, vp,
 			    (caddr_t)(uintptr_t)php->p_vaddr,
 			    php->p_filesz, offset, UIO_USERSPACE,
 			    IO_UNIT | IO_DIRECT, cred, NOCRED, NULL,
 			    curthread); /* XXXKSE */
 			if (error != 0)
 				break;
 			offset += php->p_filesz;
 			php++;
 		}
 	}
 	free(hdr, M_TEMP);
 
 	return (error);
 }
 
 /*
  * A callback for each_writable_segment() to write out the segment's
  * program header entry.
  */
 static void
 cb_put_phdr(entry, closure)
 	vm_map_entry_t entry;
 	void *closure;
 {
 	struct phdr_closure *phc = (struct phdr_closure *)closure;
 	Elf_Phdr *phdr = phc->phdr;
 
 	phc->offset = round_page(phc->offset);
 
 	phdr->p_type = PT_LOAD;
 	phdr->p_offset = phc->offset;
 	phdr->p_vaddr = entry->start;
 	phdr->p_paddr = 0;
 	phdr->p_filesz = phdr->p_memsz = entry->end - entry->start;
 	phdr->p_align = PAGE_SIZE;
 	phdr->p_flags = 0;
 	if (entry->protection & VM_PROT_READ)
 		phdr->p_flags |= PF_R;
 	if (entry->protection & VM_PROT_WRITE)
 		phdr->p_flags |= PF_W;
 	if (entry->protection & VM_PROT_EXECUTE)
 		phdr->p_flags |= PF_X;
 
 	phc->offset += phdr->p_filesz;
 	phc->phdr++;
 }
 
 /*
  * A callback for each_writable_segment() to gather information about
  * the number of segments and their total size.
  */
 static void
 cb_size_segment(entry, closure)
 	vm_map_entry_t entry;
 	void *closure;
 {
 	struct sseg_closure *ssc = (struct sseg_closure *)closure;
 
 	ssc->count++;
 	ssc->size += entry->end - entry->start;
 }
 
 /*
  * For each writable segment in the process's memory map, call the given
  * function with a pointer to the map entry and some arbitrary
  * caller-supplied data.
  */
 static void
 each_writable_segment(td, func, closure)
 	struct thread *td;
 	segment_callback func;
 	void *closure;
 {
 	struct proc *p = td->td_proc;
 	vm_map_t map = &p->p_vmspace->vm_map;
 	vm_map_entry_t entry;
 
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
 		vm_object_t obj;
 
 		/*
 		 * Don't dump inaccessible mappings, deal with legacy
 		 * coredump mode.
 		 *
 		 * Note that read-only segments related to the elf binary
 		 * are marked MAP_ENTRY_NOCOREDUMP now so we no longer
 		 * need to arbitrarily ignore such segments.
 		 */
 		if (elf_legacy_coredump) {
 			if ((entry->protection & VM_PROT_RW) != VM_PROT_RW)
 				continue;
 		} else {
 			if ((entry->protection & VM_PROT_ALL) == 0)
 				continue;
 		}
 
 		/*
 		 * Dont include memory segment in the coredump if
 		 * MAP_NOCORE is set in mmap(2) or MADV_NOCORE in
 		 * madvise(2).  Do not dump submaps (i.e. parts of the
 		 * kernel map).
 		 */
 		if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
 			continue;
 
 		if ((obj = entry->object.vm_object) == NULL)
 			continue;
 
 		/* Find the deepest backing object. */
 		while (obj->backing_object != NULL)
 			obj = obj->backing_object;
 
 		/* Ignore memory-mapped devices and such things. */
 		if (obj->type != OBJT_DEFAULT &&
 		    obj->type != OBJT_SWAP &&
 		    obj->type != OBJT_VNODE)
 			continue;
 
 		(*func)(entry, closure);
 	}
 }
 
 /*
  * Write the core file header to the file, including padding up to
  * the page boundary.
  */
 static int
 __elfN(corehdr)(td, vp, cred, numsegs, hdr, hdrsize)
 	struct thread *td;
 	struct vnode *vp;
 	struct ucred *cred;
 	int numsegs;
 	size_t hdrsize;
 	void *hdr;
 {
 	size_t off;
 
 	/* Fill in the header. */
 	bzero(hdr, hdrsize);
 	off = 0;
 	__elfN(puthdr)(td, hdr, &off, numsegs);
 
 	/* Write it to the core file. */
 	return (vn_rdwr_inchunks(UIO_WRITE, vp, hdr, hdrsize, (off_t)0,
 	    UIO_SYSSPACE, IO_UNIT | IO_DIRECT, cred, NOCRED, NULL,
 	    td)); /* XXXKSE */
 }
 
 #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32
 typedef struct prstatus32 elf_prstatus_t;
 typedef struct prpsinfo32 elf_prpsinfo_t;
 typedef struct fpreg32 elf_prfpregset_t;
 typedef struct fpreg32 elf_fpregset_t;
 typedef struct reg32 elf_gregset_t;
 #else
 typedef prstatus_t elf_prstatus_t;
 typedef prpsinfo_t elf_prpsinfo_t;
 typedef prfpregset_t elf_prfpregset_t;
 typedef prfpregset_t elf_fpregset_t;
 typedef gregset_t elf_gregset_t;
 #endif
 
 static void
 __elfN(puthdr)(struct thread *td, void *dst, size_t *off, int numsegs)
 {
 	struct {
 		elf_prstatus_t status;
 		elf_prfpregset_t fpregset;
 		elf_prpsinfo_t psinfo;
 	} *tempdata;
 	elf_prstatus_t *status;
 	elf_prfpregset_t *fpregset;
 	elf_prpsinfo_t *psinfo;
 	struct proc *p;
 	struct thread *thr;
 	size_t ehoff, noteoff, notesz, phoff;
 
 	p = td->td_proc;
 
 	ehoff = *off;
 	*off += sizeof(Elf_Ehdr);
 
 	phoff = *off;
 	*off += (numsegs + 1) * sizeof(Elf_Phdr);
 
 	noteoff = *off;
 	/*
 	 * Don't allocate space for the notes if we're just calculating
 	 * the size of the header. We also don't collect the data.
 	 */
 	if (dst != NULL) {
 		tempdata = malloc(sizeof(*tempdata), M_TEMP, M_ZERO|M_WAITOK);
 		status = &tempdata->status;
 		fpregset = &tempdata->fpregset;
 		psinfo = &tempdata->psinfo;
 	} else {
 		tempdata = NULL;
 		status = NULL;
 		fpregset = NULL;
 		psinfo = NULL;
 	}
 
 	if (dst != NULL) {
 		psinfo->pr_version = PRPSINFO_VERSION;
 		psinfo->pr_psinfosz = sizeof(elf_prpsinfo_t);
 		strlcpy(psinfo->pr_fname, p->p_comm, sizeof(psinfo->pr_fname));
 		/*
 		 * XXX - We don't fill in the command line arguments properly
 		 * yet.
 		 */
 		strlcpy(psinfo->pr_psargs, p->p_comm,
 		    sizeof(psinfo->pr_psargs));
 	}
 	__elfN(putnote)(dst, off, "FreeBSD", NT_PRPSINFO, psinfo,
 	    sizeof *psinfo);
 
 	/*
 	 * To have the debugger select the right thread (LWP) as the initial
 	 * thread, we dump the state of the thread passed to us in td first.
 	 * This is the thread that causes the core dump and thus likely to
 	 * be the right thread one wants to have selected in the debugger.
 	 */
 	thr = td;
 	while (thr != NULL) {
 		if (dst != NULL) {
 			status->pr_version = PRSTATUS_VERSION;
 			status->pr_statussz = sizeof(elf_prstatus_t);
 			status->pr_gregsetsz = sizeof(elf_gregset_t);
 			status->pr_fpregsetsz = sizeof(elf_fpregset_t);
 			status->pr_osreldate = osreldate;
 			status->pr_cursig = p->p_sig;
 			status->pr_pid = thr->td_tid;
 #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32
 			fill_regs32(thr, &status->pr_reg);
 			fill_fpregs32(thr, fpregset);
 #else
 			fill_regs(thr, &status->pr_reg);
 			fill_fpregs(thr, fpregset);
 #endif
 		}
 		__elfN(putnote)(dst, off, "FreeBSD", NT_PRSTATUS, status,
 		    sizeof *status);
 		__elfN(putnote)(dst, off, "FreeBSD", NT_FPREGSET, fpregset,
 		    sizeof *fpregset);
 		/*
 		 * Allow for MD specific notes, as well as any MD
 		 * specific preparations for writing MI notes.
 		 */
 		__elfN(dump_thread)(thr, dst, off);
 
 		thr = (thr == td) ? TAILQ_FIRST(&p->p_threads) :
 		    TAILQ_NEXT(thr, td_plist);
 		if (thr == td)
 			thr = TAILQ_NEXT(thr, td_plist);
 	}
 
 	notesz = *off - noteoff;
 
 	if (dst != NULL)
 		free(tempdata, M_TEMP);
 
 	/* Align up to a page boundary for the program segments. */
 	*off = round_page(*off);
 
 	if (dst != NULL) {
 		Elf_Ehdr *ehdr;
 		Elf_Phdr *phdr;
 		struct phdr_closure phc;
 
 		/*
 		 * Fill in the ELF header.
 		 */
 		ehdr = (Elf_Ehdr *)((char *)dst + ehoff);
 		ehdr->e_ident[EI_MAG0] = ELFMAG0;
 		ehdr->e_ident[EI_MAG1] = ELFMAG1;
 		ehdr->e_ident[EI_MAG2] = ELFMAG2;
 		ehdr->e_ident[EI_MAG3] = ELFMAG3;
 		ehdr->e_ident[EI_CLASS] = ELF_CLASS;
 		ehdr->e_ident[EI_DATA] = ELF_DATA;
 		ehdr->e_ident[EI_VERSION] = EV_CURRENT;
 		ehdr->e_ident[EI_OSABI] = ELFOSABI_FREEBSD;
 		ehdr->e_ident[EI_ABIVERSION] = 0;
 		ehdr->e_ident[EI_PAD] = 0;
 		ehdr->e_type = ET_CORE;
 #if defined(COMPAT_IA32) && __ELF_WORD_SIZE == 32
 		ehdr->e_machine = EM_386;
 #else
 		ehdr->e_machine = ELF_ARCH;
 #endif
 		ehdr->e_version = EV_CURRENT;
 		ehdr->e_entry = 0;
 		ehdr->e_phoff = phoff;
 		ehdr->e_flags = 0;
 		ehdr->e_ehsize = sizeof(Elf_Ehdr);
 		ehdr->e_phentsize = sizeof(Elf_Phdr);
 		ehdr->e_phnum = numsegs + 1;
 		ehdr->e_shentsize = sizeof(Elf_Shdr);
 		ehdr->e_shnum = 0;
 		ehdr->e_shstrndx = SHN_UNDEF;
 
 		/*
 		 * Fill in the program header entries.
 		 */
 		phdr = (Elf_Phdr *)((char *)dst + phoff);
 
 		/* The note segement. */
 		phdr->p_type = PT_NOTE;
 		phdr->p_offset = noteoff;
 		phdr->p_vaddr = 0;
 		phdr->p_paddr = 0;
 		phdr->p_filesz = notesz;
 		phdr->p_memsz = 0;
 		phdr->p_flags = 0;
 		phdr->p_align = 0;
 		phdr++;
 
 		/* All the writable segments from the program. */
 		phc.phdr = phdr;
 		phc.offset = *off;
 		each_writable_segment(td, cb_put_phdr, &phc);
 	}
 }
 
 static void
 __elfN(putnote)(void *dst, size_t *off, const char *name, int type,
     const void *desc, size_t descsz)
 {
 	Elf_Note note;
 
 	note.n_namesz = strlen(name) + 1;
 	note.n_descsz = descsz;
 	note.n_type = type;
 	if (dst != NULL)
 		bcopy(&note, (char *)dst + *off, sizeof note);
 	*off += sizeof note;
 	if (dst != NULL)
 		bcopy(name, (char *)dst + *off, note.n_namesz);
 	*off += roundup2(note.n_namesz, sizeof(Elf_Size));
 	if (dst != NULL)
 		bcopy(desc, (char *)dst + *off, note.n_descsz);
 	*off += roundup2(note.n_descsz, sizeof(Elf_Size));
 }
 
 /*
  * Tell kern_execve.c about it, with a little help from the linker.
  */
 static struct execsw __elfN(execsw) = {
 	__CONCAT(exec_, __elfN(imgact)),
 	__XSTRING(__CONCAT(ELF, __ELF_WORD_SIZE))
 };
 EXEC_SET(__CONCAT(elf, __ELF_WORD_SIZE), __elfN(execsw));
Index: stable/6/sys/kern/kern_descrip.c
===================================================================
--- stable/6/sys/kern/kern_descrip.c	(revision 169910)
+++ stable/6/sys/kern/kern_descrip.c	(revision 169911)
@@ -1,2694 +1,2692 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 
 #include <ddb/ddb.h>
 
 static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "file desc to leader",
 		     "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 
 static uma_zone_t file_zone;
 
 
 /* How to treat 'new' parameter when allocating a fd for do_dup(). */
 enum dup_type { DUP_VARIABLE, DUP_FIXED };
 
 static int do_dup(struct thread *td, enum dup_type type, int old, int new,
     register_t *retval);
 static int	fd_first_free(struct filedesc *, int, int);
 static int	fd_last_used(struct filedesc *, int, int);
 static void	fdgrowtable(struct filedesc *, int);
 static int	fdrop_locked(struct file *fp, struct thread *td);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
 
 /*
  * A process is initially started out with NDFILE descriptors stored within
  * this structure, selected to be enough for typical applications based on
  * the historical limit of 20 open files (and the usage of descriptors by
  * shells).  If these descriptors are exhausted, a larger descriptor table
  * may be allocated, up to a process' resource limit; the internal arrays
  * are then unused.
  */
 #define NDFILE		20
 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
 #define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
 #define NDSLOT(x)	((x) / NDENTRIES)
 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
 /*
  * Storage required per open file descriptor.
  */
 #define OFILESIZE (sizeof(struct file *) + sizeof(char))
 
 /*
  * Basic allocation of descriptors:
  * one of the above, plus arrays for NDFILE descriptors.
  */
 struct filedesc0 {
 	struct	filedesc fd_fd;
 	/*
 	 * These arrays are used when the number of open files is
 	 * <= NDFILE, and are then pointed to by the pointers above.
 	 */
 	struct	file *fd_dfiles[NDFILE];
 	char	fd_dfileflags[NDFILE];
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
 };
 
 /*
  * Descriptor management.
  */
 struct filelist filehead;	/* head of list of open files */
 int openfiles;			/* actual number of open files */
 struct sx filelist_lock;	/* sx to protect filelist */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
 
 /* A mutex to protect the association between a proc and filedesc. */
 static struct mtx	fdesc_mtx;
 
 /*
  * Find the first zero bit in the given bitmap, starting at low and not
  * exceeding size - 1.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, maxoff;
 
 	if (low >= size)
 		return (low);
 
 	off = NDSLOT(low);
 	if (low % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
 		if ((mask &= ~map[off]) != 0UL)
 			return (off * NDENTRIES + ffsl(mask) - 1);
 		++off;
 	}
 	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
 		if (map[off] != ~0UL)
 			return (off * NDENTRIES + ffsl(~map[off]) - 1);
 	return (size);
 }
 
 /*
  * Find the highest non-zero bit in the given bitmap, starting at low and
  * not exceeding size - 1.
  */
 static int
 fd_last_used(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, minoff;
 
 	if (low >= size)
 		return (-1);
 
 	off = NDSLOT(size);
 	if (size % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
 		if ((mask &= map[off]) != 0)
 			return (off * NDENTRIES + flsl(mask) - 1);
 		--off;
 	}
 	for (minoff = NDSLOT(low); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
 	return (low - 1);
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
         KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
             ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
 /*
  * Mark a file descriptor as used.
  */
 static void
 fdused(struct filedesc *fdp, int fd)
 {
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd already used"));
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 	if (fd > fdp->fd_lastfile)
 		fdp->fd_lastfile = fd;
 	if (fd == fdp->fd_freefile)
 		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
 }
 
 /*
  * Mark a file descriptor as unused.
  */
 static void
 fdunused(struct filedesc *fdp, int fd)
 {
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 	KASSERT(fdisused(fdp, fd),
 	    ("fd is already unused"));
 	KASSERT(fdp->fd_ofiles[fd] == NULL,
 	    ("fd is still in use"));
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 	if (fd == fdp->fd_lastfile)
 		fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
 }
 
 /*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdtablesize_args {
 	int	dummy;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 getdtablesize(struct thread *td, struct getdtablesize_args *uap)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK(p);
 	td->td_retval[0] =
 	    min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 /*
  * Duplicate a file descriptor to a particular value.
  *
  * note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup2_args {
 	u_int	from;
 	u_int	to;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 dup2(struct thread *td, struct dup2_args *uap)
 {
 
 	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
 		    td->td_retval));
 }
 
 /*
  * Duplicate a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup_args {
 	u_int	fd;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 dup(struct thread *td, struct dup_args *uap)
 {
 
 	return (do_dup(td, DUP_VARIABLE, (int)uap->fd, 0, td->td_retval));
 }
 
 /*
  * The file control system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fcntl_args {
 	int	fd;
 	int	cmd;
 	long	arg;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 fcntl(struct thread *td, struct fcntl_args *uap)
 {
 	struct flock fl;
 	intptr_t arg;
 	int error;
 
 	error = 0;
 	switch (uap->cmd) {
 	case F_GETLK:
 	case F_SETLK:
 	case F_SETLKW:
 		error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
 		arg = (intptr_t)&fl;
 		break;
 	default:
 		arg = uap->arg;
 		break;
 	}
 	if (error)
 		return (error);
 	error = kern_fcntl(td, uap->fd, uap->cmd, arg);
 	if (error)
 		return (error);
 	if (uap->cmd == F_GETLK)
 		error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
 	return (error);
 }
 
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
 	struct file *fp;
 	struct proc *p;
 	char *pop;
 	struct vnode *vp;
 	u_int newmin;
 	int error, flg, tmp;
 	int giant_locked;
 
 	/*
 	 * XXXRW: Some fcntl() calls require Giant -- others don't.  Try to
 	 * avoid grabbing Giant for calls we know don't need it.
 	 */
 	switch (cmd) {
 	case F_DUPFD:
 	case F_GETFD:
 	case F_SETFD:
 	case F_GETFL:
 		giant_locked = 0;
 		break;
 
 	default:
 		giant_locked = 1;
 		mtx_lock(&Giant);
 	}
 
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
 	fdp = p->p_fd;
 	FILEDESC_LOCK(fdp);
 	if ((unsigned)fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
 		FILEDESC_UNLOCK(fdp);
 		error = EBADF;
 		goto done2;
 	}
 	pop = &fdp->fd_ofileflags[fd];
 
 	switch (cmd) {
 	case F_DUPFD:
 		/* mtx_assert(&Giant, MA_NOTOWNED); */
 		FILEDESC_UNLOCK(fdp);
 		newmin = arg;
 		PROC_LOCK(p);
 		if (newmin >= lim_cur(p, RLIMIT_NOFILE) ||
 		    newmin >= maxfilesperproc) {
 			PROC_UNLOCK(p);
 			error = EINVAL;
 			break;
 		}
 		PROC_UNLOCK(p);
 		error = do_dup(td, DUP_VARIABLE, fd, newmin, td->td_retval);
 		break;
 
 	case F_GETFD:
 		/* mtx_assert(&Giant, MA_NOTOWNED); */
 		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
 		FILEDESC_UNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		/* mtx_assert(&Giant, MA_NOTOWNED); */
 		*pop = (*pop &~ UF_EXCLOSE) |
 		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
 		FILEDESC_UNLOCK(fdp);
 		break;
 
 	case F_GETFL:
 		/* mtx_assert(&Giant, MA_NOTOWNED); */
 		FILE_LOCK(fp);
 		td->td_retval[0] = OFLAGS(fp->f_flag);
 		FILE_UNLOCK(fp);
 		FILEDESC_UNLOCK(fdp);
 		break;
 
 	case F_SETFL:
 		mtx_assert(&Giant, MA_OWNED);
 		FILE_LOCK(fp);
 		fhold_locked(fp);
 		fp->f_flag &= ~FCNTLFLAGS;
 		fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
 		FILE_UNLOCK(fp);
 		FILEDESC_UNLOCK(fdp);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error) {
 			fdrop(fp, td);
 			break;
 		}
 		tmp = fp->f_flag & FASYNC;
 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
 		if (error == 0) {
 			fdrop(fp, td);
 			break;
 		}
 		FILE_LOCK(fp);
 		fp->f_flag &= ~FNONBLOCK;
 		FILE_UNLOCK(fp);
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_GETOWN:
 		mtx_assert(&Giant, MA_OWNED);
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
 		fdrop(fp, td);
 		break;
 
 	case F_SETOWN:
 		mtx_assert(&Giant, MA_OWNED);
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLKW:
 		mtx_assert(&Giant, MA_OWNED);
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
 		mtx_assert(&Giant, MA_OWNED);
 		if (fp->f_type != DTYPE_VNODE) {
 			FILEDESC_UNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 
 		flp = (struct flock *)arg;
 		if (flp->l_whence == SEEK_CUR) {
 			if (fp->f_offset < 0 ||
 			    (flp->l_start > 0 &&
 			     fp->f_offset > OFF_MAX - flp->l_start)) {
 				FILEDESC_UNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
 			flp->l_start += fp->f_offset;
 		}
 
 		/*
 		 * VOP_ADVLOCK() may block.
 		 */
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 		vp = fp->f_vnode;
 
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
 			    flp, F_POSIX);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		/* Check for race with close */
 		FILEDESC_LOCK_FAST(fdp);
 		if ((unsigned) fd >= fdp->fd_nfiles ||
 		    fp != fdp->fd_ofiles[fd]) {
 			FILEDESC_UNLOCK_FAST(fdp);
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 					   F_UNLCK, flp, F_POSIX);
 		} else
 			FILEDESC_UNLOCK_FAST(fdp);
 		fdrop(fp, td);
 		break;
 
 	case F_GETLK:
 		mtx_assert(&Giant, MA_OWNED);
 		if (fp->f_type != DTYPE_VNODE) {
 			FILEDESC_UNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
 			FILEDESC_UNLOCK(fdp);
 			error = EINVAL;
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
 			if ((flp->l_start > 0 &&
 			    fp->f_offset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
 			     fp->f_offset < OFF_MIN - flp->l_start)) {
 				FILEDESC_UNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
 			flp->l_start += fp->f_offset;
 		}
 		/*
 		 * VOP_ADVLOCK() may block.
 		 */
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 		vp = fp->f_vnode;
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
 		fdrop(fp, td);
 		break;
 	default:
 		FILEDESC_UNLOCK(fdp);
 		error = EINVAL;
 		break;
 	}
 done2:
 	if (giant_locked)
 		mtx_unlock(&Giant);
 	return (error);
 }
 
 /*
  * Common code for dup, dup2, and fcntl(F_DUPFD).
  */
 static int
 do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	struct file *fp;
 	struct file *delfp;
 	int error, holdleaders, maxfd;
 
 	KASSERT((type == DUP_VARIABLE || type == DUP_FIXED),
 	    ("invalid dup type %d", type));
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	/*
 	 * Verify we have a valid descriptor to dup from and possibly to
 	 * dup to.
 	 */
 	if (old < 0 || new < 0)
 		return (EBADF);
 	PROC_LOCK(p);
 	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 	if (new >= maxfd)
 		return (EMFILE);
 
 	FILEDESC_LOCK(fdp);
 	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 	if (type == DUP_FIXED && old == new) {
 		*retval = new;
 		FILEDESC_UNLOCK(fdp);
 		return (0);
 	}
 	fp = fdp->fd_ofiles[old];
 	fhold(fp);
 
 	/*
 	 * If the caller specified a file descriptor, make sure the file
 	 * table is large enough to hold it, and grab it.  Otherwise, just
 	 * allocate a new descriptor the usual way.  Since the filedesc
 	 * lock may be temporarily dropped in the process, we have to look
 	 * out for a race.
 	 */
 	if (type == DUP_FIXED) {
 		if (new >= fdp->fd_nfiles)
 			fdgrowtable(fdp, new + 1);
 		if (fdp->fd_ofiles[new] == NULL)
 			fdused(fdp, new);
 	} else {
 		if ((error = fdalloc(td, new, &new)) != 0) {
 			FILEDESC_UNLOCK(fdp);
 			fdrop(fp, td);
 			return (error);
 		}
 	}
 
 	/*
 	 * If the old file changed out from under us then treat it as a
 	 * bad file descriptor.  Userland should do its own locking to
 	 * avoid this case.
 	 */
 	if (fdp->fd_ofiles[old] != fp) {
 		/* we've allocated a descriptor which we won't use */
 		if (fdp->fd_ofiles[new] == NULL)
 			fdunused(fdp, new);
 		FILEDESC_UNLOCK(fdp);
 		fdrop(fp, td);
 		return (EBADF);
 	}
 	KASSERT(old != new,
 	    ("new fd is same as old"));
 
 	/*
 	 * Save info on the descriptor being overwritten.  We cannot close
 	 * it without introducing an ownership race for the slot, since we
 	 * need to drop the filedesc lock to call closef().
 	 *
 	 * XXX this duplicates parts of close().
 	 */
 	delfp = fdp->fd_ofiles[new];
 	holdleaders = 0;
 	if (delfp != NULL) {
 		if (td->td_proc->p_fdtol != NULL) {
 			/*
 			 * Ask fdfree() to sleep to ensure that all relevant
 			 * process leaders can be traversed in closef().
 			 */
 			fdp->fd_holdleaderscount++;
 			holdleaders = 1;
 		}
 	}
 
 	/*
 	 * Duplicate the source descriptor
 	 */
 	fdp->fd_ofiles[new] = fp;
 	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
 	if (new > fdp->fd_lastfile)
 		fdp->fd_lastfile = new;
 	*retval = new;
 
 	/*
 	 * If we dup'd over a valid file, we now own the reference to it
 	 * and must dispose of it using closef() semantics (as if a
 	 * close() were performed on it).
 	 *
 	 * XXX this duplicates parts of close().
 	 */
 	if (delfp != NULL) {
 		knote_fdclose(td, new);
 		FILEDESC_UNLOCK(fdp);
 		(void) closef(delfp, td);
 		if (holdleaders) {
 			FILEDESC_LOCK_FAST(fdp);
 			fdp->fd_holdleaderscount--;
 			if (fdp->fd_holdleaderscount == 0 &&
 			    fdp->fd_holdleaderswakeup != 0) {
 				fdp->fd_holdleaderswakeup = 0;
 				wakeup(&fdp->fd_holdleaderscount);
 			}
 			FILEDESC_UNLOCK_FAST(fdp);
 		}
 	} else {
 		FILEDESC_UNLOCK(fdp);
 	}
 	return (0);
 }
 
 /*
  * If sigio is on the list associated with a process or process group,
  * disable signalling from the device, remove sigio from the list and
  * free sigio.
  */
 void
 funsetown(struct sigio **sigiop)
 {
 	struct sigio *sigio;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	*(sigio->sio_myref) = NULL;
 	if ((sigio)->sio_pgid < 0) {
 		struct pgrp *pg = (sigio)->sio_pgrp;
 		PGRP_LOCK(pg);
 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 		PGRP_UNLOCK(pg);
 	} else {
 		struct proc *p = (sigio)->sio_proc;
 		PROC_LOCK(p);
 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
 			     sigio, sio_pgsigio);
 		PROC_UNLOCK(p);
 	}
 	SIGIO_UNLOCK();
 	crfree(sigio->sio_ucred);
 	FREE(sigio, M_SIGIO);
 }
 
 /*
  * Free a list of sigio structures.
  * We only need to lock the SIGIO_LOCK because we have made ourselves
  * inaccessible to callers of fsetown and therefore do not need to lock
  * the proc or pgrp struct for the list manipulation.
  */
 void
 funsetownlst(struct sigiolst *sigiolst)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	struct sigio *sigio;
 
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL)
 		return;
 	p = NULL;
 	pg = NULL;
 
 	/*
 	 * Every entry of the list should belong
 	 * to a single proc or pgrp.
 	 */
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
 	} else /* if (sigio->sio_pgid > 0) */ {
 		p = sigio->sio_proc;
 		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	}
 
 	SIGIO_LOCK();
 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
 		*(sigio->sio_myref) = NULL;
 		if (pg != NULL) {
 			KASSERT(sigio->sio_pgid < 0,
 			    ("Proc sigio in pgrp sigio list"));
 			KASSERT(sigio->sio_pgrp == pg,
 			    ("Bogus pgrp in sigio list"));
 			PGRP_LOCK(pg);
 			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PGRP_UNLOCK(pg);
 		} else /* if (p != NULL) */ {
 			KASSERT(sigio->sio_pgid > 0,
 			    ("Pgrp sigio in proc sigio list"));
 			KASSERT(sigio->sio_proc == p,
 			    ("Bogus proc in sigio list"));
 			PROC_LOCK(p);
 			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PROC_UNLOCK(p);
 		}
 		SIGIO_UNLOCK();
 		crfree(sigio->sio_ucred);
 		FREE(sigio, M_SIGIO);
 		SIGIO_LOCK();
 	}
 	SIGIO_UNLOCK();
 }
 
 /*
  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
  *
  * After permission checking, add a sigio structure to the sigio list for
  * the process or process group.
  */
 int
 fsetown(pid_t pgid, struct sigio **sigiop)
 {
 	struct proc *proc;
 	struct pgrp *pgrp;
 	struct sigio *sigio;
 	int ret;
 
 	if (pgid == 0) {
 		funsetown(sigiop);
 		return (0);
 	}
 
 	ret = 0;
 
 	/* Allocate and fill in the new sigio out of locks. */
 	MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, M_WAITOK);
 	sigio->sio_pgid = pgid;
 	sigio->sio_ucred = crhold(curthread->td_ucred);
 	sigio->sio_myref = sigiop;
 
 	sx_slock(&proctree_lock);
 	if (pgid > 0) {
 		proc = pfind(pgid);
 		if (proc == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		PROC_UNLOCK(proc);
 		if (proc->p_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		pgrp = NULL;
 	} else /* if (pgid < 0) */ {
 		pgrp = pgfind(-pgid);
 		if (pgrp == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 		PGRP_UNLOCK(pgrp);
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		if (pgrp->pg_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		proc = NULL;
 	}
 	funsetown(sigiop);
 	if (pgid > 0) {
 		PROC_LOCK(proc);
 		/*
 		 * Since funsetownlst() is called without the proctree
 		 * locked, we need to check for P_WEXIT.
 		 * XXX: is ESRCH correct?
 		 */
 		if ((proc->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(proc);
 			ret = ESRCH;
 			goto fail;
 		}
 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_proc = proc;
 		PROC_UNLOCK(proc);
 	} else {
 		PGRP_LOCK(pgrp);
 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_pgrp = pgrp;
 		PGRP_UNLOCK(pgrp);
 	}
 	sx_sunlock(&proctree_lock);
 	SIGIO_LOCK();
 	*sigiop = sigio;
 	SIGIO_UNLOCK();
 	return (0);
 
 fail:
 	sx_sunlock(&proctree_lock);
 	crfree(sigio->sio_ucred);
 	FREE(sigio, M_SIGIO);
 	return (ret);
 }
 
 /*
  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
  */
 pid_t
 fgetown(sigiop)
 	struct sigio **sigiop;
 {
 	pid_t pgid;
 
 	SIGIO_LOCK();
 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 	SIGIO_UNLOCK();
 	return (pgid);
 }
 
 /*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct close_args {
 	int     fd;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 close(td, uap)
 	struct thread *td;
 	struct close_args *uap;
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int fd, error;
 	int holdleaders;
 
 	fd = uap->fd;
 	error = 0;
 	holdleaders = 0;
 	fdp = td->td_proc->p_fd;
 
 	AUDIT_SYSCLOSE(td, fd);
 
 	FILEDESC_LOCK(fdp);
 	if ((unsigned)fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 	fdp->fd_ofiles[fd] = NULL;
 	fdp->fd_ofileflags[fd] = 0;
 	fdunused(fdp, fd);
 	if (td->td_proc->p_fdtol != NULL) {
 		/*
 		 * Ask fdfree() to sleep to ensure that all relevant
 		 * process leaders can be traversed in closef().
 		 */
 		fdp->fd_holdleaderscount++;
 		holdleaders = 1;
 	}
 
 	/*
 	 * We now hold the fp reference that used to be owned by the descriptor
 	 * array.
 	 * We have to unlock the FILEDESC *AFTER* knote_fdclose to prevent a
 	 * race of the fd getting opened, a knote added, and deleteing a knote
 	 * for the new fd.
 	 */
 	knote_fdclose(td, fd);
 	FILEDESC_UNLOCK(fdp);
 
 	error = closef(fp, td);
 	if (holdleaders) {
 		FILEDESC_LOCK_FAST(fdp);
 		fdp->fd_holdleaderscount--;
 		if (fdp->fd_holdleaderscount == 0 &&
 		    fdp->fd_holdleaderswakeup != 0) {
 			fdp->fd_holdleaderswakeup = 0;
 			wakeup(&fdp->fd_holdleaderscount);
 		}
 		FILEDESC_UNLOCK_FAST(fdp);
 	}
 	return (error);
 }
 
 #if defined(COMPAT_43)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ofstat_args {
 	int	fd;
 	struct	ostat *sb;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 ofstat(struct thread *td, struct ofstat_args *uap)
 {
 	struct ostat oub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtstat(&ub, &oub);
 		error = copyout(&oub, uap->sb, sizeof(oub));
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstat_args {
 	int	fd;
 	struct	stat *sb;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 fstat(struct thread *td, struct fstat_args *uap)
 {
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0)
 		error = copyout(&ub, uap->sb, sizeof(ub));
 	return (error);
 }
 
 int
 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 {
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG(fd, fd);
 
 	if ((error = fget(td, fd, &fp)) != 0)
 		return (error);
 
 	AUDIT_ARG(file, td->td_proc, fp);
 
 	error = fo_stat(fp, sbp, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nfstat_args {
 	int	fd;
 	struct	nstat *sb;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 nfstat(struct thread *td, struct nfstat_args *uap)
 {
 	struct nstat nub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtnstat(&ub, &nub);
 		error = copyout(&nub, uap->sb, sizeof(nub));
 	}
 	return (error);
 }
 
 /*
  * Return pathconf information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fpathconf_args {
 	int	fd;
 	int	name;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 fpathconf(struct thread *td, struct fpathconf_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	int error;
 
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		return (error);
 
 	/* If asynchronous I/O is available, it works for all descriptors. */
 	if (uap->name == _PC_ASYNC_IO) {
 		td->td_retval[0] = async_io_version;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		int vfslocked;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
 		VOP_UNLOCK(vp, 0, td);
 		VFS_UNLOCK_GIANT(vfslocked);
 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 		if (uap->name != _PC_PIPE_BUF) {
 			error = EINVAL;
 		} else {
 			td->td_retval[0] = PIPE_BUF;
 		error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
 	}
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Grow the file table to accomodate (at least) nfd descriptors.  This may
  * block and drop the filedesc lock, but it will reacquire it before
  * returning.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
 	struct file **ntable;
 	char *nfileflags;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap;
 
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 
 	KASSERT(fdp->fd_nfiles > 0,
 	    ("zero-length file table"));
 
 	/* compute the size of the new table */
 	onfiles = fdp->fd_nfiles;
 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 	if (nnfiles <= onfiles)
 		/* the table is already large enough */
 		return;
 
 	/* allocate a new table and (if required) new bitmaps */
 	FILEDESC_UNLOCK(fdp);
 	MALLOC(ntable, struct file **, nnfiles * OFILESIZE,
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	nfileflags = (char *)&ntable[nnfiles];
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
 		MALLOC(nmap, NDSLOTTYPE *, NDSLOTS(nnfiles) * NDSLOTSIZE,
 		    M_FILEDESC, M_ZERO | M_WAITOK);
 	else
 		nmap = NULL;
 	FILEDESC_LOCK(fdp);
 
 	/*
 	 * We now have new tables ready to go.  Since we dropped the
 	 * filedesc lock to call malloc(), watch out for a race.
 	 */
 	onfiles = fdp->fd_nfiles;
 	if (onfiles >= nnfiles) {
 		/* we lost the race, but that's OK */
 		free(ntable, M_FILEDESC);
 		if (nmap != NULL)
 			free(nmap, M_FILEDESC);
 		return;
 	}
 	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
 	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
 	if (onfiles > NDFILE)
 		free(fdp->fd_ofiles, M_FILEDESC);
 	fdp->fd_ofiles = ntable;
 	fdp->fd_ofileflags = nfileflags;
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
 		if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 			free(fdp->fd_map, M_FILEDESC);
 		fdp->fd_map = nmap;
 	}
 	fdp->fd_nfiles = nnfiles;
 }
 
 /*
  * Allocate a file descriptor for the process.
  */
 int
 fdalloc(struct thread *td, int minfd, int *result)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int fd = -1, maxfd;
 
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;	   
 
 	PROC_LOCK(p);
 	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 
 	/*
 	 * Search the bitmap for a free descriptor.  If none is found, try
 	 * to grow the file table.  Keep at it until we either get a file
 	 * descriptor or run into process or system limits; fdgrowtable()
 	 * may drop the filedesc lock, so we're in a race.
 	 */
 	for (;;) {
 		fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 		if (fd >= maxfd)
 			return (EMFILE);
 		if (fd < fdp->fd_nfiles)
 			break;
 		fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
 	KASSERT(fdp->fd_ofiles[fd] == NULL,
 	    ("free descriptor isn't"));
 	fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
 }
 
 /*
  * Check to see whether n user file descriptors
  * are available to the process p.
  */
 int
 fdavail(struct thread *td, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file **fpp;
 	int i, lim, last;
 
 	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
 
 	PROC_LOCK(p);
 	lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
 	PROC_UNLOCK(p);
 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
 		return (1);
 	last = min(fdp->fd_nfiles, lim);
 	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
 	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
 		if (*fpp == NULL && --n <= 0)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Create a new open file structure and allocate
  * a file decriptor for the process that refers to it.
  * We add one reference to the file for the descriptor table
  * and one reference for resultfp. This is to prevent us being
  * preempted and the entry in the descriptor table closed after
  * we release the FILEDESC lock.
  */
 int
 falloc(struct thread *td, struct file **resultfp, int *resultfd)
 {
 	struct proc *p = td->td_proc;
 	struct file *fp, *fq;
 	int error, i;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	static struct timeval lastfail;
 	static int curfail;
 
 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
 	sx_xlock(&filelist_lock);
 
 	if ((openfiles >= maxuserfiles &&
 	     suser_cred(td->td_ucred, SUSER_RUID) != 0) ||
 	    openfiles >= maxfiles) {
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",
 				td->td_ucred->cr_ruid);
 		}
 		sx_xunlock(&filelist_lock);
 		uma_zfree(file_zone, fp);
 		return (ENFILE);
 	}
 	openfiles++;
 
 	/*
 	 * If the process has file descriptor zero open, add the new file
 	 * descriptor to the list of open files at that point, otherwise
 	 * put it at the front of the list of open files.
 	 */
 	fp->f_mtxp = mtx_pool_alloc(mtxpool_sleep);
 	fp->f_count = 1;
 	if (resultfp)
 		fp->f_count++;
 	fp->f_cred = crhold(td->td_ucred);
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fp->f_vnode = NULL;
 	FILEDESC_LOCK(p->p_fd);
 	if ((fq = p->p_fd->fd_ofiles[0])) {
 		LIST_INSERT_AFTER(fq, fp, f_list);
 	} else {
 		LIST_INSERT_HEAD(&filehead, fp, f_list);
 	}
 	sx_xunlock(&filelist_lock);
 	if ((error = fdalloc(td, 0, &i))) {
 		FILEDESC_UNLOCK(p->p_fd);
 		fdrop(fp, td);
 		if (resultfp)
 			fdrop(fp, td);
 		return (error);
 	}
 	p->p_fd->fd_ofiles[i] = fp;
 	FILEDESC_UNLOCK(p->p_fd);
 	if (resultfp)
 		*resultfp = fp;
 	if (resultfd)
 		*resultfd = i;
 	return (0);
 }
 
 /*
  * Build a new filedesc structure from another.
  * Copy the current, root, and jail root vnode references.
  */
 struct filedesc *
 fdinit(struct filedesc *fdp)
 {
 	struct filedesc0 *newfdp;
 
 	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
 	mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
 	if (fdp != NULL) {
 		FILEDESC_LOCK(fdp);
 		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
 		if (newfdp->fd_fd.fd_cdir)
 			VREF(newfdp->fd_fd.fd_cdir);
 		newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
 		if (newfdp->fd_fd.fd_rdir)
 			VREF(newfdp->fd_fd.fd_rdir);
 		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
 		if (newfdp->fd_fd.fd_jdir)
 			VREF(newfdp->fd_fd.fd_jdir);
 		FILEDESC_UNLOCK(fdp);
 	}
 
 	/* Create the file descriptor table. */
 	newfdp->fd_fd.fd_refcnt = 1;
 	newfdp->fd_fd.fd_holdcnt = 1;
 	newfdp->fd_fd.fd_cmask = CMASK;
 	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
 	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
 	newfdp->fd_fd.fd_nfiles = NDFILE;
 	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
 	newfdp->fd_fd.fd_lastfile = -1;
 	return (&newfdp->fd_fd);
 }
 
 static struct filedesc *
 fdhold(struct proc *p)
 {
 	struct filedesc *fdp;
 
 	mtx_lock(&fdesc_mtx);
 	fdp = p->p_fd;
 	if (fdp != NULL)
 		fdp->fd_holdcnt++;
 	mtx_unlock(&fdesc_mtx);
 	return (fdp);
 }
 
 static void
 fddrop(struct filedesc *fdp)
 {
 	int i;
 
 	mtx_lock(&fdesc_mtx);
 	i = --fdp->fd_holdcnt;
 	mtx_unlock(&fdesc_mtx);
 	if (i > 0)
 		return;
 
 	mtx_destroy(&fdp->fd_mtx);
 	FREE(fdp, M_FILEDESC);
 }
 
 /*
  * Share a filedesc structure.
  */
 struct filedesc *
 fdshare(struct filedesc *fdp)
 {
 	FILEDESC_LOCK_FAST(fdp);
 	fdp->fd_refcnt++;
 	FILEDESC_UNLOCK_FAST(fdp);
 	return (fdp);
 }
 
 /*
  * Unshare a filedesc structure, if necessary by making a copy
  */
 void
 fdunshare(struct proc *p, struct thread *td)
 {
 
 	FILEDESC_LOCK_FAST(p->p_fd);
 	if (p->p_fd->fd_refcnt > 1) {
 		struct filedesc *tmp;
 
 		FILEDESC_UNLOCK_FAST(p->p_fd);
 		tmp = fdcopy(p->p_fd);
 		fdfree(td);
 		p->p_fd = tmp;
 	} else
 		FILEDESC_UNLOCK_FAST(p->p_fd);
 }
 
 /*
  * Copy a filedesc structure.
  * A NULL pointer in returns a NULL reference, this is to ease callers,
  * not catch errors.
  */
 struct filedesc *
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
 	if (fdp == NULL)
 		return (NULL);
 
 	newfdp = fdinit(fdp);
 	FILEDESC_LOCK_FAST(fdp);
 	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
 		FILEDESC_UNLOCK_FAST(fdp);
 		FILEDESC_LOCK(newfdp);
 		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 		FILEDESC_UNLOCK(newfdp);
 		FILEDESC_LOCK_FAST(fdp);
 	}
 	/* copy everything except kqueue descriptors */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= fdp->fd_lastfile; ++i) {
 		if (fdisused(fdp, i) &&
 		    fdp->fd_ofiles[i]->f_type != DTYPE_KQUEUE) {
 			newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
 			newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
 			fhold(newfdp->fd_ofiles[i]);
 			newfdp->fd_lastfile = i;
 		} else {
 			if (newfdp->fd_freefile == -1)
 				newfdp->fd_freefile = i;
 		}
 	}
 	FILEDESC_UNLOCK_FAST(fdp);
 	FILEDESC_LOCK(newfdp);
 	for (i = 0; i <= newfdp->fd_lastfile; ++i)
 		if (newfdp->fd_ofiles[i] != NULL)
 			fdused(newfdp, i);
 	FILEDESC_UNLOCK(newfdp);
 	FILEDESC_LOCK_FAST(fdp);
 	if (newfdp->fd_freefile == -1)
 		newfdp->fd_freefile = i;
 	newfdp->fd_cmask = fdp->fd_cmask;
 	FILEDESC_UNLOCK_FAST(fdp);
 	return (newfdp);
 }
 
 /*
  * Release a filedesc structure.
  */
 void
 fdfree(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct file **fpp;
 	int i, locked;
 	struct filedesc_to_leader *fdtol;
 	struct file *fp;
 	struct vnode *cdir, *jdir, *rdir, *vp;
 	struct flock lf;
 
 	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return;
 
 	/* Check for special need to clear POSIX style locks */
 	fdtol = td->td_proc->p_fdtol;
 	if (fdtol != NULL) {
 		FILEDESC_LOCK(fdp);
 		KASSERT(fdtol->fdl_refcount > 0,
 			("filedesc_to_refcount botch: fdl_refcount=%d",
 			 fdtol->fdl_refcount));
 		if (fdtol->fdl_refcount == 1 &&
 		    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			for (i = 0, fpp = fdp->fd_ofiles;
 			     i <= fdp->fd_lastfile;
 			     i++, fpp++) {
 				if (*fpp == NULL ||
 				    (*fpp)->f_type != DTYPE_VNODE)
 					continue;
 				fp = *fpp;
 				fhold(fp);
 				FILEDESC_UNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				locked = VFS_LOCK_GIANT(vp->v_mount);
 				(void) VOP_ADVLOCK(vp,
 						   (caddr_t)td->td_proc->
 						   p_leader,
 						   F_UNLCK,
 						   &lf,
 						   F_POSIX);
 				VFS_UNLOCK_GIANT(locked);
 				FILEDESC_LOCK(fdp);
 				fdrop(fp, td);
 				fpp = fdp->fd_ofiles + i;
 			}
 		}
 	retry:
 		if (fdtol->fdl_refcount == 1) {
 			if (fdp->fd_holdleaderscount > 0 &&
 			    (td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 				/*
 				 * close() or do_dup() has cleared a reference
 				 * in a shared file descriptor table.
 				 */
 				fdp->fd_holdleaderswakeup = 1;
 				msleep(&fdp->fd_holdleaderscount, &fdp->fd_mtx,
 				       PLOCK, "fdlhold", 0);
 				goto retry;
 			}
 			if (fdtol->fdl_holdcount > 0) {
 				/*
 				 * Ensure that fdtol->fdl_leader
 				 * remains valid in closef().
 				 */
 				fdtol->fdl_wakeup = 1;
 				msleep(fdtol, &fdp->fd_mtx,
 				       PLOCK, "fdlhold", 0);
 				goto retry;
 			}
 		}
 		fdtol->fdl_refcount--;
 		if (fdtol->fdl_refcount == 0 &&
 		    fdtol->fdl_holdcount == 0) {
 			fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 			fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 		} else
 			fdtol = NULL;
 		td->td_proc->p_fdtol = NULL;
 		FILEDESC_UNLOCK(fdp);
 		if (fdtol != NULL)
 			FREE(fdtol, M_FILEDESC_TO_LEADER);
 	}
 	FILEDESC_LOCK_FAST(fdp);
 	i = --fdp->fd_refcnt;
 	FILEDESC_UNLOCK_FAST(fdp);
 	if (i > 0)
 		return;
 	/*
 	 * We are the last reference to the structure, so we can
 	 * safely assume it will not change out from under us.
 	 */
 	fpp = fdp->fd_ofiles;
 	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
 		if (*fpp)
 			(void) closef(*fpp, td);
 	}
 	FILEDESC_LOCK(fdp);
 
 	/* XXX This should happen earlier. */
 	mtx_lock(&fdesc_mtx);
 	td->td_proc->p_fd = NULL;
 	mtx_unlock(&fdesc_mtx);
 
 	if (fdp->fd_nfiles > NDFILE)
 		FREE(fdp->fd_ofiles, M_FILEDESC);
 	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 		FREE(fdp->fd_map, M_FILEDESC);
 
 	fdp->fd_nfiles = 0;
 
 	cdir = fdp->fd_cdir;
 	fdp->fd_cdir = NULL;
 	rdir = fdp->fd_rdir;
 	fdp->fd_rdir = NULL;
 	jdir = fdp->fd_jdir;
 	fdp->fd_jdir = NULL;
 	FILEDESC_UNLOCK(fdp);
 
 	if (cdir) {
 		locked = VFS_LOCK_GIANT(cdir->v_mount);
 		vrele(cdir);
 		VFS_UNLOCK_GIANT(locked);
 	}
 	if (rdir) {
 		locked = VFS_LOCK_GIANT(rdir->v_mount);
 		vrele(rdir);
 		VFS_UNLOCK_GIANT(locked);
 	}
 	if (jdir) {
 		locked = VFS_LOCK_GIANT(jdir->v_mount);
 		vrele(jdir);
 		VFS_UNLOCK_GIANT(locked);
 	}
 
 	fddrop(fdp);
 }
 
 /*
  * For setugid programs, we don't want to people to use that setugidness
  * to generate error messages which write to a file which otherwise would
  * otherwise be off-limits to the process.  We check for filesystems where
  * the vnode can change out from under us after execve (like [lin]procfs).
  *
  * Since setugidsafety calls this only for fd 0, 1 and 2, this check is
  * sufficient.  We also don't check for setugidness since we know we are.
  */
 static int
 is_unsafe(struct file *fp)
 {
 	if (fp->f_type == DTYPE_VNODE) {
 		struct vnode *vp = fp->f_vnode;
 
 		if ((vp->v_vflag & VV_PROCDEP) != 0)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Make this setguid thing safe, if at all possible.
  */
 void
 setugidsafety(struct thread *td)
 {
 	struct filedesc *fdp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return;
 
 	/*
 	 * Note: fdp->fd_ofiles may be reallocated out from under us while
 	 * we are blocked in a close.  Be careful!
 	 */
 	FILEDESC_LOCK(fdp);
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (i > 2)
 			break;
 		if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
 			struct file *fp;
 
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fp = fdp->fd_ofiles[i];
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			fdunused(fdp, i);
 			FILEDESC_UNLOCK(fdp);
 			(void) closef(fp, td);
 			FILEDESC_LOCK(fdp);
 		}
 	}
 	FILEDESC_UNLOCK(fdp);
 }
 
 void
 fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
 {
 
 	FILEDESC_LOCK(fdp);
 	if (fdp->fd_ofiles[idx] == fp) {
 		fdp->fd_ofiles[idx] = NULL;
 		fdunused(fdp, idx);
 		FILEDESC_UNLOCK(fdp);
 		fdrop(fp, td);
 	} else {
 		FILEDESC_UNLOCK(fdp);
 	}
 }
 
 /*
  * Close any files on exec?
  */
 void
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
 	int i;
 
 	/* Certain daemons might not have file descriptors. */
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return;
 
 	FILEDESC_LOCK(fdp);
 
 	/*
 	 * We cannot cache fd_ofiles or fd_ofileflags since operations
 	 * may block and rip them out from under us.
 	 */
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (fdp->fd_ofiles[i] != NULL &&
 		    (fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
 			struct file *fp;
 
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fp = fdp->fd_ofiles[i];
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			fdunused(fdp, i);
 			FILEDESC_UNLOCK(fdp);
 			(void) closef(fp, td);
 			FILEDESC_LOCK(fdp);
 		}
 	}
 	FILEDESC_UNLOCK(fdp);
 }
 
 /*
  * It is unsafe for set[ug]id processes to be started with file
  * descriptors 0..2 closed, as these descriptors are given implicit
  * significance in the Standard C library.  fdcheckstd() will create a
  * descriptor referencing /dev/null for each of stdin, stdout, and
  * stderr that is not already open.
  */
 int
 fdcheckstd(struct thread *td)
 {
-	struct nameidata nd;
 	struct filedesc *fdp;
-	struct file *fp;
-	register_t retval;
-	int fd, i, error, flags, devnull;
+	register_t retval, save;
+	int i, error, devnull;
 
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
 		return (0);
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	devnull = -1;
 	error = 0;
 	for (i = 0; i < 3; i++) {
 		if (fdp->fd_ofiles[i] != NULL)
 			continue;
 		if (devnull < 0) {
 			int vfslocked;
 			error = falloc(td, &fp, &fd);
 			if (error != 0)
 				break;
 			/* Note extra ref on `fp' held for us by falloc(). */
 			KASSERT(fd == i, ("oof, we didn't get our fd"));
 			NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE,
 			    "/dev/null", td);
 			flags = FREAD | FWRITE;
 			error = vn_open(&nd, &flags, 0, fd);
 			if (error != 0) {
 				/*
 				 * Someone may have closed the entry in the
 				 * file descriptor table, so check it hasn't
 				 * changed before dropping the reference count.
 				 */
 				FILEDESC_LOCK(fdp);
 				KASSERT(fdp->fd_ofiles[fd] == fp,
 				    ("table not shared, how did it change?"));
 				fdp->fd_ofiles[fd] = NULL;
 				fdunused(fdp, fd);
 				FILEDESC_UNLOCK(fdp);
 				fdrop(fp, td);
 				fdrop(fp, td);
 				break;
 			}
 			vfslocked = NDHASGIANT(&nd);
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			fp->f_flag = flags;
 			fp->f_vnode = nd.ni_vp;
 			if (fp->f_data == NULL)
 				fp->f_data = nd.ni_vp;
 			if (fp->f_ops == &badfileops)
 				fp->f_ops = &vnops;
 			fp->f_type = DTYPE_VNODE;
 			VOP_UNLOCK(nd.ni_vp, 0, td);
 			VFS_UNLOCK_GIANT(vfslocked);
 			devnull = fd;
 			fdrop(fp, td);
 		} else {
 			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
 			if (error != 0)
 				break;
 		}
 	}
 	return (error);
 }
 
 /*
  * Internal form of close.
  * Decrement reference count on file structure.
  * Note: td may be NULL when closing a file that was being passed in a
  * message.
  *
  * XXXRW: Giant is not required for the caller, but often will be held; this
  * makes it moderately likely the Giant will be recursed in the VFS case.
  */
 int
 closef(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
 	 * locks owned by this process.  This is handled by setting
 	 * a flag in the unlock to free ONLY locks obeying POSIX
 	 * semantics, and not to free BSD-style file locks.
 	 * If the descriptor was in a message, POSIX-style locks
 	 * aren't passed with the descriptor, and the thread pointer
 	 * will be NULL.  Callers should be careful only to pass a
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
 	 */
 	if (fp->f_type == DTYPE_VNODE && td != NULL) {
 		int vfslocked;
 
 		vp = fp->f_vnode;
 		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 					   F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
 			/*
 			 * Handle special case where file descriptor table
 			 * is shared between multiple process leaders.
 			 */
 			fdp = td->td_proc->p_fd;
 			FILEDESC_LOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
 			     fdtol != td->td_proc->p_fdtol;
 			     fdtol = fdtol->fdl_next) {
 				if ((fdtol->fdl_leader->p_flag &
 				     P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
 				FILEDESC_UNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 						   (caddr_t)fdtol->fdl_leader,
 						   F_UNLCK, &lf, F_POSIX);
 				FILEDESC_LOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
 				    fdtol->fdl_wakeup != 0) {
 					fdtol->fdl_wakeup = 0;
 					wakeup(fdtol);
 				}
 			}
 			FILEDESC_UNLOCK(fdp);
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	return (fdrop(fp, td));
 }
 
 /*
  * Extract the file pointer associated with the specified descriptor for
  * the current user process.
  *
  * If the descriptor doesn't exist, EBADF is returned.
  *
  * If the descriptor exists but doesn't match 'flags' then
  * return EBADF for read attempts and EINVAL for write attempts.
  *
  * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
  * It should be dropped with fdrop().
  * If it is not set, then the refcount will not be bumped however the
  * thread's filedesc struct will be returned locked (for fgetsock).
  *
  * If an error occured the non-zero error is returned and *fpp is set to NULL.
  * Otherwise *fpp is set and zero is returned.
  */
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	*fpp = NULL;
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
 		return (EBADF);
 	FILEDESC_LOCK(fdp);
 	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 
 	/*
 	 * Note: FREAD failure returns EBADF to maintain backwards
 	 * compatibility with what routines returned before.
 	 *
 	 * Only one flag, or 0, may be specified.
 	 */
 	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
 		FILEDESC_UNLOCK(fdp);
 		return (EINVAL);
 	}
 	if (hold) {
 		fhold(fp);
 		FILEDESC_UNLOCK(fdp);
 	}
 	*fpp = fp;
 	return (0);
 }
 
 int
 fget(struct thread *td, int fd, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, 0, 1));
 }
 
 int
 fget_read(struct thread *td, int fd, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, FREAD, 1));
 }
 
 int
 fget_write(struct thread *td, int fd, struct file **fpp)
 {
 
 	return(_fget(td, fd, fpp, FWRITE, 1));
 }
 
 /*
  * Like fget() but loads the underlying vnode, or returns an error if
  * the descriptor does not represent a vnode.  Note that pipes use vnodes
  * but never have VM objects.  The returned vnode will be vref()d.
  *
  * XXX: what about the unused flags ?
  */
 static __inline int
 _fgetvp(struct thread *td, int fd, struct vnode **vpp, int flags)
 {
 	struct file *fp;
 	int error;
 
 	*vpp = NULL;
 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 	} else {
 		*vpp = fp->f_vnode;
 		vref(*vpp);
 	}
 	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	return (error);
 }
 
 int
 fgetvp(struct thread *td, int fd, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, vpp, 0));
 }
 
 int
 fgetvp_read(struct thread *td, int fd, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, vpp, FREAD));
 }
 
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, vpp, FWRITE));
 }
 #endif
 
 /*
  * Like fget() but loads the underlying socket, or returns an error if
  * the descriptor does not represent a socket.
  *
  * We bump the ref count on the returned socket.  XXX Also obtain the SX
  * lock in the future.
  */
 int
 fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
 {
 	struct file *fp;
 	int error;
 
 	NET_ASSERT_GIANT();
 
 	*spp = NULL;
 	if (fflagp != NULL)
 		*fflagp = 0;
 	if ((error = _fget(td, fd, &fp, 0, 0)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		error = ENOTSOCK;
 	} else {
 		*spp = fp->f_data;
 		if (fflagp)
 			*fflagp = fp->f_flag;
 		SOCK_LOCK(*spp);
 		soref(*spp);
 		SOCK_UNLOCK(*spp);
 	}
 	FILEDESC_UNLOCK(td->td_proc->p_fd);
 	return (error);
 }
 
 /*
  * Drop the reference count on the socket and XXX release the SX lock in
  * the future.  The last reference closes the socket.
  */
 void
 fputsock(struct socket *so)
 {
 
 	NET_ASSERT_GIANT();
 	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	sorele(so);
 }
 
 int
 fdrop(struct file *fp, struct thread *td)
 {
 
 	FILE_LOCK(fp);
 	return (fdrop_locked(fp, td));
 }
 
 /*
  * Drop reference on struct file passed in, may call closef if the
  * reference hits zero.
  * Expects struct file locked, and will unlock it.
  */
 static int
 fdrop_locked(struct file *fp, struct thread *td)
 {
 	int error;
 
 	FILE_LOCK_ASSERT(fp, MA_OWNED);
 
 	if (--fp->f_count > 0) {
 		FILE_UNLOCK(fp);
 		return (0);
 	}
 
 	/*
 	 * We might have just dropped the last reference to a file
 	 * object that is for a UNIX domain socket whose message
 	 * buffers are being examined in unp_gc().  If that is the
 	 * case, FWAIT will be set in f_gcflag and we need to wait for
 	 * unp_gc() to finish its scan.
 	 */
 	while (fp->f_gcflag & FWAIT)
 		msleep(&fp->f_gcflag, fp->f_mtxp, 0, "fpdrop", 0);
 
 	/* We have the last ref so we can proceed without the file lock. */
 	FILE_UNLOCK(fp);
 	if (fp->f_count < 0)
 		panic("fdrop: count < 0");
 	if (fp->f_ops != &badfileops)
 		error = fo_close(fp, td);
 	else
 		error = 0;
 
 	sx_xlock(&filelist_lock);
 	LIST_REMOVE(fp, f_list);
 	openfiles--;
 	sx_xunlock(&filelist_lock);
 	crfree(fp->f_cred);
 	uma_zfree(file_zone, fp);
 
 	return (error);
 }
 
 /*
  * Apply an advisory lock on a file descriptor.
  *
  * Just attempt to get a record lock of the requested type on
  * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
 	int	fd;
 	int	how;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 flock(struct thread *td, struct flock_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 	int error;
 
 	if ((error = fget(td, uap->fd, &fp)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_VNODE) {
 		fdrop(fp, td);
 		return (EOPNOTSUPP);
 	}
 
 	mtx_lock(&Giant);
 	vp = fp->f_vnode;
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
 		FILE_LOCK(fp);
 		fp->f_flag &= ~FHASLOCK;
 		FILE_UNLOCK(fp);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done2;
 	}
 	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
 	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else {
 		error = EBADF;
 		goto done2;
 	}
 	FILE_LOCK(fp);
 	fp->f_flag |= FHASLOCK;
 	FILE_UNLOCK(fp);
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
 	fdrop(fp, td);
 	mtx_unlock(&Giant);
 	return (error);
 }
 /*
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
 dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error)
 {
 	struct file *wfp;
 	struct file *fp;
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
 	FILEDESC_LOCK(fdp);
 	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
 	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
 		FILEDESC_UNLOCK(fdp);
 		return (EBADF);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
 	 * For ENODEV simply dup (dfd) to file descriptor
 	 * (indx) and return.
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and
 	 * store it in (indx).  (dfd) is effectively closed by
 	 * this operation.
 	 *
 	 * Any other error code is just returned.
 	 */
 	switch (error) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
 		FILE_LOCK(wfp);
 		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
 			FILE_UNLOCK(wfp);
 			FILEDESC_UNLOCK(fdp);
 			return (EACCES);
 		}
 		fp = fdp->fd_ofiles[indx];
 		fdp->fd_ofiles[indx] = wfp;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
 		if (fp == NULL)
 			fdused(fdp, indx);
 		fhold_locked(wfp);
 		FILE_UNLOCK(wfp);
 		FILEDESC_UNLOCK(fdp);
 		if (fp != NULL) {
 			/*
 			 * We now own the reference to fp that the ofiles[]
 			 * array used to own.  Release it.
 			 */
 			FILE_LOCK(fp);
 			fdrop_locked(fp, td);
 		}
 		return (0);
 
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
 		fp = fdp->fd_ofiles[indx];
 		fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
 		fdp->fd_ofiles[dfd] = NULL;
 		fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
 		fdp->fd_ofileflags[dfd] = 0;
 		fdunused(fdp, dfd);
 		if (fp == NULL)
 			fdused(fdp, indx);
 		if (fp != NULL)
 			FILE_LOCK(fp);
 
 		/*
 		 * We now own the reference to fp that the ofiles[] array
 		 * used to own.  Release it.
 		 */
 		if (fp != NULL)
 			fdrop_locked(fp, td);
 
 		FILEDESC_UNLOCK(fdp);
 
 		return (0);
 
 	default:
 		FILEDESC_UNLOCK(fdp);
 		return (error);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Scan all active processes to see if any of them have a current
  * or root directory of `olddp'. If so, replace them with the new
  * mount point.
  */
 void
 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int nrele;
 
 	if (vrefcnt(olddp) == 1)
 		return;
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &allproc, p_list) {
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
 		nrele = 0;
 		FILEDESC_LOCK_FAST(fdp);
 		if (fdp->fd_cdir == olddp) {
 			vref(newdp);
 			fdp->fd_cdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_rdir == olddp) {
 			vref(newdp);
 			fdp->fd_rdir = newdp;
 			nrele++;
 		}
 		FILEDESC_UNLOCK_FAST(fdp);
 		fddrop(fdp);
 		while (nrele--)
 			vrele(olddp);
 	}
 	sx_sunlock(&allproc_lock);
 	if (rootvnode == olddp) {
 		vrele(rootvnode);
 		vref(newdp);
 		rootvnode = newdp;
 	}
 }
 
 struct filedesc_to_leader *
 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
 {
 	struct filedesc_to_leader *fdtol;
 
 	MALLOC(fdtol, struct filedesc_to_leader *,
 	       sizeof(struct filedesc_to_leader),
 	       M_FILEDESC_TO_LEADER,
 	       M_WAITOK);
 	fdtol->fdl_refcount = 1;
 	fdtol->fdl_holdcount = 0;
 	fdtol->fdl_wakeup = 0;
 	fdtol->fdl_leader = leader;
 	if (old != NULL) {
 		FILEDESC_LOCK(fdp);
 		fdtol->fdl_next = old->fdl_next;
 		fdtol->fdl_prev = old;
 		old->fdl_next = fdtol;
 		fdtol->fdl_next->fdl_prev = fdtol;
 		FILEDESC_UNLOCK(fdp);
 	} else {
 		fdtol->fdl_next = fdtol;
 		fdtol->fdl_prev = fdtol;
 	}
 	return (fdtol);
 }
 
 /*
  * Get file structures.
  */
 static int
 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 {
 	struct xfile xf;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int error, n;
 
 	/*
 	 * Note: because the number of file descriptors is calculated
 	 * in different ways for sizing vs returning the data,
 	 * there is information leakage from the first loop.  However,
 	 * it is of a similar order of magnitude to the leakage from
 	 * global system statistics such as kern.openfiles.
 	 */
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	if (req->oldptr == NULL) {
 		n = 16;		/* A slight overestimate. */
 		sx_slock(&filelist_lock);
 		LIST_FOREACH(fp, &filehead, f_list) {
 			/*
 			 * We should grab the lock, but this is an
 			 * estimate, so does it really matter?
 			 */
 			/* mtx_lock(fp->f_mtxp); */
 			n += fp->f_count;
 			/* mtx_unlock(f->f_mtxp); */
 		}
 		sx_sunlock(&filelist_lock);
 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 	}
 	error = 0;
 	bzero(&xf, sizeof(xf));
 	xf.xf_size = sizeof(xf);
 	sx_slock(&allproc_lock);
 	LIST_FOREACH(p, &allproc, p_list) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		PROC_LOCK(p);
 		if (p_cansee(req->td, p) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		xf.xf_pid = p->p_pid;
 		xf.xf_uid = p->p_ucred->cr_uid;
 		PROC_UNLOCK(p);
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_LOCK_FAST(fdp);
 		for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
 			if ((fp = fdp->fd_ofiles[n]) == NULL)
 				continue;
 			xf.xf_fd = n;
 			xf.xf_file = fp;
 			xf.xf_data = fp->f_data;
 			xf.xf_vnode = fp->f_vnode;
 			xf.xf_type = fp->f_type;
 			xf.xf_count = fp->f_count;
 			xf.xf_msgcount = fp->f_msgcount;
 			xf.xf_offset = fp->f_offset;
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 			if (error)
 				break;
 		}
 		FILEDESC_UNLOCK_FAST(fdp);
 		fddrop(fdp);
 		if (error)
 			break;
 	}
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 
 #ifdef DDB
 /*
  * For the purposes of debugging, generate a human-readable string for the
  * file type.
  */
 static const char *
 file_type_to_name(short type)
 {
 
 	switch (type) {
 	case 0:
 		return ("zero");
 	case DTYPE_VNODE:
 		return ("vnod");
 	case DTYPE_SOCKET:
 		return ("sock");
 	case DTYPE_PIPE:
 		return ("pipe");
 	case DTYPE_FIFO:
 		return ("fifo");
 	case DTYPE_CRYPTO:
 		return ("crpt");
 	default:
 		return ("unkn");
 	}
 }
 
 /*
  * For the purposes of debugging, identify a process (if any, perhaps one of
  * many) that references the passed file in its file descriptor array. Return
  * NULL if none.
  */
 static struct proc *
 file_to_first_proc(struct file *fp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int n;
 
 	LIST_FOREACH(p, &allproc, p_list) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
 		for (n = 0; n < fdp->fd_nfiles; n++) {
 			if (fp == fdp->fd_ofiles[n])
 				return (p);
 		}
 	}
 	return (NULL);
 }
 
 DB_SHOW_COMMAND(files, db_show_files)
 {
 	struct file *fp;
 	struct proc *p;
 
 	db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", "File",
 	    "Type", "Data", "Flag", "GCFl", "Count", "MCount", "Vnode",
 	    "FPID", "FCmd");
 	LIST_FOREACH(fp, &filehead, f_list) {
 		p = file_to_first_proc(fp);
 		db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
 		    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
 		    fp->f_gcflag, fp->f_count, fp->f_msgcount, fp->f_vnode,
 		    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 	}
 }
 #endif
 
 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
     &maxfilesperproc, 0, "Maximum files allowed open per process");
 
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
     &maxfiles, 0, "Maximum number of files");
 
 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
     &openfiles, 0, "System-wide number of open files");
 
 /* ARGSUSED*/
 static void
 filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	sx_init(&filelist_lock, "filelist lock");
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
 }
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL)
 
 /*-------------------------------------------------------------------*/
 
 static int
 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_poll(struct file *fp, int events, struct ucred *active_cred, struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_close(struct file *fp, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 struct fileops badfileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = badfo_poll,
 	.fo_kqfilter = badfo_kqfilter,
 	.fo_stat = badfo_stat,
 	.fo_close = badfo_close,
 };
 
 
 /*-------------------------------------------------------------------*/
 
 /*
  * File Descriptor pseudo-device driver (/dev/fd/).
  *
  * Opening minor device N dup()s the file (if any) connected to file
  * descriptor N belonging to the calling process.  Note that this driver
  * consists of only the ``open()'' routine, because all subsequent
  * references to this file will be direct to the other driver.
  *
  * XXX: we could give this one a cloning event handler if necessary.
  */
 
 /* ARGSUSED */
 static int
 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 {
 
 	/*
 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
 	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 	 * will simply report the error.
 	 */
 	td->td_dupfd = dev2unit(dev);
 	return (ENODEV);
 }
 
 static struct cdevsw fildesc_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	D_NEEDGIANT,
 	.d_open =	fdopen,
 	.d_name =	"FD",
 };
 
 static void
 fildesc_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	dev = make_dev(&fildesc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0666, "fd/0");
 	make_dev_alias(dev, "stdin");
 	dev = make_dev(&fildesc_cdevsw, 1, UID_ROOT, GID_WHEEL, 0666, "fd/1");
 	make_dev_alias(dev, "stdout");
 	dev = make_dev(&fildesc_cdevsw, 2, UID_ROOT, GID_WHEEL, 0666, "fd/2");
 	make_dev_alias(dev, "stderr");
 }
 
 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL)
Index: stable/6/sys/kern/kern_exec.c
===================================================================
--- stable/6/sys/kern/kern_exec.c	(revision 169910)
+++ stable/6/sys/kern/kern_exec.c	(revision 169911)
@@ -1,1312 +1,1303 @@
 /*-
  * Copyright (c) 1993, David Greenman
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/mac.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
 #include <sys/acct.h>
 #include <sys/exec.h>
 #include <sys/imgact.h>
 #include <sys/imgact_elf.h>
 #include <sys/wait.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/namei.h>
 #include <sys/resourcevar.h>
 #include <sys/sf_buf.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/shm.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #ifdef	HWPMC_HOOKS
 #include <sys/pmckern.h>
 #endif
 
 #include <machine/reg.h>
 
 #include <security/audit/audit.h>
 
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
 static int do_execve(struct thread *td, struct image_args *args,
     struct mac *mac_p);
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
     NULL, 0, sysctl_kern_ps_strings, "LU", "");
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG|CTLFLAG_RD,
     NULL, 0, sysctl_kern_usrstack, "LU", "");
 
 SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT|CTLFLAG_RD,
     NULL, 0, sysctl_kern_stackprot, "I", "");
 
 u_long ps_arg_cache_limit = PAGE_SIZE / 16;
 SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW, 
     &ps_arg_cache_limit, 0, "");
 
 static int
 sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_psstrings;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
 		   sizeof(p->p_sysent->sv_psstrings));
 	return error;
 }
 
 static int
 sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 	int error;
 
 	p = curproc;
 #ifdef SCTL_MASK32
 	if (req->flags & SCTL_MASK32) {
 		unsigned int val;
 		val = (unsigned int)p->p_sysent->sv_usrstack;
 		error = SYSCTL_OUT(req, &val, sizeof(val));
 	} else
 #endif
 		error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
 		    sizeof(p->p_sysent->sv_usrstack));
 	return error;
 }
 
 static int
 sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
 {
 	struct proc *p;
 
 	p = curproc;
 	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
 	    sizeof(p->p_sysent->sv_stackprot)));
 }
 
 /*
  * Each of the items is a pointer to a `const struct execsw', hence the
  * double pointer here.
  */
 static const struct execsw **execsw;
 
 #ifndef _SYS_SYSPROTO_H_
 struct execve_args {
 	char    *fname; 
 	char    **argv;
 	char    **envv; 
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 execve(td, uap)
 	struct thread *td;
 	struct execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 	} */ *uap;
 {
 	int error;
 	struct image_args args;
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 
 	if (error == 0)
 		error = kern_execve(td, &args, NULL);
 
 	exec_free_args(&args);
 
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __mac_execve_args {
 	char	*fname;
 	char	**argv;
 	char	**envv;
 	struct mac	*mac_p;
 };
 #endif
 
 /*
  * MPSAFE
  */
 int
 __mac_execve(td, uap)
 	struct thread *td;
 	struct __mac_execve_args /* {
 		char *fname;
 		char **argv;
 		char **envv;
 		struct mac *mac_p;
 	} */ *uap;
 {
 #ifdef MAC
 	int error;
 	struct image_args args;
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
 
 	if (error == 0)
 		error = kern_execve(td, &args, uap->mac_p);
 
 	exec_free_args(&args);
 
 	return (error);
 #else
 	return (ENOSYS);
 #endif
 }
 
 /*
  * XXX: kern_execve has the astonishing property of not always
  * returning to the caller.  If sufficiently bad things happen during
  * the call to do_execve(), it can end up calling exit1(); as a result,
  * callers must avoid doing anything which they might need to undo
  * (e.g., allocating memory).
  */
 int
 kern_execve(td, args, mac_p)
 	struct thread *td;
 	struct image_args *args;
 	struct mac *mac_p;
 {
 	struct proc *p = td->td_proc;
 	int error;
 
 	AUDIT_ARG(argv, args->begin_argv, args->argc,
 	    args->begin_envv - args->begin_argv);
 	AUDIT_ARG(envv, args->begin_envv, args->envc,
 	    args->endp - args->begin_envv);
 	if (p->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p);
 		if (thread_single(SINGLE_BOUNDARY)) {
 			PROC_UNLOCK(p);
 			return (ERESTART);	/* Try again later. */
 		}
 		PROC_UNLOCK(p);
 	}
 
 	error = do_execve(td, args, mac_p);
 
 	if (p->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p);
 		/*
 		 * If success, we upgrade to SINGLE_EXIT state to
 		 * force other threads to suicide.
 		 */
 		if (error == 0)
 			thread_single(SINGLE_EXIT);
 		else
 			thread_single_end();
 		PROC_UNLOCK(p);
 	}
 
 	return (error);
 }
 
 /*
  * In-kernel implementation of execve().  All arguments are assumed to be
  * userspace pointers from the passed thread.
  *
  * MPSAFE
  */
 static int
 do_execve(td, args, mac_p)
 	struct thread *td;
 	struct image_args *args;
 	struct mac *mac_p;
 {
 	struct proc *p = td->td_proc;
 	struct nameidata nd, *ndp;
 	struct ucred *newcred = NULL, *oldcred;
 	struct uidinfo *euip;
 	register_t *stack_base;
 	int error, len, i;
 	struct image_params image_params, *imgp;
 	struct vattr attr;
 	int (*img_first)(struct image_params *);
 	struct pargs *oldargs = NULL, *newargs = NULL;
 	struct sigacts *oldsigacts, *newsigacts;
 #ifdef KTRACE
 	struct vnode *tracevp = NULL;
 	struct ucred *tracecred = NULL;
 #endif
 	struct vnode *textvp = NULL;
 	int credential_changing;
 	int vfslocked;
 	int textset;
 #ifdef MAC
 	struct label *interplabel = NULL;
 	int will_transition;
 #endif
 #ifdef HWPMC_HOOKS
 	struct pmckern_procexec pe;
 #endif
 
 	vfslocked = 0;
 	imgp = &image_params;
 
 	/*
 	 * Lock the process and set the P_INEXEC flag to indicate that
 	 * it should be left alone until we're done here.  This is
 	 * necessary to avoid race conditions - e.g. in ptrace() -
 	 * that might allow a local user to illicitly obtain elevated
 	 * privileges.
 	 */
 	PROC_LOCK(p);
 	KASSERT((p->p_flag & P_INEXEC) == 0,
 	    ("%s(): process already has P_INEXEC flag", __func__));
 	p->p_flag |= P_INEXEC;
 	PROC_UNLOCK(p);
 
 	/*
 	 * Initialize part of the common data
 	 */
 	imgp->proc = p;
 	imgp->execlabel = NULL;
 	imgp->attr = &attr;
 	imgp->entry_addr = 0;
 	imgp->vmspace_destroyed = 0;
 	imgp->interpreted = 0;
 	imgp->interpreter_name = args->buf + PATH_MAX + ARG_MAX;
 	imgp->auxargs = NULL;
 	imgp->vp = NULL;
 	imgp->object = NULL;
 	imgp->firstpage = NULL;
 	imgp->ps_strings = 0;
 	imgp->auxarg_size = 0;
 	imgp->args = args;
 
 #ifdef MAC
 	error = mac_execve_enter(imgp, mac_p);
 	if (error)
 		goto exec_fail;
 #endif
 
 	imgp->image_header = NULL;
 
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp amoung other things.
 	 *
 	 * XXXAUDIT: It would be desirable to also audit the name of the
 	 * interpreter if this is an interpreted binary.
 	 */
 	ndp = &nd;
 	NDINIT(ndp, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE |
 	    AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
 
 interpret:
 	error = namei(ndp);
 	if (error)
 		goto exec_fail;
 
 	vfslocked = NDHASGIANT(ndp);
 	imgp->vp = ndp->ni_vp;
 
 	/*
 	 * Check file permissions (also 'opens' file)
 	 */
 	error = exec_check_permissions(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	imgp->object = imgp->vp->v_object;
 	if (imgp->object != NULL)
 		vm_object_reference(imgp->object);
 
 	/*
 	 * Set VV_TEXT now so no one can write to the executable while we're
 	 * activating it.
 	 *
 	 * Remember if this was set before and unset it in case this is not
 	 * actually an executable image.
 	 */
 	textset = imgp->vp->v_vflag & VV_TEXT;
 	imgp->vp->v_vflag |= VV_TEXT;
 
 	error = exec_map_first_page(imgp);
 	if (error)
 		goto exec_fail_dealloc;
 
 	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell
 	 *	scripts differently.
 	 */
 	error = -1;
 	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
 		error = img_first(imgp);
 
 	/*
 	 *	Loop through the list of image activators, calling each one.
 	 *	An activator returns -1 if there is no match, 0 on success,
 	 *	and an error otherwise.
 	 */
 	for (i = 0; error == -1 && execsw[i]; ++i) {
 		if (execsw[i]->ex_imgact == NULL ||
 		    execsw[i]->ex_imgact == img_first) {
 			continue;
 		}
 		error = (*execsw[i]->ex_imgact)(imgp);
 	}
 
 	if (error) {
 		if (error == -1) {
 			if (textset == 0)
 				imgp->vp->v_vflag &= ~VV_TEXT;
 			error = ENOEXEC;
 		}
 		goto exec_fail_dealloc;
 	}
 
 	/*
 	 * Special interpreter operation, cleanup and loop up to try to
 	 * activate the interpreter.
 	 */
 	if (imgp->interpreted) {
 		exec_unmap_first_page(imgp);
 		/*
 		 * VV_TEXT needs to be unset for scripts.  There is a short
 		 * period before we determine that something is a script where
 		 * VV_TEXT will be set. The vnode lock is held over this
 		 * entire period so nothing should illegitimately be blocked.
 		 */
 		imgp->vp->v_vflag &= ~VV_TEXT;
 		/* free name buffer and old vnode */
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 #ifdef MAC
 		interplabel = mac_vnode_label_alloc();
 		mac_copy_vnode_label(ndp->ni_vp->v_label, interplabel);
 #endif
 		vput(ndp->ni_vp);
 		vm_object_deallocate(imgp->object);
 		imgp->object = NULL;
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = 0;
 		/* set new name to that of the interpreter */
 		NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE,
 		    UIO_SYSSPACE, imgp->interpreter_name, td);
 		goto interpret;
 	}
 
 	/*
 	 * Copy out strings (args and env) and initialize stack base
 	 */
 	if (p->p_sysent->sv_copyout_strings)
 		stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
 	else
 		stack_base = exec_copyout_strings(imgp);
 
 	/*
 	 * If custom stack fixup routine present for this process
 	 * let it do the stack setup.
 	 * Else stuff argument count as first item on stack
 	 */
 	if (p->p_sysent->sv_fixup != NULL)
 		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
 	else
 		suword(--stack_base, imgp->args->argc);
 
 	/*
 	 * For security and other reasons, the file descriptor table cannot
 	 * be shared after an exec.
 	 */
 	fdunshare(p, td);
 
 	/*
 	 * Malloc things before we need locks.
 	 */
 	newcred = crget();
 	euip = uifind(attr.va_uid);
 	i = imgp->args->begin_envv - imgp->args->begin_argv;
 	/* Cache arguments if they fit inside our allowance */
 	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
 		newargs = pargs_alloc(i);
 		bcopy(imgp->args->begin_argv, newargs->ar_args, i);
 	}
 
 	/* close files on exec */
 	VOP_UNLOCK(imgp->vp, 0, td);
 	fdcloseexec(td);
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	/* Get a reference to the vnode prior to locking the proc */
 	VREF(ndp->ni_vp);
 
 	/*
 	 * For security and other reasons, signal handlers cannot
 	 * be shared after an exec. The new process gets a copy of the old
 	 * handlers. In execsigs(), the new process will have its signals
 	 * reset.
 	 */
 	PROC_LOCK(p);
 	if (sigacts_shared(p->p_sigacts)) {
 		oldsigacts = p->p_sigacts;
 		PROC_UNLOCK(p);
 		newsigacts = sigacts_alloc();
 		sigacts_copy(newsigacts, oldsigacts);
 		PROC_LOCK(p);
 		p->p_sigacts = newsigacts;
 	} else
 		oldsigacts = NULL;
 
 	/* Stop profiling */
 	stopprofclock(p);
 
 	/* reset caught signals */
 	execsigs(p);
 
 	/* name this process - nameiexec(p, ndp) */
 	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
 	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
 	p->p_comm[len] = 0;
 
 	/*
 	 * mark as execed, wakeup the process that vforked (if any) and tell
 	 * it that it now has its own resources back
 	 */
 	p->p_flag |= P_EXEC;
 	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
 		p->p_flag &= ~P_PPWAIT;
 		wakeup(p->p_pptr);
 	}
 
 	/*
 	 * Implement image setuid/setgid.
 	 *
 	 * Don't honor setuid/setgid if the filesystem prohibits it or if
 	 * the process is being traced.
 	 *
 	 * XXXMAC: For the time being, use NOSUID to also prohibit
 	 * transitions on the file system.
 	 */
 	oldcred = p->p_ucred;
 	credential_changing = 0;
 	credential_changing |= (attr.va_mode & VSUID) && oldcred->cr_uid !=
 	    attr.va_uid;
 	credential_changing |= (attr.va_mode & VSGID) && oldcred->cr_gid !=
 	    attr.va_gid;
 #ifdef MAC
 	will_transition = mac_execve_will_transition(oldcred, imgp->vp,
 	    interplabel, imgp);
 	credential_changing |= will_transition;
 #endif
 
 	if (credential_changing &&
 	    (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
 	    (p->p_flag & P_TRACED) == 0) {
 		/*
 		 * Turn off syscall tracing for set-id programs, except for
 		 * root.  Record any set-id flags first to make sure that
 		 * we do not regain any tracing during a possible block.
 		 */
 		setsugid(p);
 #ifdef KTRACE
 		if (p->p_tracevp != NULL && suser_cred(oldcred, SUSER_ALLOWJAIL)) {
 			mtx_lock(&ktrace_mtx);
 			p->p_traceflag = 0;
 			tracevp = p->p_tracevp;
 			p->p_tracevp = NULL;
 			tracecred = p->p_tracecred;
 			p->p_tracecred = NULL;
 			mtx_unlock(&ktrace_mtx);
 		}
 #endif
 		/*
 		 * Close any file descriptors 0..2 that reference procfs,
 		 * then make sure file descriptors 0..2 are in use.
 		 *
 		 * setugidsafety() may call closef() and then pfind()
 		 * which may grab the process lock.
 		 * fdcheckstd() may call falloc() which may block to
 		 * allocate memory, so temporarily drop the process lock.
 		 */
 		PROC_UNLOCK(p);
 		setugidsafety(td);
 		VOP_UNLOCK(imgp->vp, 0, td);
 		error = fdcheckstd(td);
 		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 		if (error != 0)
 			goto done1;
 		PROC_LOCK(p);
 		/*
 		 * Set the new credentials.
 		 */
 		crcopy(newcred, oldcred);
 		if (attr.va_mode & VSUID)
 			change_euid(newcred, euip);
 		if (attr.va_mode & VSGID)
 			change_egid(newcred, attr.va_gid);
 #ifdef MAC
 		if (will_transition) {
 			mac_execve_transition(oldcred, newcred, imgp->vp,
 			    interplabel, imgp);
 		}
 #endif
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXXMAC: Note that the current logic will save the
 		 * uid and gid if a MAC domain transition occurs, even
 		 * though maybe it shouldn't.
 		 */
 		change_svuid(newcred, newcred->cr_uid);
 		change_svgid(newcred, newcred->cr_gid);
 		p->p_ucred = newcred;
 		newcred = NULL;
 	} else {
 		if (oldcred->cr_uid == oldcred->cr_ruid &&
 		    oldcred->cr_gid == oldcred->cr_rgid)
 			p->p_flag &= ~P_SUGID;
 		/*
 		 * Implement correct POSIX saved-id behavior.
 		 *
 		 * XXX: It's not clear that the existing behavior is
 		 * POSIX-compliant.  A number of sources indicate that the
 		 * saved uid/gid should only be updated if the new ruid is
 		 * not equal to the old ruid, or the new euid is not equal
 		 * to the old euid and the new euid is not equal to the old
 		 * ruid.  The FreeBSD code always updates the saved uid/gid.
 		 * Also, this code uses the new (replaced) euid and egid as
 		 * the source, which may or may not be the right ones to use.
 		 */
 		if (oldcred->cr_svuid != oldcred->cr_uid ||
 		    oldcred->cr_svgid != oldcred->cr_gid) {
 			crcopy(newcred, oldcred);
 			change_svuid(newcred, newcred->cr_uid);
 			change_svgid(newcred, newcred->cr_gid);
 			p->p_ucred = newcred;
 			newcred = NULL;
 		}
 	}
 
 	/*
 	 * Store the vp for use in procfs.  This vnode was referenced prior
 	 * to locking the proc lock.
 	 */
 	textvp = p->p_textvp;
 	p->p_textvp = ndp->ni_vp;
 
 	/*
 	 * Notify others that we exec'd, and clear the P_INEXEC flag
 	 * as we're now a bona fide freshly-execed process.
 	 */
 	KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
 	p->p_flag &= ~P_INEXEC;
 
 	/*
 	 * If tracing the process, trap to debugger so breakpoints
 	 * can be set before the program executes.
 	 * Use tdsignal to deliver signal to current thread, use
 	 * psignal may cause the signal to be delivered to wrong thread
 	 * because that thread will exit, remember we are going to enter
 	 * single thread mode.
 	 */
 	if (p->p_flag & P_TRACED)
 		tdsignal(td, SIGTRAP, SIGTARGET_TD);
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
 
 	/*
 	 * Free any previous argument cache and replace it with
 	 * the new argument cache, if any.
 	 */
 	oldargs = p->p_args;
 	p->p_args = newargs;
 	newargs = NULL;
 
 #ifdef	HWPMC_HOOKS
 	/*
 	 * Check if system-wide sampling is in effect or if the
 	 * current process is using PMCs.  If so, do exec() time
 	 * processing.  This processing needs to happen AFTER the
 	 * P_INEXEC flag is cleared.
 	 *
 	 * The proc lock needs to be released before taking the PMC
 	 * SX.
 	 */
 	if (PMC_SYSTEM_SAMPLING_ACTIVE() || PMC_PROC_IS_USING_PMCS(p)) {
 		PROC_UNLOCK(p);
 		pe.pm_credentialschanged = credential_changing;
 		pe.pm_entryaddr = imgp->entry_addr;
 
 		PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
 	} else
 		PROC_UNLOCK(p);
 #else  /* !HWPMC_HOOKS */
 	PROC_UNLOCK(p);
 #endif
 
 	/* Set values passed into the program in registers. */
 	if (p->p_sysent->sv_setregs)
 		(*p->p_sysent->sv_setregs)(td, imgp->entry_addr,
 		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
 	else
 		exec_setregs(td, imgp->entry_addr,
 		    (u_long)(uintptr_t)stack_base, imgp->ps_strings);
 
 	vfs_mark_atime(imgp->vp, td);
 
 done1:
 	/*
 	 * Free any resources malloc'd earlier that we didn't use.
 	 */
 	uifree(euip);
 	if (newcred == NULL)
 		crfree(oldcred);
 	else
 		crfree(newcred);
 	VOP_UNLOCK(imgp->vp, 0, td);
 	/*
 	 * Handle deferred decrement of ref counts.
 	 */
 	if (textvp != NULL) {
 		int tvfslocked;
 
 		tvfslocked = VFS_LOCK_GIANT(textvp->v_mount);
 		vrele(textvp);
 		VFS_UNLOCK_GIANT(tvfslocked);
 	}
 	if (ndp->ni_vp && error != 0)
 		vrele(ndp->ni_vp);
 #ifdef KTRACE
 	if (tracevp != NULL)
 		vrele(tracevp);
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (oldargs != NULL)
 		pargs_drop(oldargs);
 	if (newargs != NULL)
 		pargs_drop(newargs);
 	if (oldsigacts != NULL)
 		sigacts_free(oldsigacts);
 
 exec_fail_dealloc:
 
 	/*
 	 * free various allocated resources
 	 */
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	if (imgp->vp != NULL) {
 		NDFREE(ndp, NDF_ONLY_PNBUF);
 		vput(imgp->vp);
 	}
 
 	if (imgp->object != NULL)
 		vm_object_deallocate(imgp->object);
 
 	if (error == 0) {
 		/*
 		 * Stop the process here if its stop event mask has
 		 * the S_EXEC bit set.
 		 */
 		STOPEVENT(p, S_EXEC, 0);
 		goto done2;
 	}
 
 exec_fail:
 	/* we're done here, clear P_INEXEC */
 	PROC_LOCK(p);
 	p->p_flag &= ~P_INEXEC;
 	PROC_UNLOCK(p);
 
 	if (imgp->vmspace_destroyed) {
 		/* sorry, no more process anymore. exit gracefully */
 #ifdef MAC
 		mac_execve_exit(imgp);
 		if (interplabel != NULL)
 			mac_vnode_label_free(interplabel);
 #endif
 		VFS_UNLOCK_GIANT(vfslocked);
 		exec_free_args(args);
 		exit1(td, W_EXITCODE(0, SIGABRT));
 		/* NOT REACHED */
 		error = 0;
 	}
 done2:
 #ifdef MAC
 	mac_execve_exit(imgp);
 	if (interplabel != NULL)
 		mac_vnode_label_free(interplabel);
 #endif
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 int
 exec_map_first_page(imgp)
 	struct image_params *imgp;
 {
 	int rv, i;
 	int initial_pagein;
 	vm_page_t ma[VM_INITIAL_PAGEIN];
 	vm_object_t object;
 
 	if (imgp->firstpage != NULL)
 		exec_unmap_first_page(imgp);
 
 	object = imgp->vp->v_object;
 	if (object == NULL)
 		return (EACCES);
 	VM_OBJECT_LOCK(object);
 	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
 	if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
 		initial_pagein = VM_INITIAL_PAGEIN;
 		if (initial_pagein > object->size)
 			initial_pagein = object->size;
 		for (i = 1; i < initial_pagein; i++) {
 			if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
 				if (ma[i]->valid)
 					break;
 				vm_page_lock_queues();
 				if ((ma[i]->flags & PG_BUSY) || ma[i]->busy) {
 					vm_page_unlock_queues();
 					break;
 				}
 				vm_page_busy(ma[i]);
 				vm_page_unlock_queues();
 			} else {
 				ma[i] = vm_page_alloc(object, i,
 				    VM_ALLOC_NORMAL);
 				if (ma[i] == NULL)
 					break;
 			}
 		}
 		initial_pagein = i;
 		rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
 		ma[0] = vm_page_lookup(object, 0);
 		if ((rv != VM_PAGER_OK) || (ma[0] == NULL) ||
 		    (ma[0]->valid == 0)) {
 			if (ma[0]) {
 				vm_page_lock_queues();
 				pmap_remove_all(ma[0]);
 				vm_page_free(ma[0]);
 				vm_page_unlock_queues();
 			}
 			VM_OBJECT_UNLOCK(object);
 			return (EIO);
 		}
 	}
 	vm_page_lock_queues();
 	vm_page_hold(ma[0]);
 	vm_page_wakeup(ma[0]);
 	vm_page_unlock_queues();
 	VM_OBJECT_UNLOCK(object);
 
 	imgp->firstpage = sf_buf_alloc(ma[0], 0);
 	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
 
 	return (0);
 }
 
 void
 exec_unmap_first_page(imgp)
 	struct image_params *imgp;
 {
 	vm_page_t m;
 
 	if (imgp->firstpage != NULL) {
 		m = sf_buf_page(imgp->firstpage);
 		sf_buf_free(imgp->firstpage);
 		imgp->firstpage = NULL;
 		vm_page_lock_queues();
 		vm_page_unhold(m);
 		vm_page_unlock_queues();
 	}
 }
 
 /*
  * Destroy old address space, and allocate a new stack
  *	The new stack is only SGROWSIZ large because it is grown
  *	automatically in trap.c.
  */
 int
 exec_new_vmspace(imgp, sv)
 	struct image_params *imgp;
 	struct sysentvec *sv;
 {
 	int error;
 	struct proc *p = imgp->proc;
 	struct vmspace *vmspace = p->p_vmspace;
 	vm_offset_t stack_addr;
 	vm_map_t map;
 
 	imgp->vmspace_destroyed = 1;
 
 	/* Called with Giant held, do not depend on it! */
 	EVENTHANDLER_INVOKE(process_exec, p);
 
 	/*
-	 * Here is as good a place as any to do any resource limit cleanups.
-	 * This is needed if a 64 bit binary exec's a 32 bit binary - the
-	 * data size limit may need to be changed to a value that makes
-	 * sense for the 32 bit binary.
-	 */
-	if (sv->sv_fixlimits != NULL)
-		sv->sv_fixlimits(imgp);
-
-	/*
 	 * Blow away entire process VM, if address space not shared,
 	 * otherwise, create a new VM space so that other threads are
 	 * not disrupted
 	 */
 	map = &vmspace->vm_map;
 	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser &&
 	    vm_map_max(map) == sv->sv_maxuser) {
 		shmexit(vmspace);
 		pmap_remove_pages(vmspace_pmap(vmspace), vm_map_min(map),
 		    vm_map_max(map));
 		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
 	} else {
 		vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
 		vmspace = p->p_vmspace;
 		map = &vmspace->vm_map;
 	}
 
 	/* Allocate a new stack */
 	stack_addr = sv->sv_usrstack - maxssiz;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
 	if (error)
 		return (error);
 
 #ifdef __ia64__
 	/* Allocate a new register stack */
 	stack_addr = IA64_BACKINGSTORE;
 	error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
 	if (error)
 		return (error);
 #endif
 
 	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
 	 * VM_STACK case, but they are still used to monitor the size of the
 	 * process stack so we can check the stack rlimit.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
 	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz;
 
 	return (0);
 }
 
 /*
  * Copy out argument and environment strings from the old process
  *	address space into the temporary string buffer.
  */
 int
 exec_copyin_args(struct image_args *args, char *fname,
     enum uio_seg segflg, char **argv, char **envv)
 {
 	char *argp, *envp;
 	int error;
 	size_t length;
 
 	error = 0;
 
 	bzero(args, sizeof(*args));
 	if (argv == NULL)
 		return (EFAULT);
 	/*
 	 * Allocate temporary demand zeroed space for argument and
 	 *	environment strings:
 	 *
 	 * o ARG_MAX for argument and environment;
 	 * o MAXSHELLCMDLEN for the name of interpreters.
 	 */
 	args->buf = (char *) kmem_alloc_wait(exec_map,
 	    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
 	if (args->buf == NULL)
 		return (ENOMEM);
 	args->begin_argv = args->buf;
 	args->endp = args->begin_argv;
 	args->stringspace = ARG_MAX;
 
 	args->fname = args->buf + ARG_MAX;
 
 	/*
 	 * Copy the file name.
 	 */
 	error = (segflg == UIO_SYSSPACE) ?
 	    copystr(fname, args->fname, PATH_MAX, &length) :
 	    copyinstr(fname, args->fname, PATH_MAX, &length);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * extract arguments first
 	 */
 	while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
 		if (argp == (caddr_t) -1)
 			return (EFAULT);
 		if ((error = copyinstr(argp, args->endp,
 		    args->stringspace, &length))) {
 			if (error == ENAMETOOLONG)
 				return (E2BIG);
 			return (error);
 		}
 		args->stringspace -= length;
 		args->endp += length;
 		args->argc++;
 	}
 
 	args->begin_envv = args->endp;
 
 	/*
 	 * extract environment strings
 	 */
 	if (envv) {
 		while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
 			if (envp == (caddr_t)-1)
 				return (EFAULT);
 			if ((error = copyinstr(envp, args->endp,
 			    args->stringspace, &length))) {
 				if (error == ENAMETOOLONG)
 					return (E2BIG);
 				return (error);
 			}
 			args->stringspace -= length;
 			args->endp += length;
 			args->envc++;
 		}
 	}
 
 	return (0);
 }
 
 void
 exec_free_args(struct image_args *args)
 {
 
 	if (args->buf) {
 		kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
 		    PATH_MAX + ARG_MAX + MAXSHELLCMDLEN);
 		args->buf = NULL;
 	}
 }
 
 /*
  * Copy strings out to the new process address space, constructing
  *	new arg and env vector tables. Return a pointer to the base
  *	so that it can be used as the initial stack pointer.
  */
 register_t *
 exec_copyout_strings(imgp)
 	struct image_params *imgp;
 {
 	int argc, envc;
 	char **vectp;
 	char *stringp, *destp;
 	register_t *stack_base;
 	struct ps_strings *arginfo;
 	struct proc *p;
 	int szsigcode;
 
 	/*
 	 * Calculate string base and vector table pointers.
 	 * Also deal with signal trampoline code for this exec type.
 	 */
 	p = imgp->proc;
 	szsigcode = 0;
 	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
 	if (p->p_sysent->sv_szsigcode != NULL)
 		szsigcode = *(p->p_sysent->sv_szsigcode);
 	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
 	    roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
 
 	/*
 	 * install sigcode
 	 */
 	if (szsigcode)
 		copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
 		    szsigcode), szsigcode);
 
 	/*
 	 * If we have a valid auxargs ptr, prepare some room
 	 * on the stack.
 	 */
 	if (imgp->auxargs) {
 		/*
 		 * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
 		 * lower compatibility.
 		 */
 		imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
 		    (AT_COUNT * 2);
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets,and imgp->auxarg_size is room
 		 * for argument of Runtime loader.
 		 */
 		vectp = (char **)(destp - (imgp->args->argc +
 		    imgp->args->envc + 2 + imgp->auxarg_size) *
 		    sizeof(char *));
 
 	} else {
 		/*
 		 * The '+ 2' is for the null pointers at the end of each of
 		 * the arg and env vector sets
 		 */
 		vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
 		    sizeof(char *));
 	}
 
 	/*
 	 * vectp also becomes our initial stack base
 	 */
 	stack_base = (register_t *)vectp;
 
 	stringp = imgp->args->begin_argv;
 	argc = imgp->args->argc;
 	envc = imgp->args->envc;
 
 	/*
 	 * Copy out strings - arguments and environment.
 	 */
 	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
 
 	/*
 	 * Fill in "ps_strings" struct for ps, w, etc.
 	 */
 	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nargvstr, argc);
 
 	/*
 	 * Fill in argument portion of vector table.
 	 */
 	for (; argc > 0; --argc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* a null vector table pointer separates the argp's from the envp's */
 	suword(vectp++, 0);
 
 	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
 	suword(&arginfo->ps_nenvstr, envc);
 
 	/*
 	 * Fill in environment portion of vector table.
 	 */
 	for (; envc > 0; --envc) {
 		suword(vectp++, (long)(intptr_t)destp);
 		while (*stringp++ != 0)
 			destp++;
 		destp++;
 	}
 
 	/* end of vector table is a null pointer */
 	suword(vectp, 0);
 
 	return (stack_base);
 }
 
 /*
  * Check permissions of file to execute.
  *	Called with imgp->vp locked.
  *	Return 0 for success or error code on failure.
  */
 int
 exec_check_permissions(imgp)
 	struct image_params *imgp;
 {
 	struct vnode *vp = imgp->vp;
 	struct vattr *attr = imgp->attr;
 	struct thread *td;
 	int error;
 
 	td = curthread;			/* XXXKSE */
 
 	/* Get file attributes */
 	error = VOP_GETATTR(vp, attr, td->td_ucred, td);
 	if (error)
 		return (error);
 
 #ifdef MAC
 	error = mac_check_vnode_exec(td->td_ucred, imgp->vp, imgp);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * 1) Check if file execution is disabled for the filesystem that this
 	 *	file resides on.
 	 * 2) Insure that at least one execute bit is on - otherwise root
 	 *	will always succeed, and we don't want to happen unless the
 	 *	file really is executable.
 	 * 3) Insure that the file is a regular file.
 	 */
 	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
 	    ((attr->va_mode & 0111) == 0) ||
 	    (attr->va_type != VREG))
 		return (EACCES);
 
 	/*
 	 * Zero length files can't be exec'd
 	 */
 	if (attr->va_size == 0)
 		return (ENOEXEC);
 
 	/*
 	 *  Check for execute permission to file based on current credentials.
 	 */
 	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
 	if (error)
 		return (error);
 
 	/*
 	 * Check number of open-for-writes on the file and deny execution
 	 * if there are any.
 	 */
 	if (vp->v_writecount)
 		return (ETXTBSY);
 
 	/*
 	 * Call filesystem specific open routine (which does nothing in the
 	 * general case).
 	 */
 	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, -1);
 	return (error);
 }
 
 /*
  * Exec handler registration
  */
 int
 exec_register(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 2;	/* New slot and trailing NULL */
 
 	if (execsw)
 		for (es = execsw; *es; es++)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	if (execsw)
 		for (es = execsw; *es; es++)
 			*xs++ = *es;
 	*xs++ = execsw_arg;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
 
 int
 exec_unregister(execsw_arg)
 	const struct execsw *execsw_arg;
 {
 	const struct execsw **es, **xs, **newexecsw;
 	int count = 1;
 
 	if (execsw == NULL)
 		panic("unregister with no handlers left?\n");
 
 	for (es = execsw; *es; es++) {
 		if (*es == execsw_arg)
 			break;
 	}
 	if (*es == NULL)
 		return (ENOENT);
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			count++;
 	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
 	if (newexecsw == NULL)
 		return (ENOMEM);
 	xs = newexecsw;
 	for (es = execsw; *es; es++)
 		if (*es != execsw_arg)
 			*xs++ = *es;
 	*xs = NULL;
 	if (execsw)
 		free(execsw, M_TEMP);
 	execsw = newexecsw;
 	return (0);
 }
Index: stable/6/sys/kern/kern_resource.c
===================================================================
--- stable/6/sys/kern/kern_resource.c	(revision 169910)
+++ stable/6/sys/kern/kern_resource.c	(revision 169911)
@@ -1,1186 +1,1184 @@
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_resource.c	8.5 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_compat.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/file.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/time.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 
 
 static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
 static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
 #define	UIHASH(uid)	(&uihashtbl[(uid) & uihash])
 static struct mtx uihashtbl_mtx;
 static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
 static u_long uihash;		/* size of hash table - 1 */
 
 static void	calcru1(struct proc *p, struct rusage_ext *ruxp,
 		    struct timeval *up, struct timeval *sp);
 static int	donice(struct thread *td, struct proc *chgp, int n);
 static struct uidinfo *uilookup(uid_t uid);
 
 /*
  * Resource controls and accounting.
  */
 
 #ifndef _SYS_SYSPROTO_H_
 struct getpriority_args {
 	int	which;
 	int	who;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 getpriority(td, uap)
 	struct thread *td;
 	register struct getpriority_args *uap;
 {
 	struct proc *p;
 	struct pgrp *pg;
 	int error, low;
 
 	error = 0;
 	low = PRIO_MAX + 1;
 	switch (uap->which) {
 
 	case PRIO_PROCESS:
 		if (uap->who == 0)
 			low = td->td_proc->p_nice;
 		else {
 			p = pfind(uap->who);
 			if (p == NULL)
 				break;
 			if (p_cansee(td, p) == 0)
 				low = p->p_nice;
 			PROC_UNLOCK(p);
 		}
 		break;
 
 	case PRIO_PGRP:
 		sx_slock(&proctree_lock);
 		if (uap->who == 0) {
 			pg = td->td_proc->p_pgrp;
 			PGRP_LOCK(pg);
 		} else {
 			pg = pgfind(uap->who);
 			if (pg == NULL) {
 				sx_sunlock(&proctree_lock);
 				break;
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (!p_cansee(td, p)) {
 				if (p->p_nice < low)
 					low = p->p_nice;
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pg);
 		break;
 
 	case PRIO_USER:
 		if (uap->who == 0)
 			uap->who = td->td_ucred->cr_uid;
 		sx_slock(&allproc_lock);
 		LIST_FOREACH(p, &allproc, p_list) {
 			PROC_LOCK(p);
 			if (!p_cansee(td, p) &&
 			    p->p_ucred->cr_uid == uap->who) {
 				if (p->p_nice < low)
 					low = p->p_nice;
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (low == PRIO_MAX + 1 && error == 0)
 		error = ESRCH;
 	td->td_retval[0] = low;
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct setpriority_args {
 	int	which;
 	int	who;
 	int	prio;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 setpriority(td, uap)
 	struct thread *td;
 	struct setpriority_args *uap;
 {
 	struct proc *curp, *p;
 	struct pgrp *pg;
 	int found = 0, error = 0;
 
 	curp = td->td_proc;
 	switch (uap->which) {
 	case PRIO_PROCESS:
 		if (uap->who == 0) {
 			PROC_LOCK(curp);
 			error = donice(td, curp, uap->prio);
 			PROC_UNLOCK(curp);
 		} else {
 			p = pfind(uap->who);
 			if (p == 0)
 				break;
 			if (p_cansee(td, p) == 0)
 				error = donice(td, p, uap->prio);
 			PROC_UNLOCK(p);
 		}
 		found++;
 		break;
 
 	case PRIO_PGRP:
 		sx_slock(&proctree_lock);
 		if (uap->who == 0) {
 			pg = curp->p_pgrp;
 			PGRP_LOCK(pg);
 		} else {
 			pg = pgfind(uap->who);
 			if (pg == NULL) {
 				sx_sunlock(&proctree_lock);
 				break;
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (!p_cansee(td, p)) {
 				error = donice(td, p, uap->prio);
 				found++;
 			}
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pg);
 		break;
 
 	case PRIO_USER:
 		if (uap->who == 0)
 			uap->who = td->td_ucred->cr_uid;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_ucred->cr_uid == uap->who &&
 			    !p_cansee(td, p)) {
 				error = donice(td, p, uap->prio);
 				found++;
 			}
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (found == 0 && error == 0)
 		error = ESRCH;
 	return (error);
 }
 
 /*
  * Set "nice" for a (whole) process.
  */
 static int
 donice(struct thread *td, struct proc *p, int n)
 {
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((error = p_cansched(td, p)))
 		return (error);
 	if (n > PRIO_MAX)
 		n = PRIO_MAX;
 	if (n < PRIO_MIN)
 		n = PRIO_MIN;
  	if (n < p->p_nice && suser(td) != 0)
 		return (EACCES);
 	mtx_lock_spin(&sched_lock);
 	sched_nice(p, n);
 	mtx_unlock_spin(&sched_lock);
 	return (0);
 }
 
 /*
  * Set realtime priority.
  *
  * MPSAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rtprio_args {
 	int		function;
 	pid_t		pid;
 	struct rtprio	*rtp;
 };
 #endif
 
 int
 rtprio(td, uap)
 	struct thread *td;		/* curthread */
 	register struct rtprio_args *uap;
 {
 	struct proc *curp;
 	struct proc *p;
 	struct ksegrp *kg;
 	struct rtprio rtp;
 	int cierror, error;
 
 	/* Perform copyin before acquiring locks if needed. */
 	if (uap->function == RTP_SET)
 		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
 	else
 		cierror = 0;
 
 	curp = td->td_proc;
 	if (uap->pid == 0) {
 		p = curp;
 		PROC_LOCK(p);
 	} else {
 		p = pfind(uap->pid);
 		if (p == NULL)
 			return (ESRCH);
 	}
 
 	switch (uap->function) {
 	case RTP_LOOKUP:
 		if ((error = p_cansee(td, p)))
 			break;
 		mtx_lock_spin(&sched_lock);
 		/*
 		 * Return OUR priority if no pid specified,
 		 * or if one is, report the highest priority
 		 * in the process.  There isn't much more you can do as 
 		 * there is only room to return a single priority.
 		 * XXXKSE: maybe need a new interface to report 
 		 * priorities of multiple system scope threads.
 		 * Note: specifying our own pid is not the same
 		 * as leaving it zero.
 		 */
 		if (uap->pid == 0) {
 			pri_to_rtp(td->td_ksegrp, &rtp);
 		} else {
 			struct rtprio rtp2;
 
 			rtp.type = RTP_PRIO_IDLE;
 			rtp.prio = RTP_PRIO_MAX;
 			FOREACH_KSEGRP_IN_PROC(p, kg) {
 				pri_to_rtp(kg, &rtp2);
 				if (rtp2.type <  rtp.type ||
 				    (rtp2.type == rtp.type &&
 				    rtp2.prio < rtp.prio)) {
 					rtp.type = rtp2.type;
 					rtp.prio = rtp2.prio;
 				}
 			}
 		}
 		mtx_unlock_spin(&sched_lock);
 		PROC_UNLOCK(p);
 		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 	case RTP_SET:
 		if ((error = p_cansched(td, p)) || (error = cierror))
 			break;
 
 		/* Disallow setting rtprio in most cases if not superuser. */
 		if (suser(td) != 0) {
 			/* can't set someone else's */
 			if (uap->pid) {
 				error = EPERM;
 				break;
 			}
 			/* can't set realtime priority */
 /*
  * Realtime priority has to be restricted for reasons which should be
  * obvious.  However, for idle priority, there is a potential for
  * system deadlock if an idleprio process gains a lock on a resource
  * that other processes need (and the idleprio process can't run
  * due to a CPU-bound normal process).  Fix me!  XXX
  */
 #if 0
  			if (RTP_PRIO_IS_REALTIME(rtp.type)) {
 #else
 			if (rtp.type != RTP_PRIO_NORMAL) {
 #endif
 				error = EPERM;
 				break;
 			}
 		}
 
 		/*
 		 * If we are setting our own priority, set just our
 		 * KSEGRP but if we are doing another process,
 		 * do all the groups on that process. If we
 		 * specify our own pid we do the latter.
 		 */
 		mtx_lock_spin(&sched_lock);
 		if (uap->pid == 0) {
 			error = rtp_to_pri(&rtp, td->td_ksegrp);
 		} else {
 			FOREACH_KSEGRP_IN_PROC(p, kg) {
 				if ((error = rtp_to_pri(&rtp, kg)) != 0) {
 					break;
 				}
 			}
 		}
 		mtx_unlock_spin(&sched_lock);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 int
 rtp_to_pri(struct rtprio *rtp, struct ksegrp *kg)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	if (rtp->prio > RTP_PRIO_MAX)
 		return (EINVAL);
 	switch (RTP_PRIO_BASE(rtp->type)) {
 	case RTP_PRIO_REALTIME:
 		kg->kg_user_pri = PRI_MIN_REALTIME + rtp->prio;
 		break;
 	case RTP_PRIO_NORMAL:
 		kg->kg_user_pri = PRI_MIN_TIMESHARE + rtp->prio;
 		break;
 	case RTP_PRIO_IDLE:
 		kg->kg_user_pri = PRI_MIN_IDLE + rtp->prio;
 		break;
 	default:
 		return (EINVAL);
 	}
 	sched_class(kg, rtp->type);
 	if (curthread->td_ksegrp == kg) {
 		sched_prio(curthread, kg->kg_user_pri); /* XXX dubious */
 	}
 	return (0);
 }
 
 void
 pri_to_rtp(struct ksegrp *kg, struct rtprio *rtp)
 {
 
 	mtx_assert(&sched_lock, MA_OWNED);
 	switch (PRI_BASE(kg->kg_pri_class)) {
 	case PRI_REALTIME:
 		rtp->prio = kg->kg_user_pri - PRI_MIN_REALTIME;
 		break;
 	case PRI_TIMESHARE:
 		rtp->prio = kg->kg_user_pri - PRI_MIN_TIMESHARE;
 		break;
 	case PRI_IDLE:
 		rtp->prio = kg->kg_user_pri - PRI_MIN_IDLE;
 		break;
 	default:
 		break;
 	}
 	rtp->type = kg->kg_pri_class;
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osetrlimit_args {
 	u_int	which;
 	struct	orlimit *rlp;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 osetrlimit(td, uap)
 	struct thread *td;
 	register struct osetrlimit_args *uap;
 {
 	struct orlimit olim;
 	struct rlimit lim;
 	int error;
 
 	if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
 		return (error);
 	lim.rlim_cur = olim.rlim_cur;
 	lim.rlim_max = olim.rlim_max;
 	error = kern_setrlimit(td, uap->which, &lim);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct ogetrlimit_args {
 	u_int	which;
 	struct	orlimit *rlp;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 ogetrlimit(td, uap)
 	struct thread *td;
 	register struct ogetrlimit_args *uap;
 {
 	struct orlimit olim;
 	struct rlimit rl;
 	struct proc *p;
 	int error;
 
 	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	lim_rlimit(p, uap->which, &rl);
 	PROC_UNLOCK(p);
 
 	/*
 	 * XXX would be more correct to convert only RLIM_INFINITY to the
 	 * old RLIM_INFINITY and fail with EOVERFLOW for other larger
 	 * values.  Most 64->32 and 32->16 conversions, including not
 	 * unimportant ones of uids are even more broken than what we
 	 * do here (they blindly truncate).  We don't do this correctly
 	 * here since we have little experience with EOVERFLOW yet.
 	 * Elsewhere, getuid() can't fail...
 	 */
 	olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur;
 	olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max;
 	error = copyout(&olim, uap->rlp, sizeof(olim));
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct __setrlimit_args {
 	u_int	which;
 	struct	rlimit *rlp;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 setrlimit(td, uap)
 	struct thread *td;
 	register struct __setrlimit_args *uap;
 {
 	struct rlimit alim;
 	int error;
 
 	if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit))))
 		return (error);
 	error = kern_setrlimit(td, uap->which, &alim);
 	return (error);
 }
 
 int
 kern_setrlimit(td, which, limp)
 	struct thread *td;
 	u_int which;
 	struct rlimit *limp;
 {
 	struct plimit *newlim, *oldlim;
 	struct proc *p;
 	register struct rlimit *alimp;
 	rlim_t oldssiz;
 	int error;
 
 	if (which >= RLIM_NLIMITS)
 		return (EINVAL);
 
 	/*
 	 * Preserve historical bugs by treating negative limits as unsigned.
 	 */
 	if (limp->rlim_cur < 0)
 		limp->rlim_cur = RLIM_INFINITY;
 	if (limp->rlim_max < 0)
 		limp->rlim_max = RLIM_INFINITY;
 
 	oldssiz = 0;
 	p = td->td_proc;
 	newlim = lim_alloc();
 	PROC_LOCK(p);
 	oldlim = p->p_limit;
 	alimp = &oldlim->pl_rlimit[which];
 	if (limp->rlim_cur > alimp->rlim_max ||
 	    limp->rlim_max > alimp->rlim_max)
 		if ((error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL))) {
 			PROC_UNLOCK(p);
 			lim_free(newlim);
 			return (error);
 		}
 	if (limp->rlim_cur > limp->rlim_max)
 		limp->rlim_cur = limp->rlim_max;
 	lim_copy(newlim, oldlim);
 	alimp = &newlim->pl_rlimit[which];
 
 	switch (which) {
 
 	case RLIMIT_CPU:
 		mtx_lock_spin(&sched_lock);
 		p->p_cpulimit = limp->rlim_cur;
 		mtx_unlock_spin(&sched_lock);
 		break;
 	case RLIMIT_DATA:
 		if (limp->rlim_cur > maxdsiz)
 			limp->rlim_cur = maxdsiz;
 		if (limp->rlim_max > maxdsiz)
 			limp->rlim_max = maxdsiz;
 		break;
 
 	case RLIMIT_STACK:
 		if (limp->rlim_cur > maxssiz)
 			limp->rlim_cur = maxssiz;
 		if (limp->rlim_max > maxssiz)
 			limp->rlim_max = maxssiz;
 		oldssiz = alimp->rlim_cur;
 		break;
 
 	case RLIMIT_NOFILE:
 		if (limp->rlim_cur > maxfilesperproc)
 			limp->rlim_cur = maxfilesperproc;
 		if (limp->rlim_max > maxfilesperproc)
 			limp->rlim_max = maxfilesperproc;
 		break;
 
 	case RLIMIT_NPROC:
 		if (limp->rlim_cur > maxprocperuid)
 			limp->rlim_cur = maxprocperuid;
 		if (limp->rlim_max > maxprocperuid)
 			limp->rlim_max = maxprocperuid;
 		if (limp->rlim_cur < 1)
 			limp->rlim_cur = 1;
 		if (limp->rlim_max < 1)
 			limp->rlim_max = 1;
 		break;
 	}
+	if (td->td_proc->p_sysent->sv_fixlimit != NULL)
+		td->td_proc->p_sysent->sv_fixlimit(limp, which);
 	*alimp = *limp;
 	p->p_limit = newlim;
 	PROC_UNLOCK(p);
 	lim_free(oldlim);
 
 	if (which == RLIMIT_STACK) {
 		/*
 		 * Stack is allocated to the max at exec time with only
 		 * "rlim_cur" bytes accessible.  If stack limit is going
 		 * up make more accessible, if going down make inaccessible.
 		 */
 		if (limp->rlim_cur != oldssiz) {
 			vm_offset_t addr;
 			vm_size_t size;
 			vm_prot_t prot;
 
 			if (limp->rlim_cur > oldssiz) {
 				prot = p->p_sysent->sv_stackprot;
 				size = limp->rlim_cur - oldssiz;
 				addr = p->p_sysent->sv_usrstack -
 				    limp->rlim_cur;
 			} else {
 				prot = VM_PROT_NONE;
 				size = oldssiz - limp->rlim_cur;
 				addr = p->p_sysent->sv_usrstack - oldssiz;
 			}
 			addr = trunc_page(addr);
 			size = round_page(size);
 			(void)vm_map_protect(&p->p_vmspace->vm_map,
 			    addr, addr + size, prot, FALSE);
 		}
 	}
 
-	if (td->td_proc->p_sysent->sv_fixlimits != NULL) {
-		struct image_params imgp;
-
-		imgp.proc = td->td_proc;
-		td->td_proc->p_sysent->sv_fixlimits(&imgp);
-	}
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct __getrlimit_args {
 	u_int	which;
 	struct	rlimit *rlp;
 };
 #endif
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 getrlimit(td, uap)
 	struct thread *td;
 	register struct __getrlimit_args *uap;
 {
 	struct rlimit rlim;
 	struct proc *p;
 	int error;
 
 	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
 	p = td->td_proc;
 	PROC_LOCK(p);
 	lim_rlimit(p, uap->which, &rlim);
 	PROC_UNLOCK(p);
 	error = copyout(&rlim, uap->rlp, sizeof(struct rlimit));
 	return (error);
 }
 
 /*
  * Transform the running time and tick information in proc p into user,
  * system, and interrupt time usage.
  */
 void
 calcru(p, up, sp)
 	struct proc *p;
 	struct timeval *up;
 	struct timeval *sp;
 {
 	struct bintime bt;
 	struct rusage_ext rux;
 	struct thread *td;
 	int bt_valid;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&sched_lock, MA_NOTOWNED);
 	bt_valid = 0;
 	mtx_lock_spin(&sched_lock);
 	rux = p->p_rux;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (TD_IS_RUNNING(td)) {
 			/*
 			 * Adjust for the current time slice.  This is
 			 * actually fairly important since the error here is
 			 * on the order of a time quantum which is much
 			 * greater than the precision of binuptime().
 			 */
 			KASSERT(td->td_oncpu != NOCPU,
 			    ("%s: running thread has no CPU", __func__));
 			if (!bt_valid) {
 				binuptime(&bt);
 				bt_valid = 1;
 			}
 			bintime_add(&rux.rux_runtime, &bt);
 			bintime_sub(&rux.rux_runtime,
 			    &pcpu_find(td->td_oncpu)->pc_switchtime);
 		}
 	}
 	mtx_unlock_spin(&sched_lock);
 	calcru1(p, &rux, up, sp);
 	p->p_rux.rux_uu = rux.rux_uu;
 	p->p_rux.rux_su = rux.rux_su;
 	p->p_rux.rux_iu = rux.rux_iu;
 }
 
 void
 calccru(p, up, sp)
 	struct proc *p;
 	struct timeval *up;
 	struct timeval *sp;
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	calcru1(p, &p->p_crux, up, sp);
 }
 
 static void
 calcru1(p, ruxp, up, sp)
 	struct proc *p;
 	struct rusage_ext *ruxp;
 	struct timeval *up;
 	struct timeval *sp;
 {
 	struct timeval tv;
 	/* {user, system, interrupt, total} {ticks, usec}; previous tu: */
 	u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
 
 	ut = ruxp->rux_uticks;
 	st = ruxp->rux_sticks;
 	it = ruxp->rux_iticks;
 	tt = ut + st + it;
 	if (tt == 0) {
 		st = 1;
 		tt = 1;
 	}
 	bintime2timeval(&ruxp->rux_runtime, &tv);
 	tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
 	ptu = ruxp->rux_uu + ruxp->rux_su + ruxp->rux_iu;
 	if (tu < ptu) {
 		printf(
 "calcru: runtime went backwards from %ju usec to %ju usec for pid %d (%s)\n",
 		    (uintmax_t)ptu, (uintmax_t)tu, p->p_pid, p->p_comm);
 		tu = ptu;
 	}
 	if ((int64_t)tu < 0) {
 		printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
 		    (intmax_t)tu, p->p_pid, p->p_comm);
 		tu = ptu;
 	}
 
 	/* Subdivide tu. */
 	uu = (tu * ut) / tt;
 	su = (tu * st) / tt;
 	iu = tu - uu - su;
 
 	/* Enforce monotonicity. */
 	if (uu < ruxp->rux_uu || su < ruxp->rux_su || iu < ruxp->rux_iu) {
 		if (uu < ruxp->rux_uu)
 			uu = ruxp->rux_uu;
 		else if (uu + ruxp->rux_su + ruxp->rux_iu > tu)
 			uu = tu - ruxp->rux_su - ruxp->rux_iu;
 		if (st == 0)
 			su = ruxp->rux_su;
 		else {
 			su = ((tu - uu) * st) / (st + it);
 			if (su < ruxp->rux_su)
 				su = ruxp->rux_su;
 			else if (uu + su + ruxp->rux_iu > tu)
 				su = tu - uu - ruxp->rux_iu;
 		}
 		KASSERT(uu + su + ruxp->rux_iu <= tu,
 		    ("calcru: monotonisation botch 1"));
 		iu = tu - uu - su;
 		KASSERT(iu >= ruxp->rux_iu,
 		    ("calcru: monotonisation botch 2"));
 	}
 	ruxp->rux_uu = uu;
 	ruxp->rux_su = su;
 	ruxp->rux_iu = iu;
 
 	up->tv_sec = uu / 1000000;
 	up->tv_usec = uu % 1000000;
 	sp->tv_sec = su / 1000000;
 	sp->tv_usec = su % 1000000;
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct getrusage_args {
 	int	who;
 	struct	rusage *rusage;
 };
 #endif
 /*
  * MPSAFE
  */
 int
 getrusage(td, uap)
 	register struct thread *td;
 	register struct getrusage_args *uap;
 {
 	struct rusage ru;
 	int error;
 
 	error = kern_getrusage(td, uap->who, &ru);
 	if (error == 0)
 		error = copyout(&ru, uap->rusage, sizeof(struct rusage));
 	return (error);
 }
 
 int
 kern_getrusage(td, who, rup)
 	struct thread *td;
 	int who;
 	struct rusage *rup;
 {
 	struct proc *p;
 
 	p = td->td_proc;
 	PROC_LOCK(p);
 	switch (who) {
 
 	case RUSAGE_SELF:
 		*rup = p->p_stats->p_ru;
 		calcru(p, &rup->ru_utime, &rup->ru_stime);
 		break;
 
 	case RUSAGE_CHILDREN:
 		*rup = p->p_stats->p_cru;
 		calccru(p, &rup->ru_utime, &rup->ru_stime);
 		break;
 
 	default:
 		PROC_UNLOCK(p);
 		return (EINVAL);
 	}
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 void
 ruadd(ru, rux, ru2, rux2)
 	struct rusage *ru;
 	struct rusage_ext *rux;
 	struct rusage *ru2;
 	struct rusage_ext *rux2;
 {
 	register long *ip, *ip2;
 	register int i;
 
 	bintime_add(&rux->rux_runtime, &rux2->rux_runtime);
 	rux->rux_uticks += rux2->rux_uticks;
 	rux->rux_sticks += rux2->rux_sticks;
 	rux->rux_iticks += rux2->rux_iticks;
 	rux->rux_uu += rux2->rux_uu;
 	rux->rux_su += rux2->rux_su;
 	rux->rux_iu += rux2->rux_iu;
 	if (ru->ru_maxrss < ru2->ru_maxrss)
 		ru->ru_maxrss = ru2->ru_maxrss;
 	ip = &ru->ru_first;
 	ip2 = &ru2->ru_first;
 	for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
 		*ip++ += *ip2++;
 }
 
 /*
  * Allocate a new resource limits structure and initialize its
  * reference count and mutex pointer.
  */
 struct plimit *
 lim_alloc()
 {
 	struct plimit *limp;
 
 	limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
 	limp->pl_refcnt = 1;
 	limp->pl_mtx = mtx_pool_alloc(mtxpool_sleep);
 	return (limp);
 }
 
 struct plimit *
 lim_hold(limp)
 	struct plimit *limp;
 {
 
 	LIM_LOCK(limp);
 	limp->pl_refcnt++;
 	LIM_UNLOCK(limp);
 	return (limp);
 }
 
 void
 lim_free(limp)
 	struct plimit *limp;
 {
 
 	LIM_LOCK(limp);
 	KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
 	if (--limp->pl_refcnt == 0) {
 		LIM_UNLOCK(limp);
 		free((void *)limp, M_PLIMIT);
 		return;
 	}
 	LIM_UNLOCK(limp);
 }
 
 /*
  * Make a copy of the plimit structure.
  * We share these structures copy-on-write after fork.
  */
 void
 lim_copy(dst, src)
 	struct plimit *dst, *src;
 {
 
 	KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit"));
 	bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit));
 }
 
 /*
  * Return the hard limit for a particular system resource.  The
  * which parameter specifies the index into the rlimit array.
  */
 rlim_t
 lim_max(struct proc *p, int which)
 {
 	struct rlimit rl;
 
 	lim_rlimit(p, which, &rl);
 	return (rl.rlim_max);
 }
 
 /*
  * Return the current (soft) limit for a particular system resource.
  * The which parameter which specifies the index into the rlimit array
  */
 rlim_t
 lim_cur(struct proc *p, int which)
 {
 	struct rlimit rl;
 
 	lim_rlimit(p, which, &rl);
 	return (rl.rlim_cur);
 }
 
 /*
  * Return a copy of the entire rlimit structure for the system limit
  * specified by 'which' in the rlimit structure pointed to by 'rlp'.
  */
 void
 lim_rlimit(struct proc *p, int which, struct rlimit *rlp)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(which >= 0 && which < RLIM_NLIMITS,
 	    ("request for invalid resource limit"));
 	*rlp = p->p_limit->pl_rlimit[which];
+	if (p->p_sysent->sv_fixlimit != NULL)
+		p->p_sysent->sv_fixlimit(rlp, which);
 }
 
 /*
  * Find the uidinfo structure for a uid.  This structure is used to
  * track the total resource consumption (process count, socket buffer
  * size, etc.) for the uid and impose limits.
  */
 void
 uihashinit()
 {
 
 	uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
 	mtx_init(&uihashtbl_mtx, "uidinfo hash", NULL, MTX_DEF);
 }
 
 /*
  * Look up a uidinfo struct for the parameter uid.
  * uihashtbl_mtx must be locked.
  */
 static struct uidinfo *
 uilookup(uid)
 	uid_t uid;
 {
 	struct uihashhead *uipp;
 	struct uidinfo *uip;
 
 	mtx_assert(&uihashtbl_mtx, MA_OWNED);
 	uipp = UIHASH(uid);
 	LIST_FOREACH(uip, uipp, ui_hash)
 		if (uip->ui_uid == uid)
 			break;
 
 	return (uip);
 }
 
 /*
  * Find or allocate a struct uidinfo for a particular uid.
  * Increase refcount on uidinfo struct returned.
  * uifree() should be called on a struct uidinfo when released.
  */
 struct uidinfo *
 uifind(uid)
 	uid_t uid;
 {
 	struct uidinfo *old_uip, *uip;
 
 	mtx_lock(&uihashtbl_mtx);
 	uip = uilookup(uid);
 	if (uip == NULL) {
 		mtx_unlock(&uihashtbl_mtx);
 		uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
 		mtx_lock(&uihashtbl_mtx);
 		/*
 		 * There's a chance someone created our uidinfo while we
 		 * were in malloc and not holding the lock, so we have to
 		 * make sure we don't insert a duplicate uidinfo.
 		 */
 		if ((old_uip = uilookup(uid)) != NULL) {
 			/* Someone else beat us to it. */
 			free(uip, M_UIDINFO);
 			uip = old_uip;
 		} else {
 			uip->ui_mtxp = mtx_pool_alloc(mtxpool_sleep);
 			uip->ui_uid = uid;
 			LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
 		}
 	}
 	uihold(uip);
 	mtx_unlock(&uihashtbl_mtx);
 	return (uip);
 }
 
 /*
  * Place another refcount on a uidinfo struct.
  */
 void
 uihold(uip)
 	struct uidinfo *uip;
 {
 
 	UIDINFO_LOCK(uip);
 	uip->ui_ref++;
 	UIDINFO_UNLOCK(uip);
 }
 
 /*-
  * Since uidinfo structs have a long lifetime, we use an
  * opportunistic refcounting scheme to avoid locking the lookup hash
  * for each release.
  *
  * If the refcount hits 0, we need to free the structure,
  * which means we need to lock the hash.
  * Optimal case:
  *   After locking the struct and lowering the refcount, if we find
  *   that we don't need to free, simply unlock and return.
  * Suboptimal case:
  *   If refcount lowering results in need to free, bump the count
  *   back up, loose the lock and aquire the locks in the proper
  *   order to try again.
  */
 void
 uifree(uip)
 	struct uidinfo *uip;
 {
 
 	/* Prepare for optimal case. */
 	UIDINFO_LOCK(uip);
 
 	if (--uip->ui_ref != 0) {
 		UIDINFO_UNLOCK(uip);
 		return;
 	}
 
 	/* Prepare for suboptimal case. */
 	uip->ui_ref++;
 	UIDINFO_UNLOCK(uip);
 	mtx_lock(&uihashtbl_mtx);
 	UIDINFO_LOCK(uip);
 
 	/*
 	 * We must subtract one from the count again because we backed out
 	 * our initial subtraction before dropping the lock.
 	 * Since another thread may have added a reference after we dropped the
 	 * initial lock we have to test for zero again.
 	 */
 	if (--uip->ui_ref == 0) {
 		LIST_REMOVE(uip, ui_hash);
 		mtx_unlock(&uihashtbl_mtx);
 		if (uip->ui_sbsize != 0)
 			printf("freeing uidinfo: uid = %d, sbsize = %jd\n",
 			    uip->ui_uid, (intmax_t)uip->ui_sbsize);
 		if (uip->ui_proccnt != 0)
 			printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
 			    uip->ui_uid, uip->ui_proccnt);
 		UIDINFO_UNLOCK(uip);
 		FREE(uip, M_UIDINFO);
 		return;
 	}
 
 	mtx_unlock(&uihashtbl_mtx);
 	UIDINFO_UNLOCK(uip);
 }
 
 /*
  * Change the count associated with number of processes
  * a given user is using.  When 'max' is 0, don't enforce a limit
  */
 int
 chgproccnt(uip, diff, max)
 	struct	uidinfo	*uip;
 	int	diff;
 	int	max;
 {
 
 	UIDINFO_LOCK(uip);
 	/* Don't allow them to exceed max, but allow subtraction. */
 	if (diff > 0 && uip->ui_proccnt + diff > max && max != 0) {
 		UIDINFO_UNLOCK(uip);
 		return (0);
 	}
 	uip->ui_proccnt += diff;
 	if (uip->ui_proccnt < 0)
 		printf("negative proccnt for uid = %d\n", uip->ui_uid);
 	UIDINFO_UNLOCK(uip);
 	return (1);
 }
 
 /*
  * Change the total socket buffer size a user has used.
  */
 int
 chgsbsize(uip, hiwat, to, max)
 	struct	uidinfo	*uip;
 	u_int  *hiwat;
 	u_int	to;
 	rlim_t	max;
 {
 	rlim_t new;
 
 	UIDINFO_LOCK(uip);
 	new = uip->ui_sbsize + to - *hiwat;
 	/* Don't allow them to exceed max, but allow subtraction. */
 	if (to > *hiwat && new > max) {
 		UIDINFO_UNLOCK(uip);
 		return (0);
 	}
 	uip->ui_sbsize = new;
 	UIDINFO_UNLOCK(uip);
 	*hiwat = to;
 	if (new < 0)
 		printf("negative sbsize for uid = %d\n", uip->ui_uid);
 	return (1);
 }
Index: stable/6/sys/sys/sysent.h
===================================================================
--- stable/6/sys/sys/sysent.h	(revision 169910)
+++ stable/6/sys/sys/sysent.h	(revision 169911)
@@ -1,137 +1,138 @@
 /*-
  * Copyright (c) 1982, 1988, 1991 The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_SYSENT_H_
 #define	_SYS_SYSENT_H_
 
 #include <bsm/audit.h>
 
+struct rlimit;
 struct thread;
 
 typedef	int	sy_call_t(struct thread *, void *);
 
 struct sysent {		/* system call table */
 	int	sy_narg;	/* number of arguments */
 	sy_call_t *sy_call;	/* implementing function */
 	au_event_t sy_auevent;	/* audit event associated with syscall */
 };
 
 #define SYF_ARGMASK	0x0000FFFF
 #define SYF_MPSAFE	0x00010000
 
 struct image_params;
 struct __sigset;
 struct trapframe;
 struct vnode;
 
 struct sysentvec {
 	int		sv_size;	/* number of entries */
 	struct sysent	*sv_table;	/* pointer to sysent */
 	u_int		sv_mask;	/* optional mask to index */
 	int		sv_sigsize;	/* size of signal translation table */
 	int		*sv_sigtbl;	/* signal translation table */
 	int		sv_errsize;	/* size of errno translation table */
 	int 		*sv_errtbl;	/* errno translation table */
 	int		(*sv_transtrap)(int, int);
 					/* translate trap-to-signal mapping */
 	int		(*sv_fixup)(register_t **, struct image_params *);
 					/* stack fixup function */
 	void		(*sv_sendsig)(void (*)(int), int, struct __sigset *,
 			    u_long);	/* send signal */
 	char 		*sv_sigcode;	/* start of sigtramp code */
 	int 		*sv_szsigcode;	/* size of sigtramp code */
 	void		(*sv_prepsyscall)(struct trapframe *, int *, u_int *,
 			    caddr_t *);
 	char		*sv_name;	/* name of binary type */
 	int		(*sv_coredump)(struct thread *, struct vnode *, off_t);
 					/* function to dump core, or NULL */
 	int		(*sv_imgact_try)(struct image_params *);
 	int		sv_minsigstksz;	/* minimum signal stack size */
 	int		sv_pagesize;	/* pagesize */
 	vm_offset_t	sv_minuser;	/* VM_MIN_ADDRESS */
 	vm_offset_t	sv_maxuser;	/* VM_MAXUSER_ADDRESS */
 	vm_offset_t	sv_usrstack;	/* USRSTACK */
 	vm_offset_t	sv_psstrings;	/* PS_STRINGS */
 	int		sv_stackprot;	/* vm protection for stack */
 	register_t	*(*sv_copyout_strings)(struct image_params *);
 	void		(*sv_setregs)(struct thread *, u_long, u_long, u_long);
-	void		(*sv_fixlimits)(struct image_params *);
+	void		(*sv_fixlimit)(struct rlimit *, int);
 };
 
 #ifdef _KERNEL
 extern struct sysentvec aout_sysvec;
 extern struct sysentvec elf_freebsd_sysvec;
 extern struct sysentvec null_sysvec;
 extern struct sysent sysent[];
 
 #define NO_SYSCALL (-1)
 
 struct module;
 
 struct syscall_module_data {
        int     (*chainevh)(struct module *, int, void *); /* next handler */
        void    *chainarg;      /* arg for next event handler */
        int     *offset;         /* offset into sysent */
        struct  sysent *new_sysent; /* new sysent */
        struct  sysent old_sysent; /* old sysent */
 };
 
 #define SYSCALL_MODULE(name, offset, new_sysent, evh, arg)     \
 static struct syscall_module_data name##_syscall_mod = {       \
        evh, arg, offset, new_sysent, { 0, NULL }               \
 };                                                             \
                                                                \
 static moduledata_t name##_mod = {                             \
        #name,                                                  \
        syscall_module_handler,                                 \
        &name##_syscall_mod                                     \
 };                                                             \
 DECLARE_MODULE(name, name##_mod, SI_SUB_SYSCALLS, SI_ORDER_MIDDLE)
 
 #define SYSCALL_MODULE_HELPER(syscallname)              \
 static int syscallname##_syscall = SYS_##syscallname;   \
 static struct sysent syscallname##_sysent = {           \
     (sizeof(struct syscallname ## _args )               \
      / sizeof(register_t)),                             \
     (sy_call_t *)& syscallname                          \
 };                                                      \
 SYSCALL_MODULE(syscallname,                             \
     & syscallname##_syscall, & syscallname##_sysent,    \
     NULL, NULL);
 
 int    syscall_register(int *offset, struct sysent *new_sysent,
 	    struct sysent *old_sysent);
 int    syscall_deregister(int *offset, struct sysent *old_sysent);
 int    syscall_module_handler(struct module *mod, int what, void *arg);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_SYSENT_H_ */