No OneTemporary
Actions

Size

4 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

This document is not UTF8. It was detected as ISO-8859-1 (Latin 1) and converted to UTF8 for display.

	Index: head/sys/amd64/amd64/machdep.c
	===================================================================
	--- head/sys/amd64/amd64/machdep.c (revision 225616)
	+++ head/sys/amd64/amd64/machdep.c (revision 225617)
	@@ -1,2423 +1,2423 @@
	/*-
	* Copyright (c) 2003 Peter Wemm.
	* Copyright (c) 1992 Terrence R. Lambert.
	* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
	* All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* William Jolitz.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_atalk.h"
	#include "opt_atpic.h"
	#include "opt_compat.h"
	#include "opt_cpu.h"
	#include "opt_ddb.h"
	#include "opt_inet.h"
	#include "opt_ipx.h"
	#include "opt_isa.h"
	#include "opt_kstack_pages.h"
	#include "opt_maxmem.h"
	#include "opt_mp_watchdog.h"
	#include "opt_perfmon.h"
	#include "opt_sched.h"
	#include "opt_kdtrace.h"

	#include <sys/param.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/bus.h>
	#include <sys/callout.h>
	#include <sys/cons.h>
	#include <sys/cpu.h>
	#include <sys/eventhandler.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/linker.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/msgbuf.h>
	#include <sys/mutex.h>
	#include <sys/pcpu.h>
	#include <sys/ptrace.h>
	#include <sys/reboot.h>
	#include <sys/sched.h>
	#include <sys/signalvar.h>
	#ifdef SMP
	#include <sys/smp.h>
	#endif
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/ucontext.h>
	#include <sys/vmmeter.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_page.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_param.h>

	#ifdef DDB
	#ifndef KDB
	#error KDB must be enabled in order for DDB to work!
	#endif
	#include <ddb/ddb.h>
	#include <ddb/db_sym.h>
	#endif

	#include <net/netisr.h>

	#include <machine/clock.h>
	#include <machine/cpu.h>
	#include <machine/cputypes.h>
	#include <machine/intr_machdep.h>
	#include <x86/mca.h>
	#include <machine/md_var.h>
	#include <machine/metadata.h>
	#include <machine/mp_watchdog.h>
	#include <machine/pc/bios.h>
	#include <machine/pcb.h>
	#include <machine/proc.h>
	#include <machine/reg.h>
	#include <machine/sigframe.h>
	#include <machine/specialreg.h>
	#ifdef PERFMON
	#include <machine/perfmon.h>
	#endif
	#include <machine/tss.h>
	#ifdef SMP
	#include <machine/smp.h>
	#endif

	#ifdef DEV_ATPIC
	#include <x86/isa/icu.h>
	#else
	#include <machine/apicvar.h>
	#endif

	#include <isa/isareg.h>
	#include <isa/rtc.h>

	/* Sanity check for __curthread() */
	CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);

	extern u_int64_t hammer_time(u_int64_t, u_int64_t);

	extern void printcpuinfo(void); /* XXX header file */
	extern void identify_cpu(void);
	extern void panicifcpuunsupported(void);

	#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
	#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)

	static void cpu_startup(void *);
	static void get_fpcontext(struct thread td, mcontext_t mcp);
	static int set_fpcontext(struct thread td, const mcontext_t mcp);
	SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);

	/*
	* The file "conf/ldscript.amd64" defines the symbol "kernphys". Its value is
	* the physical address at which the kernel is loaded.
	*/
	extern char kernphys[];
	#ifdef DDB
	extern vm_offset_t ksym_start, ksym_end;
	#endif

	struct msgbuf *msgbufp;

	/* Intel ICH registers */
	#define ICH_PMBASE 0x400
	#define ICH_SMI_EN ICH_PMBASE + 0x30

	int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;

	int cold = 1;

	long Maxmem = 0;
	long realmem = 0;

	/*
	* The number of PHYSMAP entries must be one less than the number of
	* PHYSSEG entries because the PHYSMAP entry that spans the largest
	* physical address that is accessible by ISA DMA is split into two
	* PHYSSEG entries.
	*/
	#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))

	vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
	vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];

	/* must be 2 less so 0 0 can signal end of chunks */
	#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
	#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)

	struct kva_md_info kmi;

	static struct trapframe proc0_tf;
	struct region_descriptor r_gdt, r_idt;

	struct pcpu __pcpu[MAXCPU];

	struct mtx icu_lock;

	struct mtx dt_lock; /* lock for GDT and LDT */

	static void
	cpu_startup(dummy)
	void *dummy;
	{
	uintmax_t memsize;
	char *sysenv;

	/*
	* On MacBooks, we need to disallow the legacy USB circuit to
	* generate an SMI# because this can cause several problems,
	* namely: incorrect CPU frequency detection and failure to
	* start the APs.
	* We do this by disabling a bit in the SMI_EN (SMI Control and
	* Enable register) of the Intel ICH LPC Interface Bridge.
	*/
	sysenv = getenv("smbios.system.product");
	if (sysenv != NULL) {
	if (strncmp(sysenv, "MacBook1,1", 10) == 0 \|\|
	strncmp(sysenv, "MacBook3,1", 10) == 0 \|\|
	strncmp(sysenv, "MacBookPro1,1", 13) == 0 \|\|
	strncmp(sysenv, "MacBookPro1,2", 13) == 0 \|\|
	strncmp(sysenv, "MacBookPro3,1", 13) == 0 \|\|
	strncmp(sysenv, "Macmini1,1", 10) == 0) {
	if (bootverbose)
	printf("Disabling LEGACY_USB_EN bit on "
	"Intel ICH.\n");
	outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
	}
	freeenv(sysenv);
	}

	/*
	* Good {morning,afternoon,evening,night}.
	*/
	startrtclock();
	printcpuinfo();
	panicifcpuunsupported();
	#ifdef PERFMON
	perfmon_init();
	#endif
	realmem = Maxmem;

	/*
	* Display physical memory if SMBIOS reports reasonable amount.
	*/
	memsize = 0;
	sysenv = getenv("smbios.memory.enabled");
	if (sysenv != NULL) {
	memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
	freeenv(sysenv);
	}
	if (memsize < ptoa((uintmax_t)cnt.v_free_count))
	memsize = ptoa((uintmax_t)Maxmem);
	printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);

	/*
	* Display any holes after the first chunk of extended memory.
	*/
	if (bootverbose) {
	int indx;

	printf("Physical memory chunk(s):\n");
	for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
	vm_paddr_t size;

	size = phys_avail[indx + 1] - phys_avail[indx];
	printf(
	"0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
	(uintmax_t)phys_avail[indx],
	(uintmax_t)phys_avail[indx + 1] - 1,
	(uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
	}
	}

	vm_ksubmap_init(&kmi);

	printf("avail memory = %ju (%ju MB)\n",
	ptoa((uintmax_t)cnt.v_free_count),
	ptoa((uintmax_t)cnt.v_free_count) / 1048576);

	/*
	* Set up buffers, so they can be used to read disk labels.
	*/
	bufinit();
	vm_pager_bufferinit();

	cpu_setregs();
	}

	/*
	* Send an interrupt to process.
	*
	* Stack is set up to allow sigcode stored
	* at top to call routine, followed by call
	* to sigreturn routine below. After sigreturn
	* resets the signal mask, the stack, and the
	* frame pointer, it returns to the user
	* specified pc, psl.
	*/
	void
	sendsig(sig_t catcher, ksiginfo_t ksi, sigset_t mask)
	{
	struct sigframe sf, *sfp;
	struct pcb *pcb;
	struct proc *p;
	struct thread *td;
	struct sigacts *psp;
	char *sp;
	struct trapframe *regs;
	int sig;
	int oonstack;

	td = curthread;
	pcb = td->td_pcb;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	sig = ksi->ksi_signo;
	psp = p->p_sigacts;
	mtx_assert(&psp->ps_mtx, MA_OWNED);
	regs = td->td_frame;
	oonstack = sigonstack(regs->tf_rsp);

	/* Save user context. */
	bzero(&sf, sizeof(sf));
	sf.sf_uc.uc_sigmask = *mask;
	sf.sf_uc.uc_stack = td->td_sigstk;
	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
	? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
	get_fpcontext(td, &sf.sf_uc.uc_mcontext);
	fpstate_drop(td);
	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
	bzero(sf.sf_uc.uc_mcontext.mc_spare,
	sizeof(sf.sf_uc.uc_mcontext.mc_spare));
	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));

	/* Allocate space for the signal handler context. */
	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
	SIGISMEMBER(psp->ps_sigonstack, sig)) {
	sp = td->td_sigstk.ss_sp +
	td->td_sigstk.ss_size - sizeof(struct sigframe);
	#if defined(COMPAT_43)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	#endif
	} else
	sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
	/* Align to 16 bytes. */
	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);

	/* Translate the signal if appropriate. */
	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
	sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];

	/* Build the argument list for the signal handler. */
	regs->tf_rdi = sig; /* arg 1 in %rdi */
	regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
	bzero(&sf.sf_si, sizeof(sf.sf_si));
	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
	/* Signal handler installed with SA_SIGINFO. */
	regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
	sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;

	/* Fill in POSIX parts */
	sf.sf_si = ksi->ksi_info;
	sf.sf_si.si_signo = sig; /* maybe a translated signal */
	regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
	} else {
	/* Old FreeBSD-style arguments. */
	regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */
	regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
	sf.sf_ahu.sf_handler = catcher;
	}
	mtx_unlock(&psp->ps_mtx);
	PROC_UNLOCK(p);

	/*
	* Copy the sigframe out to the user's stack.
	*/
	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
	#ifdef DEBUG
	printf("process %ld has trashed its stack\n", (long)p->p_pid);
	#endif
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	}

	regs->tf_rsp = (long)sfp;
	regs->tf_rip = p->p_sysent->sv_sigcode_base;
	regs->tf_rflags &= ~(PSL_T \| PSL_D);
	regs->tf_cs = _ucodesel;
	regs->tf_ds = _udatasel;
	regs->tf_es = _udatasel;
	regs->tf_fs = _ufssel;
	regs->tf_gs = _ugssel;
	regs->tf_flags = TF_HASSEGS;
	set_pcb_flags(pcb, PCB_FULL_IRET);
	PROC_LOCK(p);
	mtx_lock(&psp->ps_mtx);
	}

	/*
	* System call to cleanup state after a signal
	* has been taken. Reset signal mask and
	* stack state from context left by sendsig (above).
	* Return to previous pc and psl as specified by
	* context left by sendsig. Check carefully to
	* make sure that the user has not modified the
	* state to gain improper privileges.
	*
	* MPSAFE
	*/
	int
	-sigreturn(td, uap)
	+sys_sigreturn(td, uap)
	struct thread *td;
	struct sigreturn_args /* {
	const struct __ucontext *sigcntxp;
	} / uap;
	{
	ucontext_t uc;
	struct pcb *pcb;
	struct proc *p;
	struct trapframe *regs;
	ucontext_t *ucp;
	long rflags;
	int cs, error, ret;
	ksiginfo_t ksi;

	pcb = td->td_pcb;
	p = td->td_proc;

	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
	if (error != 0) {
	uprintf("pid %d (%s): sigreturn copyin failed\n",
	p->p_pid, td->td_name);
	return (error);
	}
	ucp = &uc;
	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
	uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
	td->td_name, ucp->uc_mcontext.mc_flags);
	return (EINVAL);
	}
	regs = td->td_frame;
	rflags = ucp->uc_mcontext.mc_rflags;
	/*
	* Don't allow users to change privileged or reserved flags.
	*/
	/*
	* XXX do allow users to change the privileged flag PSL_RF.
	* The cpu sets PSL_RF in tf_rflags for faults. Debuggers
	* should sometimes set it there too. tf_rflags is kept in
	* the signal context during signal handling and there is no
	* other place to remember it, so the PSL_RF bit may be
	* corrupted by the signal handler without us knowing.
	* Corruption of the PSL_RF bit at worst causes one more or
	* one less debugger trap, so allowing it is fairly harmless.
	*/
	if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
	uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
	td->td_name, rflags);
	return (EINVAL);
	}

	/*
	* Don't allow users to load a valid privileged %cs. Let the
	* hardware check for invalid selectors, excess privilege in
	* other selectors, invalid %eip's and invalid %esp's.
	*/
	cs = ucp->uc_mcontext.mc_cs;
	if (!CS_SECURE(cs)) {
	uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
	td->td_name, cs);
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGBUS;
	ksi.ksi_code = BUS_OBJERR;
	ksi.ksi_trapno = T_PROTFLT;
	ksi.ksi_addr = (void *)regs->tf_rip;
	trapsignal(td, &ksi);
	return (EINVAL);
	}

	ret = set_fpcontext(td, &ucp->uc_mcontext);
	if (ret != 0) {
	uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
	p->p_pid, td->td_name, ret);
	return (ret);
	}
	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;

	#if defined(COMPAT_43)
	if (ucp->uc_mcontext.mc_onstack & 1)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	else
	td->td_sigstk.ss_flags &= ~SS_ONSTACK;
	#endif

	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
	set_pcb_flags(pcb, PCB_FULL_IRET);
	return (EJUSTRETURN);
	}

	#ifdef COMPAT_FREEBSD4
	int
	freebsd4_sigreturn(struct thread td, struct freebsd4_sigreturn_args uap)
	{

	- return sigreturn(td, (struct sigreturn_args *)uap);
	+ return sys_sigreturn(td, (struct sigreturn_args *)uap);
	}
	#endif


	/*
	* Machine dependent boot() routine
	*
	* I haven't seen anything to put here yet
	* Possibly some stuff might be grafted back here from boot()
	*/
	void
	cpu_boot(int howto)
	{
	}

	/*
	* Flush the D-cache for non-DMA I/O so that the I-cache can
	* be made coherent later.
	*/
	void
	cpu_flush_dcache(void *ptr, size_t len)
	{
	/* Not applicable */
	}

	/* Get current clock frequency for the given cpu id. */
	int
	cpu_est_clockrate(int cpu_id, uint64_t *rate)
	{
	uint64_t tsc1, tsc2;
	uint64_t acnt, mcnt, perf;
	register_t reg;

	if (pcpu_find(cpu_id) == NULL \|\| rate == NULL)
	return (EINVAL);

	/*
	* If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
	* DELAY(9) based logic fails.
	*/
	if (tsc_is_invariant && !tsc_perf_stat)
	return (EOPNOTSUPP);

	#ifdef SMP
	if (smp_cpus > 1) {
	/* Schedule ourselves on the indicated cpu. */
	thread_lock(curthread);
	sched_bind(curthread, cpu_id);
	thread_unlock(curthread);
	}
	#endif

	/* Calibrate by measuring a short delay. */
	reg = intr_disable();
	if (tsc_is_invariant) {
	wrmsr(MSR_MPERF, 0);
	wrmsr(MSR_APERF, 0);
	tsc1 = rdtsc();
	DELAY(1000);
	mcnt = rdmsr(MSR_MPERF);
	acnt = rdmsr(MSR_APERF);
	tsc2 = rdtsc();
	intr_restore(reg);
	perf = 1000 * acnt / mcnt;
	rate = (tsc2 - tsc1) perf;
	} else {
	tsc1 = rdtsc();
	DELAY(1000);
	tsc2 = rdtsc();
	intr_restore(reg);
	rate = (tsc2 - tsc1) 1000;
	}

	#ifdef SMP
	if (smp_cpus > 1) {
	thread_lock(curthread);
	sched_unbind(curthread);
	thread_unlock(curthread);
	}
	#endif

	return (0);
	}

	/*
	* Shutdown the CPU as much as possible
	*/
	void
	cpu_halt(void)
	{
	for (;;)
	__asm__ ("hlt");
	}

	void (cpu_idle_hook)(void) = NULL; / ACPI idle hook. */
	static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */
	static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
	TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
	SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
	0, "Use MONITOR/MWAIT for short idle");

	#define STATE_RUNNING 0x0
	#define STATE_MWAIT 0x1
	#define STATE_SLEEPING 0x2

	static void
	cpu_idle_acpi(int busy)
	{
	int *state;

	state = (int *)PCPU_PTR(monitorbuf);
	*state = STATE_SLEEPING;
	disable_intr();
	if (sched_runnable())
	enable_intr();
	else if (cpu_idle_hook)
	cpu_idle_hook();
	else
	__asm __volatile("sti; hlt");
	*state = STATE_RUNNING;
	}

	static void
	cpu_idle_hlt(int busy)
	{
	int *state;

	state = (int *)PCPU_PTR(monitorbuf);
	*state = STATE_SLEEPING;
	/*
	* We must absolutely guarentee that hlt is the next instruction
	* after sti or we introduce a timing window.
	*/
	disable_intr();
	if (sched_runnable())
	enable_intr();
	else
	__asm __volatile("sti; hlt");
	*state = STATE_RUNNING;
	}

	/*
	* MWAIT cpu power states. Lower 4 bits are sub-states.
	*/
	#define MWAIT_C0 0xf0
	#define MWAIT_C1 0x00
	#define MWAIT_C2 0x10
	#define MWAIT_C3 0x20
	#define MWAIT_C4 0x30

	static void
	cpu_idle_mwait(int busy)
	{
	int *state;

	state = (int *)PCPU_PTR(monitorbuf);
	*state = STATE_MWAIT;
	if (!sched_runnable()) {
	cpu_monitor(state, 0, 0);
	if (*state == STATE_MWAIT)
	cpu_mwait(0, MWAIT_C1);
	}
	*state = STATE_RUNNING;
	}

	static void
	cpu_idle_spin(int busy)
	{
	int *state;
	int i;

	state = (int *)PCPU_PTR(monitorbuf);
	*state = STATE_RUNNING;
	for (i = 0; i < 1000; i++) {
	if (sched_runnable())
	return;
	cpu_spinwait();
	}
	}

	/*
	* C1E renders the local APIC timer dead, so we disable it by
	* reading the Interrupt Pending Message register and clearing
	* both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
	*
	* Reference:
	* "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
	* #32559 revision 3.00+
	*/
	#define MSR_AMDK8_IPM 0xc0010055
	#define AMDK8_SMIONCMPHALT (1ULL << 27)
	#define AMDK8_C1EONCMPHALT (1ULL << 28)
	#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT \| AMDK8_C1EONCMPHALT)

	static void
	cpu_probe_amdc1e(void)
	{

	/*
	* Detect the presence of C1E capability mostly on latest
	* dual-cores (or future) k8 family.
	*/
	if (cpu_vendor_id == CPU_VENDOR_AMD &&
	(cpu_id & 0x00000f00) == 0x00000f00 &&
	(cpu_id & 0x0fff0000) >= 0x00040000) {
	cpu_ident_amdc1e = 1;
	}
	}

	void (*cpu_idle_fn)(int) = cpu_idle_acpi;

	void
	cpu_idle(int busy)
	{
	uint64_t msr;

	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
	busy, curcpu);
	#ifdef MP_WATCHDOG
	ap_watchdog(PCPU_GET(cpuid));
	#endif
	/* If we are busy - try to use fast methods. */
	if (busy) {
	if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
	cpu_idle_mwait(busy);
	goto out;
	}
	}

	/* If we have time - switch timers into idle mode. */
	if (!busy) {
	critical_enter();
	cpu_idleclock();
	}

	/* Apply AMD APIC timer C1E workaround. */
	if (cpu_ident_amdc1e && cpu_disable_deep_sleep) {
	msr = rdmsr(MSR_AMDK8_IPM);
	if (msr & AMDK8_CMPHALT)
	wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
	}

	/* Call main idle method. */
	cpu_idle_fn(busy);

	/* Switch timers mack into active mode. */
	if (!busy) {
	cpu_activeclock();
	critical_exit();
	}
	out:
	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
	busy, curcpu);
	}

	int
	cpu_idle_wakeup(int cpu)
	{
	struct pcpu *pcpu;
	int *state;

	pcpu = pcpu_find(cpu);
	state = (int *)pcpu->pc_monitorbuf;
	/*
	* This doesn't need to be atomic since missing the race will
	* simply result in unnecessary IPIs.
	*/
	if (*state == STATE_SLEEPING)
	return (0);
	if (*state == STATE_MWAIT)
	*state = STATE_RUNNING;
	return (1);
	}

	/*
	* Ordered by speed/power consumption.
	*/
	struct {
	void *id_fn;
	char *id_name;
	} idle_tbl[] = {
	{ cpu_idle_spin, "spin" },
	{ cpu_idle_mwait, "mwait" },
	{ cpu_idle_hlt, "hlt" },
	{ cpu_idle_acpi, "acpi" },
	{ NULL, NULL }
	};

	static int
	idle_sysctl_available(SYSCTL_HANDLER_ARGS)
	{
	char avail, p;
	int error;
	int i;

	avail = malloc(256, M_TEMP, M_WAITOK);
	p = avail;
	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
	if (strstr(idle_tbl[i].id_name, "mwait") &&
	(cpu_feature2 & CPUID2_MON) == 0)
	continue;
	if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
	cpu_idle_hook == NULL)
	continue;
	p += sprintf(p, "%s%s", p != avail ? ", " : "",
	idle_tbl[i].id_name);
	}
	error = sysctl_handle_string(oidp, avail, 0, req);
	free(avail, M_TEMP);
	return (error);
	}

	SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING \| CTLFLAG_RD,
	0, 0, idle_sysctl_available, "A", "list of available idle functions");

	static int
	idle_sysctl(SYSCTL_HANDLER_ARGS)
	{
	char buf[16];
	int error;
	char *p;
	int i;

	p = "unknown";
	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
	if (idle_tbl[i].id_fn == cpu_idle_fn) {
	p = idle_tbl[i].id_name;
	break;
	}
	}
	strncpy(buf, p, sizeof(buf));
	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
	if (strstr(idle_tbl[i].id_name, "mwait") &&
	(cpu_feature2 & CPUID2_MON) == 0)
	continue;
	if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
	cpu_idle_hook == NULL)
	continue;
	if (strcmp(idle_tbl[i].id_name, buf))
	continue;
	cpu_idle_fn = idle_tbl[i].id_fn;
	return (0);
	}
	return (EINVAL);
	}

	SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING \| CTLFLAG_RW, 0, 0,
	idle_sysctl, "A", "currently selected idle function");

	/*
	* Reset registers to default values on exec.
	*/
	void
	exec_setregs(struct thread td, struct image_params imgp, u_long stack)
	{
	struct trapframe *regs = td->td_frame;
	struct pcb *pcb = td->td_pcb;

	mtx_lock(&dt_lock);
	if (td->td_proc->p_md.md_ldt != NULL)
	user_ldt_free(td);
	else
	mtx_unlock(&dt_lock);

	pcb->pcb_fsbase = 0;
	pcb->pcb_gsbase = 0;
	clear_pcb_flags(pcb, PCB_32BIT \| PCB_GS32BIT);
	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
	set_pcb_flags(pcb, PCB_FULL_IRET);

	bzero((char *)regs, sizeof(struct trapframe));
	regs->tf_rip = imgp->entry_addr;
	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
	regs->tf_rdi = stack; /* argv */
	regs->tf_rflags = PSL_USER \| (regs->tf_rflags & PSL_T);
	regs->tf_ss = _udatasel;
	regs->tf_cs = _ucodesel;
	regs->tf_ds = _udatasel;
	regs->tf_es = _udatasel;
	regs->tf_fs = _ufssel;
	regs->tf_gs = _ugssel;
	regs->tf_flags = TF_HASSEGS;
	td->td_retval[1] = 0;

	/*
	* Reset the hardware debug registers if they were in use.
	* They won't have any meaning for the newly exec'd process.
	*/
	if (pcb->pcb_flags & PCB_DBREGS) {
	pcb->pcb_dr0 = 0;
	pcb->pcb_dr1 = 0;
	pcb->pcb_dr2 = 0;
	pcb->pcb_dr3 = 0;
	pcb->pcb_dr6 = 0;
	pcb->pcb_dr7 = 0;
	if (pcb == PCPU_GET(curpcb)) {
	/*
	* Clear the debug registers on the running
	* CPU, otherwise they will end up affecting
	* the next process we switch to.
	*/
	reset_dbregs();
	}
	clear_pcb_flags(pcb, PCB_DBREGS);
	}

	/*
	* Drop the FP state if we hold it, so that the process gets a
	* clean FP state if it uses the FPU again.
	*/
	fpstate_drop(td);
	}

	void
	cpu_setregs(void)
	{
	register_t cr0;

	cr0 = rcr0();
	/*
	* CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
	* BSP. See the comments there about why we set them.
	*/
	cr0 \|= CR0_MP \| CR0_NE \| CR0_TS \| CR0_WP \| CR0_AM;
	load_cr0(cr0);
	}

	/*
	* Initialize amd64 and configure to run kernel
	*/

	/*
	* Initialize segments & interrupt table
	*/

	struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
	static struct gate_descriptor idt0[NIDT];
	struct gate_descriptor idt = &idt0[0]; / interrupt descriptor table */

	static char dblfault_stack[PAGE_SIZE] __aligned(16);

	static char nmi0_stack[PAGE_SIZE] __aligned(16);
	CTASSERT(sizeof(struct nmi_pcpu) == 16);

	struct amd64tss common_tss[MAXCPU];

	/*
	* Software prototypes -- in more palatable form.
	*
	* Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
	* slots as corresponding segments for i386 kernel.
	*/
	struct soft_segment_descriptor gdt_segs[] = {
	/* GNULL_SEL 0 Null Descriptor */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_long = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GNULL2_SEL 1 Null Descriptor */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_long = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GUFS32_SEL 2 32 bit %gs Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_long = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GUGS32_SEL 3 32 bit %fs Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_long = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GCODE_SEL 4 Code Descriptor for kernel */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMERA,
	.ssd_dpl = SEL_KPL,
	.ssd_p = 1,
	.ssd_long = 1,
	.ssd_def32 = 0,
	.ssd_gran = 1 },
	/* GDATA_SEL 5 Data Descriptor for kernel */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_KPL,
	.ssd_p = 1,
	.ssd_long = 1,
	.ssd_def32 = 0,
	.ssd_gran = 1 },
	/* GUCODE32_SEL 6 32 bit Code Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMERA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_long = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GUDATA_SEL 7 32/64 bit Data Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_long = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GUCODE_SEL 8 64 bit Code Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMERA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_long = 1,
	.ssd_def32 = 0,
	.ssd_gran = 1 },
	/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
	{ .ssd_base = 0x0,
	.ssd_limit = sizeof(struct amd64tss) + IOPAGES * PAGE_SIZE - 1,
	.ssd_type = SDT_SYSTSS,
	.ssd_dpl = SEL_KPL,
	.ssd_p = 1,
	.ssd_long = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* Actually, the TSS is a system descriptor which is double size */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_long = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GUSERLDT_SEL 11 LDT Descriptor */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_long = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GUSERLDT_SEL 12 LDT Descriptor, double size */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_long = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	};

	void
	setidt(idx, func, typ, dpl, ist)
	int idx;
	inthand_t *func;
	int typ;
	int dpl;
	int ist;
	{
	struct gate_descriptor *ip;

	ip = idt + idx;
	ip->gd_looffset = (uintptr_t)func;
	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
	ip->gd_ist = ist;
	ip->gd_xx = 0;
	ip->gd_type = typ;
	ip->gd_dpl = dpl;
	ip->gd_p = 1;
	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
	}

	extern inthand_t
	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
	IDTVEC(xmm), IDTVEC(dblfault),
	#ifdef KDTRACE_HOOKS
	IDTVEC(dtrace_ret),
	#endif
	IDTVEC(fast_syscall), IDTVEC(fast_syscall32);

	#ifdef DDB
	/*
	* Display the index and function name of any IDT entries that don't use
	* the default 'rsvd' entry point.
	*/
	DB_SHOW_COMMAND(idt, db_show_idt)
	{
	struct gate_descriptor *ip;
	int idx;
	uintptr_t func;

	ip = idt;
	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
	func = ((long)ip->gd_hioffset << 16 \| ip->gd_looffset);
	if (func != (uintptr_t)&IDTVEC(rsvd)) {
	db_printf("%3d\t", idx);
	db_printsym(func, DB_STGY_PROC);
	db_printf("\n");
	}
	ip++;
	}
	}
	#endif

	void
	sdtossd(sd, ssd)
	struct user_segment_descriptor *sd;
	struct soft_segment_descriptor *ssd;
	{

	ssd->ssd_base = (sd->sd_hibase << 24) \| sd->sd_lobase;
	ssd->ssd_limit = (sd->sd_hilimit << 16) \| sd->sd_lolimit;
	ssd->ssd_type = sd->sd_type;
	ssd->ssd_dpl = sd->sd_dpl;
	ssd->ssd_p = sd->sd_p;
	ssd->ssd_long = sd->sd_long;
	ssd->ssd_def32 = sd->sd_def32;
	ssd->ssd_gran = sd->sd_gran;
	}

	void
	ssdtosd(ssd, sd)
	struct soft_segment_descriptor *ssd;
	struct user_segment_descriptor *sd;
	{

	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
	sd->sd_type = ssd->ssd_type;
	sd->sd_dpl = ssd->ssd_dpl;
	sd->sd_p = ssd->ssd_p;
	sd->sd_long = ssd->ssd_long;
	sd->sd_def32 = ssd->ssd_def32;
	sd->sd_gran = ssd->ssd_gran;
	}

	void
	ssdtosyssd(ssd, sd)
	struct soft_segment_descriptor *ssd;
	struct system_segment_descriptor *sd;
	{

	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
	sd->sd_type = ssd->ssd_type;
	sd->sd_dpl = ssd->ssd_dpl;
	sd->sd_p = ssd->ssd_p;
	sd->sd_gran = ssd->ssd_gran;
	}

	#if !defined(DEV_ATPIC) && defined(DEV_ISA)
	#include <isa/isavar.h>
	#include <isa/isareg.h>
	/*
	* Return a bitmap of the current interrupt requests. This is 8259-specific
	* and is only suitable for use at probe time.
	* This is only here to pacify sio. It is NOT FATAL if this doesn't work.
	* It shouldn't be here. There should probably be an APIC centric
	* implementation in the apic driver code, if at all.
	*/
	intrmask_t
	isa_irq_pending(void)
	{
	u_char irr1;
	u_char irr2;

	irr1 = inb(IO_ICU1);
	irr2 = inb(IO_ICU2);
	return ((irr2 << 8) \| irr1);
	}
	#endif

	u_int basemem;

	static int
	add_smap_entry(struct bios_smap smap, vm_paddr_t physmap, int *physmap_idxp)
	{
	int i, insert_idx, physmap_idx;

	physmap_idx = *physmap_idxp;

	if (boothowto & RB_VERBOSE)
	printf("SMAP type=%02x base=%016lx len=%016lx\n",
	smap->type, smap->base, smap->length);

	if (smap->type != SMAP_TYPE_MEMORY)
	return (1);

	if (smap->length == 0)
	return (0);

	/*
	* Find insertion point while checking for overlap. Start off by
	* assuming the new entry will be added to the end.
	*/
	insert_idx = physmap_idx + 2;
	for (i = 0; i <= physmap_idx; i += 2) {
	if (smap->base < physmap[i + 1]) {
	if (smap->base + smap->length <= physmap[i]) {
	insert_idx = i;
	break;
	}
	if (boothowto & RB_VERBOSE)
	printf(
	"Overlapping memory regions, ignoring second region\n");
	return (1);
	}
	}

	/* See if we can prepend to the next entry. */
	if (insert_idx <= physmap_idx &&
	smap->base + smap->length == physmap[insert_idx]) {
	physmap[insert_idx] = smap->base;
	return (1);
	}

	/* See if we can append to the previous entry. */
	if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
	physmap[insert_idx - 1] += smap->length;
	return (1);
	}

	physmap_idx += 2;
	*physmap_idxp = physmap_idx;
	if (physmap_idx == PHYSMAP_SIZE) {
	printf(
	"Too many segments in the physical address map, giving up\n");
	return (0);
	}

	/*
	* Move the last 'N' entries down to make room for the new
	* entry if needed.
	*/
	for (i = physmap_idx; i > insert_idx; i -= 2) {
	physmap[i] = physmap[i - 2];
	physmap[i + 1] = physmap[i - 1];
	}

	/* Insert the new entry. */
	physmap[insert_idx] = smap->base;
	physmap[insert_idx + 1] = smap->base + smap->length;
	return (1);
	}

	/*
	* Populate the (physmap) array with base/bound pairs describing the
	* available physical memory in the system, then test this memory and
	* build the phys_avail array describing the actually-available memory.
	*
	* Total memory size may be set by the kernel environment variable
	* hw.physmem or the compile-time define MAXMEM.
	*
	* XXX first should be vm_paddr_t.
	*/
	static void
	getmemsize(caddr_t kmdp, u_int64_t first)
	{
	int i, physmap_idx, pa_indx, da_indx;
	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
	u_long physmem_tunable, memtest;
	pt_entry_t *pte;
	struct bios_smap smapbase, smap, *smapend;
	u_int32_t smapsize;
	quad_t dcons_addr, dcons_size;

	bzero(physmap, sizeof(physmap));
	basemem = 0;
	physmap_idx = 0;

	/*
	* get memory map from INT 15:E820, kindly supplied by the loader.
	*
	* subr_module.c says:
	* "Consumer may safely assume that size value precedes data."
	* ie: an int32_t immediately precedes smap.
	*/
	smapbase = (struct bios_smap *)preload_search_info(kmdp,
	MODINFO_METADATA \| MODINFOMD_SMAP);
	if (smapbase == NULL)
	panic("No BIOS smap info from loader!");

	smapsize = ((u_int32_t )smapbase - 1);
	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);

	for (smap = smapbase; smap < smapend; smap++)
	if (!add_smap_entry(smap, physmap, &physmap_idx))
	break;

	/*
	* Find the 'base memory' segment for SMP
	*/
	basemem = 0;
	for (i = 0; i <= physmap_idx; i += 2) {
	if (physmap[i] == 0x00000000) {
	basemem = physmap[i + 1] / 1024;
	break;
	}
	}
	if (basemem == 0)
	panic("BIOS smap did not include a basemem segment!");

	#ifdef SMP
	/* make hole for AP bootstrap code */
	physmap[1] = mp_bootaddress(physmap[1] / 1024);
	#endif

	/*
	* Maxmem isn't the "maximum memory", it's one larger than the
	* highest page of the physical address space. It should be
	* called something like "Maxphyspage". We may adjust this
	* based on ``hw.physmem'' and the results of the memory test.
	*/
	Maxmem = atop(physmap[physmap_idx + 1]);

	#ifdef MAXMEM
	Maxmem = MAXMEM / 4;
	#endif

	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
	Maxmem = atop(physmem_tunable);

	/*
	* By default keep the memtest enabled. Use a general name so that
	* one could eventually do more with the code than just disable it.
	*/
	memtest = 1;
	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);

	/*
	* Don't allow MAXMEM or hw.physmem to extend the amount of memory
	* in the system.
	*/
	if (Maxmem > atop(physmap[physmap_idx + 1]))
	Maxmem = atop(physmap[physmap_idx + 1]);

	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
	(boothowto & RB_VERBOSE))
	printf("Physical memory use set to %ldK\n", Maxmem * 4);

	/* call pmap initialization to make new kernel address space */
	pmap_bootstrap(&first);

	/*
	* Size up each available chunk of physical memory.
	*/
	physmap[0] = PAGE_SIZE; /* mask off page 0 */
	pa_indx = 0;
	da_indx = 1;
	phys_avail[pa_indx++] = physmap[0];
	phys_avail[pa_indx] = physmap[0];
	dump_avail[da_indx] = physmap[0];
	pte = CMAP1;

	/*
	* Get dcons buffer address
	*/
	if (getenv_quad("dcons.addr", &dcons_addr) == 0 \|\|
	getenv_quad("dcons.size", &dcons_size) == 0)
	dcons_addr = 0;

	/*
	* physmap is in bytes, so when converting to page boundaries,
	* round up the start address and round down the end address.
	*/
	for (i = 0; i <= physmap_idx; i += 2) {
	vm_paddr_t end;

	end = ptoa((vm_paddr_t)Maxmem);
	if (physmap[i + 1] < end)
	end = trunc_page(physmap[i + 1]);
	for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
	int tmp, page_bad, full;
	int ptr = (int )CADDR1;

	full = FALSE;
	/*
	* block out kernel memory as not available.
	*/
	if (pa >= (vm_paddr_t)kernphys && pa < first)
	goto do_dump_avail;

	/*
	* block out dcons buffer
	*/
	if (dcons_addr > 0
	&& pa >= trunc_page(dcons_addr)
	&& pa < dcons_addr + dcons_size)
	goto do_dump_avail;

	page_bad = FALSE;
	if (memtest == 0)
	goto skip_memtest;

	/*
	* map page into kernel: valid, read/write,non-cacheable
	*/
	*pte = pa \| PG_V \| PG_RW \| PG_N;
	invltlb();

	tmp = (int )ptr;
	/*
	* Test for alternating 1's and 0's
	*/
	(volatile int )ptr = 0xaaaaaaaa;
	if ((volatile int )ptr != 0xaaaaaaaa)
	page_bad = TRUE;
	/*
	* Test for alternating 0's and 1's
	*/
	(volatile int )ptr = 0x55555555;
	if ((volatile int )ptr != 0x55555555)
	page_bad = TRUE;
	/*
	* Test for all 1's
	*/
	(volatile int )ptr = 0xffffffff;
	if ((volatile int )ptr != 0xffffffff)
	page_bad = TRUE;
	/*
	* Test for all 0's
	*/
	(volatile int )ptr = 0x0;
	if ((volatile int )ptr != 0x0)
	page_bad = TRUE;
	/*
	* Restore original value.
	*/
	(int )ptr = tmp;

	skip_memtest:
	/*
	* Adjust array of valid/good pages.
	*/
	if (page_bad == TRUE)
	continue;
	/*
	* If this good page is a continuation of the
	* previous set of good pages, then just increase
	* the end pointer. Otherwise start a new chunk.
	* Note that "end" points one higher than end,
	* making the range >= start and < end.
	* If we're also doing a speculative memory
	* test and we at or past the end, bump up Maxmem
	* so that we keep going. The first bad page
	* will terminate the loop.
	*/
	if (phys_avail[pa_indx] == pa) {
	phys_avail[pa_indx] += PAGE_SIZE;
	} else {
	pa_indx++;
	if (pa_indx == PHYS_AVAIL_ARRAY_END) {
	printf(
	"Too many holes in the physical address space, giving up\n");
	pa_indx--;
	full = TRUE;
	goto do_dump_avail;
	}
	phys_avail[pa_indx++] = pa; /* start */
	phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
	}
	physmem++;
	do_dump_avail:
	if (dump_avail[da_indx] == pa) {
	dump_avail[da_indx] += PAGE_SIZE;
	} else {
	da_indx++;
	if (da_indx == DUMP_AVAIL_ARRAY_END) {
	da_indx--;
	goto do_next;
	}
	dump_avail[da_indx++] = pa; /* start */
	dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
	}
	do_next:
	if (full)
	break;
	}
	}
	*pte = 0;
	invltlb();

	/*
	* XXX
	* The last chunk must contain at least one page plus the message
	* buffer to avoid complicating other code (message buffer address
	* calculation, etc.).
	*/
	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
	round_page(msgbufsize) >= phys_avail[pa_indx]) {
	physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
	phys_avail[pa_indx--] = 0;
	phys_avail[pa_indx--] = 0;
	}

	Maxmem = atop(phys_avail[pa_indx]);

	/* Trim off space for the message buffer. */
	phys_avail[pa_indx] -= round_page(msgbufsize);

	/* Map the message buffer. */
	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
	}

	u_int64_t
	hammer_time(u_int64_t modulep, u_int64_t physfree)
	{
	caddr_t kmdp;
	int gsel_tss, x;
	struct pcpu *pc;
	struct nmi_pcpu *np;
	u_int64_t msr;
	char *env;
	size_t kstack0_sz;

	thread0.td_kstack = physfree + KERNBASE;
	thread0.td_kstack_pages = KSTACK_PAGES;
	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
	bzero((void *)thread0.td_kstack, kstack0_sz);
	physfree += kstack0_sz;
	thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;

	/*
	* This may be done better later if it gets more high level
	* components in it. If so just link td->td_proc here.
	*/
	proc_linkup0(&proc0, &thread0);

	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
	preload_bootstrap_relocate(KERNBASE);
	kmdp = preload_search_by_type("elf kernel");
	if (kmdp == NULL)
	kmdp = preload_search_by_type("elf64 kernel");
	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
	kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + KERNBASE;
	#ifdef DDB
	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
	#endif

	/* Init basic tunables, hz etc */
	init_param1();

	/*
	* make gdt memory segments
	*/
	for (x = 0; x < NGDT; x++) {
	if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
	x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
	ssdtosd(&gdt_segs[x], &gdt[x]);
	}
	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
	ssdtosyssd(&gdt_segs[GPROC0_SEL],
	(struct system_segment_descriptor *)&gdt[GPROC0_SEL]);

	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
	r_gdt.rd_base = (long) gdt;
	lgdt(&r_gdt);
	pc = &__pcpu[0];

	wrmsr(MSR_FSBASE, 0); /* User value */
	wrmsr(MSR_GSBASE, (u_int64_t)pc);
	wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */

	pcpu_init(pc, 0, sizeof(struct pcpu));
	dpcpu_init((void *)(physfree + KERNBASE), 0);
	physfree += DPCPU_SIZE;
	PCPU_SET(prvspace, pc);
	PCPU_SET(curthread, &thread0);
	PCPU_SET(curpcb, thread0.td_pcb);
	PCPU_SET(tssp, &common_tss[0]);
	PCPU_SET(commontssp, &common_tss[0]);
	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);

	/*
	* Initialize mutexes.
	*
	* icu_lock: in order to allow an interrupt to occur in a critical
	* section, to set pcpu->ipending (etc...) properly, we
	* must be able to get the icu lock, so it can't be
	* under witness.
	*/
	mutex_init();
	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN \| MTX_NOWITNESS);
	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);

	/* exceptions */
	for (x = 0; x < NIDT; x++)
	setidt(x, &IDTVEC(rsvd), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_DE, &IDTVEC(div), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
	setidt(IDT_BP, &IDTVEC(bpt), SDT_SYSIGT, SEL_UPL, 0);
	setidt(IDT_OF, &IDTVEC(ofl), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_BR, &IDTVEC(bnd), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_UD, &IDTVEC(ill), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_NM, &IDTVEC(dna), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
	setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_TS, &IDTVEC(tss), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_NP, &IDTVEC(missing), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_SS, &IDTVEC(stk), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_GP, &IDTVEC(prot), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_PF, &IDTVEC(page), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_MF, &IDTVEC(fpu), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
	#ifdef KDTRACE_HOOKS
	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
	#endif

	r_idt.rd_limit = sizeof(idt0) - 1;
	r_idt.rd_base = (long) idt;
	lidt(&r_idt);

	/*
	* Initialize the i8254 before the console so that console
	* initialization can use DELAY().
	*/
	i8254_init();

	/*
	* Initialize the console before we print anything out.
	*/
	cninit();

	#ifdef DEV_ISA
	#ifdef DEV_ATPIC
	elcr_probe();
	atpic_startup();
	#else
	/* Reset and mask the atpics and leave them shut down. */
	atpic_reset();

	/*
	* Point the ICU spurious interrupt vectors at the APIC spurious
	* interrupt handler.
	*/
	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
	#endif
	#else
	#error "have you forgotten the isa device?";
	#endif

	kdb_init();

	#ifdef KDB
	if (boothowto & RB_KDB)
	kdb_enter(KDB_WHY_BOOTFLAGS,
	"Boot flags requested debugger");
	#endif

	identify_cpu(); /* Final stage of CPU initialization */
	initializecpu(); /* Initialize CPU registers */
	initializecpucache();

	/* make an initial tss so cpu can get interrupt stack on syscall! */
	common_tss[0].tss_rsp0 = thread0.td_kstack +
	kstack0_sz - sizeof(struct pcb);
	/* Ensure the stack is aligned to 16 bytes */
	common_tss[0].tss_rsp0 &= ~0xFul;
	PCPU_SET(rsp0, common_tss[0].tss_rsp0);

	/* doublefault stack space, runs on ist1 */
	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];

	/*
	* NMI stack, runs on ist2. The pcpu pointer is stored just
	* above the start of the ist2 stack.
	*/
	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
	np->np_pcpu = (register_t) pc;
	common_tss[0].tss_ist2 = (long) np;

	/* Set the IO permission bitmap (empty due to tss seg limit) */
	common_tss[0].tss_iobase = sizeof(struct amd64tss) +
	IOPAGES * PAGE_SIZE;

	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
	ltr(gsel_tss);

	/* Set up the fast syscall stuff */
	msr = rdmsr(MSR_EFER) \| EFER_SCE;
	wrmsr(MSR_EFER, msr);
	wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) \|
	((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
	wrmsr(MSR_STAR, msr);
	wrmsr(MSR_SF_MASK, PSL_NT\|PSL_T\|PSL_I\|PSL_C\|PSL_D);

	getmemsize(kmdp, physfree);
	init_param2(physmem);

	/* now running on new page tables, configured,and u/iom is accessible */

	msgbufinit(msgbufp, msgbufsize);
	fpuinit();

	/* transfer to user mode */

	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);

	load_ds(_udatasel);
	load_es(_udatasel);
	load_fs(_ufssel);

	/* setup proc 0's pcb */
	thread0.td_pcb->pcb_flags = 0;
	thread0.td_pcb->pcb_cr3 = KPML4phys;
	thread0.td_frame = &proc0_tf;

	env = getenv("kernelname");
	if (env != NULL)
	strlcpy(kernelname, env, sizeof(kernelname));

	#ifdef XENHVM
	if (inw(0x10) == 0x49d2) {
	if (bootverbose)
	printf("Xen detected: disabling emulated block and network devices\n");
	outw(0x10, 3);
	}
	#endif

	cpu_probe_amdc1e();

	/* Location of kernel stack for locore */
	return ((u_int64_t)thread0.td_pcb);
	}

	void
	cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
	{

	pcpu->pc_acpi_id = 0xffffffff;
	}

	void
	spinlock_enter(void)
	{
	struct thread *td;
	register_t flags;

	td = curthread;
	if (td->td_md.md_spinlock_count == 0) {
	flags = intr_disable();
	td->td_md.md_spinlock_count = 1;
	td->td_md.md_saved_flags = flags;
	} else
	td->td_md.md_spinlock_count++;
	critical_enter();
	}

	void
	spinlock_exit(void)
	{
	struct thread *td;
	register_t flags;

	td = curthread;
	critical_exit();
	flags = td->td_md.md_saved_flags;
	td->td_md.md_spinlock_count--;
	if (td->td_md.md_spinlock_count == 0)
	intr_restore(flags);
	}

	/*
	* Construct a PCB from a trapframe. This is called from kdb_trap() where
	* we want to start a backtrace from the function that caused us to enter
	* the debugger. We have the context in the trapframe, but base the trace
	* on the PCB. The PCB doesn't have to be perfect, as long as it contains
	* enough for a backtrace.
	*/
	void
	makectx(struct trapframe tf, struct pcb pcb)
	{

	pcb->pcb_r12 = tf->tf_r12;
	pcb->pcb_r13 = tf->tf_r13;
	pcb->pcb_r14 = tf->tf_r14;
	pcb->pcb_r15 = tf->tf_r15;
	pcb->pcb_rbp = tf->tf_rbp;
	pcb->pcb_rbx = tf->tf_rbx;
	pcb->pcb_rip = tf->tf_rip;
	pcb->pcb_rsp = tf->tf_rsp;
	}

	int
	ptrace_set_pc(struct thread *td, unsigned long addr)
	{
	td->td_frame->tf_rip = addr;
	return (0);
	}

	int
	ptrace_single_step(struct thread *td)
	{
	td->td_frame->tf_rflags \|= PSL_T;
	return (0);
	}

	int
	ptrace_clear_single_step(struct thread *td)
	{
	td->td_frame->tf_rflags &= ~PSL_T;
	return (0);
	}

	int
	fill_regs(struct thread td, struct reg regs)
	{
	struct trapframe *tp;

	tp = td->td_frame;
	return (fill_frame_regs(tp, regs));
	}

	int
	fill_frame_regs(struct trapframe tp, struct reg regs)
	{
	regs->r_r15 = tp->tf_r15;
	regs->r_r14 = tp->tf_r14;
	regs->r_r13 = tp->tf_r13;
	regs->r_r12 = tp->tf_r12;
	regs->r_r11 = tp->tf_r11;
	regs->r_r10 = tp->tf_r10;
	regs->r_r9 = tp->tf_r9;
	regs->r_r8 = tp->tf_r8;
	regs->r_rdi = tp->tf_rdi;
	regs->r_rsi = tp->tf_rsi;
	regs->r_rbp = tp->tf_rbp;
	regs->r_rbx = tp->tf_rbx;
	regs->r_rdx = tp->tf_rdx;
	regs->r_rcx = tp->tf_rcx;
	regs->r_rax = tp->tf_rax;
	regs->r_rip = tp->tf_rip;
	regs->r_cs = tp->tf_cs;
	regs->r_rflags = tp->tf_rflags;
	regs->r_rsp = tp->tf_rsp;
	regs->r_ss = tp->tf_ss;
	if (tp->tf_flags & TF_HASSEGS) {
	regs->r_ds = tp->tf_ds;
	regs->r_es = tp->tf_es;
	regs->r_fs = tp->tf_fs;
	regs->r_gs = tp->tf_gs;
	} else {
	regs->r_ds = 0;
	regs->r_es = 0;
	regs->r_fs = 0;
	regs->r_gs = 0;
	}
	return (0);
	}

	int
	set_regs(struct thread td, struct reg regs)
	{
	struct trapframe *tp;
	register_t rflags;

	tp = td->td_frame;
	rflags = regs->r_rflags & 0xffffffff;
	if (!EFL_SECURE(rflags, tp->tf_rflags) \|\| !CS_SECURE(regs->r_cs))
	return (EINVAL);
	tp->tf_r15 = regs->r_r15;
	tp->tf_r14 = regs->r_r14;
	tp->tf_r13 = regs->r_r13;
	tp->tf_r12 = regs->r_r12;
	tp->tf_r11 = regs->r_r11;
	tp->tf_r10 = regs->r_r10;
	tp->tf_r9 = regs->r_r9;
	tp->tf_r8 = regs->r_r8;
	tp->tf_rdi = regs->r_rdi;
	tp->tf_rsi = regs->r_rsi;
	tp->tf_rbp = regs->r_rbp;
	tp->tf_rbx = regs->r_rbx;
	tp->tf_rdx = regs->r_rdx;
	tp->tf_rcx = regs->r_rcx;
	tp->tf_rax = regs->r_rax;
	tp->tf_rip = regs->r_rip;
	tp->tf_cs = regs->r_cs;
	tp->tf_rflags = rflags;
	tp->tf_rsp = regs->r_rsp;
	tp->tf_ss = regs->r_ss;
	if (0) { /* XXXKIB */
	tp->tf_ds = regs->r_ds;
	tp->tf_es = regs->r_es;
	tp->tf_fs = regs->r_fs;
	tp->tf_gs = regs->r_gs;
	tp->tf_flags = TF_HASSEGS;
	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
	}
	return (0);
	}

	/* XXX check all this stuff! */
	/* externalize from sv_xmm */
	static void
	fill_fpregs_xmm(struct savefpu sv_xmm, struct fpreg fpregs)
	{
	struct envxmm penv_fpreg = (struct envxmm )&fpregs->fpr_env;
	struct envxmm *penv_xmm = &sv_xmm->sv_env;
	int i;

	/* pcb -> fpregs */
	bzero(fpregs, sizeof(*fpregs));

	/* FPU control/status */
	penv_fpreg->en_cw = penv_xmm->en_cw;
	penv_fpreg->en_sw = penv_xmm->en_sw;
	penv_fpreg->en_tw = penv_xmm->en_tw;
	penv_fpreg->en_opcode = penv_xmm->en_opcode;
	penv_fpreg->en_rip = penv_xmm->en_rip;
	penv_fpreg->en_rdp = penv_xmm->en_rdp;
	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;

	/* FPU registers */
	for (i = 0; i < 8; ++i)
	bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);

	/* SSE registers */
	for (i = 0; i < 16; ++i)
	bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
	}

	/* internalize from fpregs into sv_xmm */
	static void
	set_fpregs_xmm(struct fpreg fpregs, struct savefpu sv_xmm)
	{
	struct envxmm *penv_xmm = &sv_xmm->sv_env;
	struct envxmm penv_fpreg = (struct envxmm )&fpregs->fpr_env;
	int i;

	/* fpregs -> pcb */
	/* FPU control/status */
	penv_xmm->en_cw = penv_fpreg->en_cw;
	penv_xmm->en_sw = penv_fpreg->en_sw;
	penv_xmm->en_tw = penv_fpreg->en_tw;
	penv_xmm->en_opcode = penv_fpreg->en_opcode;
	penv_xmm->en_rip = penv_fpreg->en_rip;
	penv_xmm->en_rdp = penv_fpreg->en_rdp;
	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;

	/* FPU registers */
	for (i = 0; i < 8; ++i)
	bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);

	/* SSE registers */
	for (i = 0; i < 16; ++i)
	bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
	}

	/* externalize from td->pcb */
	int
	fill_fpregs(struct thread td, struct fpreg fpregs)
	{

	KASSERT(td == curthread \|\| TD_IS_SUSPENDED(td),
	("not suspended thread %p", td));
	fpugetregs(td);
	fill_fpregs_xmm(&td->td_pcb->pcb_user_save, fpregs);
	return (0);
	}

	/* internalize to td->pcb */
	int
	set_fpregs(struct thread td, struct fpreg fpregs)
	{

	set_fpregs_xmm(fpregs, &td->td_pcb->pcb_user_save);
	fpuuserinited(td);
	return (0);
	}

	/*
	* Get machine context.
	*/
	int
	get_mcontext(struct thread td, mcontext_t mcp, int flags)
	{
	struct pcb *pcb;
	struct trapframe *tp;

	pcb = td->td_pcb;
	tp = td->td_frame;
	PROC_LOCK(curthread->td_proc);
	mcp->mc_onstack = sigonstack(tp->tf_rsp);
	PROC_UNLOCK(curthread->td_proc);
	mcp->mc_r15 = tp->tf_r15;
	mcp->mc_r14 = tp->tf_r14;
	mcp->mc_r13 = tp->tf_r13;
	mcp->mc_r12 = tp->tf_r12;
	mcp->mc_r11 = tp->tf_r11;
	mcp->mc_r10 = tp->tf_r10;
	mcp->mc_r9 = tp->tf_r9;
	mcp->mc_r8 = tp->tf_r8;
	mcp->mc_rdi = tp->tf_rdi;
	mcp->mc_rsi = tp->tf_rsi;
	mcp->mc_rbp = tp->tf_rbp;
	mcp->mc_rbx = tp->tf_rbx;
	mcp->mc_rcx = tp->tf_rcx;
	mcp->mc_rflags = tp->tf_rflags;
	if (flags & GET_MC_CLEAR_RET) {
	mcp->mc_rax = 0;
	mcp->mc_rdx = 0;
	mcp->mc_rflags &= ~PSL_C;
	} else {
	mcp->mc_rax = tp->tf_rax;
	mcp->mc_rdx = tp->tf_rdx;
	}
	mcp->mc_rip = tp->tf_rip;
	mcp->mc_cs = tp->tf_cs;
	mcp->mc_rsp = tp->tf_rsp;
	mcp->mc_ss = tp->tf_ss;
	mcp->mc_ds = tp->tf_ds;
	mcp->mc_es = tp->tf_es;
	mcp->mc_fs = tp->tf_fs;
	mcp->mc_gs = tp->tf_gs;
	mcp->mc_flags = tp->tf_flags;
	mcp->mc_len = sizeof(*mcp);
	get_fpcontext(td, mcp);
	mcp->mc_fsbase = pcb->pcb_fsbase;
	mcp->mc_gsbase = pcb->pcb_gsbase;
	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
	return (0);
	}

	/*
	* Set machine context.
	*
	* However, we don't set any but the user modifiable flags, and we won't
	* touch the cs selector.
	*/
	int
	set_mcontext(struct thread td, const mcontext_t mcp)
	{
	struct pcb *pcb;
	struct trapframe *tp;
	long rflags;
	int ret;

	pcb = td->td_pcb;
	tp = td->td_frame;
	if (mcp->mc_len != sizeof(*mcp) \|\|
	(mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
	return (EINVAL);
	rflags = (mcp->mc_rflags & PSL_USERCHANGE) \|
	(tp->tf_rflags & ~PSL_USERCHANGE);
	ret = set_fpcontext(td, mcp);
	if (ret != 0)
	return (ret);
	tp->tf_r15 = mcp->mc_r15;
	tp->tf_r14 = mcp->mc_r14;
	tp->tf_r13 = mcp->mc_r13;
	tp->tf_r12 = mcp->mc_r12;
	tp->tf_r11 = mcp->mc_r11;
	tp->tf_r10 = mcp->mc_r10;
	tp->tf_r9 = mcp->mc_r9;
	tp->tf_r8 = mcp->mc_r8;
	tp->tf_rdi = mcp->mc_rdi;
	tp->tf_rsi = mcp->mc_rsi;
	tp->tf_rbp = mcp->mc_rbp;
	tp->tf_rbx = mcp->mc_rbx;
	tp->tf_rdx = mcp->mc_rdx;
	tp->tf_rcx = mcp->mc_rcx;
	tp->tf_rax = mcp->mc_rax;
	tp->tf_rip = mcp->mc_rip;
	tp->tf_rflags = rflags;
	tp->tf_rsp = mcp->mc_rsp;
	tp->tf_ss = mcp->mc_ss;
	tp->tf_flags = mcp->mc_flags;
	if (tp->tf_flags & TF_HASSEGS) {
	tp->tf_ds = mcp->mc_ds;
	tp->tf_es = mcp->mc_es;
	tp->tf_fs = mcp->mc_fs;
	tp->tf_gs = mcp->mc_gs;
	}
	if (mcp->mc_flags & _MC_HASBASES) {
	pcb->pcb_fsbase = mcp->mc_fsbase;
	pcb->pcb_gsbase = mcp->mc_gsbase;
	}
	set_pcb_flags(pcb, PCB_FULL_IRET);
	return (0);
	}

	static void
	get_fpcontext(struct thread td, mcontext_t mcp)
	{

	mcp->mc_ownedfp = fpugetregs(td);
	bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate,
	sizeof(mcp->mc_fpstate));
	mcp->mc_fpformat = fpuformat();
	}

	static int
	set_fpcontext(struct thread td, const mcontext_t mcp)
	{
	struct savefpu *fpstate;

	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
	return (0);
	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
	return (EINVAL);
	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
	/* We don't care what state is left in the FPU or PCB. */
	fpstate_drop(td);
	else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU \|\|
	mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
	fpstate = (struct savefpu *)&mcp->mc_fpstate;
	fpstate->sv_env.en_mxcsr &= cpu_mxcsr_mask;
	fpusetregs(td, fpstate);
	} else
	return (EINVAL);
	return (0);
	}

	void
	fpstate_drop(struct thread *td)
	{

	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
	critical_enter();
	if (PCPU_GET(fpcurthread) == td)
	fpudrop();
	/*
	* XXX force a full drop of the fpu. The above only drops it if we
	* owned it.
	*
	* XXX I don't much like fpugetuserregs()'s semantics of doing a full
	* drop. Dropping only to the pcb matches fnsave's behaviour.
	* We only need to drop to !PCB_INITDONE in sendsig(). But
	* sendsig() is the only caller of fpugetuserregs()... perhaps we just
	* have too many layers.
	*/
	clear_pcb_flags(curthread->td_pcb,
	PCB_FPUINITDONE \| PCB_USERFPUINITDONE);
	critical_exit();
	}

	int
	fill_dbregs(struct thread td, struct dbreg dbregs)
	{
	struct pcb *pcb;

	if (td == NULL) {
	dbregs->dr[0] = rdr0();
	dbregs->dr[1] = rdr1();
	dbregs->dr[2] = rdr2();
	dbregs->dr[3] = rdr3();
	dbregs->dr[6] = rdr6();
	dbregs->dr[7] = rdr7();
	} else {
	pcb = td->td_pcb;
	dbregs->dr[0] = pcb->pcb_dr0;
	dbregs->dr[1] = pcb->pcb_dr1;
	dbregs->dr[2] = pcb->pcb_dr2;
	dbregs->dr[3] = pcb->pcb_dr3;
	dbregs->dr[6] = pcb->pcb_dr6;
	dbregs->dr[7] = pcb->pcb_dr7;
	}
	dbregs->dr[4] = 0;
	dbregs->dr[5] = 0;
	dbregs->dr[8] = 0;
	dbregs->dr[9] = 0;
	dbregs->dr[10] = 0;
	dbregs->dr[11] = 0;
	dbregs->dr[12] = 0;
	dbregs->dr[13] = 0;
	dbregs->dr[14] = 0;
	dbregs->dr[15] = 0;
	return (0);
	}

	int
	set_dbregs(struct thread td, struct dbreg dbregs)
	{
	struct pcb *pcb;
	int i;

	if (td == NULL) {
	load_dr0(dbregs->dr[0]);
	load_dr1(dbregs->dr[1]);
	load_dr2(dbregs->dr[2]);
	load_dr3(dbregs->dr[3]);
	load_dr6(dbregs->dr[6]);
	load_dr7(dbregs->dr[7]);
	} else {
	/*
	* Don't let an illegal value for dr7 get set. Specifically,
	* check for undefined settings. Setting these bit patterns
	* result in undefined behaviour and can lead to an unexpected
	* TRCTRAP or a general protection fault right here.
	* Upper bits of dr6 and dr7 must not be set
	*/
	for (i = 0; i < 4; i++) {
	if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
	return (EINVAL);
	if (td->td_frame->tf_cs == _ucode32sel &&
	DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
	return (EINVAL);
	}
	if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 \|\|
	(dbregs->dr[7] & 0xffffffff00000000ul) != 0)
	return (EINVAL);

	pcb = td->td_pcb;

	/*
	* Don't let a process set a breakpoint that is not within the
	* process's address space. If a process could do this, it
	* could halt the system by setting a breakpoint in the kernel
	* (if ddb was enabled). Thus, we need to check to make sure
	* that no breakpoints are being enabled for addresses outside
	* process's address space.
	*
	* XXX - what about when the watched area of the user's
	* address space is written into from within the kernel
	* ... wouldn't that still cause a breakpoint to be generated
	* from within kernel mode?
	*/

	if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
	/* dr0 is enabled */
	if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
	return (EINVAL);
	}
	if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
	/* dr1 is enabled */
	if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
	return (EINVAL);
	}
	if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
	/* dr2 is enabled */
	if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
	return (EINVAL);
	}
	if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
	/* dr3 is enabled */
	if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
	return (EINVAL);
	}

	pcb->pcb_dr0 = dbregs->dr[0];
	pcb->pcb_dr1 = dbregs->dr[1];
	pcb->pcb_dr2 = dbregs->dr[2];
	pcb->pcb_dr3 = dbregs->dr[3];
	pcb->pcb_dr6 = dbregs->dr[6];
	pcb->pcb_dr7 = dbregs->dr[7];

	set_pcb_flags(pcb, PCB_DBREGS);
	}

	return (0);
	}

	void
	reset_dbregs(void)
	{

	load_dr7(0); /* Turn off the control bits first */
	load_dr0(0);
	load_dr1(0);
	load_dr2(0);
	load_dr3(0);
	load_dr6(0);
	}

	/*
	* Return > 0 if a hardware breakpoint has been hit, and the
	* breakpoint was in user space. Return 0, otherwise.
	*/
	int
	user_dbreg_trap(void)
	{
	u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
	u_int64_t bp; /* breakpoint bits extracted from dr6 */
	int nbp; /* number of breakpoints that triggered */
	caddr_t addr[4]; /* breakpoint addresses */
	int i;

	dr7 = rdr7();
	if ((dr7 & 0x000000ff) == 0) {
	/*
	* all GE and LE bits in the dr7 register are zero,
	* thus the trap couldn't have been caused by the
	* hardware debug registers
	*/
	return 0;
	}

	nbp = 0;
	dr6 = rdr6();
	bp = dr6 & 0x0000000f;

	if (!bp) {
	/*
	* None of the breakpoint bits are set meaning this
	* trap was not caused by any of the debug registers
	*/
	return 0;
	}

	/*
	* at least one of the breakpoints were hit, check to see
	* which ones and if any of them are user space addresses
	*/

	if (bp & 0x01) {
	addr[nbp++] = (caddr_t)rdr0();
	}
	if (bp & 0x02) {
	addr[nbp++] = (caddr_t)rdr1();
	}
	if (bp & 0x04) {
	addr[nbp++] = (caddr_t)rdr2();
	}
	if (bp & 0x08) {
	addr[nbp++] = (caddr_t)rdr3();
	}

	for (i = 0; i < nbp; i++) {
	if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
	/*
	* addr[i] is in user space
	*/
	return nbp;
	}
	}

	/*
	* None of the breakpoints are in user space.
	*/
	return 0;
	}

	#ifdef KDB

	/*
	* Provide inb() and outb() as functions. They are normally only available as
	* inline functions, thus cannot be called from the debugger.
	*/

	/* silence compiler warnings */
	u_char inb_(u_short);
	void outb_(u_short, u_char);

	u_char
	inb_(u_short port)
	{
	return inb(port);
	}

	void
	outb_(u_short port, u_char data)
	{
	outb(port, data);
	}

	#endif /* KDB */
	Index: head/sys/amd64/linux32/linux32_machdep.c
	===================================================================
	--- head/sys/amd64/linux32/linux32_machdep.c (revision 225616)
	+++ head/sys/amd64/linux32/linux32_machdep.c (revision 225617)
	@@ -1,1086 +1,1086 @@
	/*-
	* Copyright (c) 2004 Tim J. Robbins
	* Copyright (c) 2002 Doug Rabson
	* Copyright (c) 2000 Marcel Moolenaar
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/file.h>
	#include <sys/fcntl.h>
	#include <sys/clock.h>
	#include <sys/imgact.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mman.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/resource.h>
	#include <sys/resourcevar.h>
	#include <sys/sched.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>
	#include <sys/unistd.h>
	#include <sys/wait.h>

	#include <machine/frame.h>
	#include <machine/pcb.h>
	#include <machine/psl.h>
	#include <machine/segments.h>
	#include <machine/specialreg.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>

	#include <compat/freebsd32/freebsd32_util.h>
	#include <amd64/linux32/linux.h>
	#include <amd64/linux32/linux32_proto.h>
	#include <compat/linux/linux_ipc.h>
	#include <compat/linux/linux_misc.h>
	#include <compat/linux/linux_signal.h>
	#include <compat/linux/linux_util.h>
	#include <compat/linux/linux_emul.h>

	struct l_old_select_argv {
	l_int nfds;
	l_uintptr_t readfds;
	l_uintptr_t writefds;
	l_uintptr_t exceptfds;
	l_uintptr_t timeout;
	} __packed;

	int
	linux_to_bsd_sigaltstack(int lsa)
	{
	int bsa = 0;

	if (lsa & LINUX_SS_DISABLE)
	bsa \|= SS_DISABLE;
	if (lsa & LINUX_SS_ONSTACK)
	bsa \|= SS_ONSTACK;
	return (bsa);
	}

	static int linux_mmap_common(struct thread *td, l_uintptr_t addr,
	l_size_t len, l_int prot, l_int flags, l_int fd,
	l_loff_t pos);

	int
	bsd_to_linux_sigaltstack(int bsa)
	{
	int lsa = 0;

	if (bsa & SS_DISABLE)
	lsa \|= LINUX_SS_DISABLE;
	if (bsa & SS_ONSTACK)
	lsa \|= LINUX_SS_ONSTACK;
	return (lsa);
	}

	static void
	bsd_to_linux_rusage(struct rusage ru, struct l_rusage lru)
	{

	lru->ru_utime.tv_sec = ru->ru_utime.tv_sec;
	lru->ru_utime.tv_usec = ru->ru_utime.tv_usec;
	lru->ru_stime.tv_sec = ru->ru_stime.tv_sec;
	lru->ru_stime.tv_usec = ru->ru_stime.tv_usec;
	lru->ru_maxrss = ru->ru_maxrss;
	lru->ru_ixrss = ru->ru_ixrss;
	lru->ru_idrss = ru->ru_idrss;
	lru->ru_isrss = ru->ru_isrss;
	lru->ru_minflt = ru->ru_minflt;
	lru->ru_majflt = ru->ru_majflt;
	lru->ru_nswap = ru->ru_nswap;
	lru->ru_inblock = ru->ru_inblock;
	lru->ru_oublock = ru->ru_oublock;
	lru->ru_msgsnd = ru->ru_msgsnd;
	lru->ru_msgrcv = ru->ru_msgrcv;
	lru->ru_nsignals = ru->ru_nsignals;
	lru->ru_nvcsw = ru->ru_nvcsw;
	lru->ru_nivcsw = ru->ru_nivcsw;
	}

	int
	linux_execve(struct thread td, struct linux_execve_args args)
	{
	struct image_args eargs;
	char *path;
	int error;

	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(execve))
	printf(ARGS(execve, "%s"), path);
	#endif

	error = freebsd32_exec_copyin_args(&eargs, path, UIO_SYSSPACE,
	args->argp, args->envp);
	free(path, M_TEMP);
	if (error == 0)
	error = kern_execve(td, &eargs, NULL);
	if (error == 0)
	/* Linux process can execute FreeBSD one, do not attempt
	* to create emuldata for such process using
	* linux_proc_init, this leads to a panic on KASSERT
	* because such process has p->p_emuldata == NULL.
	*/
	if (SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX)
	error = linux_proc_init(td, 0, 0);
	return (error);
	}

	CTASSERT(sizeof(struct l_iovec32) == 8);

	static int
	linux32_copyinuio(struct l_iovec32 iovp, l_ulong iovcnt, struct uio *uiop)
	{
	struct l_iovec32 iov32;
	struct iovec *iov;
	struct uio *uio;
	uint32_t iovlen;
	int error, i;

	*uiop = NULL;
	if (iovcnt > UIO_MAXIOV)
	return (EINVAL);
	iovlen = iovcnt * sizeof(struct iovec);
	uio = malloc(iovlen + sizeof(*uio), M_IOV, M_WAITOK);
	iov = (struct iovec *)(uio + 1);
	for (i = 0; i < iovcnt; i++) {
	error = copyin(&iovp[i], &iov32, sizeof(struct l_iovec32));
	if (error) {
	free(uio, M_IOV);
	return (error);
	}
	iov[i].iov_base = PTRIN(iov32.iov_base);
	iov[i].iov_len = iov32.iov_len;
	}
	uio->uio_iov = iov;
	uio->uio_iovcnt = iovcnt;
	uio->uio_segflg = UIO_USERSPACE;
	uio->uio_offset = -1;
	uio->uio_resid = 0;
	for (i = 0; i < iovcnt; i++) {
	if (iov->iov_len > INT_MAX - uio->uio_resid) {
	free(uio, M_IOV);
	return (EINVAL);
	}
	uio->uio_resid += iov->iov_len;
	iov++;
	}
	*uiop = uio;
	return (0);
	}

	int
	linux32_copyiniov(struct l_iovec32 iovp32, l_ulong iovcnt, struct iovec *iovp,
	int error)
	{
	struct l_iovec32 iov32;
	struct iovec *iov;
	uint32_t iovlen;
	int i;

	*iovp = NULL;
	if (iovcnt > UIO_MAXIOV)
	return (error);
	iovlen = iovcnt * sizeof(struct iovec);
	iov = malloc(iovlen, M_IOV, M_WAITOK);
	for (i = 0; i < iovcnt; i++) {
	error = copyin(&iovp32[i], &iov32, sizeof(struct l_iovec32));
	if (error) {
	free(iov, M_IOV);
	return (error);
	}
	iov[i].iov_base = PTRIN(iov32.iov_base);
	iov[i].iov_len = iov32.iov_len;
	}
	*iovp = iov;
	return(0);

	}

	int
	linux_readv(struct thread td, struct linux_readv_args uap)
	{
	struct uio *auio;
	int error;

	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_readv(td, uap->fd, auio);
	free(auio, M_IOV);
	return (error);
	}

	int
	linux_writev(struct thread td, struct linux_writev_args uap)
	{
	struct uio *auio;
	int error;

	error = linux32_copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_writev(td, uap->fd, auio);
	free(auio, M_IOV);
	return (error);
	}

	struct l_ipc_kludge {
	l_uintptr_t msgp;
	l_long msgtyp;
	} __packed;

	int
	linux_ipc(struct thread td, struct linux_ipc_args args)
	{

	switch (args->what & 0xFFFF) {
	case LINUX_SEMOP: {
	struct linux_semop_args a;

	a.semid = args->arg1;
	a.tsops = args->ptr;
	a.nsops = args->arg2;
	return (linux_semop(td, &a));
	}
	case LINUX_SEMGET: {
	struct linux_semget_args a;

	a.key = args->arg1;
	a.nsems = args->arg2;
	a.semflg = args->arg3;
	return (linux_semget(td, &a));
	}
	case LINUX_SEMCTL: {
	struct linux_semctl_args a;
	int error;

	a.semid = args->arg1;
	a.semnum = args->arg2;
	a.cmd = args->arg3;
	error = copyin(args->ptr, &a.arg, sizeof(a.arg));
	if (error)
	return (error);
	return (linux_semctl(td, &a));
	}
	case LINUX_MSGSND: {
	struct linux_msgsnd_args a;

	a.msqid = args->arg1;
	a.msgp = args->ptr;
	a.msgsz = args->arg2;
	a.msgflg = args->arg3;
	return (linux_msgsnd(td, &a));
	}
	case LINUX_MSGRCV: {
	struct linux_msgrcv_args a;

	a.msqid = args->arg1;
	a.msgsz = args->arg2;
	a.msgflg = args->arg3;
	if ((args->what >> 16) == 0) {
	struct l_ipc_kludge tmp;
	int error;

	if (args->ptr == 0)
	return (EINVAL);
	error = copyin(args->ptr, &tmp, sizeof(tmp));
	if (error)
	return (error);
	a.msgp = PTRIN(tmp.msgp);
	a.msgtyp = tmp.msgtyp;
	} else {
	a.msgp = args->ptr;
	a.msgtyp = args->arg5;
	}
	return (linux_msgrcv(td, &a));
	}
	case LINUX_MSGGET: {
	struct linux_msgget_args a;

	a.key = args->arg1;
	a.msgflg = args->arg2;
	return (linux_msgget(td, &a));
	}
	case LINUX_MSGCTL: {
	struct linux_msgctl_args a;

	a.msqid = args->arg1;
	a.cmd = args->arg2;
	a.buf = args->ptr;
	return (linux_msgctl(td, &a));
	}
	case LINUX_SHMAT: {
	struct linux_shmat_args a;

	a.shmid = args->arg1;
	a.shmaddr = args->ptr;
	a.shmflg = args->arg2;
	a.raddr = PTRIN((l_uint)args->arg3);
	return (linux_shmat(td, &a));
	}
	case LINUX_SHMDT: {
	struct linux_shmdt_args a;

	a.shmaddr = args->ptr;
	return (linux_shmdt(td, &a));
	}
	case LINUX_SHMGET: {
	struct linux_shmget_args a;

	a.key = args->arg1;
	a.size = args->arg2;
	a.shmflg = args->arg3;
	return (linux_shmget(td, &a));
	}
	case LINUX_SHMCTL: {
	struct linux_shmctl_args a;

	a.shmid = args->arg1;
	a.cmd = args->arg2;
	a.buf = args->ptr;
	return (linux_shmctl(td, &a));
	}
	default:
	break;
	}

	return (EINVAL);
	}

	int
	linux_old_select(struct thread td, struct linux_old_select_args args)
	{
	struct l_old_select_argv linux_args;
	struct linux_select_args newsel;
	int error;

	#ifdef DEBUG
	if (ldebug(old_select))
	printf(ARGS(old_select, "%p"), args->ptr);
	#endif

	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
	if (error)
	return (error);

	newsel.nfds = linux_args.nfds;
	newsel.readfds = PTRIN(linux_args.readfds);
	newsel.writefds = PTRIN(linux_args.writefds);
	newsel.exceptfds = PTRIN(linux_args.exceptfds);
	newsel.timeout = PTRIN(linux_args.timeout);
	return (linux_select(td, &newsel));
	}

	int
	linux_set_cloned_tls(struct thread td, void desc)
	{
	struct user_segment_descriptor sd;
	struct l_user_desc info;
	struct pcb *pcb;
	int error;
	int a[2];

	error = copyin(desc, &info, sizeof(struct l_user_desc));
	if (error) {
	printf(LMSG("copyin failed!"));
	} else {
	/* We might copy out the entry_number as GUGS32_SEL. */
	info.entry_number = GUGS32_SEL;
	error = copyout(&info, desc, sizeof(struct l_user_desc));
	if (error)
	printf(LMSG("copyout failed!"));

	a[0] = LINUX_LDT_entry_a(&info);
	a[1] = LINUX_LDT_entry_b(&info);

	memcpy(&sd, &a, sizeof(a));
	#ifdef DEBUG
	if (ldebug(clone))
	printf("Segment created in clone with "
	"CLONE_SETTLS: lobase: %x, hibase: %x, "
	"lolimit: %x, hilimit: %x, type: %i, "
	"dpl: %i, p: %i, xx: %i, long: %i, "
	"def32: %i, gran: %i\n", sd.sd_lobase,
	sd.sd_hibase, sd.sd_lolimit, sd.sd_hilimit,
	sd.sd_type, sd.sd_dpl, sd.sd_p, sd.sd_xx,
	sd.sd_long, sd.sd_def32, sd.sd_gran);
	#endif
	pcb = td->td_pcb;
	pcb->pcb_gsbase = (register_t)info.base_addr;
	/* XXXKIB pcb->pcb_gs32sd = sd; */
	td->td_frame->tf_gs = GSEL(GUGS32_SEL, SEL_UPL);
	set_pcb_flags(pcb, PCB_GS32BIT \| PCB_32BIT);
	}

	return (error);
	}

	int
	linux_set_upcall_kse(struct thread *td, register_t stack)
	{

	td->td_frame->tf_rsp = stack;

	return (0);
	}

	#define STACK_SIZE (2 * 1024 * 1024)
	#define GUARD_SIZE (4 * PAGE_SIZE)

	int
	linux_mmap2(struct thread td, struct linux_mmap2_args args)
	{

	#ifdef DEBUG
	if (ldebug(mmap2))
	printf(ARGS(mmap2, "0x%08x, %d, %d, 0x%08x, %d, %d"),
	args->addr, args->len, args->prot,
	args->flags, args->fd, args->pgoff);
	#endif

	return (linux_mmap_common(td, PTROUT(args->addr), args->len, args->prot,
	args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff *
	PAGE_SIZE));
	}

	int
	linux_mmap(struct thread td, struct linux_mmap_args args)
	{
	int error;
	struct l_mmap_argv linux_args;

	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
	if (error)
	return (error);

	#ifdef DEBUG
	if (ldebug(mmap))
	printf(ARGS(mmap, "0x%08x, %d, %d, 0x%08x, %d, %d"),
	linux_args.addr, linux_args.len, linux_args.prot,
	linux_args.flags, linux_args.fd, linux_args.pgoff);
	#endif

	return (linux_mmap_common(td, linux_args.addr, linux_args.len,
	linux_args.prot, linux_args.flags, linux_args.fd,
	(uint32_t)linux_args.pgoff));
	}

	static int
	linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot,
	l_int flags, l_int fd, l_loff_t pos)
	{
	struct proc *p = td->td_proc;
	struct mmap_args /* {
	caddr_t addr;
	size_t len;
	int prot;
	int flags;
	int fd;
	long pad;
	off_t pos;
	} */ bsd_args;
	int error;
	struct file *fp;

	error = 0;
	bsd_args.flags = 0;
	fp = NULL;

	/*
	* Linux mmap(2):
	* You must specify exactly one of MAP_SHARED and MAP_PRIVATE
	*/
	if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE)))
	return (EINVAL);

	if (flags & LINUX_MAP_SHARED)
	bsd_args.flags \|= MAP_SHARED;
	if (flags & LINUX_MAP_PRIVATE)
	bsd_args.flags \|= MAP_PRIVATE;
	if (flags & LINUX_MAP_FIXED)
	bsd_args.flags \|= MAP_FIXED;
	if (flags & LINUX_MAP_ANON) {
	/* Enforce pos to be on page boundary, then ignore. */
	if ((pos & PAGE_MASK) != 0)
	return (EINVAL);
	pos = 0;
	bsd_args.flags \|= MAP_ANON;
	} else
	bsd_args.flags \|= MAP_NOSYNC;
	if (flags & LINUX_MAP_GROWSDOWN)
	bsd_args.flags \|= MAP_STACK;

	/*
	* PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC
	* on Linux/i386. We do this to ensure maximum compatibility.
	* Linux/ia64 does the same in i386 emulation mode.
	*/
	bsd_args.prot = prot;
	if (bsd_args.prot & (PROT_READ \| PROT_WRITE \| PROT_EXEC))
	bsd_args.prot \|= PROT_READ \| PROT_EXEC;

	/* Linux does not check file descriptor when MAP_ANONYMOUS is set. */
	bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd;
	if (bsd_args.fd != -1) {
	/*
	* Linux follows Solaris mmap(2) description:
	* The file descriptor fildes is opened with
	* read permission, regardless of the
	* protection options specified.
	*/

	if ((error = fget(td, bsd_args.fd, CAP_MMAP, &fp)) != 0)
	return (error);
	if (fp->f_type != DTYPE_VNODE) {
	fdrop(fp, td);
	return (EINVAL);
	}

	/* Linux mmap() just fails for O_WRONLY files */
	if (!(fp->f_flag & FREAD)) {
	fdrop(fp, td);
	return (EACCES);
	}

	fdrop(fp, td);
	}

	if (flags & LINUX_MAP_GROWSDOWN) {
	/*
	* The Linux MAP_GROWSDOWN option does not limit auto
	* growth of the region. Linux mmap with this option
	* takes as addr the inital BOS, and as len, the initial
	* region size. It can then grow down from addr without
	* limit. However, Linux threads has an implicit internal
	* limit to stack size of STACK_SIZE. Its just not
	* enforced explicitly in Linux. But, here we impose
	* a limit of (STACK_SIZE - GUARD_SIZE) on the stack
	* region, since we can do this with our mmap.
	*
	* Our mmap with MAP_STACK takes addr as the maximum
	* downsize limit on BOS, and as len the max size of
	* the region. It then maps the top SGROWSIZ bytes,
	* and auto grows the region down, up to the limit
	* in addr.
	*
	* If we don't use the MAP_STACK option, the effect
	* of this code is to allocate a stack region of a
	* fixed size of (STACK_SIZE - GUARD_SIZE).
	*/

	if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) {
	/*
	* Some Linux apps will attempt to mmap
	* thread stacks near the top of their
	* address space. If their TOS is greater
	* than vm_maxsaddr, vm_map_growstack()
	* will confuse the thread stack with the
	* process stack and deliver a SEGV if they
	* attempt to grow the thread stack past their
	* current stacksize rlimit. To avoid this,
	* adjust vm_maxsaddr upwards to reflect
	* the current stacksize rlimit rather
	* than the maximum possible stacksize.
	* It would be better to adjust the
	* mmap'ed region, but some apps do not check
	* mmap's return value.
	*/
	PROC_LOCK(p);
	p->p_vmspace->vm_maxsaddr = (char *)LINUX32_USRSTACK -
	lim_cur(p, RLIMIT_STACK);
	PROC_UNLOCK(p);
	}

	/*
	* This gives us our maximum stack size and a new BOS.
	* If we're using VM_STACK, then mmap will just map
	* the top SGROWSIZ bytes, and let the stack grow down
	* to the limit at BOS. If we're not using VM_STACK
	* we map the full stack, since we don't have a way
	* to autogrow it.
	*/
	if (len > STACK_SIZE - GUARD_SIZE) {
	bsd_args.addr = (caddr_t)PTRIN(addr);
	bsd_args.len = len;
	} else {
	bsd_args.addr = (caddr_t)PTRIN(addr) -
	(STACK_SIZE - GUARD_SIZE - len);
	bsd_args.len = STACK_SIZE - GUARD_SIZE;
	}
	} else {
	bsd_args.addr = (caddr_t)PTRIN(addr);
	bsd_args.len = len;
	}
	bsd_args.pos = pos;

	#ifdef DEBUG
	if (ldebug(mmap))
	printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
	__func__,
	(void *)bsd_args.addr, (int)bsd_args.len, bsd_args.prot,
	bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
	#endif
	- error = mmap(td, &bsd_args);
	+ error = sys_mmap(td, &bsd_args);
	#ifdef DEBUG
	if (ldebug(mmap))
	printf("-> %s() return: 0x%x (0x%08x)\n",
	__func__, error, (u_int)td->td_retval[0]);
	#endif
	return (error);
	}

	int
	linux_mprotect(struct thread td, struct linux_mprotect_args uap)
	{
	struct mprotect_args bsd_args;

	bsd_args.addr = uap->addr;
	bsd_args.len = uap->len;
	bsd_args.prot = uap->prot;
	if (bsd_args.prot & (PROT_READ \| PROT_WRITE \| PROT_EXEC))
	bsd_args.prot \|= PROT_READ \| PROT_EXEC;
	- return (mprotect(td, &bsd_args));
	+ return (sys_mprotect(td, &bsd_args));
	}

	int
	linux_iopl(struct thread td, struct linux_iopl_args args)
	{
	int error;

	if (args->level < 0 \|\| args->level > 3)
	return (EINVAL);
	if ((error = priv_check(td, PRIV_IO)) != 0)
	return (error);
	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
	return (error);
	td->td_frame->tf_rflags = (td->td_frame->tf_rflags & ~PSL_IOPL) \|
	(args->level * (PSL_IOPL / 3));

	return (0);
	}

	int
	linux_pipe(struct thread td, struct linux_pipe_args args)
	{
	int error;
	int fildes[2];

	#ifdef DEBUG
	if (ldebug(pipe))
	printf(ARGS(pipe, "*"));
	#endif

	error = kern_pipe(td, fildes);
	if (error)
	return (error);

	/* XXX: Close descriptors on error. */
	return (copyout(fildes, args->pipefds, sizeof fildes));
	}

	int
	linux_sigaction(struct thread td, struct linux_sigaction_args args)
	{
	l_osigaction_t osa;
	l_sigaction_t act, oact;
	int error;

	#ifdef DEBUG
	if (ldebug(sigaction))
	printf(ARGS(sigaction, "%d, %p, %p"),
	args->sig, (void )args->nsa, (void )args->osa);
	#endif

	if (args->nsa != NULL) {
	error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
	if (error)
	return (error);
	act.lsa_handler = osa.lsa_handler;
	act.lsa_flags = osa.lsa_flags;
	act.lsa_restorer = osa.lsa_restorer;
	LINUX_SIGEMPTYSET(act.lsa_mask);
	act.lsa_mask.__bits[0] = osa.lsa_mask;
	}

	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
	args->osa ? &oact : NULL);

	if (args->osa != NULL && !error) {
	osa.lsa_handler = oact.lsa_handler;
	osa.lsa_flags = oact.lsa_flags;
	osa.lsa_restorer = oact.lsa_restorer;
	osa.lsa_mask = oact.lsa_mask.__bits[0];
	error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
	}

	return (error);
	}

	/*
	* Linux has two extra args, restart and oldmask. We don't use these,
	* but it seems that "restart" is actually a context pointer that
	* enables the signal to happen with a different register set.
	*/
	int
	linux_sigsuspend(struct thread td, struct linux_sigsuspend_args args)
	{
	sigset_t sigmask;
	l_sigset_t mask;

	#ifdef DEBUG
	if (ldebug(sigsuspend))
	printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
	#endif

	LINUX_SIGEMPTYSET(mask);
	mask.__bits[0] = args->mask;
	linux_to_bsd_sigset(&mask, &sigmask);
	return (kern_sigsuspend(td, sigmask));
	}

	int
	linux_rt_sigsuspend(struct thread td, struct linux_rt_sigsuspend_args uap)
	{
	l_sigset_t lmask;
	sigset_t sigmask;
	int error;

	#ifdef DEBUG
	if (ldebug(rt_sigsuspend))
	printf(ARGS(rt_sigsuspend, "%p, %d"),
	(void *)uap->newset, uap->sigsetsize);
	#endif

	if (uap->sigsetsize != sizeof(l_sigset_t))
	return (EINVAL);

	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
	if (error)
	return (error);

	linux_to_bsd_sigset(&lmask, &sigmask);
	return (kern_sigsuspend(td, sigmask));
	}

	int
	linux_pause(struct thread td, struct linux_pause_args args)
	{
	struct proc *p = td->td_proc;
	sigset_t sigmask;

	#ifdef DEBUG
	if (ldebug(pause))
	printf(ARGS(pause, ""));
	#endif

	PROC_LOCK(p);
	sigmask = td->td_sigmask;
	PROC_UNLOCK(p);
	return (kern_sigsuspend(td, sigmask));
	}

	int
	linux_sigaltstack(struct thread td, struct linux_sigaltstack_args uap)
	{
	stack_t ss, oss;
	l_stack_t lss;
	int error;

	#ifdef DEBUG
	if (ldebug(sigaltstack))
	printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
	#endif

	if (uap->uss != NULL) {
	error = copyin(uap->uss, &lss, sizeof(l_stack_t));
	if (error)
	return (error);

	ss.ss_sp = PTRIN(lss.ss_sp);
	ss.ss_size = lss.ss_size;
	ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
	}
	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
	(uap->uoss != NULL) ? &oss : NULL);
	if (!error && uap->uoss != NULL) {
	lss.ss_sp = PTROUT(oss.ss_sp);
	lss.ss_size = oss.ss_size;
	lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
	error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
	}

	return (error);
	}

	int
	linux_ftruncate64(struct thread td, struct linux_ftruncate64_args args)
	{
	struct ftruncate_args sa;

	#ifdef DEBUG
	if (ldebug(ftruncate64))
	printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
	(intmax_t)args->length);
	#endif

	sa.fd = args->fd;
	sa.length = args->length;
	- return ftruncate(td, &sa);
	+ return sys_ftruncate(td, &sa);
	}

	int
	linux_gettimeofday(struct thread td, struct linux_gettimeofday_args uap)
	{
	struct timeval atv;
	l_timeval atv32;
	struct timezone rtz;
	int error = 0;

	if (uap->tp) {
	microtime(&atv);
	atv32.tv_sec = atv.tv_sec;
	atv32.tv_usec = atv.tv_usec;
	error = copyout(&atv32, uap->tp, sizeof(atv32));
	}
	if (error == 0 && uap->tzp != NULL) {
	rtz.tz_minuteswest = tz_minuteswest;
	rtz.tz_dsttime = tz_dsttime;
	error = copyout(&rtz, uap->tzp, sizeof(rtz));
	}
	return (error);
	}

	int
	linux_settimeofday(struct thread td, struct linux_settimeofday_args uap)
	{
	l_timeval atv32;
	struct timeval atv, *tvp;
	struct timezone atz, *tzp;
	int error;

	if (uap->tp) {
	error = copyin(uap->tp, &atv32, sizeof(atv32));
	if (error)
	return (error);
	atv.tv_sec = atv32.tv_sec;
	atv.tv_usec = atv32.tv_usec;
	tvp = &atv;
	} else
	tvp = NULL;
	if (uap->tzp) {
	error = copyin(uap->tzp, &atz, sizeof(atz));
	if (error)
	return (error);
	tzp = &atz;
	} else
	tzp = NULL;
	return (kern_settimeofday(td, tvp, tzp));
	}

	int
	linux_getrusage(struct thread td, struct linux_getrusage_args uap)
	{
	struct l_rusage s32;
	struct rusage s;
	int error;

	error = kern_getrusage(td, uap->who, &s);
	if (error != 0)
	return (error);
	if (uap->rusage != NULL) {
	bsd_to_linux_rusage(&s, &s32);
	error = copyout(&s32, uap->rusage, sizeof(s32));
	}
	return (error);
	}

	int
	linux_sched_rr_get_interval(struct thread *td,
	struct linux_sched_rr_get_interval_args *uap)
	{
	struct timespec ts;
	struct l_timespec ts32;
	int error;

	error = kern_sched_rr_get_interval(td, uap->pid, &ts);
	if (error != 0)
	return (error);
	ts32.tv_sec = ts.tv_sec;
	ts32.tv_nsec = ts.tv_nsec;
	return (copyout(&ts32, uap->interval, sizeof(ts32)));
	}

	int
	linux_set_thread_area(struct thread *td,
	struct linux_set_thread_area_args *args)
	{
	struct l_user_desc info;
	struct user_segment_descriptor sd;
	struct pcb *pcb;
	int a[2];
	int error;

	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
	if (error)
	return (error);

	#ifdef DEBUG
	if (ldebug(set_thread_area))
	printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, "
	"%i, %i, %i"), info.entry_number, info.base_addr,
	info.limit, info.seg_32bit, info.contents,
	info.read_exec_only, info.limit_in_pages,
	info.seg_not_present, info.useable);
	#endif

	/*
	* Semantics of Linux version: every thread in the system has array
	* of three TLS descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown.
	* This syscall loads one of the selected TLS decriptors with a value
	* and also loads GDT descriptors 6, 7 and 8 with the content of
	* the per-thread descriptors.
	*
	* Semantics of FreeBSD version: I think we can ignore that Linux has
	* three per-thread descriptors and use just the first one.
	* The tls_array[] is used only in [gs]et_thread_area() syscalls and
	* for loading the GDT descriptors. We use just one GDT descriptor
	* for TLS, so we will load just one.
	*
	* XXX: This doesn't work when a user space process tries to use more
	* than one TLS segment. Comment in the Linux source says wine might
	* do this.
	*/

	/*
	* GLIBC reads current %gs and call set_thread_area() with it.
	* We should let GUDATA_SEL and GUGS32_SEL proceed as well because
	* we use these segments.
	*/
	switch (info.entry_number) {
	case GUGS32_SEL:
	case GUDATA_SEL:
	case 6:
	case -1:
	info.entry_number = GUGS32_SEL;
	break;
	default:
	return (EINVAL);
	}

	/*
	* We have to copy out the GDT entry we use.
	*
	* XXX: What if a user space program does not check the return value
	* and tries to use 6, 7 or 8?
	*/
	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
	if (error)
	return (error);

	if (LINUX_LDT_empty(&info)) {
	a[0] = 0;
	a[1] = 0;
	} else {
	a[0] = LINUX_LDT_entry_a(&info);
	a[1] = LINUX_LDT_entry_b(&info);
	}

	memcpy(&sd, &a, sizeof(a));
	#ifdef DEBUG
	if (ldebug(set_thread_area))
	printf("Segment created in set_thread_area: "
	"lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, "
	"type: %i, dpl: %i, p: %i, xx: %i, long: %i, "
	"def32: %i, gran: %i\n",
	sd.sd_lobase,
	sd.sd_hibase,
	sd.sd_lolimit,
	sd.sd_hilimit,
	sd.sd_type,
	sd.sd_dpl,
	sd.sd_p,
	sd.sd_xx,
	sd.sd_long,
	sd.sd_def32,
	sd.sd_gran);
	#endif

	pcb = td->td_pcb;
	pcb->pcb_gsbase = (register_t)info.base_addr;
	set_pcb_flags(pcb, PCB_32BIT \| PCB_GS32BIT);
	update_gdt_gsbase(td, info.base_addr);

	return (0);
	}

	int
	linux_wait4(struct thread td, struct linux_wait4_args args)
	{
	int error, options;
	struct rusage ru, *rup;
	struct l_rusage lru;

	#ifdef DEBUG
	if (ldebug(wait4))
	printf(ARGS(wait4, "%d, %p, %d, %p"),
	args->pid, (void *)args->status, args->options,
	(void *)args->rusage);
	#endif

	options = (args->options & (WNOHANG \| WUNTRACED));
	/* WLINUXCLONE should be equal to __WCLONE, but we make sure */
	if (args->options & __WCLONE)
	options \|= WLINUXCLONE;

	if (args->rusage != NULL)
	rup = &ru;
	else
	rup = NULL;
	error = linux_common_wait(td, args->pid, args->status, options, rup);
	if (error)
	return (error);
	if (args->rusage != NULL) {
	bsd_to_linux_rusage(rup, &lru);
	error = copyout(&lru, args->rusage, sizeof(lru));
	}

	return (error);
	}
	Index: head/sys/arm/arm/machdep.c
	===================================================================
	--- head/sys/arm/arm/machdep.c (revision 225616)
	+++ head/sys/arm/arm/machdep.c (revision 225617)
	@@ -1,709 +1,709 @@
	/* $NetBSD: arm32_machdep.c,v 1.44 2004/03/24 15:34:47 atatat Exp $ */

	/*-
	* Copyright (c) 2004 Olivier Houchard
	* Copyright (c) 1994-1998 Mark Brinicombe.
	* Copyright (c) 1994 Brini.
	* All rights reserved.
	*
	* This code is derived from software written for Brini by Mark Brinicombe
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by Mark Brinicombe
	* for the NetBSD Project.
	* 4. The name of the company nor the name of the author may be used to
	* endorse or promote products derived from this software without specific
	* prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
	* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
	* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* Machine dependant functions for kernel setup
	*
	* Created : 17/09/94
	* Updated : 18/04/01 updated for new wscons
	*/

	#include "opt_compat.h"
	#include "opt_ddb.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/bus.h>
	#include <sys/cons.h>
	#include <sys/cpu.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/linker.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/pcpu.h>
	#include <sys/ptrace.h>
	#include <sys/signalvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/uio.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>

	#include <machine/armreg.h>
	#include <machine/cpu.h>
	#include <machine/machdep.h>
	#include <machine/md_var.h>
	#include <machine/metadata.h>
	#include <machine/pcb.h>
	#include <machine/pmap.h>
	#include <machine/reg.h>
	#include <machine/trap.h>
	#include <machine/undefined.h>
	#include <machine/vmparam.h>
	#include <machine/sysarch.h>

	uint32_t cpu_reset_address = 0;
	int cold = 1;
	vm_offset_t vector_page;

	long realmem = 0;

	int (_arm_memcpy)(void , void *, int, int) = NULL;
	int (_arm_bzero)(void , int, int) = NULL;
	int _min_memcpy_size = 0;
	int _min_bzero_size = 0;

	extern int *end;
	#ifdef DDB
	extern vm_offset_t ksym_start, ksym_end;
	#endif

	void
	sendsig(catcher, ksi, mask)
	sig_t catcher;
	ksiginfo_t *ksi;
	sigset_t *mask;
	{
	struct thread *td;
	struct proc *p;
	struct trapframe *tf;
	struct sigframe *fp, frame;
	struct sigacts *psp;
	int onstack;
	int sig;
	int code;

	td = curthread;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	sig = ksi->ksi_signo;
	code = ksi->ksi_code;
	psp = p->p_sigacts;
	mtx_assert(&psp->ps_mtx, MA_OWNED);
	tf = td->td_frame;
	onstack = sigonstack(tf->tf_usr_sp);

	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
	catcher, sig);

	/* Allocate and validate space for the signal handler context. */
	if ((td->td_flags & TDP_ALTSTACK) != 0 && !(onstack) &&
	SIGISMEMBER(psp->ps_sigonstack, sig)) {
	fp = (struct sigframe *)(td->td_sigstk.ss_sp +
	td->td_sigstk.ss_size);
	#if defined(COMPAT_43)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	#endif
	} else
	fp = (struct sigframe *)td->td_frame->tf_usr_sp;

	/* make room on the stack */
	fp--;

	/* make the stack aligned */
	fp = (struct sigframe *)STACKALIGN(fp);
	/* Populate the siginfo frame. */
	get_mcontext(td, &frame.sf_uc.uc_mcontext, 0);
	frame.sf_si = ksi->ksi_info;
	frame.sf_uc.uc_sigmask = *mask;
	frame.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK )
	? ((onstack) ? SS_ONSTACK : 0) : SS_DISABLE;
	frame.sf_uc.uc_stack = td->td_sigstk;
	mtx_unlock(&psp->ps_mtx);
	PROC_UNLOCK(td->td_proc);

	/* Copy the sigframe out to the user's stack. */
	if (copyout(&frame, fp, sizeof(*fp)) != 0) {
	/* Process has trashed its stack. Kill it. */
	CTR2(KTR_SIG, "sendsig: sigexit td=%p fp=%p", td, fp);
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	}

	/* Translate the signal if appropriate. */
	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
	sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];

	/*
	* Build context to run handler in. We invoke the handler
	* directly, only returning via the trampoline. Note the
	* trampoline version numbers are coordinated with machine-
	* dependent code in libc.
	*/

	tf->tf_r0 = sig;
	tf->tf_r1 = (register_t)&fp->sf_si;
	tf->tf_r2 = (register_t)&fp->sf_uc;

	/* the trampoline uses r5 as the uc address */
	tf->tf_r5 = (register_t)&fp->sf_uc;
	tf->tf_pc = (register_t)catcher;
	tf->tf_usr_sp = (register_t)fp;
	tf->tf_usr_lr = (register_t)(PS_STRINGS - *(p->p_sysent->sv_szsigcode));

	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td, tf->tf_usr_lr,
	tf->tf_usr_sp);

	PROC_LOCK(p);
	mtx_lock(&psp->ps_mtx);
	}

	struct kva_md_info kmi;

	/*
	* arm32_vector_init:
	*
	* Initialize the vector page, and select whether or not to
	* relocate the vectors.
	*
	* NOTE: We expect the vector page to be mapped at its expected
	* destination.
	*/

	extern unsigned int page0[], page0_data[];
	void
	arm_vector_init(vm_offset_t va, int which)
	{
	unsigned int vectors = (int ) va;
	unsigned int *vectors_data = vectors + (page0_data - page0);
	int vec;

	/*
	* Loop through the vectors we're taking over, and copy the
	* vector's insn and data word.
	*/
	for (vec = 0; vec < ARM_NVEC; vec++) {
	if ((which & (1 << vec)) == 0) {
	/* Don't want to take over this vector. */
	continue;
	}
	vectors[vec] = page0[vec];
	vectors_data[vec] = page0_data[vec];
	}

	/* Now sync the vectors. */
	cpu_icache_sync_range(va, (ARM_NVEC * 2) * sizeof(u_int));

	vector_page = va;

	if (va == ARM_VECTORS_HIGH) {
	/*
	* Assume the MD caller knows what it's doing here, and
	* really does want the vector page relocated.
	*
	* Note: This has to be done here (and not just in
	* cpu_setup()) because the vector page needs to be
	* accessible before cpu_startup() is called.
	* Think ddb(9) ...
	*
	* NOTE: If the CPU control register is not readable,
	* this will totally fail! We'll just assume that
	* any system that has high vector support has a
	* readable CPU control register, for now. If we
	* ever encounter one that does not, we'll have to
	* rethink this.
	*/
	cpu_control(CPU_CONTROL_VECRELOC, CPU_CONTROL_VECRELOC);
	}
	}

	static void
	cpu_startup(void *dummy)
	{
	struct pcb *pcb = thread0.td_pcb;
	#ifndef ARM_CACHE_LOCK_ENABLE
	vm_page_t m;
	#endif

	cpu_setup("");
	identify_arm_cpu();

	printf("real memory = %ju (%ju MB)\n", (uintmax_t)ptoa(physmem),
	(uintmax_t)ptoa(physmem) / 1048576);
	realmem = physmem;

	/*
	* Display the RAM layout.
	*/
	if (bootverbose) {
	int indx;

	printf("Physical memory chunk(s):\n");
	for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
	vm_paddr_t size;

	size = phys_avail[indx + 1] - phys_avail[indx];
	printf("%#08jx - %#08jx, %ju bytes (%ju pages)\n",
	(uintmax_t)phys_avail[indx],
	(uintmax_t)phys_avail[indx + 1] - 1,
	(uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
	}
	}

	vm_ksubmap_init(&kmi);

	printf("avail memory = %ju (%ju MB)\n",
	(uintmax_t)ptoa(cnt.v_free_count),
	(uintmax_t)ptoa(cnt.v_free_count) / 1048576);

	bufinit();
	vm_pager_bufferinit();
	pcb->un_32.pcb32_und_sp = (u_int)thread0.td_kstack +
	USPACE_UNDEF_STACK_TOP;
	pcb->un_32.pcb32_sp = (u_int)thread0.td_kstack +
	USPACE_SVC_STACK_TOP;
	vector_page_setprot(VM_PROT_READ);
	pmap_set_pcb_pagedir(pmap_kernel(), pcb);
	pmap_postinit();
	#ifdef ARM_CACHE_LOCK_ENABLE
	pmap_kenter_user(ARM_TP_ADDRESS, ARM_TP_ADDRESS);
	arm_lock_cache_line(ARM_TP_ADDRESS);
	#else
	m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ \| VM_ALLOC_ZERO);
	pmap_kenter_user(ARM_TP_ADDRESS, VM_PAGE_TO_PHYS(m));
	#endif
	}

	SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);

	/*
	* Flush the D-cache for non-DMA I/O so that the I-cache can
	* be made coherent later.
	*/
	void
	cpu_flush_dcache(void *ptr, size_t len)
	{

	cpu_dcache_wb_range((uintptr_t)ptr, len);
	cpu_l2cache_wb_range((uintptr_t)ptr, len);
	}

	/* Get current clock frequency for the given cpu id. */
	int
	cpu_est_clockrate(int cpu_id, uint64_t *rate)
	{

	return (ENXIO);
	}

	void
	cpu_idle(int busy)
	{
	cpu_sleep(0);
	}

	int
	cpu_idle_wakeup(int cpu)
	{

	return (0);
	}

	int
	fill_regs(struct thread td, struct reg regs)
	{
	struct trapframe *tf = td->td_frame;
	bcopy(&tf->tf_r0, regs->r, sizeof(regs->r));
	regs->r_sp = tf->tf_usr_sp;
	regs->r_lr = tf->tf_usr_lr;
	regs->r_pc = tf->tf_pc;
	regs->r_cpsr = tf->tf_spsr;
	return (0);
	}
	int
	fill_fpregs(struct thread td, struct fpreg regs)
	{
	bzero(regs, sizeof(*regs));
	return (0);
	}

	int
	set_regs(struct thread td, struct reg regs)
	{
	struct trapframe *tf = td->td_frame;

	bcopy(regs->r, &tf->tf_r0, sizeof(regs->r));
	tf->tf_usr_sp = regs->r_sp;
	tf->tf_usr_lr = regs->r_lr;
	tf->tf_pc = regs->r_pc;
	tf->tf_spsr &= ~PSR_FLAGS;
	tf->tf_spsr \|= regs->r_cpsr & PSR_FLAGS;
	return (0);
	}

	int
	set_fpregs(struct thread td, struct fpreg regs)
	{
	return (0);
	}

	int
	fill_dbregs(struct thread td, struct dbreg regs)
	{
	return (0);
	}
	int
	set_dbregs(struct thread td, struct dbreg regs)
	{
	return (0);
	}


	static int
	ptrace_read_int(struct thread td, vm_offset_t addr, u_int32_t v)
	{
	struct iovec iov;
	struct uio uio;

	PROC_LOCK_ASSERT(td->td_proc, MA_NOTOWNED);
	iov.iov_base = (caddr_t) v;
	iov.iov_len = sizeof(u_int32_t);
	uio.uio_iov = &iov;
	uio.uio_iovcnt = 1;
	uio.uio_offset = (off_t)addr;
	uio.uio_resid = sizeof(u_int32_t);
	uio.uio_segflg = UIO_SYSSPACE;
	uio.uio_rw = UIO_READ;
	uio.uio_td = td;
	return proc_rwmem(td->td_proc, &uio);
	}

	static int
	ptrace_write_int(struct thread *td, vm_offset_t addr, u_int32_t v)
	{
	struct iovec iov;
	struct uio uio;

	PROC_LOCK_ASSERT(td->td_proc, MA_NOTOWNED);
	iov.iov_base = (caddr_t) &v;
	iov.iov_len = sizeof(u_int32_t);
	uio.uio_iov = &iov;
	uio.uio_iovcnt = 1;
	uio.uio_offset = (off_t)addr;
	uio.uio_resid = sizeof(u_int32_t);
	uio.uio_segflg = UIO_SYSSPACE;
	uio.uio_rw = UIO_WRITE;
	uio.uio_td = td;
	return proc_rwmem(td->td_proc, &uio);
	}

	int
	ptrace_single_step(struct thread *td)
	{
	struct proc *p;
	int error;

	KASSERT(td->td_md.md_ptrace_instr == 0,
	("Didn't clear single step"));
	p = td->td_proc;
	PROC_UNLOCK(p);
	error = ptrace_read_int(td, td->td_frame->tf_pc + 4,
	&td->td_md.md_ptrace_instr);
	if (error)
	goto out;
	error = ptrace_write_int(td, td->td_frame->tf_pc + 4,
	PTRACE_BREAKPOINT);
	if (error)
	td->td_md.md_ptrace_instr = 0;
	td->td_md.md_ptrace_addr = td->td_frame->tf_pc + 4;
	out:
	PROC_LOCK(p);
	return (error);
	}

	int
	ptrace_clear_single_step(struct thread *td)
	{
	struct proc *p;

	if (td->td_md.md_ptrace_instr) {
	p = td->td_proc;
	PROC_UNLOCK(p);
	ptrace_write_int(td, td->td_md.md_ptrace_addr,
	td->td_md.md_ptrace_instr);
	PROC_LOCK(p);
	td->td_md.md_ptrace_instr = 0;
	}
	return (0);
	}

	int
	ptrace_set_pc(struct thread *td, unsigned long addr)
	{
	td->td_frame->tf_pc = addr;
	return (0);
	}

	void
	cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
	{
	}

	void
	spinlock_enter(void)
	{
	struct thread *td;
	register_t cspr;

	td = curthread;
	if (td->td_md.md_spinlock_count == 0) {
	cspr = disable_interrupts(I32_bit \| F32_bit);
	td->td_md.md_spinlock_count = 1;
	td->td_md.md_saved_cspr = cspr;
	} else
	td->td_md.md_spinlock_count++;
	critical_enter();
	}

	void
	spinlock_exit(void)
	{
	struct thread *td;
	register_t cspr;

	td = curthread;
	critical_exit();
	cspr = td->td_md.md_saved_cspr;
	td->td_md.md_spinlock_count--;
	if (td->td_md.md_spinlock_count == 0)
	restore_interrupts(cspr);
	}

	/*
	* Clear registers on exec
	*/
	void
	exec_setregs(struct thread td, struct image_params imgp, u_long stack)
	{
	struct trapframe *tf = td->td_frame;

	memset(tf, 0, sizeof(*tf));
	tf->tf_usr_sp = stack;
	tf->tf_usr_lr = imgp->entry_addr;
	tf->tf_svc_lr = 0x77777777;
	tf->tf_pc = imgp->entry_addr;
	tf->tf_spsr = PSR_USR32_MODE;
	}

	/*
	* Get machine context.
	*/
	int
	get_mcontext(struct thread td, mcontext_t mcp, int clear_ret)
	{
	struct trapframe *tf = td->td_frame;
	__greg_t *gr = mcp->__gregs;

	if (clear_ret & GET_MC_CLEAR_RET)
	gr[_REG_R0] = 0;
	else
	gr[_REG_R0] = tf->tf_r0;
	gr[_REG_R1] = tf->tf_r1;
	gr[_REG_R2] = tf->tf_r2;
	gr[_REG_R3] = tf->tf_r3;
	gr[_REG_R4] = tf->tf_r4;
	gr[_REG_R5] = tf->tf_r5;
	gr[_REG_R6] = tf->tf_r6;
	gr[_REG_R7] = tf->tf_r7;
	gr[_REG_R8] = tf->tf_r8;
	gr[_REG_R9] = tf->tf_r9;
	gr[_REG_R10] = tf->tf_r10;
	gr[_REG_R11] = tf->tf_r11;
	gr[_REG_R12] = tf->tf_r12;
	gr[_REG_SP] = tf->tf_usr_sp;
	gr[_REG_LR] = tf->tf_usr_lr;
	gr[_REG_PC] = tf->tf_pc;
	gr[_REG_CPSR] = tf->tf_spsr;

	return (0);
	}

	/*
	* Set machine context.
	*
	* However, we don't set any but the user modifiable flags, and we won't
	* touch the cs selector.
	*/
	int
	set_mcontext(struct thread td, const mcontext_t mcp)
	{
	struct trapframe *tf = td->td_frame;
	const __greg_t *gr = mcp->__gregs;

	tf->tf_r0 = gr[_REG_R0];
	tf->tf_r1 = gr[_REG_R1];
	tf->tf_r2 = gr[_REG_R2];
	tf->tf_r3 = gr[_REG_R3];
	tf->tf_r4 = gr[_REG_R4];
	tf->tf_r5 = gr[_REG_R5];
	tf->tf_r6 = gr[_REG_R6];
	tf->tf_r7 = gr[_REG_R7];
	tf->tf_r8 = gr[_REG_R8];
	tf->tf_r9 = gr[_REG_R9];
	tf->tf_r10 = gr[_REG_R10];
	tf->tf_r11 = gr[_REG_R11];
	tf->tf_r12 = gr[_REG_R12];
	tf->tf_usr_sp = gr[_REG_SP];
	tf->tf_usr_lr = gr[_REG_LR];
	tf->tf_pc = gr[_REG_PC];
	tf->tf_spsr = gr[_REG_CPSR];

	return (0);
	}

	/*
	* MPSAFE
	*/
	int
	-sigreturn(td, uap)
	+sys_sigreturn(td, uap)
	struct thread *td;
	struct sigreturn_args /* {
	const struct __ucontext *sigcntxp;
	} / uap;
	{
	struct sigframe sf;
	struct trapframe *tf;
	int spsr;

	if (uap == NULL)
	return (EFAULT);
	if (copyin(uap->sigcntxp, &sf, sizeof(sf)))
	return (EFAULT);
	/*
	* Make sure the processor mode has not been tampered with and
	* interrupts have not been disabled.
	*/
	spsr = sf.sf_uc.uc_mcontext.__gregs[_REG_CPSR];
	if ((spsr & PSR_MODE) != PSR_USR32_MODE \|\|
	(spsr & (I32_bit \| F32_bit)) != 0)
	return (EINVAL);
	/* Restore register context. */
	tf = td->td_frame;
	set_mcontext(td, &sf.sf_uc.uc_mcontext);

	/* Restore signal mask. */
	kern_sigprocmask(td, SIG_SETMASK, &sf.sf_uc.uc_sigmask, NULL, 0);

	return (EJUSTRETURN);
	}


	/*
	* Construct a PCB from a trapframe. This is called from kdb_trap() where
	* we want to start a backtrace from the function that caused us to enter
	* the debugger. We have the context in the trapframe, but base the trace
	* on the PCB. The PCB doesn't have to be perfect, as long as it contains
	* enough for a backtrace.
	*/
	void
	makectx(struct trapframe tf, struct pcb pcb)
	{
	pcb->un_32.pcb32_r8 = tf->tf_r8;
	pcb->un_32.pcb32_r9 = tf->tf_r9;
	pcb->un_32.pcb32_r10 = tf->tf_r10;
	pcb->un_32.pcb32_r11 = tf->tf_r11;
	pcb->un_32.pcb32_r12 = tf->tf_r12;
	pcb->un_32.pcb32_pc = tf->tf_pc;
	pcb->un_32.pcb32_lr = tf->tf_usr_lr;
	pcb->un_32.pcb32_sp = tf->tf_usr_sp;
	}

	/*
	* Fake up a boot descriptor table
	*/
	vm_offset_t
	fake_preload_metadata(void)
	{
	#ifdef DDB
	vm_offset_t zstart = 0, zend = 0;
	#endif
	vm_offset_t lastaddr;
	int i = 0;
	static uint32_t fake_preload[35];

	fake_preload[i++] = MODINFO_NAME;
	fake_preload[i++] = strlen("elf kernel") + 1;
	strcpy((char*)&fake_preload[i++], "elf kernel");
	i += 2;
	fake_preload[i++] = MODINFO_TYPE;
	fake_preload[i++] = strlen("elf kernel") + 1;
	strcpy((char*)&fake_preload[i++], "elf kernel");
	i += 2;
	fake_preload[i++] = MODINFO_ADDR;
	fake_preload[i++] = sizeof(vm_offset_t);
	fake_preload[i++] = KERNVIRTADDR;
	fake_preload[i++] = MODINFO_SIZE;
	fake_preload[i++] = sizeof(uint32_t);
	fake_preload[i++] = (uint32_t)&end - KERNVIRTADDR;
	#ifdef DDB
	if ((uint32_t )KERNVIRTADDR == MAGIC_TRAMP_NUMBER) {
	fake_preload[i++] = MODINFO_METADATA\|MODINFOMD_SSYM;
	fake_preload[i++] = sizeof(vm_offset_t);
	fake_preload[i++] = (uint32_t )(KERNVIRTADDR + 4);
	fake_preload[i++] = MODINFO_METADATA\|MODINFOMD_ESYM;
	fake_preload[i++] = sizeof(vm_offset_t);
	fake_preload[i++] = (uint32_t )(KERNVIRTADDR + 8);
	lastaddr = (uint32_t )(KERNVIRTADDR + 8);
	zend = lastaddr;
	zstart = (uint32_t )(KERNVIRTADDR + 4);
	ksym_start = zstart;
	ksym_end = zend;
	} else
	#endif
	lastaddr = (vm_offset_t)&end;
	fake_preload[i++] = 0;
	fake_preload[i] = 0;
	preload_metadata = (void *)fake_preload;

	return (lastaddr);
	}
	Index: head/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c (revision 225616)
	+++ head/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c (revision 225617)
	@@ -1,16518 +1,16518 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*
	* $FreeBSD$
	*/

	/*
	* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
	* Use is subject to license terms.
	*/

	#pragma ident "%Z%%M% %I% %E% SMI"

	/*
	* DTrace - Dynamic Tracing for Solaris
	*
	* This is the implementation of the Solaris Dynamic Tracing framework
	* (DTrace). The user-visible interface to DTrace is described at length in
	* the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
	* library, the in-kernel DTrace framework, and the DTrace providers are
	* described in the block comments in the <sys/dtrace.h> header file. The
	* internal architecture of DTrace is described in the block comments in the
	* <sys/dtrace_impl.h> header file. The comments contained within the DTrace
	* implementation very much assume mastery of all of these sources; if one has
	* an unanswered question about the implementation, one should consult them
	* first.
	*
	* The functions here are ordered roughly as follows:
	*
	* - Probe context functions
	* - Probe hashing functions
	* - Non-probe context utility functions
	* - Matching functions
	* - Provider-to-Framework API functions
	* - Probe management functions
	* - DIF object functions
	* - Format functions
	* - Predicate functions
	* - ECB functions
	* - Buffer functions
	* - Enabling functions
	* - DOF functions
	* - Anonymous enabling functions
	* - Consumer state functions
	* - Helper functions
	* - Hook functions
	* - Driver cookbook functions
	*
	* Each group of functions begins with a block comment labelled the "DTrace
	* [Group] Functions", allowing one to find each block by searching forward
	* on capital-f functions.
	*/
	#include <sys/errno.h>
	#if !defined(sun)
	#include <sys/time.h>
	#endif
	#include <sys/stat.h>
	#include <sys/modctl.h>
	#include <sys/conf.h>
	#include <sys/systm.h>
	#if defined(sun)
	#include <sys/ddi.h>
	#include <sys/sunddi.h>
	#endif
	#include <sys/cpuvar.h>
	#include <sys/kmem.h>
	#if defined(sun)
	#include <sys/strsubr.h>
	#endif
	#include <sys/sysmacros.h>
	#include <sys/dtrace_impl.h>
	#include <sys/atomic.h>
	#include <sys/cmn_err.h>
	#if defined(sun)
	#include <sys/mutex_impl.h>
	#include <sys/rwlock_impl.h>
	#endif
	#include <sys/ctf_api.h>
	#if defined(sun)
	#include <sys/panic.h>
	#include <sys/priv_impl.h>
	#endif
	#include <sys/policy.h>
	#if defined(sun)
	#include <sys/cred_impl.h>
	#include <sys/procfs_isa.h>
	#endif
	#include <sys/taskq.h>
	#if defined(sun)
	#include <sys/mkdev.h>
	#include <sys/kdi.h>
	#endif
	#include <sys/zone.h>
	#include <sys/socket.h>
	#include <netinet/in.h>

	/* FreeBSD includes: */
	#if !defined(sun)
	#include <sys/callout.h>
	#include <sys/ctype.h>
	#include <sys/limits.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/sysctl.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/rwlock.h>
	#include <sys/sx.h>
	#include <sys/dtrace_bsd.h>
	#include <netinet/in.h>
	#include "dtrace_cddl.h"
	#include "dtrace_debug.c"
	#endif

	/*
	* DTrace Tunable Variables
	*
	* The following variables may be tuned by adding a line to /etc/system that
	* includes both the name of the DTrace module ("dtrace") and the name of the
	* variable. For example:
	*
	* set dtrace:dtrace_destructive_disallow = 1
	*
	* In general, the only variables that one should be tuning this way are those
	* that affect system-wide DTrace behavior, and for which the default behavior
	* is undesirable. Most of these variables are tunable on a per-consumer
	* basis using DTrace options, and need not be tuned on a system-wide basis.
	* When tuning these variables, avoid pathological values; while some attempt
	* is made to verify the integrity of these variables, they are not considered
	* part of the supported interface to DTrace, and they are therefore not
	* checked comprehensively. Further, these variables should not be tuned
	* dynamically via "mdb -kw" or other means; they should only be tuned via
	* /etc/system.
	*/
	int dtrace_destructive_disallow = 0;
	dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
	size_t dtrace_difo_maxsize = (256 * 1024);
	dtrace_optval_t dtrace_dof_maxsize = (256 * 1024);
	size_t dtrace_global_maxsize = (16 * 1024);
	size_t dtrace_actions_max = (16 * 1024);
	size_t dtrace_retain_max = 1024;
	dtrace_optval_t dtrace_helper_actions_max = 32;
	dtrace_optval_t dtrace_helper_providers_max = 32;
	dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
	size_t dtrace_strsize_default = 256;
	dtrace_optval_t dtrace_cleanrate_default = 9900990; /* 101 hz */
	dtrace_optval_t dtrace_cleanrate_min = 200000; /* 5000 hz */
	dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
	dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
	dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
	dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
	dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
	dtrace_optval_t dtrace_nspec_default = 1;
	dtrace_optval_t dtrace_specsize_default = 32 * 1024;
	dtrace_optval_t dtrace_stackframes_default = 20;
	dtrace_optval_t dtrace_ustackframes_default = 20;
	dtrace_optval_t dtrace_jstackframes_default = 50;
	dtrace_optval_t dtrace_jstackstrsize_default = 512;
	int dtrace_msgdsize_max = 128;
	hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
	hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
	int dtrace_devdepth_max = 32;
	int dtrace_err_verbose;
	hrtime_t dtrace_deadman_interval = NANOSEC;
	hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
	hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;

	/*
	* DTrace External Variables
	*
	* As dtrace(7D) is a kernel module, any DTrace variables are obviously
	* available to DTrace consumers via the backtick (`) syntax. One of these,
	* dtrace_zero, is made deliberately so: it is provided as a source of
	* well-known, zero-filled memory. While this variable is not documented,
	* it is used by some translators as an implementation detail.
	*/
	const char dtrace_zero[256] = { 0 }; /* zero-filled memory */

	/*
	* DTrace Internal Variables
	*/
	#if defined(sun)
	static dev_info_t dtrace_devi; / device info */
	#endif
	#if defined(sun)
	static vmem_t dtrace_arena; / probe ID arena */
	static vmem_t dtrace_minor; / minor number arena */
	static taskq_t dtrace_taskq; / task queue */
	#else
	static struct unrhdr dtrace_arena; / Probe ID number. */
	#endif
	static dtrace_probe_t *dtrace_probes; / array of all probes */
	static int dtrace_nprobes; /* number of probes */
	static dtrace_provider_t dtrace_provider; / provider list */
	static dtrace_meta_t dtrace_meta_pid; / user-land meta provider */
	static int dtrace_opens; /* number of opens */
	static int dtrace_helpers; /* number of helpers */
	#if defined(sun)
	static void dtrace_softstate; / softstate pointer */
	#endif
	static dtrace_hash_t dtrace_bymod; / probes hashed by module */
	static dtrace_hash_t dtrace_byfunc; / probes hashed by function */
	static dtrace_hash_t dtrace_byname; / probes hashed by name */
	static dtrace_toxrange_t dtrace_toxrange; / toxic range array */
	static int dtrace_toxranges; /* number of toxic ranges */
	static int dtrace_toxranges_max; /* size of toxic range array */
	static dtrace_anon_t dtrace_anon; /* anonymous enabling */
	static kmem_cache_t dtrace_state_cache; / cache for dynamic state */
	static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
	static kthread_t dtrace_panicked; / panicking thread */
	static dtrace_ecb_t dtrace_ecb_create_cache; / cached created ECB */
	static dtrace_genid_t dtrace_probegen; /* current probe generation */
	static dtrace_helpers_t dtrace_deferred_pid; / deferred helper list */
	static dtrace_enabling_t dtrace_retained; / list of retained enablings */
	static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
	#if !defined(sun)
	static struct mtx dtrace_unr_mtx;
	MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
	int dtrace_in_probe; /* non-zero if executing a probe */
	#if defined(__i386__) \|\| defined(__amd64__)
	uintptr_t dtrace_in_probe_addr; /* Address of invop when already in probe */
	#endif
	#endif

	/*
	* DTrace Locking
	* DTrace is protected by three (relatively coarse-grained) locks:
	*
	* (1) dtrace_lock is required to manipulate essentially any DTrace state,
	* including enabling state, probes, ECBs, consumer state, helper state,
	* etc. Importantly, dtrace_lock is _not_ required when in probe context;
	* probe context is lock-free -- synchronization is handled via the
	* dtrace_sync() cross call mechanism.
	*
	* (2) dtrace_provider_lock is required when manipulating provider state, or
	* when provider state must be held constant.
	*
	* (3) dtrace_meta_lock is required when manipulating meta provider state, or
	* when meta provider state must be held constant.
	*
	* The lock ordering between these three locks is dtrace_meta_lock before
	* dtrace_provider_lock before dtrace_lock. (In particular, there are
	* several places where dtrace_provider_lock is held by the framework as it
	* calls into the providers -- which then call back into the framework,
	* grabbing dtrace_lock.)
	*
	* There are two other locks in the mix: mod_lock and cpu_lock. With respect
	* to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
	* role as a coarse-grained lock; it is acquired before both of these locks.
	* With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
	* be acquired _between_ dtrace_meta_lock and any other DTrace locks.
	* mod_lock is similar with respect to dtrace_provider_lock in that it must be
	* acquired _between_ dtrace_provider_lock and dtrace_lock.
	*/
	static kmutex_t dtrace_lock; /* probe state lock */
	static kmutex_t dtrace_provider_lock; /* provider state lock */
	static kmutex_t dtrace_meta_lock; /* meta-provider state lock */

	#if !defined(sun)
	/* XXX FreeBSD hacks. */
	static kmutex_t mod_lock;

	#define cr_suid cr_svuid
	#define cr_sgid cr_svgid
	#define ipaddr_t in_addr_t
	#define mod_modname pathname
	#define vuprintf vprintf
	#define ttoproc(_a) ((_a)->td_proc)
	#define crgetzoneid(_a) 0
	#define NCPU MAXCPU
	#define SNOCD 0
	#define CPU_ON_INTR(_a) 0

	#define PRIV_EFFECTIVE (1 << 0)
	#define PRIV_DTRACE_KERNEL (1 << 1)
	#define PRIV_DTRACE_PROC (1 << 2)
	#define PRIV_DTRACE_USER (1 << 3)
	#define PRIV_PROC_OWNER (1 << 4)
	#define PRIV_PROC_ZONE (1 << 5)
	#define PRIV_ALL ~0

	SYSCTL_NODE(_debug, OID_AUTO, dtrace, CTLFLAG_RD, 0, "DTrace Information");
	#endif

	#if defined(sun)
	#define curcpu CPU->cpu_id
	#endif


	/*
	* DTrace Provider Variables
	*
	* These are the variables relating to DTrace as a provider (that is, the
	* provider of the BEGIN, END, and ERROR probes).
	*/
	static dtrace_pattr_t dtrace_provider_attr = {
	{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
	{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
	{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
	{ DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
	};

	static void
	dtrace_nullop(void)
	{}

	static dtrace_pops_t dtrace_provider_ops = {
	(void ()(void , dtrace_probedesc_t *))dtrace_nullop,
	(void ()(void , modctl_t *))dtrace_nullop,
	(void ()(void , dtrace_id_t, void *))dtrace_nullop,
	(void ()(void , dtrace_id_t, void *))dtrace_nullop,
	(void ()(void , dtrace_id_t, void *))dtrace_nullop,
	(void ()(void , dtrace_id_t, void *))dtrace_nullop,
	NULL,
	NULL,
	NULL,
	(void ()(void , dtrace_id_t, void *))dtrace_nullop
	};

	static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
	static dtrace_id_t dtrace_probeid_end; /* special END probe */
	dtrace_id_t dtrace_probeid_error; /* special ERROR probe */

	/*
	* DTrace Helper Tracing Variables
	*/
	uint32_t dtrace_helptrace_next = 0;
	uint32_t dtrace_helptrace_nlocals;
	char *dtrace_helptrace_buffer;
	int dtrace_helptrace_bufsize = 512 * 1024;

	#ifdef DEBUG
	int dtrace_helptrace_enabled = 1;
	#else
	int dtrace_helptrace_enabled = 0;
	#endif

	/*
	* DTrace Error Hashing
	*
	* On DEBUG kernels, DTrace will track the errors that has seen in a hash
	* table. This is very useful for checking coverage of tests that are
	* expected to induce DIF or DOF processing errors, and may be useful for
	* debugging problems in the DIF code generator or in DOF generation . The
	* error hash may be examined with the ::dtrace_errhash MDB dcmd.
	*/
	#ifdef DEBUG
	static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
	static const char *dtrace_errlast;
	static kthread_t *dtrace_errthread;
	static kmutex_t dtrace_errlock;
	#endif

	/*
	* DTrace Macros and Constants
	*
	* These are various macros that are useful in various spots in the
	* implementation, along with a few random constants that have no meaning
	* outside of the implementation. There is no real structure to this cpp
	* mishmash -- but is there ever?
	*/
	#define DTRACE_HASHSTR(hash, probe) \
	dtrace_hash_str(((char *)((uintptr_t)(probe) + (hash)->dth_stroffs)))

	#define DTRACE_HASHNEXT(hash, probe) \
	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)

	#define DTRACE_HASHPREV(hash, probe) \
	(dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)

	#define DTRACE_HASHEQ(hash, lhs, rhs) \
	(strcmp(((char *)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
	((char *)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)

	#define DTRACE_AGGHASHSIZE_SLEW 17

	#define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)

	/*
	* The key for a thread-local variable consists of the lower 61 bits of the
	* t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
	* We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
	* equal to a variable identifier. This is necessary (but not sufficient) to
	* assure that global associative arrays never collide with thread-local
	* variables. To guarantee that they cannot collide, we must also define the
	* order for keying dynamic variables. That order is:
	*
	* [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
	*
	* Because the variable-key and the tls-key are in orthogonal spaces, there is
	* no way for a global variable key signature to match a thread-local key
	* signature.
	*/
	#if defined(sun)
	#define DTRACE_TLS_THRKEY(where) { \
	uint_t intr = 0; \
	uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
	for (; actv; actv >>= 1) \
	intr++; \
	ASSERT(intr < (1 << 3)); \
	(where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
	(((uint64_t)1 << 61) - 1)) \| ((uint64_t)intr << 61); \
	}
	#else
	#define DTRACE_TLS_THRKEY(where) { \
	solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
	uint_t intr = 0; \
	uint_t actv = _c->cpu_intr_actv; \
	for (; actv; actv >>= 1) \
	intr++; \
	ASSERT(intr < (1 << 3)); \
	(where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
	(((uint64_t)1 << 61) - 1)) \| ((uint64_t)intr << 61); \
	}
	#endif

	#define DT_BSWAP_8(x) ((x) & 0xff)
	#define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) \| DT_BSWAP_8((x) >> 8))
	#define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) \| DT_BSWAP_16((x) >> 16))
	#define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) \| DT_BSWAP_32((x) >> 32))

	#define DT_MASK_LO 0x00000000FFFFFFFFULL

	#define DTRACE_STORE(type, tomax, offset, what) \
	((type )((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);

	#ifndef __i386
	#define DTRACE_ALIGNCHECK(addr, size, flags) \
	if (addr & (size - 1)) { \
	*flags \|= CPU_DTRACE_BADALIGN; \
	cpu_core[curcpu].cpuc_dtrace_illval = addr; \
	return (0); \
	}
	#else
	#define DTRACE_ALIGNCHECK(addr, size, flags)
	#endif

	/*
	* Test whether a range of memory starting at testaddr of size testsz falls
	* within the range of memory described by addr, sz. We take care to avoid
	* problems with overflow and underflow of the unsigned quantities, and
	* disallow all negative sizes. Ranges of size 0 are allowed.
	*/
	#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
	((testaddr) - (baseaddr) < (basesz) && \
	(testaddr) + (testsz) - (baseaddr) <= (basesz) && \
	(testaddr) + (testsz) >= (testaddr))

	/*
	* Test whether alloc_sz bytes will fit in the scratch region. We isolate
	* alloc_sz on the righthand side of the comparison in order to avoid overflow
	* or underflow in the comparison with it. This is simpler than the INRANGE
	* check above, because we know that the dtms_scratch_ptr is valid in the
	* range. Allocations of size zero are allowed.
	*/
	#define DTRACE_INSCRATCH(mstate, alloc_sz) \
	((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
	(mstate)->dtms_scratch_ptr >= (alloc_sz))

	#define DTRACE_LOADFUNC(bits) \
	/CSTYLED/ \
	uint##bits##_t \
	dtrace_load##bits(uintptr_t addr) \
	{ \
	size_t size = bits / NBBY; \
	/CSTYLED/ \
	uint##bits##_t rval; \
	int i; \
	volatile uint16_t flags = (volatile uint16_t ) \
	&cpu_core[curcpu].cpuc_dtrace_flags; \
	\
	DTRACE_ALIGNCHECK(addr, size, flags); \
	\
	for (i = 0; i < dtrace_toxranges; i++) { \
	if (addr >= dtrace_toxrange[i].dtt_limit) \
	continue; \
	\
	if (addr + size <= dtrace_toxrange[i].dtt_base) \
	continue; \
	\
	/* \
	* This address falls within a toxic region; return 0. \
	*/ \
	*flags \|= CPU_DTRACE_BADADDR; \
	cpu_core[curcpu].cpuc_dtrace_illval = addr; \
	return (0); \
	} \
	\
	*flags \|= CPU_DTRACE_NOFAULT; \
	/CSTYLED/ \
	rval = ((volatile uint##bits##_t )addr); \
	*flags &= ~CPU_DTRACE_NOFAULT; \
	\
	return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0); \
	}

	#ifdef _LP64
	#define dtrace_loadptr dtrace_load64
	#else
	#define dtrace_loadptr dtrace_load32
	#endif

	#define DTRACE_DYNHASH_FREE 0
	#define DTRACE_DYNHASH_SINK 1
	#define DTRACE_DYNHASH_VALID 2

	#define DTRACE_MATCH_NEXT 0
	#define DTRACE_MATCH_DONE 1
	#define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
	#define DTRACE_STATE_ALIGN 64

	#define DTRACE_FLAGS2FLT(flags) \
	(((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
	((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
	((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
	((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
	((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
	((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
	((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
	((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
	((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
	DTRACEFLT_UNKNOWN)

	#define DTRACEACT_ISSTRING(act) \
	((act)->dta_kind == DTRACEACT_DIFEXPR && \
	(act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)

	/* Function prototype definitions: */
	static size_t dtrace_strlen(const char *, size_t);
	static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
	static void dtrace_enabling_provide(dtrace_provider_t *);
	static int dtrace_enabling_match(dtrace_enabling_t , int );
	static void dtrace_enabling_matchall(void);
	static dtrace_state_t *dtrace_anon_grab(void);
	static uint64_t dtrace_helper(int, dtrace_mstate_t *,
	dtrace_state_t *, uint64_t, uint64_t);
	static dtrace_helpers_t dtrace_helpers_create(proc_t );
	static void dtrace_buffer_drop(dtrace_buffer_t *);
	static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
	dtrace_state_t , dtrace_mstate_t );
	static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
	dtrace_optval_t);
	static int dtrace_ecb_create_enable(dtrace_probe_t , void );
	static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
	uint16_t dtrace_load16(uintptr_t);
	uint32_t dtrace_load32(uintptr_t);
	uint64_t dtrace_load64(uintptr_t);
	uint8_t dtrace_load8(uintptr_t);
	void dtrace_dynvar_clean(dtrace_dstate_t *);
	dtrace_dynvar_t dtrace_dynvar(dtrace_dstate_t , uint_t, dtrace_key_t *,
	size_t, dtrace_dynvar_op_t, dtrace_mstate_t , dtrace_vstate_t );
	uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t , dtrace_mstate_t );

	/*
	* DTrace Probe Context Functions
	*
	* These functions are called from probe context. Because probe context is
	* any context in which C may be called, arbitrarily locks may be held,
	* interrupts may be disabled, we may be in arbitrary dispatched state, etc.
	* As a result, functions called from probe context may only call other DTrace
	* support functions -- they may not interact at all with the system at large.
	* (Note that the ASSERT macro is made probe-context safe by redefining it in
	* terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
	* loads are to be performed from probe context, they _must_ be in terms of
	* the safe dtrace_load*() variants.
	*
	* Some functions in this block are not actually called from probe context;
	* for these functions, there will be a comment above the function reading
	* "Note: not called from probe context."
	*/
	void
	dtrace_panic(const char *format, ...)
	{
	va_list alist;

	va_start(alist, format);
	dtrace_vpanic(format, alist);
	va_end(alist);
	}

	int
	dtrace_assfail(const char a, const char f, int l)
	{
	dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);

	/*
	* We just need something here that even the most clever compiler
	* cannot optimize away.
	*/
	return (a[(uintptr_t)f]);
	}

	/*
	* Atomically increment a specified error counter from probe context.
	*/
	static void
	dtrace_error(uint32_t *counter)
	{
	/*
	* Most counters stored to in probe context are per-CPU counters.
	* However, there are some error conditions that are sufficiently
	* arcane that they don't merit per-CPU storage. If these counters
	* are incremented concurrently on different CPUs, scalability will be
	* adversely affected -- but we don't expect them to be white-hot in a
	* correctly constructed enabling...
	*/
	uint32_t oval, nval;

	do {
	oval = *counter;

	if ((nval = oval + 1) == 0) {
	/*
	* If the counter would wrap, set it to 1 -- assuring
	* that the counter is never zero when we have seen
	* errors. (The counter must be 32-bits because we
	* aren't guaranteed a 64-bit compare&swap operation.)
	* To save this code both the infamy of being fingered
	* by a priggish news story and the indignity of being
	* the target of a neo-puritan witch trial, we're
	* carefully avoiding any colorful description of the
	* likelihood of this condition -- but suffice it to
	* say that it is only slightly more likely than the
	* overflow of predicate cache IDs, as discussed in
	* dtrace_predicate_create().
	*/
	nval = 1;
	}
	} while (dtrace_cas32(counter, oval, nval) != oval);
	}

	/*
	* Use the DTRACE_LOADFUNC macro to define functions for each of loading a
	* uint8_t, a uint16_t, a uint32_t and a uint64_t.
	*/
	DTRACE_LOADFUNC(8)
	DTRACE_LOADFUNC(16)
	DTRACE_LOADFUNC(32)
	DTRACE_LOADFUNC(64)

	static int
	dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
	{
	if (dest < mstate->dtms_scratch_base)
	return (0);

	if (dest + size < dest)
	return (0);

	if (dest + size > mstate->dtms_scratch_ptr)
	return (0);

	return (1);
	}

	static int
	dtrace_canstore_statvar(uint64_t addr, size_t sz,
	dtrace_statvar_t **svars, int nsvars)
	{
	int i;

	for (i = 0; i < nsvars; i++) {
	dtrace_statvar_t *svar = svars[i];

	if (svar == NULL \|\| svar->dtsv_size == 0)
	continue;

	if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
	return (1);
	}

	return (0);
	}

	/*
	* Check to see if the address is within a memory region to which a store may
	* be issued. This includes the DTrace scratch areas, and any DTrace variable
	* region. The caller of dtrace_canstore() is responsible for performing any
	* alignment checks that are needed before stores are actually executed.
	*/
	static int
	dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
	dtrace_vstate_t *vstate)
	{
	/*
	* First, check to see if the address is in scratch space...
	*/
	if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
	mstate->dtms_scratch_size))
	return (1);

	/*
	* Now check to see if it's a dynamic variable. This check will pick
	* up both thread-local variables and any global dynamically-allocated
	* variables.
	*/
	if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
	vstate->dtvs_dynvars.dtds_size)) {
	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
	uintptr_t base = (uintptr_t)dstate->dtds_base +
	(dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
	uintptr_t chunkoffs;

	/*
	* Before we assume that we can store here, we need to make
	* sure that it isn't in our metadata -- storing to our
	* dynamic variable metadata would corrupt our state. For
	* the range to not include any dynamic variable metadata,
	* it must:
	*
	* (1) Start above the hash table that is at the base of
	* the dynamic variable space
	*
	* (2) Have a starting chunk offset that is beyond the
	* dtrace_dynvar_t that is at the base of every chunk
	*
	* (3) Not span a chunk boundary
	*
	*/
	if (addr < base)
	return (0);

	chunkoffs = (addr - base) % dstate->dtds_chunksize;

	if (chunkoffs < sizeof (dtrace_dynvar_t))
	return (0);

	if (chunkoffs + sz > dstate->dtds_chunksize)
	return (0);

	return (1);
	}

	/*
	* Finally, check the static local and global variables. These checks
	* take the longest, so we perform them last.
	*/
	if (dtrace_canstore_statvar(addr, sz,
	vstate->dtvs_locals, vstate->dtvs_nlocals))
	return (1);

	if (dtrace_canstore_statvar(addr, sz,
	vstate->dtvs_globals, vstate->dtvs_nglobals))
	return (1);

	return (0);
	}


	/*
	* Convenience routine to check to see if the address is within a memory
	* region in which a load may be issued given the user's privilege level;
	* if not, it sets the appropriate error flags and loads 'addr' into the
	* illegal value slot.
	*
	* DTrace subroutines (DIF_SUBR_*) should use this helper to implement
	* appropriate memory access protection.
	*/
	static int
	dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
	dtrace_vstate_t *vstate)
	{
	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;

	/*
	* If we hold the privilege to read from kernel memory, then
	* everything is readable.
	*/
	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
	return (1);

	/*
	* You can obviously read that which you can store.
	*/
	if (dtrace_canstore(addr, sz, mstate, vstate))
	return (1);

	/*
	* We're allowed to read from our own string table.
	*/
	if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
	mstate->dtms_difo->dtdo_strlen))
	return (1);

	DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
	*illval = addr;
	return (0);
	}

	/*
	* Convenience routine to check to see if a given string is within a memory
	* region in which a load may be issued given the user's privilege level;
	* this exists so that we don't need to issue unnecessary dtrace_strlen()
	* calls in the event that the user has all privileges.
	*/
	static int
	dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
	dtrace_vstate_t *vstate)
	{
	size_t strsz;

	/*
	* If we hold the privilege to read from kernel memory, then
	* everything is readable.
	*/
	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
	return (1);

	strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
	if (dtrace_canload(addr, strsz, mstate, vstate))
	return (1);

	return (0);
	}

	/*
	* Convenience routine to check to see if a given variable is within a memory
	* region in which a load may be issued given the user's privilege level.
	*/
	static int
	dtrace_vcanload(void src, dtrace_diftype_t type, dtrace_mstate_t *mstate,
	dtrace_vstate_t *vstate)
	{
	size_t sz;
	ASSERT(type->dtdt_flags & DIF_TF_BYREF);

	/*
	* If we hold the privilege to read from kernel memory, then
	* everything is readable.
	*/
	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
	return (1);

	if (type->dtdt_kind == DIF_TYPE_STRING)
	sz = dtrace_strlen(src,
	vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
	else
	sz = type->dtdt_size;

	return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
	}

	/*
	* Compare two strings using safe loads.
	*/
	static int
	dtrace_strncmp(char s1, char s2, size_t limit)
	{
	uint8_t c1, c2;
	volatile uint16_t *flags;

	if (s1 == s2 \|\| limit == 0)
	return (0);

	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;

	do {
	if (s1 == NULL) {
	c1 = '\0';
	} else {
	c1 = dtrace_load8((uintptr_t)s1++);
	}

	if (s2 == NULL) {
	c2 = '\0';
	} else {
	c2 = dtrace_load8((uintptr_t)s2++);
	}

	if (c1 != c2)
	return (c1 - c2);
	} while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));

	return (0);
	}

	/*
	* Compute strlen(s) for a string using safe memory accesses. The additional
	* len parameter is used to specify a maximum length to ensure completion.
	*/
	static size_t
	dtrace_strlen(const char *s, size_t lim)
	{
	uint_t len;

	for (len = 0; len != lim; len++) {
	if (dtrace_load8((uintptr_t)s++) == '\0')
	break;
	}

	return (len);
	}

	/*
	* Check if an address falls within a toxic region.
	*/
	static int
	dtrace_istoxic(uintptr_t kaddr, size_t size)
	{
	uintptr_t taddr, tsize;
	int i;

	for (i = 0; i < dtrace_toxranges; i++) {
	taddr = dtrace_toxrange[i].dtt_base;
	tsize = dtrace_toxrange[i].dtt_limit - taddr;

	if (kaddr - taddr < tsize) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
	cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
	return (1);
	}

	if (taddr - kaddr < size) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
	cpu_core[curcpu].cpuc_dtrace_illval = taddr;
	return (1);
	}
	}

	return (0);
	}

	/*
	* Copy src to dst using safe memory accesses. The src is assumed to be unsafe
	* memory specified by the DIF program. The dst is assumed to be safe memory
	* that we can store to directly because it is managed by DTrace. As with
	* standard bcopy, overlapping copies are handled properly.
	*/
	static void
	dtrace_bcopy(const void src, void dst, size_t len)
	{
	if (len != 0) {
	uint8_t *s1 = dst;
	const uint8_t *s2 = src;

	if (s1 <= s2) {
	do {
	*s1++ = dtrace_load8((uintptr_t)s2++);
	} while (--len != 0);
	} else {
	s2 += len;
	s1 += len;

	do {
	*--s1 = dtrace_load8((uintptr_t)--s2);
	} while (--len != 0);
	}
	}
	}

	/*
	* Copy src to dst using safe memory accesses, up to either the specified
	* length, or the point that a nul byte is encountered. The src is assumed to
	* be unsafe memory specified by the DIF program. The dst is assumed to be
	* safe memory that we can store to directly because it is managed by DTrace.
	* Unlike dtrace_bcopy(), overlapping regions are not handled.
	*/
	static void
	dtrace_strcpy(const void src, void dst, size_t len)
	{
	if (len != 0) {
	uint8_t *s1 = dst, c;
	const uint8_t *s2 = src;

	do {
	*s1++ = c = dtrace_load8((uintptr_t)s2++);
	} while (--len != 0 && c != '\0');
	}
	}

	/*
	* Copy src to dst, deriving the size and type from the specified (BYREF)
	* variable type. The src is assumed to be unsafe memory specified by the DIF
	* program. The dst is assumed to be DTrace variable memory that is of the
	* specified type; we assume that we can store to directly.
	*/
	static void
	dtrace_vcopy(void src, void dst, dtrace_diftype_t *type)
	{
	ASSERT(type->dtdt_flags & DIF_TF_BYREF);

	if (type->dtdt_kind == DIF_TYPE_STRING) {
	dtrace_strcpy(src, dst, type->dtdt_size);
	} else {
	dtrace_bcopy(src, dst, type->dtdt_size);
	}
	}

	/*
	* Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
	* unsafe memory specified by the DIF program. The s2 data is assumed to be
	* safe memory that we can access directly because it is managed by DTrace.
	*/
	static int
	dtrace_bcmp(const void s1, const void s2, size_t len)
	{
	volatile uint16_t *flags;

	flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;

	if (s1 == s2)
	return (0);

	if (s1 == NULL \|\| s2 == NULL)
	return (1);

	if (s1 != s2 && len != 0) {
	const uint8_t *ps1 = s1;
	const uint8_t *ps2 = s2;

	do {
	if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
	return (1);
	} while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
	}
	return (0);
	}

	/*
	* Zero the specified region using a simple byte-by-byte loop. Note that this
	* is for safe DTrace-managed memory only.
	*/
	static void
	dtrace_bzero(void *dst, size_t len)
	{
	uchar_t *cp;

	for (cp = dst; len != 0; len--)
	*cp++ = 0;
	}

	static void
	dtrace_add_128(uint64_t addend1, uint64_t addend2, uint64_t *sum)
	{
	uint64_t result[2];

	result[0] = addend1[0] + addend2[0];
	result[1] = addend1[1] + addend2[1] +
	(result[0] < addend1[0] \|\| result[0] < addend2[0] ? 1 : 0);

	sum[0] = result[0];
	sum[1] = result[1];
	}

	/*
	* Shift the 128-bit value in a by b. If b is positive, shift left.
	* If b is negative, shift right.
	*/
	static void
	dtrace_shift_128(uint64_t *a, int b)
	{
	uint64_t mask;

	if (b == 0)
	return;

	if (b < 0) {
	b = -b;
	if (b >= 64) {
	a[0] = a[1] >> (b - 64);
	a[1] = 0;
	} else {
	a[0] >>= b;
	mask = 1LL << (64 - b);
	mask -= 1;
	a[0] \|= ((a[1] & mask) << (64 - b));
	a[1] >>= b;
	}
	} else {
	if (b >= 64) {
	a[1] = a[0] << (b - 64);
	a[0] = 0;
	} else {
	a[1] <<= b;
	mask = a[0] >> (64 - b);
	a[1] \|= mask;
	a[0] <<= b;
	}
	}
	}

	/*
	* The basic idea is to break the 2 64-bit values into 4 32-bit values,
	* use native multiplication on those, and then re-combine into the
	* resulting 128-bit value.
	*
	* (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
	* hi1 * hi2 << 64 +
	* hi1 * lo2 << 32 +
	* hi2 * lo1 << 32 +
	* lo1 * lo2
	*/
	static void
	dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
	{
	uint64_t hi1, hi2, lo1, lo2;
	uint64_t tmp[2];

	hi1 = factor1 >> 32;
	hi2 = factor2 >> 32;

	lo1 = factor1 & DT_MASK_LO;
	lo2 = factor2 & DT_MASK_LO;

	product[0] = lo1 * lo2;
	product[1] = hi1 * hi2;

	tmp[0] = hi1 * lo2;
	tmp[1] = 0;
	dtrace_shift_128(tmp, 32);
	dtrace_add_128(product, tmp, product);

	tmp[0] = hi2 * lo1;
	tmp[1] = 0;
	dtrace_shift_128(tmp, 32);
	dtrace_add_128(product, tmp, product);
	}

	/*
	* This privilege check should be used by actions and subroutines to
	* verify that the user credentials of the process that enabled the
	* invoking ECB match the target credentials
	*/
	static int
	dtrace_priv_proc_common_user(dtrace_state_t *state)
	{
	cred_t cr, s_cr = state->dts_cred.dcr_cred;

	/*
	* We should always have a non-NULL state cred here, since if cred
	* is null (anonymous tracing), we fast-path bypass this routine.
	*/
	ASSERT(s_cr != NULL);

	if ((cr = CRED()) != NULL &&
	s_cr->cr_uid == cr->cr_uid &&
	s_cr->cr_uid == cr->cr_ruid &&
	s_cr->cr_uid == cr->cr_suid &&
	s_cr->cr_gid == cr->cr_gid &&
	s_cr->cr_gid == cr->cr_rgid &&
	s_cr->cr_gid == cr->cr_sgid)
	return (1);

	return (0);
	}

	/*
	* This privilege check should be used by actions and subroutines to
	* verify that the zone of the process that enabled the invoking ECB
	* matches the target credentials
	*/
	static int
	dtrace_priv_proc_common_zone(dtrace_state_t *state)
	{
	#if defined(sun)
	cred_t cr, s_cr = state->dts_cred.dcr_cred;

	/*
	* We should always have a non-NULL state cred here, since if cred
	* is null (anonymous tracing), we fast-path bypass this routine.
	*/
	ASSERT(s_cr != NULL);

	if ((cr = CRED()) != NULL &&
	s_cr->cr_zone == cr->cr_zone)
	return (1);

	return (0);
	#else
	return (1);
	#endif
	}

	/*
	* This privilege check should be used by actions and subroutines to
	* verify that the process has not setuid or changed credentials.
	*/
	static int
	dtrace_priv_proc_common_nocd(void)
	{
	proc_t *proc;

	if ((proc = ttoproc(curthread)) != NULL &&
	!(proc->p_flag & SNOCD))
	return (1);

	return (0);
	}

	static int
	dtrace_priv_proc_destructive(dtrace_state_t *state)
	{
	int action = state->dts_cred.dcr_action;

	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
	dtrace_priv_proc_common_zone(state) == 0)
	goto bad;

	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
	dtrace_priv_proc_common_user(state) == 0)
	goto bad;

	if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
	dtrace_priv_proc_common_nocd() == 0)
	goto bad;

	return (1);

	bad:
	cpu_core[curcpu].cpuc_dtrace_flags \|= CPU_DTRACE_UPRIV;

	return (0);
	}

	static int
	dtrace_priv_proc_control(dtrace_state_t *state)
	{
	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
	return (1);

	if (dtrace_priv_proc_common_zone(state) &&
	dtrace_priv_proc_common_user(state) &&
	dtrace_priv_proc_common_nocd())
	return (1);

	cpu_core[curcpu].cpuc_dtrace_flags \|= CPU_DTRACE_UPRIV;

	return (0);
	}

	static int
	dtrace_priv_proc(dtrace_state_t *state)
	{
	if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
	return (1);

	cpu_core[curcpu].cpuc_dtrace_flags \|= CPU_DTRACE_UPRIV;

	return (0);
	}

	static int
	dtrace_priv_kernel(dtrace_state_t *state)
	{
	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
	return (1);

	cpu_core[curcpu].cpuc_dtrace_flags \|= CPU_DTRACE_KPRIV;

	return (0);
	}

	static int
	dtrace_priv_kernel_destructive(dtrace_state_t *state)
	{
	if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
	return (1);

	cpu_core[curcpu].cpuc_dtrace_flags \|= CPU_DTRACE_KPRIV;

	return (0);
	}

	/*
	* Note: not called from probe context. This function is called
	* asynchronously (and at a regular interval) from outside of probe context to
	* clean the dirty dynamic variable lists on all CPUs. Dynamic variable
	* cleaning is explained in detail in <sys/dtrace_impl.h>.
	*/
	void
	dtrace_dynvar_clean(dtrace_dstate_t *dstate)
	{
	dtrace_dynvar_t *dirty;
	dtrace_dstate_percpu_t *dcpu;
	int i, work = 0;

	for (i = 0; i < NCPU; i++) {
	dcpu = &dstate->dtds_percpu[i];

	ASSERT(dcpu->dtdsc_rinsing == NULL);

	/*
	* If the dirty list is NULL, there is no dirty work to do.
	*/
	if (dcpu->dtdsc_dirty == NULL)
	continue;

	/*
	* If the clean list is non-NULL, then we're not going to do
	* any work for this CPU -- it means that there has not been
	* a dtrace_dynvar() allocation on this CPU (or from this CPU)
	* since the last time we cleaned house.
	*/
	if (dcpu->dtdsc_clean != NULL)
	continue;

	work = 1;

	/*
	* Atomically move the dirty list aside.
	*/
	do {
	dirty = dcpu->dtdsc_dirty;

	/*
	* Before we zap the dirty list, set the rinsing list.
	* (This allows for a potential assertion in
	* dtrace_dynvar(): if a free dynamic variable appears
	* on a hash chain, either the dirty list or the
	* rinsing list for some CPU must be non-NULL.)
	*/
	dcpu->dtdsc_rinsing = dirty;
	dtrace_membar_producer();
	} while (dtrace_casptr(&dcpu->dtdsc_dirty,
	dirty, NULL) != dirty);
	}

	if (!work) {
	/*
	* We have no work to do; we can simply return.
	*/
	return;
	}

	dtrace_sync();

	for (i = 0; i < NCPU; i++) {
	dcpu = &dstate->dtds_percpu[i];

	if (dcpu->dtdsc_rinsing == NULL)
	continue;

	/*
	* We are now guaranteed that no hash chain contains a pointer
	* into this dirty list; we can make it clean.
	*/
	ASSERT(dcpu->dtdsc_clean == NULL);
	dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
	dcpu->dtdsc_rinsing = NULL;
	}

	/*
	* Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
	* sure that all CPUs have seen all of the dtdsc_clean pointers.
	* This prevents a race whereby a CPU incorrectly decides that
	* the state should be something other than DTRACE_DSTATE_CLEAN
	* after dtrace_dynvar_clean() has completed.
	*/
	dtrace_sync();

	dstate->dtds_state = DTRACE_DSTATE_CLEAN;
	}

	/*
	* Depending on the value of the op parameter, this function looks-up,
	* allocates or deallocates an arbitrarily-keyed dynamic variable. If an
	* allocation is requested, this function will return a pointer to a
	* dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
	* variable can be allocated. If NULL is returned, the appropriate counter
	* will be incremented.
	*/
	dtrace_dynvar_t *
	dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
	dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
	dtrace_mstate_t mstate, dtrace_vstate_t vstate)
	{
	uint64_t hashval = DTRACE_DYNHASH_VALID;
	dtrace_dynhash_t *hash = dstate->dtds_hash;
	dtrace_dynvar_t free, new_free, next, dvar, start, prev = NULL;
	processorid_t me = curcpu, cpu = me;
	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
	size_t bucket, ksize;
	size_t chunksize = dstate->dtds_chunksize;
	uintptr_t kdata, lock, nstate;
	uint_t i;

	ASSERT(nkeys != 0);

	/*
	* Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
	* algorithm. For the by-value portions, we perform the algorithm in
	* 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
	* bit, and seems to have only a minute effect on distribution. For
	* the by-reference data, we perform "One-at-a-time" iterating (safely)
	* over each referenced byte. It's painful to do this, but it's much
	* better than pathological hash distribution. The efficacy of the
	* hashing algorithm (and a comparison with other algorithms) may be
	* found by running the ::dtrace_dynstat MDB dcmd.
	*/
	for (i = 0; i < nkeys; i++) {
	if (key[i].dttk_size == 0) {
	uint64_t val = key[i].dttk_value;

	hashval += (val >> 48) & 0xffff;
	hashval += (hashval << 10);
	hashval ^= (hashval >> 6);

	hashval += (val >> 32) & 0xffff;
	hashval += (hashval << 10);
	hashval ^= (hashval >> 6);

	hashval += (val >> 16) & 0xffff;
	hashval += (hashval << 10);
	hashval ^= (hashval >> 6);

	hashval += val & 0xffff;
	hashval += (hashval << 10);
	hashval ^= (hashval >> 6);
	} else {
	/*
	* This is incredibly painful, but it beats the hell
	* out of the alternative.
	*/
	uint64_t j, size = key[i].dttk_size;
	uintptr_t base = (uintptr_t)key[i].dttk_value;

	if (!dtrace_canload(base, size, mstate, vstate))
	break;

	for (j = 0; j < size; j++) {
	hashval += dtrace_load8(base + j);
	hashval += (hashval << 10);
	hashval ^= (hashval >> 6);
	}
	}
	}

	if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
	return (NULL);

	hashval += (hashval << 3);
	hashval ^= (hashval >> 11);
	hashval += (hashval << 15);

	/*
	* There is a remote chance (ideally, 1 in 2^31) that our hashval
	* comes out to be one of our two sentinel hash values. If this
	* actually happens, we set the hashval to be a value known to be a
	* non-sentinel value.
	*/
	if (hashval == DTRACE_DYNHASH_FREE \|\| hashval == DTRACE_DYNHASH_SINK)
	hashval = DTRACE_DYNHASH_VALID;

	/*
	* Yes, it's painful to do a divide here. If the cycle count becomes
	* important here, tricks can be pulled to reduce it. (However, it's
	* critical that hash collisions be kept to an absolute minimum;
	* they're much more painful than a divide.) It's better to have a
	* solution that generates few collisions and still keeps things
	* relatively simple.
	*/
	bucket = hashval % dstate->dtds_hashsize;

	if (op == DTRACE_DYNVAR_DEALLOC) {
	volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;

	for (;;) {
	while ((lock = *lockp) & 1)
	continue;

	if (dtrace_casptr((volatile void *)lockp,
	(volatile void )lock, (volatile void )(lock + 1)) == (void *)lock)
	break;
	}

	dtrace_membar_producer();
	}

	top:
	prev = NULL;
	lock = hash[bucket].dtdh_lock;

	dtrace_membar_consumer();

	start = hash[bucket].dtdh_chain;
	ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK \|\|
	start->dtdv_hashval != DTRACE_DYNHASH_FREE \|\|
	op != DTRACE_DYNVAR_DEALLOC));

	for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
	dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
	dtrace_key_t *dkey = &dtuple->dtt_key[0];

	if (dvar->dtdv_hashval != hashval) {
	if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
	/*
	* We've reached the sink, and therefore the
	* end of the hash chain; we can kick out of
	* the loop knowing that we have seen a valid
	* snapshot of state.
	*/
	ASSERT(dvar->dtdv_next == NULL);
	ASSERT(dvar == &dtrace_dynhash_sink);
	break;
	}

	if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
	/*
	* We've gone off the rails: somewhere along
	* the line, one of the members of this hash
	* chain was deleted. Note that we could also
	* detect this by simply letting this loop run
	* to completion, as we would eventually hit
	* the end of the dirty list. However, we
	* want to avoid running the length of the
	* dirty list unnecessarily (it might be quite
	* long), so we catch this as early as
	* possible by detecting the hash marker. In
	* this case, we simply set dvar to NULL and
	* break; the conditional after the loop will
	* send us back to top.
	*/
	dvar = NULL;
	break;
	}

	goto next;
	}

	if (dtuple->dtt_nkeys != nkeys)
	goto next;

	for (i = 0; i < nkeys; i++, dkey++) {
	if (dkey->dttk_size != key[i].dttk_size)
	goto next; /* size or type mismatch */

	if (dkey->dttk_size != 0) {
	if (dtrace_bcmp(
	(void *)(uintptr_t)key[i].dttk_value,
	(void *)(uintptr_t)dkey->dttk_value,
	dkey->dttk_size))
	goto next;
	} else {
	if (dkey->dttk_value != key[i].dttk_value)
	goto next;
	}
	}

	if (op != DTRACE_DYNVAR_DEALLOC)
	return (dvar);

	ASSERT(dvar->dtdv_next == NULL \|\|
	dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);

	if (prev != NULL) {
	ASSERT(hash[bucket].dtdh_chain != dvar);
	ASSERT(start != dvar);
	ASSERT(prev->dtdv_next == dvar);
	prev->dtdv_next = dvar->dtdv_next;
	} else {
	if (dtrace_casptr(&hash[bucket].dtdh_chain,
	start, dvar->dtdv_next) != start) {
	/*
	* We have failed to atomically swing the
	* hash table head pointer, presumably because
	* of a conflicting allocation on another CPU.
	* We need to reread the hash chain and try
	* again.
	*/
	goto top;
	}
	}

	dtrace_membar_producer();

	/*
	* Now set the hash value to indicate that it's free.
	*/
	ASSERT(hash[bucket].dtdh_chain != dvar);
	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;

	dtrace_membar_producer();

	/*
	* Set the next pointer to point at the dirty list, and
	* atomically swing the dirty pointer to the newly freed dvar.
	*/
	do {
	next = dcpu->dtdsc_dirty;
	dvar->dtdv_next = next;
	} while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);

	/*
	* Finally, unlock this hash bucket.
	*/
	ASSERT(hash[bucket].dtdh_lock == lock);
	ASSERT(lock & 1);
	hash[bucket].dtdh_lock++;

	return (NULL);
	next:
	prev = dvar;
	continue;
	}

	if (dvar == NULL) {
	/*
	* If dvar is NULL, it is because we went off the rails:
	* one of the elements that we traversed in the hash chain
	* was deleted while we were traversing it. In this case,
	* we assert that we aren't doing a dealloc (deallocs lock
	* the hash bucket to prevent themselves from racing with
	* one another), and retry the hash chain traversal.
	*/
	ASSERT(op != DTRACE_DYNVAR_DEALLOC);
	goto top;
	}

	if (op != DTRACE_DYNVAR_ALLOC) {
	/*
	* If we are not to allocate a new variable, we want to
	* return NULL now. Before we return, check that the value
	* of the lock word hasn't changed. If it has, we may have
	* seen an inconsistent snapshot.
	*/
	if (op == DTRACE_DYNVAR_NOALLOC) {
	if (hash[bucket].dtdh_lock != lock)
	goto top;
	} else {
	ASSERT(op == DTRACE_DYNVAR_DEALLOC);
	ASSERT(hash[bucket].dtdh_lock == lock);
	ASSERT(lock & 1);
	hash[bucket].dtdh_lock++;
	}

	return (NULL);
	}

	/*
	* We need to allocate a new dynamic variable. The size we need is the
	* size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
	* size of any auxiliary key data (rounded up to 8-byte alignment) plus
	* the size of any referred-to data (dsize). We then round the final
	* size up to the chunksize for allocation.
	*/
	for (ksize = 0, i = 0; i < nkeys; i++)
	ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));

	/*
	* This should be pretty much impossible, but could happen if, say,
	* strange DIF specified the tuple. Ideally, this should be an
	* assertion and not an error condition -- but that requires that the
	* chunksize calculation in dtrace_difo_chunksize() be absolutely
	* bullet-proof. (That is, it must not be able to be fooled by
	* malicious DIF.) Given the lack of backwards branches in DIF,
	* solving this would presumably not amount to solving the Halting
	* Problem -- but it still seems awfully hard.
	*/
	if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
	ksize + dsize > chunksize) {
	dcpu->dtdsc_drops++;
	return (NULL);
	}

	nstate = DTRACE_DSTATE_EMPTY;

	do {
	retry:
	free = dcpu->dtdsc_free;

	if (free == NULL) {
	dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
	void *rval;

	if (clean == NULL) {
	/*
	* We're out of dynamic variable space on
	* this CPU. Unless we have tried all CPUs,
	* we'll try to allocate from a different
	* CPU.
	*/
	switch (dstate->dtds_state) {
	case DTRACE_DSTATE_CLEAN: {
	void *sp = &dstate->dtds_state;

	if (++cpu >= NCPU)
	cpu = 0;

	if (dcpu->dtdsc_dirty != NULL &&
	nstate == DTRACE_DSTATE_EMPTY)
	nstate = DTRACE_DSTATE_DIRTY;

	if (dcpu->dtdsc_rinsing != NULL)
	nstate = DTRACE_DSTATE_RINSING;

	dcpu = &dstate->dtds_percpu[cpu];

	if (cpu != me)
	goto retry;

	(void) dtrace_cas32(sp,
	DTRACE_DSTATE_CLEAN, nstate);

	/*
	* To increment the correct bean
	* counter, take another lap.
	*/
	goto retry;
	}

	case DTRACE_DSTATE_DIRTY:
	dcpu->dtdsc_dirty_drops++;
	break;

	case DTRACE_DSTATE_RINSING:
	dcpu->dtdsc_rinsing_drops++;
	break;

	case DTRACE_DSTATE_EMPTY:
	dcpu->dtdsc_drops++;
	break;
	}

	DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
	return (NULL);
	}

	/*
	* The clean list appears to be non-empty. We want to
	* move the clean list to the free list; we start by
	* moving the clean pointer aside.
	*/
	if (dtrace_casptr(&dcpu->dtdsc_clean,
	clean, NULL) != clean) {
	/*
	* We are in one of two situations:
	*
	* (a) The clean list was switched to the
	* free list by another CPU.
	*
	* (b) The clean list was added to by the
	* cleansing cyclic.
	*
	* In either of these situations, we can
	* just reattempt the free list allocation.
	*/
	goto retry;
	}

	ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);

	/*
	* Now we'll move the clean list to the free list.
	* It's impossible for this to fail: the only way
	* the free list can be updated is through this
	* code path, and only one CPU can own the clean list.
	* Thus, it would only be possible for this to fail if
	* this code were racing with dtrace_dynvar_clean().
	* (That is, if dtrace_dynvar_clean() updated the clean
	* list, and we ended up racing to update the free
	* list.) This race is prevented by the dtrace_sync()
	* in dtrace_dynvar_clean() -- which flushes the
	* owners of the clean lists out before resetting
	* the clean lists.
	*/
	rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
	ASSERT(rval == NULL);
	goto retry;
	}

	dvar = free;
	new_free = dvar->dtdv_next;
	} while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);

	/*
	* We have now allocated a new chunk. We copy the tuple keys into the
	* tuple array and copy any referenced key data into the data space
	* following the tuple array. As we do this, we relocate dttk_value
	* in the final tuple to point to the key data address in the chunk.
	*/
	kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
	dvar->dtdv_data = (void *)(kdata + ksize);
	dvar->dtdv_tuple.dtt_nkeys = nkeys;

	for (i = 0; i < nkeys; i++) {
	dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
	size_t kesize = key[i].dttk_size;

	if (kesize != 0) {
	dtrace_bcopy(
	(const void *)(uintptr_t)key[i].dttk_value,
	(void *)kdata, kesize);
	dkey->dttk_value = kdata;
	kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
	} else {
	dkey->dttk_value = key[i].dttk_value;
	}

	dkey->dttk_size = kesize;
	}

	ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
	dvar->dtdv_hashval = hashval;
	dvar->dtdv_next = start;

	if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
	return (dvar);

	/*
	* The cas has failed. Either another CPU is adding an element to
	* this hash chain, or another CPU is deleting an element from this
	* hash chain. The simplest way to deal with both of these cases
	* (though not necessarily the most efficient) is to free our
	* allocated block and tail-call ourselves. Note that the free is
	* to the dirty list and _not_ to the free list. This is to prevent
	* races with allocators, above.
	*/
	dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;

	dtrace_membar_producer();

	do {
	free = dcpu->dtdsc_dirty;
	dvar->dtdv_next = free;
	} while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);

	return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
	}

	/ARGSUSED/
	static void
	dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
	{
	if ((int64_t)nval < (int64_t)*oval)
	*oval = nval;
	}

	/ARGSUSED/
	static void
	dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
	{
	if ((int64_t)nval > (int64_t)*oval)
	*oval = nval;
	}

	static void
	dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
	{
	int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
	int64_t val = (int64_t)nval;

	if (val < 0) {
	for (i = 0; i < zero; i++) {
	if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
	quanta[i] += incr;
	return;
	}
	}
	} else {
	for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
	if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
	quanta[i - 1] += incr;
	return;
	}
	}

	quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
	return;
	}

	ASSERT(0);
	}

	static void
	dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
	{
	uint64_t arg = *lquanta++;
	int32_t base = DTRACE_LQUANTIZE_BASE(arg);
	uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
	int32_t val = (int32_t)nval, level;

	ASSERT(step != 0);
	ASSERT(levels != 0);

	if (val < base) {
	/*
	* This is an underflow.
	*/
	lquanta[0] += incr;
	return;
	}

	level = (val - base) / step;

	if (level < levels) {
	lquanta[level + 1] += incr;
	return;
	}

	/*
	* This is an overflow.
	*/
	lquanta[levels + 1] += incr;
	}

	/ARGSUSED/
	static void
	dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
	{
	data[0]++;
	data[1] += nval;
	}

	/ARGSUSED/
	static void
	dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
	{
	int64_t snval = (int64_t)nval;
	uint64_t tmp[2];

	data[0]++;
	data[1] += nval;

	/*
	* What we want to say here is:
	*
	* data[2] += nval * nval;
	*
	* But given that nval is 64-bit, we could easily overflow, so
	* we do this as 128-bit arithmetic.
	*/
	if (snval < 0)
	snval = -snval;

	dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
	dtrace_add_128(data + 2, tmp, data + 2);
	}

	/ARGSUSED/
	static void
	dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
	{
	oval = oval + 1;
	}

	/ARGSUSED/
	static void
	dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
	{
	*oval += nval;
	}

	/*
	* Aggregate given the tuple in the principal data buffer, and the aggregating
	* action denoted by the specified dtrace_aggregation_t. The aggregation
	* buffer is specified as the buf parameter. This routine does not return
	* failure; if there is no space in the aggregation buffer, the data will be
	* dropped, and a corresponding counter incremented.
	*/
	static void
	dtrace_aggregate(dtrace_aggregation_t agg, dtrace_buffer_t dbuf,
	intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
	{
	dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
	uint32_t i, ndx, size, fsize;
	uint32_t align = sizeof (uint64_t) - 1;
	dtrace_aggbuffer_t *agb;
	dtrace_aggkey_t *key;
	uint32_t hashval = 0, limit, isstr;
	caddr_t tomax, data, kdata;
	dtrace_actkind_t action;
	dtrace_action_t *act;
	uintptr_t offs;

	if (buf == NULL)
	return;

	if (!agg->dtag_hasarg) {
	/*
	* Currently, only quantize() and lquantize() take additional
	* arguments, and they have the same semantics: an increment
	* value that defaults to 1 when not present. If additional
	* aggregating actions take arguments, the setting of the
	* default argument value will presumably have to become more
	* sophisticated...
	*/
	arg = 1;
	}

	action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
	size = rec->dtrd_offset - agg->dtag_base;
	fsize = size + rec->dtrd_size;

	ASSERT(dbuf->dtb_tomax != NULL);
	data = dbuf->dtb_tomax + offset + agg->dtag_base;

	if ((tomax = buf->dtb_tomax) == NULL) {
	dtrace_buffer_drop(buf);
	return;
	}

	/*
	* The metastructure is always at the bottom of the buffer.
	*/
	agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
	sizeof (dtrace_aggbuffer_t));

	if (buf->dtb_offset == 0) {
	/*
	* We just kludge up approximately 1/8th of the size to be
	* buckets. If this guess ends up being routinely
	* off-the-mark, we may need to dynamically readjust this
	* based on past performance.
	*/
	uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);

	if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
	(uintptr_t)tomax \|\| hashsize == 0) {
	/*
	* We've been given a ludicrously small buffer;
	* increment our drop count and leave.
	*/
	dtrace_buffer_drop(buf);
	return;
	}

	/*
	* And now, a pathetic attempt to try to get a an odd (or
	* perchance, a prime) hash size for better hash distribution.
	*/
	if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
	hashsize -= DTRACE_AGGHASHSIZE_SLEW;

	agb->dtagb_hashsize = hashsize;
	agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
	agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
	agb->dtagb_free = (uintptr_t)agb->dtagb_hash;

	for (i = 0; i < agb->dtagb_hashsize; i++)
	agb->dtagb_hash[i] = NULL;
	}

	ASSERT(agg->dtag_first != NULL);
	ASSERT(agg->dtag_first->dta_intuple);

	/*
	* Calculate the hash value based on the key. Note that we _don't_
	* include the aggid in the hashing (but we will store it as part of
	* the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
	* algorithm: a simple, quick algorithm that has no known funnels, and
	* gets good distribution in practice. The efficacy of the hashing
	* algorithm (and a comparison with other algorithms) may be found by
	* running the ::dtrace_aggstat MDB dcmd.
	*/
	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
	i = act->dta_rec.dtrd_offset - agg->dtag_base;
	limit = i + act->dta_rec.dtrd_size;
	ASSERT(limit <= size);
	isstr = DTRACEACT_ISSTRING(act);

	for (; i < limit; i++) {
	hashval += data[i];
	hashval += (hashval << 10);
	hashval ^= (hashval >> 6);

	if (isstr && data[i] == '\0')
	break;
	}
	}

	hashval += (hashval << 3);
	hashval ^= (hashval >> 11);
	hashval += (hashval << 15);

	/*
	* Yes, the divide here is expensive -- but it's generally the least
	* of the performance issues given the amount of data that we iterate
	* over to compute hash values, compare data, etc.
	*/
	ndx = hashval % agb->dtagb_hashsize;

	for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
	ASSERT((caddr_t)key >= tomax);
	ASSERT((caddr_t)key < tomax + buf->dtb_size);

	if (hashval != key->dtak_hashval \|\| key->dtak_size != size)
	continue;

	kdata = key->dtak_data;
	ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);

	for (act = agg->dtag_first; act->dta_intuple;
	act = act->dta_next) {
	i = act->dta_rec.dtrd_offset - agg->dtag_base;
	limit = i + act->dta_rec.dtrd_size;
	ASSERT(limit <= size);
	isstr = DTRACEACT_ISSTRING(act);

	for (; i < limit; i++) {
	if (kdata[i] != data[i])
	goto next;

	if (isstr && data[i] == '\0')
	break;
	}
	}

	if (action != key->dtak_action) {
	/*
	* We are aggregating on the same value in the same
	* aggregation with two different aggregating actions.
	* (This should have been picked up in the compiler,
	* so we may be dealing with errant or devious DIF.)
	* This is an error condition; we indicate as much,
	* and return.
	*/
	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
	return;
	}

	/*
	* This is a hit: we need to apply the aggregator to
	* the value at this key.
	*/
	agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
	return;
	next:
	continue;
	}

	/*
	* We didn't find it. We need to allocate some zero-filled space,
	* link it into the hash table appropriately, and apply the aggregator
	* to the (zero-filled) value.
	*/
	offs = buf->dtb_offset;
	while (offs & (align - 1))
	offs += sizeof (uint32_t);

	/*
	* If we don't have enough room to both allocate a new key _and_
	* its associated data, increment the drop count and return.
	*/
	if ((uintptr_t)tomax + offs + fsize >
	agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
	dtrace_buffer_drop(buf);
	return;
	}

	/CONSTCOND/
	ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
	key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
	agb->dtagb_free -= sizeof (dtrace_aggkey_t);

	key->dtak_data = kdata = tomax + offs;
	buf->dtb_offset = offs + fsize;

	/*
	* Now copy the data across.
	*/
	((dtrace_aggid_t )kdata) = agg->dtag_id;

	for (i = sizeof (dtrace_aggid_t); i < size; i++)
	kdata[i] = data[i];

	/*
	* Because strings are not zeroed out by default, we need to iterate
	* looking for actions that store strings, and we need to explicitly
	* pad these strings out with zeroes.
	*/
	for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
	int nul;

	if (!DTRACEACT_ISSTRING(act))
	continue;

	i = act->dta_rec.dtrd_offset - agg->dtag_base;
	limit = i + act->dta_rec.dtrd_size;
	ASSERT(limit <= size);

	for (nul = 0; i < limit; i++) {
	if (nul) {
	kdata[i] = '\0';
	continue;
	}

	if (data[i] != '\0')
	continue;

	nul = 1;
	}
	}

	for (i = size; i < fsize; i++)
	kdata[i] = 0;

	key->dtak_hashval = hashval;
	key->dtak_size = size;
	key->dtak_action = action;
	key->dtak_next = agb->dtagb_hash[ndx];
	agb->dtagb_hash[ndx] = key;

	/*
	* Finally, apply the aggregator.
	*/
	((uint64_t )(key->dtak_data + size)) = agg->dtag_initial;
	agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
	}

	/*
	* Given consumer state, this routine finds a speculation in the INACTIVE
	* state and transitions it into the ACTIVE state. If there is no speculation
	* in the INACTIVE state, 0 is returned. In this case, no error counter is
	* incremented -- it is up to the caller to take appropriate action.
	*/
	static int
	dtrace_speculation(dtrace_state_t *state)
	{
	int i = 0;
	dtrace_speculation_state_t current;
	uint32_t *stat = &state->dts_speculations_unavail, count;

	while (i < state->dts_nspeculations) {
	dtrace_speculation_t *spec = &state->dts_speculations[i];

	current = spec->dtsp_state;

	if (current != DTRACESPEC_INACTIVE) {
	if (current == DTRACESPEC_COMMITTINGMANY \|\|
	current == DTRACESPEC_COMMITTING \|\|
	current == DTRACESPEC_DISCARDING)
	stat = &state->dts_speculations_busy;
	i++;
	continue;
	}

	if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
	current, DTRACESPEC_ACTIVE) == current)
	return (i + 1);
	}

	/*
	* We couldn't find a speculation. If we found as much as a single
	* busy speculation buffer, we'll attribute this failure as "busy"
	* instead of "unavail".
	*/
	do {
	count = *stat;
	} while (dtrace_cas32(stat, count, count + 1) != count);

	return (0);
	}

	/*
	* This routine commits an active speculation. If the specified speculation
	* is not in a valid state to perform a commit(), this routine will silently do
	* nothing. The state of the specified speculation is transitioned according
	* to the state transition diagram outlined in <sys/dtrace_impl.h>
	*/
	static void
	dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
	dtrace_specid_t which)
	{
	dtrace_speculation_t *spec;
	dtrace_buffer_t src, dest;
	uintptr_t daddr, saddr, dlimit;
	dtrace_speculation_state_t current, new = 0;
	intptr_t offs;

	if (which == 0)
	return;

	if (which > state->dts_nspeculations) {
	cpu_core[cpu].cpuc_dtrace_flags \|= CPU_DTRACE_ILLOP;
	return;
	}

	spec = &state->dts_speculations[which - 1];
	src = &spec->dtsp_buffer[cpu];
	dest = &state->dts_buffer[cpu];

	do {
	current = spec->dtsp_state;

	if (current == DTRACESPEC_COMMITTINGMANY)
	break;

	switch (current) {
	case DTRACESPEC_INACTIVE:
	case DTRACESPEC_DISCARDING:
	return;

	case DTRACESPEC_COMMITTING:
	/*
	* This is only possible if we are (a) commit()'ing
	* without having done a prior speculate() on this CPU
	* and (b) racing with another commit() on a different
	* CPU. There's nothing to do -- we just assert that
	* our offset is 0.
	*/
	ASSERT(src->dtb_offset == 0);
	return;

	case DTRACESPEC_ACTIVE:
	new = DTRACESPEC_COMMITTING;
	break;

	case DTRACESPEC_ACTIVEONE:
	/*
	* This speculation is active on one CPU. If our
	* buffer offset is non-zero, we know that the one CPU
	* must be us. Otherwise, we are committing on a
	* different CPU from the speculate(), and we must
	* rely on being asynchronously cleaned.
	*/
	if (src->dtb_offset != 0) {
	new = DTRACESPEC_COMMITTING;
	break;
	}
	/FALLTHROUGH/

	case DTRACESPEC_ACTIVEMANY:
	new = DTRACESPEC_COMMITTINGMANY;
	break;

	default:
	ASSERT(0);
	}
	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
	current, new) != current);

	/*
	* We have set the state to indicate that we are committing this
	* speculation. Now reserve the necessary space in the destination
	* buffer.
	*/
	if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
	sizeof (uint64_t), state, NULL)) < 0) {
	dtrace_buffer_drop(dest);
	goto out;
	}

	/*
	* We have the space; copy the buffer across. (Note that this is a
	* highly subobtimal bcopy(); in the unlikely event that this becomes
	* a serious performance issue, a high-performance DTrace-specific
	* bcopy() should obviously be invented.)
	*/
	daddr = (uintptr_t)dest->dtb_tomax + offs;
	dlimit = daddr + src->dtb_offset;
	saddr = (uintptr_t)src->dtb_tomax;

	/*
	* First, the aligned portion.
	*/
	while (dlimit - daddr >= sizeof (uint64_t)) {
	((uint64_t )daddr) = ((uint64_t )saddr);

	daddr += sizeof (uint64_t);
	saddr += sizeof (uint64_t);
	}

	/*
	* Now any left-over bit...
	*/
	while (dlimit - daddr)
	((uint8_t )daddr++) = ((uint8_t )saddr++);

	/*
	* Finally, commit the reserved space in the destination buffer.
	*/
	dest->dtb_offset = offs + src->dtb_offset;

	out:
	/*
	* If we're lucky enough to be the only active CPU on this speculation
	* buffer, we can just set the state back to DTRACESPEC_INACTIVE.
	*/
	if (current == DTRACESPEC_ACTIVE \|\|
	(current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
	uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
	DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);

	ASSERT(rval == DTRACESPEC_COMMITTING);
	}

	src->dtb_offset = 0;
	src->dtb_xamot_drops += src->dtb_drops;
	src->dtb_drops = 0;
	}

	/*
	* This routine discards an active speculation. If the specified speculation
	* is not in a valid state to perform a discard(), this routine will silently
	* do nothing. The state of the specified speculation is transitioned
	* according to the state transition diagram outlined in <sys/dtrace_impl.h>
	*/
	static void
	dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
	dtrace_specid_t which)
	{
	dtrace_speculation_t *spec;
	dtrace_speculation_state_t current, new = 0;
	dtrace_buffer_t *buf;

	if (which == 0)
	return;

	if (which > state->dts_nspeculations) {
	cpu_core[cpu].cpuc_dtrace_flags \|= CPU_DTRACE_ILLOP;
	return;
	}

	spec = &state->dts_speculations[which - 1];
	buf = &spec->dtsp_buffer[cpu];

	do {
	current = spec->dtsp_state;

	switch (current) {
	case DTRACESPEC_INACTIVE:
	case DTRACESPEC_COMMITTINGMANY:
	case DTRACESPEC_COMMITTING:
	case DTRACESPEC_DISCARDING:
	return;

	case DTRACESPEC_ACTIVE:
	case DTRACESPEC_ACTIVEMANY:
	new = DTRACESPEC_DISCARDING;
	break;

	case DTRACESPEC_ACTIVEONE:
	if (buf->dtb_offset != 0) {
	new = DTRACESPEC_INACTIVE;
	} else {
	new = DTRACESPEC_DISCARDING;
	}
	break;

	default:
	ASSERT(0);
	}
	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
	current, new) != current);

	buf->dtb_offset = 0;
	buf->dtb_drops = 0;
	}

	/*
	* Note: not called from probe context. This function is called
	* asynchronously from cross call context to clean any speculations that are
	* in the COMMITTINGMANY or DISCARDING states. These speculations may not be
	* transitioned back to the INACTIVE state until all CPUs have cleaned the
	* speculation.
	*/
	static void
	dtrace_speculation_clean_here(dtrace_state_t *state)
	{
	dtrace_icookie_t cookie;
	processorid_t cpu = curcpu;
	dtrace_buffer_t *dest = &state->dts_buffer[cpu];
	dtrace_specid_t i;

	cookie = dtrace_interrupt_disable();

	if (dest->dtb_tomax == NULL) {
	dtrace_interrupt_enable(cookie);
	return;
	}

	for (i = 0; i < state->dts_nspeculations; i++) {
	dtrace_speculation_t *spec = &state->dts_speculations[i];
	dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];

	if (src->dtb_tomax == NULL)
	continue;

	if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
	src->dtb_offset = 0;
	continue;
	}

	if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
	continue;

	if (src->dtb_offset == 0)
	continue;

	dtrace_speculation_commit(state, cpu, i + 1);
	}

	dtrace_interrupt_enable(cookie);
	}

	/*
	* Note: not called from probe context. This function is called
	* asynchronously (and at a regular interval) to clean any speculations that
	* are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
	* is work to be done, it cross calls all CPUs to perform that work;
	* COMMITMANY and DISCARDING speculations may not be transitioned back to the
	* INACTIVE state until they have been cleaned by all CPUs.
	*/
	static void
	dtrace_speculation_clean(dtrace_state_t *state)
	{
	int work = 0, rv;
	dtrace_specid_t i;

	for (i = 0; i < state->dts_nspeculations; i++) {
	dtrace_speculation_t *spec = &state->dts_speculations[i];

	ASSERT(!spec->dtsp_cleaning);

	if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
	spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
	continue;

	work++;
	spec->dtsp_cleaning = 1;
	}

	if (!work)
	return;

	dtrace_xcall(DTRACE_CPUALL,
	(dtrace_xcall_t)dtrace_speculation_clean_here, state);

	/*
	* We now know that all CPUs have committed or discarded their
	* speculation buffers, as appropriate. We can now set the state
	* to inactive.
	*/
	for (i = 0; i < state->dts_nspeculations; i++) {
	dtrace_speculation_t *spec = &state->dts_speculations[i];
	dtrace_speculation_state_t current, new;

	if (!spec->dtsp_cleaning)
	continue;

	current = spec->dtsp_state;
	ASSERT(current == DTRACESPEC_DISCARDING \|\|
	current == DTRACESPEC_COMMITTINGMANY);

	new = DTRACESPEC_INACTIVE;

	rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
	ASSERT(rv == current);
	spec->dtsp_cleaning = 0;
	}
	}

	/*
	* Called as part of a speculate() to get the speculative buffer associated
	* with a given speculation. Returns NULL if the specified speculation is not
	* in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
	* the active CPU is not the specified CPU -- the speculation will be
	* atomically transitioned into the ACTIVEMANY state.
	*/
	static dtrace_buffer_t *
	dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
	dtrace_specid_t which)
	{
	dtrace_speculation_t *spec;
	dtrace_speculation_state_t current, new = 0;
	dtrace_buffer_t *buf;

	if (which == 0)
	return (NULL);

	if (which > state->dts_nspeculations) {
	cpu_core[cpuid].cpuc_dtrace_flags \|= CPU_DTRACE_ILLOP;
	return (NULL);
	}

	spec = &state->dts_speculations[which - 1];
	buf = &spec->dtsp_buffer[cpuid];

	do {
	current = spec->dtsp_state;

	switch (current) {
	case DTRACESPEC_INACTIVE:
	case DTRACESPEC_COMMITTINGMANY:
	case DTRACESPEC_DISCARDING:
	return (NULL);

	case DTRACESPEC_COMMITTING:
	ASSERT(buf->dtb_offset == 0);
	return (NULL);

	case DTRACESPEC_ACTIVEONE:
	/*
	* This speculation is currently active on one CPU.
	* Check the offset in the buffer; if it's non-zero,
	* that CPU must be us (and we leave the state alone).
	* If it's zero, assume that we're starting on a new
	* CPU -- and change the state to indicate that the
	* speculation is active on more than one CPU.
	*/
	if (buf->dtb_offset != 0)
	return (buf);

	new = DTRACESPEC_ACTIVEMANY;
	break;

	case DTRACESPEC_ACTIVEMANY:
	return (buf);

	case DTRACESPEC_ACTIVE:
	new = DTRACESPEC_ACTIVEONE;
	break;

	default:
	ASSERT(0);
	}
	} while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
	current, new) != current);

	ASSERT(new == DTRACESPEC_ACTIVEONE \|\| new == DTRACESPEC_ACTIVEMANY);
	return (buf);
	}

	/*
	* Return a string. In the event that the user lacks the privilege to access
	* arbitrary kernel memory, we copy the string out to scratch memory so that we
	* don't fail access checking.
	*
	* dtrace_dif_variable() uses this routine as a helper for various
	* builtin values such as 'execname' and 'probefunc.'
	*/
	uintptr_t
	dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
	dtrace_mstate_t *mstate)
	{
	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
	uintptr_t ret;
	size_t strsz;

	/*
	* The easy case: this probe is allowed to read all of memory, so
	* we can just return this as a vanilla pointer.
	*/
	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
	return (addr);

	/*
	* This is the tougher case: we copy the string in question from
	* kernel memory into scratch memory and return it that way: this
	* ensures that we won't trip up when access checking tests the
	* BYREF return value.
	*/
	strsz = dtrace_strlen((char *)addr, size) + 1;

	if (mstate->dtms_scratch_ptr + strsz >
	mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	return (0);
	}

	dtrace_strcpy((const void )addr, (void )mstate->dtms_scratch_ptr,
	strsz);
	ret = mstate->dtms_scratch_ptr;
	mstate->dtms_scratch_ptr += strsz;
	return (ret);
	}

	/*
	* Return a string from a memoy address which is known to have one or
	* more concatenated, individually zero terminated, sub-strings.
	* In the event that the user lacks the privilege to access
	* arbitrary kernel memory, we copy the string out to scratch memory so that we
	* don't fail access checking.
	*
	* dtrace_dif_variable() uses this routine as a helper for various
	* builtin values such as 'execargs'.
	*/
	static uintptr_t
	dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
	dtrace_mstate_t *mstate)
	{
	char *p;
	size_t i;
	uintptr_t ret;

	if (mstate->dtms_scratch_ptr + strsz >
	mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	return (0);
	}

	dtrace_bcopy((const void )addr, (void )mstate->dtms_scratch_ptr,
	strsz);

	/* Replace sub-string termination characters with a space. */
	for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
	p++, i++)
	if (*p == '\0')
	*p = ' ';

	ret = mstate->dtms_scratch_ptr;
	mstate->dtms_scratch_ptr += strsz;
	return (ret);
	}

	/*
	* This function implements the DIF emulator's variable lookups. The emulator
	* passes a reserved variable identifier and optional built-in array index.
	*/
	static uint64_t
	dtrace_dif_variable(dtrace_mstate_t mstate, dtrace_state_t state, uint64_t v,
	uint64_t ndx)
	{
	/*
	* If we're accessing one of the uncached arguments, we'll turn this
	* into a reference in the args array.
	*/
	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
	ndx = v - DIF_VAR_ARG0;
	v = DIF_VAR_ARGS;
	}

	switch (v) {
	case DIF_VAR_ARGS:
	ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
	if (ndx >= sizeof (mstate->dtms_arg) /
	sizeof (mstate->dtms_arg[0])) {
	int aframes = mstate->dtms_probe->dtpr_aframes + 2;
	dtrace_provider_t *pv;
	uint64_t val;

	pv = mstate->dtms_probe->dtpr_provider;
	if (pv->dtpv_pops.dtps_getargval != NULL)
	val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
	mstate->dtms_probe->dtpr_id,
	mstate->dtms_probe->dtpr_arg, ndx, aframes);
	else
	val = dtrace_getarg(ndx, aframes);

	/*
	* This is regrettably required to keep the compiler
	* from tail-optimizing the call to dtrace_getarg().
	* The condition always evaluates to true, but the
	* compiler has no way of figuring that out a priori.
	* (None of this would be necessary if the compiler
	* could be relied upon to _always_ tail-optimize
	* the call to dtrace_getarg() -- but it can't.)
	*/
	if (mstate->dtms_probe != NULL)
	return (val);

	ASSERT(0);
	}

	return (mstate->dtms_arg[ndx]);

	#if defined(sun)
	case DIF_VAR_UREGS: {
	klwp_t *lwp;

	if (!dtrace_priv_proc(state))
	return (0);

	if ((lwp = curthread->t_lwp) == NULL) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
	cpu_core[curcpu].cpuc_dtrace_illval = NULL;
	return (0);
	}

	return (dtrace_getreg(lwp->lwp_regs, ndx));
	return (0);
	}
	#else
	case DIF_VAR_UREGS: {
	struct trapframe *tframe;

	if (!dtrace_priv_proc(state))
	return (0);

	if ((tframe = curthread->td_frame) == NULL) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
	cpu_core[curcpu].cpuc_dtrace_illval = 0;
	return (0);
	}

	return (dtrace_getreg(tframe, ndx));
	}
	#endif

	case DIF_VAR_CURTHREAD:
	if (!dtrace_priv_kernel(state))
	return (0);
	return ((uint64_t)(uintptr_t)curthread);

	case DIF_VAR_TIMESTAMP:
	if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
	mstate->dtms_timestamp = dtrace_gethrtime();
	mstate->dtms_present \|= DTRACE_MSTATE_TIMESTAMP;
	}
	return (mstate->dtms_timestamp);

	case DIF_VAR_VTIMESTAMP:
	ASSERT(dtrace_vtime_references != 0);
	return (curthread->t_dtrace_vtime);

	case DIF_VAR_WALLTIMESTAMP:
	if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
	mstate->dtms_walltimestamp = dtrace_gethrestime();
	mstate->dtms_present \|= DTRACE_MSTATE_WALLTIMESTAMP;
	}
	return (mstate->dtms_walltimestamp);

	#if defined(sun)
	case DIF_VAR_IPL:
	if (!dtrace_priv_kernel(state))
	return (0);
	if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
	mstate->dtms_ipl = dtrace_getipl();
	mstate->dtms_present \|= DTRACE_MSTATE_IPL;
	}
	return (mstate->dtms_ipl);
	#endif

	case DIF_VAR_EPID:
	ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
	return (mstate->dtms_epid);

	case DIF_VAR_ID:
	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
	return (mstate->dtms_probe->dtpr_id);

	case DIF_VAR_STACKDEPTH:
	if (!dtrace_priv_kernel(state))
	return (0);
	if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
	int aframes = mstate->dtms_probe->dtpr_aframes + 2;

	mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
	mstate->dtms_present \|= DTRACE_MSTATE_STACKDEPTH;
	}
	return (mstate->dtms_stackdepth);

	case DIF_VAR_USTACKDEPTH:
	if (!dtrace_priv_proc(state))
	return (0);
	if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
	/*
	* See comment in DIF_VAR_PID.
	*/
	if (DTRACE_ANCHORED(mstate->dtms_probe) &&
	CPU_ON_INTR(CPU)) {
	mstate->dtms_ustackdepth = 0;
	} else {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
	mstate->dtms_ustackdepth =
	dtrace_getustackdepth();
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
	}
	mstate->dtms_present \|= DTRACE_MSTATE_USTACKDEPTH;
	}
	return (mstate->dtms_ustackdepth);

	case DIF_VAR_CALLER:
	if (!dtrace_priv_kernel(state))
	return (0);
	if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
	int aframes = mstate->dtms_probe->dtpr_aframes + 2;

	if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
	/*
	* If this is an unanchored probe, we are
	* required to go through the slow path:
	* dtrace_caller() only guarantees correct
	* results for anchored probes.
	*/
	pc_t caller[2] = {0, 0};

	dtrace_getpcstack(caller, 2, aframes,
	(uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
	mstate->dtms_caller = caller[1];
	} else if ((mstate->dtms_caller =
	dtrace_caller(aframes)) == -1) {
	/*
	* We have failed to do this the quick way;
	* we must resort to the slower approach of
	* calling dtrace_getpcstack().
	*/
	pc_t caller = 0;

	dtrace_getpcstack(&caller, 1, aframes, NULL);
	mstate->dtms_caller = caller;
	}

	mstate->dtms_present \|= DTRACE_MSTATE_CALLER;
	}
	return (mstate->dtms_caller);

	case DIF_VAR_UCALLER:
	if (!dtrace_priv_proc(state))
	return (0);

	if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
	uint64_t ustack[3];

	/*
	* dtrace_getupcstack() fills in the first uint64_t
	* with the current PID. The second uint64_t will
	* be the program counter at user-level. The third
	* uint64_t will contain the caller, which is what
	* we're after.
	*/
	ustack[2] = 0;
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
	dtrace_getupcstack(ustack, 3);
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
	mstate->dtms_ucaller = ustack[2];
	mstate->dtms_present \|= DTRACE_MSTATE_UCALLER;
	}

	return (mstate->dtms_ucaller);

	case DIF_VAR_PROBEPROV:
	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
	return (dtrace_dif_varstr(
	(uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
	state, mstate));

	case DIF_VAR_PROBEMOD:
	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
	return (dtrace_dif_varstr(
	(uintptr_t)mstate->dtms_probe->dtpr_mod,
	state, mstate));

	case DIF_VAR_PROBEFUNC:
	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
	return (dtrace_dif_varstr(
	(uintptr_t)mstate->dtms_probe->dtpr_func,
	state, mstate));

	case DIF_VAR_PROBENAME:
	ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
	return (dtrace_dif_varstr(
	(uintptr_t)mstate->dtms_probe->dtpr_name,
	state, mstate));

	case DIF_VAR_PID:
	if (!dtrace_priv_proc(state))
	return (0);

	#if defined(sun)
	/*
	* Note that we are assuming that an unanchored probe is
	* always due to a high-level interrupt. (And we're assuming
	* that there is only a single high level interrupt.)
	*/
	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
	return (pid0.pid_id);

	/*
	* It is always safe to dereference one's own t_procp pointer:
	* it always points to a valid, allocated proc structure.
	* Further, it is always safe to dereference the p_pidp member
	* of one's own proc structure. (These are truisms becuase
	* threads and processes don't clean up their own state --
	* they leave that task to whomever reaps them.)
	*/
	return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
	#else
	return ((uint64_t)curproc->p_pid);
	#endif

	case DIF_VAR_PPID:
	if (!dtrace_priv_proc(state))
	return (0);

	#if defined(sun)
	/*
	* See comment in DIF_VAR_PID.
	*/
	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
	return (pid0.pid_id);

	/*
	* It is always safe to dereference one's own t_procp pointer:
	* it always points to a valid, allocated proc structure.
	* (This is true because threads don't clean up their own
	* state -- they leave that task to whomever reaps them.)
	*/
	return ((uint64_t)curthread->t_procp->p_ppid);
	#else
	return ((uint64_t)curproc->p_pptr->p_pid);
	#endif

	case DIF_VAR_TID:
	#if defined(sun)
	/*
	* See comment in DIF_VAR_PID.
	*/
	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
	return (0);
	#endif

	return ((uint64_t)curthread->t_tid);

	case DIF_VAR_EXECARGS: {
	struct pargs *p_args = curthread->td_proc->p_args;

	if (p_args == NULL)
	return(0);

	return (dtrace_dif_varstrz(
	(uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
	}

	case DIF_VAR_EXECNAME:
	#if defined(sun)
	if (!dtrace_priv_proc(state))
	return (0);

	/*
	* See comment in DIF_VAR_PID.
	*/
	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
	return ((uint64_t)(uintptr_t)p0.p_user.u_comm);

	/*
	* It is always safe to dereference one's own t_procp pointer:
	* it always points to a valid, allocated proc structure.
	* (This is true because threads don't clean up their own
	* state -- they leave that task to whomever reaps them.)
	*/
	return (dtrace_dif_varstr(
	(uintptr_t)curthread->t_procp->p_user.u_comm,
	state, mstate));
	#else
	return (dtrace_dif_varstr(
	(uintptr_t) curthread->td_proc->p_comm, state, mstate));
	#endif

	case DIF_VAR_ZONENAME:
	#if defined(sun)
	if (!dtrace_priv_proc(state))
	return (0);

	/*
	* See comment in DIF_VAR_PID.
	*/
	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
	return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);

	/*
	* It is always safe to dereference one's own t_procp pointer:
	* it always points to a valid, allocated proc structure.
	* (This is true because threads don't clean up their own
	* state -- they leave that task to whomever reaps them.)
	*/
	return (dtrace_dif_varstr(
	(uintptr_t)curthread->t_procp->p_zone->zone_name,
	state, mstate));
	#else
	return (0);
	#endif

	case DIF_VAR_UID:
	if (!dtrace_priv_proc(state))
	return (0);

	#if defined(sun)
	/*
	* See comment in DIF_VAR_PID.
	*/
	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
	return ((uint64_t)p0.p_cred->cr_uid);
	#endif

	/*
	* It is always safe to dereference one's own t_procp pointer:
	* it always points to a valid, allocated proc structure.
	* (This is true because threads don't clean up their own
	* state -- they leave that task to whomever reaps them.)
	*
	* Additionally, it is safe to dereference one's own process
	* credential, since this is never NULL after process birth.
	*/
	return ((uint64_t)curthread->t_procp->p_cred->cr_uid);

	case DIF_VAR_GID:
	if (!dtrace_priv_proc(state))
	return (0);

	#if defined(sun)
	/*
	* See comment in DIF_VAR_PID.
	*/
	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
	return ((uint64_t)p0.p_cred->cr_gid);
	#endif

	/*
	* It is always safe to dereference one's own t_procp pointer:
	* it always points to a valid, allocated proc structure.
	* (This is true because threads don't clean up their own
	* state -- they leave that task to whomever reaps them.)
	*
	* Additionally, it is safe to dereference one's own process
	* credential, since this is never NULL after process birth.
	*/
	return ((uint64_t)curthread->t_procp->p_cred->cr_gid);

	case DIF_VAR_ERRNO: {
	#if defined(sun)
	klwp_t *lwp;
	if (!dtrace_priv_proc(state))
	return (0);

	/*
	* See comment in DIF_VAR_PID.
	*/
	if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
	return (0);

	/*
	* It is always safe to dereference one's own t_lwp pointer in
	* the event that this pointer is non-NULL. (This is true
	* because threads and lwps don't clean up their own state --
	* they leave that task to whomever reaps them.)
	*/
	if ((lwp = curthread->t_lwp) == NULL)
	return (0);

	return ((uint64_t)lwp->lwp_errno);
	#else
	return (curthread->td_errno);
	#endif
	}
	default:
	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
	return (0);
	}
	}

	/*
	* Emulate the execution of DTrace ID subroutines invoked by the call opcode.
	* Notice that we don't bother validating the proper number of arguments or
	* their types in the tuple stack. This isn't needed because all argument
	* interpretation is safe because of our load safety -- the worst that can
	* happen is that a bogus program can obtain bogus results.
	*/
	static void
	dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
	dtrace_key_t *tupregs, int nargs,
	dtrace_mstate_t mstate, dtrace_state_t state)
	{
	volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
	dtrace_vstate_t *vstate = &state->dts_vstate;

	#if defined(sun)
	union {
	mutex_impl_t mi;
	uint64_t mx;
	} m;

	union {
	krwlock_t ri;
	uintptr_t rw;
	} r;
	#else
	struct thread *lowner;
	union {
	struct lock_object *li;
	uintptr_t lx;
	} l;
	#endif

	switch (subr) {
	case DIF_SUBR_RAND:
	regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
	break;

	#if defined(sun)
	case DIF_SUBR_MUTEX_OWNED:
	if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
	mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	m.mx = dtrace_load64(tupregs[0].dttk_value);
	if (MUTEX_TYPE_ADAPTIVE(&m.mi))
	regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
	else
	regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
	break;

	case DIF_SUBR_MUTEX_OWNER:
	if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
	mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	m.mx = dtrace_load64(tupregs[0].dttk_value);
	if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
	MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
	regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
	else
	regs[rd] = 0;
	break;

	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
	if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
	mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	m.mx = dtrace_load64(tupregs[0].dttk_value);
	regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
	break;

	case DIF_SUBR_MUTEX_TYPE_SPIN:
	if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
	mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	m.mx = dtrace_load64(tupregs[0].dttk_value);
	regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
	break;

	case DIF_SUBR_RW_READ_HELD: {
	uintptr_t tmp;

	if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
	mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	r.rw = dtrace_loadptr(tupregs[0].dttk_value);
	regs[rd] = _RW_READ_HELD(&r.ri, tmp);
	break;
	}

	case DIF_SUBR_RW_WRITE_HELD:
	if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
	mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	r.rw = dtrace_loadptr(tupregs[0].dttk_value);
	regs[rd] = _RW_WRITE_HELD(&r.ri);
	break;

	case DIF_SUBR_RW_ISWRITER:
	if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
	mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	r.rw = dtrace_loadptr(tupregs[0].dttk_value);
	regs[rd] = _RW_ISWRITER(&r.ri);
	break;

	#else
	case DIF_SUBR_MUTEX_OWNED:
	if (!dtrace_canload(tupregs[0].dttk_value,
	sizeof (struct lock_object), mstate, vstate)) {
	regs[rd] = 0;
	break;
	}
	l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
	regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
	break;

	case DIF_SUBR_MUTEX_OWNER:
	if (!dtrace_canload(tupregs[0].dttk_value,
	sizeof (struct lock_object), mstate, vstate)) {
	regs[rd] = 0;
	break;
	}
	l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
	LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
	regs[rd] = (uintptr_t)lowner;
	break;

	case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
	if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
	mstate, vstate)) {
	regs[rd] = 0;
	break;
	}
	l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
	/* XXX - should be only LC_SLEEPABLE? */
	regs[rd] = (LOCK_CLASS(l.li)->lc_flags &
	(LC_SLEEPLOCK \| LC_SLEEPABLE)) != 0;
	break;

	case DIF_SUBR_MUTEX_TYPE_SPIN:
	if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
	mstate, vstate)) {
	regs[rd] = 0;
	break;
	}
	l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
	regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
	break;

	case DIF_SUBR_RW_READ_HELD:
	case DIF_SUBR_SX_SHARED_HELD:
	if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
	mstate, vstate)) {
	regs[rd] = 0;
	break;
	}
	l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
	regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
	lowner == NULL;
	break;

	case DIF_SUBR_RW_WRITE_HELD:
	case DIF_SUBR_SX_EXCLUSIVE_HELD:
	if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
	mstate, vstate)) {
	regs[rd] = 0;
	break;
	}
	l.lx = dtrace_loadptr(tupregs[0].dttk_value);
	LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
	regs[rd] = (lowner == curthread);
	break;

	case DIF_SUBR_RW_ISWRITER:
	case DIF_SUBR_SX_ISEXCLUSIVE:
	if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
	mstate, vstate)) {
	regs[rd] = 0;
	break;
	}
	l.lx = dtrace_loadptr(tupregs[0].dttk_value);
	regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
	lowner != NULL;
	break;
	#endif /* ! defined(sun) */

	case DIF_SUBR_BCOPY: {
	/*
	* We need to be sure that the destination is in the scratch
	* region -- no other region is allowed.
	*/
	uintptr_t src = tupregs[0].dttk_value;
	uintptr_t dest = tupregs[1].dttk_value;
	size_t size = tupregs[2].dttk_value;

	if (!dtrace_inscratch(dest, size, mstate)) {
	*flags \|= CPU_DTRACE_BADADDR;
	*illval = regs[rd];
	break;
	}

	if (!dtrace_canload(src, size, mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	dtrace_bcopy((void )src, (void )dest, size);
	break;
	}

	case DIF_SUBR_ALLOCA:
	case DIF_SUBR_COPYIN: {
	uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
	uint64_t size =
	tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
	size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;

	/*
	* This action doesn't require any credential checks since
	* probes will not activate in user contexts to which the
	* enabling user does not have permissions.
	*/

	/*
	* Rounding up the user allocation size could have overflowed
	* a large, bogus allocation (like -1ULL) to 0.
	*/
	if (scratch_size < size \|\|
	!DTRACE_INSCRATCH(mstate, scratch_size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}

	if (subr == DIF_SUBR_COPYIN) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
	dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
	}

	mstate->dtms_scratch_ptr += scratch_size;
	regs[rd] = dest;
	break;
	}

	case DIF_SUBR_COPYINTO: {
	uint64_t size = tupregs[1].dttk_value;
	uintptr_t dest = tupregs[2].dttk_value;

	/*
	* This action doesn't require any credential checks since
	* probes will not activate in user contexts to which the
	* enabling user does not have permissions.
	*/
	if (!dtrace_inscratch(dest, size, mstate)) {
	*flags \|= CPU_DTRACE_BADADDR;
	*illval = regs[rd];
	break;
	}

	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
	dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
	break;
	}

	case DIF_SUBR_COPYINSTR: {
	uintptr_t dest = mstate->dtms_scratch_ptr;
	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];

	if (nargs > 1 && tupregs[1].dttk_value < size)
	size = tupregs[1].dttk_value + 1;

	/*
	* This action doesn't require any credential checks since
	* probes will not activate in user contexts to which the
	* enabling user does not have permissions.
	*/
	if (!DTRACE_INSCRATCH(mstate, size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}

	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
	dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);

	((char *)dest)[size - 1] = '\0';
	mstate->dtms_scratch_ptr += size;
	regs[rd] = dest;
	break;
	}

	#if defined(sun)
	case DIF_SUBR_MSGSIZE:
	case DIF_SUBR_MSGDSIZE: {
	uintptr_t baddr = tupregs[0].dttk_value, daddr;
	uintptr_t wptr, rptr;
	size_t count = 0;
	int cont = 0;

	while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {

	if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
	vstate)) {
	regs[rd] = 0;
	break;
	}

	wptr = dtrace_loadptr(baddr +
	offsetof(mblk_t, b_wptr));

	rptr = dtrace_loadptr(baddr +
	offsetof(mblk_t, b_rptr));

	if (wptr < rptr) {
	*flags \|= CPU_DTRACE_BADADDR;
	*illval = tupregs[0].dttk_value;
	break;
	}

	daddr = dtrace_loadptr(baddr +
	offsetof(mblk_t, b_datap));

	baddr = dtrace_loadptr(baddr +
	offsetof(mblk_t, b_cont));

	/*
	* We want to prevent against denial-of-service here,
	* so we're only going to search the list for
	* dtrace_msgdsize_max mblks.
	*/
	if (cont++ > dtrace_msgdsize_max) {
	*flags \|= CPU_DTRACE_ILLOP;
	break;
	}

	if (subr == DIF_SUBR_MSGDSIZE) {
	if (dtrace_load8(daddr +
	offsetof(dblk_t, db_type)) != M_DATA)
	continue;
	}

	count += wptr - rptr;
	}

	if (!(*flags & CPU_DTRACE_FAULT))
	regs[rd] = count;

	break;
	}
	#endif

	case DIF_SUBR_PROGENYOF: {
	pid_t pid = tupregs[0].dttk_value;
	proc_t *p;
	int rval = 0;

	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);

	for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
	#if defined(sun)
	if (p->p_pidp->pid_id == pid) {
	#else
	if (p->p_pid == pid) {
	#endif
	rval = 1;
	break;
	}
	}

	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);

	regs[rd] = rval;
	break;
	}

	case DIF_SUBR_SPECULATION:
	regs[rd] = dtrace_speculation(state);
	break;

	case DIF_SUBR_COPYOUT: {
	uintptr_t kaddr = tupregs[0].dttk_value;
	uintptr_t uaddr = tupregs[1].dttk_value;
	uint64_t size = tupregs[2].dttk_value;

	if (!dtrace_destructive_disallow &&
	dtrace_priv_proc_control(state) &&
	!dtrace_istoxic(kaddr, size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
	dtrace_copyout(kaddr, uaddr, size, flags);
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
	}
	break;
	}

	case DIF_SUBR_COPYOUTSTR: {
	uintptr_t kaddr = tupregs[0].dttk_value;
	uintptr_t uaddr = tupregs[1].dttk_value;
	uint64_t size = tupregs[2].dttk_value;

	if (!dtrace_destructive_disallow &&
	dtrace_priv_proc_control(state) &&
	!dtrace_istoxic(kaddr, size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
	dtrace_copyoutstr(kaddr, uaddr, size, flags);
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
	}
	break;
	}

	case DIF_SUBR_STRLEN: {
	size_t sz;
	uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
	sz = dtrace_strlen((char *)addr,
	state->dts_options[DTRACEOPT_STRSIZE]);

	if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	regs[rd] = sz;

	break;
	}

	case DIF_SUBR_STRCHR:
	case DIF_SUBR_STRRCHR: {
	/*
	* We're going to iterate over the string looking for the
	* specified character. We will iterate until we have reached
	* the string length or we have found the character. If this
	* is DIF_SUBR_STRRCHR, we will look for the last occurrence
	* of the specified character instead of the first.
	*/
	uintptr_t saddr = tupregs[0].dttk_value;
	uintptr_t addr = tupregs[0].dttk_value;
	uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
	char c, target = (char)tupregs[1].dttk_value;

	for (regs[rd] = 0; addr < limit; addr++) {
	if ((c = dtrace_load8(addr)) == target) {
	regs[rd] = addr;

	if (subr == DIF_SUBR_STRCHR)
	break;
	}

	if (c == '\0')
	break;
	}

	if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	break;
	}

	case DIF_SUBR_STRSTR:
	case DIF_SUBR_INDEX:
	case DIF_SUBR_RINDEX: {
	/*
	* We're going to iterate over the string looking for the
	* specified string. We will iterate until we have reached
	* the string length or we have found the string. (Yes, this
	* is done in the most naive way possible -- but considering
	* that the string we're searching for is likely to be
	* relatively short, the complexity of Rabin-Karp or similar
	* hardly seems merited.)
	*/
	char addr = (char )(uintptr_t)tupregs[0].dttk_value;
	char substr = (char )(uintptr_t)tupregs[1].dttk_value;
	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
	size_t len = dtrace_strlen(addr, size);
	size_t sublen = dtrace_strlen(substr, size);
	char limit = addr + len, orig = addr;
	int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
	int inc = 1;

	regs[rd] = notfound;

	if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
	vstate)) {
	regs[rd] = 0;
	break;
	}

	/*
	* strstr() and index()/rindex() have similar semantics if
	* both strings are the empty string: strstr() returns a
	* pointer to the (empty) string, and index() and rindex()
	* both return index 0 (regardless of any position argument).
	*/
	if (sublen == 0 && len == 0) {
	if (subr == DIF_SUBR_STRSTR)
	regs[rd] = (uintptr_t)addr;
	else
	regs[rd] = 0;
	break;
	}

	if (subr != DIF_SUBR_STRSTR) {
	if (subr == DIF_SUBR_RINDEX) {
	limit = orig - 1;
	addr += len;
	inc = -1;
	}

	/*
	* Both index() and rindex() take an optional position
	* argument that denotes the starting position.
	*/
	if (nargs == 3) {
	int64_t pos = (int64_t)tupregs[2].dttk_value;

	/*
	* If the position argument to index() is
	* negative, Perl implicitly clamps it at
	* zero. This semantic is a little surprising
	* given the special meaning of negative
	* positions to similar Perl functions like
	* substr(), but it appears to reflect a
	* notion that index() can start from a
	* negative index and increment its way up to
	* the string. Given this notion, Perl's
	* rindex() is at least self-consistent in
	* that it implicitly clamps positions greater
	* than the string length to be the string
	* length. Where Perl completely loses
	* coherence, however, is when the specified
	* substring is the empty string (""). In
	* this case, even if the position is
	* negative, rindex() returns 0 -- and even if
	* the position is greater than the length,
	* index() returns the string length. These
	* semantics violate the notion that index()
	* should never return a value less than the
	* specified position and that rindex() should
	* never return a value greater than the
	* specified position. (One assumes that
	* these semantics are artifacts of Perl's
	* implementation and not the results of
	* deliberate design -- it beggars belief that
	* even Larry Wall could desire such oddness.)
	* While in the abstract one would wish for
	* consistent position semantics across
	* substr(), index() and rindex() -- or at the
	* very least self-consistent position
	* semantics for index() and rindex() -- we
	* instead opt to keep with the extant Perl
	* semantics, in all their broken glory. (Do
	* we have more desire to maintain Perl's
	* semantics than Perl does? Probably.)
	*/
	if (subr == DIF_SUBR_RINDEX) {
	if (pos < 0) {
	if (sublen == 0)
	regs[rd] = 0;
	break;
	}

	if (pos > len)
	pos = len;
	} else {
	if (pos < 0)
	pos = 0;

	if (pos >= len) {
	if (sublen == 0)
	regs[rd] = len;
	break;
	}
	}

	addr = orig + pos;
	}
	}

	for (regs[rd] = notfound; addr != limit; addr += inc) {
	if (dtrace_strncmp(addr, substr, sublen) == 0) {
	if (subr != DIF_SUBR_STRSTR) {
	/*
	* As D index() and rindex() are
	* modeled on Perl (and not on awk),
	* we return a zero-based (and not a
	* one-based) index. (For you Perl
	* weenies: no, we're not going to add
	* $[ -- and shouldn't you be at a con
	* or something?)
	*/
	regs[rd] = (uintptr_t)(addr - orig);
	break;
	}

	ASSERT(subr == DIF_SUBR_STRSTR);
	regs[rd] = (uintptr_t)addr;
	break;
	}
	}

	break;
	}

	case DIF_SUBR_STRTOK: {
	uintptr_t addr = tupregs[0].dttk_value;
	uintptr_t tokaddr = tupregs[1].dttk_value;
	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
	uintptr_t limit, toklimit = tokaddr + size;
	uint8_t c = 0, tokmap[32]; /* 256 / 8 */
	char dest = (char )mstate->dtms_scratch_ptr;
	int i;

	/*
	* Check both the token buffer and (later) the input buffer,
	* since both could be non-scratch addresses.
	*/
	if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	if (!DTRACE_INSCRATCH(mstate, size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}

	if (addr == 0) {
	/*
	* If the address specified is NULL, we use our saved
	* strtok pointer from the mstate. Note that this
	* means that the saved strtok pointer is _only_
	* valid within multiple enablings of the same probe --
	* it behaves like an implicit clause-local variable.
	*/
	addr = mstate->dtms_strtok;
	} else {
	/*
	* If the user-specified address is non-NULL we must
	* access check it. This is the only time we have
	* a chance to do so, since this address may reside
	* in the string table of this clause-- future calls
	* (when we fetch addr from mstate->dtms_strtok)
	* would fail this access check.
	*/
	if (!dtrace_strcanload(addr, size, mstate, vstate)) {
	regs[rd] = 0;
	break;
	}
	}

	/*
	* First, zero the token map, and then process the token
	* string -- setting a bit in the map for every character
	* found in the token string.
	*/
	for (i = 0; i < sizeof (tokmap); i++)
	tokmap[i] = 0;

	for (; tokaddr < toklimit; tokaddr++) {
	if ((c = dtrace_load8(tokaddr)) == '\0')
	break;

	ASSERT((c >> 3) < sizeof (tokmap));
	tokmap[c >> 3] \|= (1 << (c & 0x7));
	}

	for (limit = addr + size; addr < limit; addr++) {
	/*
	* We're looking for a character that is _not_ contained
	* in the token string.
	*/
	if ((c = dtrace_load8(addr)) == '\0')
	break;

	if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
	break;
	}

	if (c == '\0') {
	/*
	* We reached the end of the string without finding
	* any character that was not in the token string.
	* We return NULL in this case, and we set the saved
	* address to NULL as well.
	*/
	regs[rd] = 0;
	mstate->dtms_strtok = 0;
	break;
	}

	/*
	* From here on, we're copying into the destination string.
	*/
	for (i = 0; addr < limit && i < size - 1; addr++) {
	if ((c = dtrace_load8(addr)) == '\0')
	break;

	if (tokmap[c >> 3] & (1 << (c & 0x7)))
	break;

	ASSERT(i < size);
	dest[i++] = c;
	}

	ASSERT(i < size);
	dest[i] = '\0';
	regs[rd] = (uintptr_t)dest;
	mstate->dtms_scratch_ptr += size;
	mstate->dtms_strtok = addr;
	break;
	}

	case DIF_SUBR_SUBSTR: {
	uintptr_t s = tupregs[0].dttk_value;
	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
	char d = (char )mstate->dtms_scratch_ptr;
	int64_t index = (int64_t)tupregs[1].dttk_value;
	int64_t remaining = (int64_t)tupregs[2].dttk_value;
	size_t len = dtrace_strlen((char *)s, size);
	int64_t i = 0;

	if (!dtrace_canload(s, len + 1, mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	if (!DTRACE_INSCRATCH(mstate, size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}

	if (nargs <= 2)
	remaining = (int64_t)size;

	if (index < 0) {
	index += len;

	if (index < 0 && index + remaining > 0) {
	remaining += index;
	index = 0;
	}
	}

	if (index >= len \|\| index < 0) {
	remaining = 0;
	} else if (remaining < 0) {
	remaining += len - index;
	} else if (index + remaining > size) {
	remaining = size - index;
	}

	for (i = 0; i < remaining; i++) {
	if ((d[i] = dtrace_load8(s + index + i)) == '\0')
	break;
	}

	d[i] = '\0';

	mstate->dtms_scratch_ptr += size;
	regs[rd] = (uintptr_t)d;
	break;
	}

	#if defined(sun)
	case DIF_SUBR_GETMAJOR:
	#ifdef _LP64
	regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
	#else
	regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
	#endif
	break;

	case DIF_SUBR_GETMINOR:
	#ifdef _LP64
	regs[rd] = tupregs[0].dttk_value & MAXMIN64;
	#else
	regs[rd] = tupregs[0].dttk_value & MAXMIN;
	#endif
	break;

	case DIF_SUBR_DDI_PATHNAME: {
	/*
	* This one is a galactic mess. We are going to roughly
	* emulate ddi_pathname(), but it's made more complicated
	* by the fact that we (a) want to include the minor name and
	* (b) must proceed iteratively instead of recursively.
	*/
	uintptr_t dest = mstate->dtms_scratch_ptr;
	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
	char start = (char )dest, *end = start + size - 1;
	uintptr_t daddr = tupregs[0].dttk_value;
	int64_t minor = (int64_t)tupregs[1].dttk_value;
	char *s;
	int i, len, depth = 0;

	/*
	* Due to all the pointer jumping we do and context we must
	* rely upon, we just mandate that the user must have kernel
	* read privileges to use this routine.
	*/
	if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
	*flags \|= CPU_DTRACE_KPRIV;
	*illval = daddr;
	regs[rd] = 0;
	}

	if (!DTRACE_INSCRATCH(mstate, size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}

	*end = '\0';

	/*
	* We want to have a name for the minor. In order to do this,
	* we need to walk the minor list from the devinfo. We want
	* to be sure that we don't infinitely walk a circular list,
	* so we check for circularity by sending a scout pointer
	* ahead two elements for every element that we iterate over;
	* if the list is circular, these will ultimately point to the
	* same element. You may recognize this little trick as the
	* answer to a stupid interview question -- one that always
	* seems to be asked by those who had to have it laboriously
	* explained to them, and who can't even concisely describe
	* the conditions under which one would be forced to resort to
	* this technique. Needless to say, those conditions are
	* found here -- and probably only here. Is this the only use
	* of this infamous trick in shipping, production code? If it
	* isn't, it probably should be...
	*/
	if (minor != -1) {
	uintptr_t maddr = dtrace_loadptr(daddr +
	offsetof(struct dev_info, devi_minor));

	uintptr_t next = offsetof(struct ddi_minor_data, next);
	uintptr_t name = offsetof(struct ddi_minor_data,
	d_minor) + offsetof(struct ddi_minor, name);
	uintptr_t dev = offsetof(struct ddi_minor_data,
	d_minor) + offsetof(struct ddi_minor, dev);
	uintptr_t scout;

	if (maddr != NULL)
	scout = dtrace_loadptr(maddr + next);

	while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
	uint64_t m;
	#ifdef _LP64
	m = dtrace_load64(maddr + dev) & MAXMIN64;
	#else
	m = dtrace_load32(maddr + dev) & MAXMIN;
	#endif
	if (m != minor) {
	maddr = dtrace_loadptr(maddr + next);

	if (scout == NULL)
	continue;

	scout = dtrace_loadptr(scout + next);

	if (scout == NULL)
	continue;

	scout = dtrace_loadptr(scout + next);

	if (scout == NULL)
	continue;

	if (scout == maddr) {
	*flags \|= CPU_DTRACE_ILLOP;
	break;
	}

	continue;
	}

	/*
	* We have the minor data. Now we need to
	* copy the minor's name into the end of the
	* pathname.
	*/
	s = (char *)dtrace_loadptr(maddr + name);
	len = dtrace_strlen(s, size);

	if (*flags & CPU_DTRACE_FAULT)
	break;

	if (len != 0) {
	if ((end -= (len + 1)) < start)
	break;

	*end = ':';
	}

	for (i = 1; i <= len; i++)
	end[i] = dtrace_load8((uintptr_t)s++);
	break;
	}
	}

	while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
	ddi_node_state_t devi_state;

	devi_state = dtrace_load32(daddr +
	offsetof(struct dev_info, devi_node_state));

	if (*flags & CPU_DTRACE_FAULT)
	break;

	if (devi_state >= DS_INITIALIZED) {
	s = (char *)dtrace_loadptr(daddr +
	offsetof(struct dev_info, devi_addr));
	len = dtrace_strlen(s, size);

	if (*flags & CPU_DTRACE_FAULT)
	break;

	if (len != 0) {
	if ((end -= (len + 1)) < start)
	break;

	*end = '@';
	}

	for (i = 1; i <= len; i++)
	end[i] = dtrace_load8((uintptr_t)s++);
	}

	/*
	* Now for the node name...
	*/
	s = (char *)dtrace_loadptr(daddr +
	offsetof(struct dev_info, devi_node_name));

	daddr = dtrace_loadptr(daddr +
	offsetof(struct dev_info, devi_parent));

	/*
	* If our parent is NULL (that is, if we're the root
	* node), we're going to use the special path
	* "devices".
	*/
	if (daddr == 0)
	s = "devices";

	len = dtrace_strlen(s, size);
	if (*flags & CPU_DTRACE_FAULT)
	break;

	if ((end -= (len + 1)) < start)
	break;

	for (i = 1; i <= len; i++)
	end[i] = dtrace_load8((uintptr_t)s++);
	*end = '/';

	if (depth++ > dtrace_devdepth_max) {
	*flags \|= CPU_DTRACE_ILLOP;
	break;
	}
	}

	if (end < start)
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);

	if (daddr == 0) {
	regs[rd] = (uintptr_t)end;
	mstate->dtms_scratch_ptr += size;
	}

	break;
	}
	#endif

	case DIF_SUBR_STRJOIN: {
	char d = (char )mstate->dtms_scratch_ptr;
	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
	uintptr_t s1 = tupregs[0].dttk_value;
	uintptr_t s2 = tupregs[1].dttk_value;
	int i = 0;

	if (!dtrace_strcanload(s1, size, mstate, vstate) \|\|
	!dtrace_strcanload(s2, size, mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	if (!DTRACE_INSCRATCH(mstate, size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}

	for (;;) {
	if (i >= size) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}

	if ((d[i++] = dtrace_load8(s1++)) == '\0') {
	i--;
	break;
	}
	}

	for (;;) {
	if (i >= size) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}

	if ((d[i++] = dtrace_load8(s2++)) == '\0')
	break;
	}

	if (i < size) {
	mstate->dtms_scratch_ptr += i;
	regs[rd] = (uintptr_t)d;
	}

	break;
	}

	case DIF_SUBR_LLTOSTR: {
	int64_t i = (int64_t)tupregs[0].dttk_value;
	int64_t val = i < 0 ? i * -1 : i;
	uint64_t size = 22; /* enough room for 2^64 in decimal */
	char end = (char )mstate->dtms_scratch_ptr + size - 1;

	if (!DTRACE_INSCRATCH(mstate, size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}

	for (*end-- = '\0'; val; val /= 10)
	*end-- = '0' + (val % 10);

	if (i == 0)
	*end-- = '0';

	if (i < 0)
	*end-- = '-';

	regs[rd] = (uintptr_t)end + 1;
	mstate->dtms_scratch_ptr += size;
	break;
	}

	case DIF_SUBR_HTONS:
	case DIF_SUBR_NTOHS:
	#if BYTE_ORDER == BIG_ENDIAN
	regs[rd] = (uint16_t)tupregs[0].dttk_value;
	#else
	regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
	#endif
	break;


	case DIF_SUBR_HTONL:
	case DIF_SUBR_NTOHL:
	#if BYTE_ORDER == BIG_ENDIAN
	regs[rd] = (uint32_t)tupregs[0].dttk_value;
	#else
	regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
	#endif
	break;


	case DIF_SUBR_HTONLL:
	case DIF_SUBR_NTOHLL:
	#if BYTE_ORDER == BIG_ENDIAN
	regs[rd] = (uint64_t)tupregs[0].dttk_value;
	#else
	regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
	#endif
	break;


	case DIF_SUBR_DIRNAME:
	case DIF_SUBR_BASENAME: {
	char dest = (char )mstate->dtms_scratch_ptr;
	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
	uintptr_t src = tupregs[0].dttk_value;
	int i, j, len = dtrace_strlen((char *)src, size);
	int lastbase = -1, firstbase = -1, lastdir = -1;
	int start, end;

	if (!dtrace_canload(src, len + 1, mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	if (!DTRACE_INSCRATCH(mstate, size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}

	/*
	* The basename and dirname for a zero-length string is
	* defined to be "."
	*/
	if (len == 0) {
	len = 1;
	src = (uintptr_t)".";
	}

	/*
	* Start from the back of the string, moving back toward the
	* front until we see a character that isn't a slash. That
	* character is the last character in the basename.
	*/
	for (i = len - 1; i >= 0; i--) {
	if (dtrace_load8(src + i) != '/')
	break;
	}

	if (i >= 0)
	lastbase = i;

	/*
	* Starting from the last character in the basename, move
	* towards the front until we find a slash. The character
	* that we processed immediately before that is the first
	* character in the basename.
	*/
	for (; i >= 0; i--) {
	if (dtrace_load8(src + i) == '/')
	break;
	}

	if (i >= 0)
	firstbase = i + 1;

	/*
	* Now keep going until we find a non-slash character. That
	* character is the last character in the dirname.
	*/
	for (; i >= 0; i--) {
	if (dtrace_load8(src + i) != '/')
	break;
	}

	if (i >= 0)
	lastdir = i;

	ASSERT(!(lastbase == -1 && firstbase != -1));
	ASSERT(!(firstbase == -1 && lastdir != -1));

	if (lastbase == -1) {
	/*
	* We didn't find a non-slash character. We know that
	* the length is non-zero, so the whole string must be
	* slashes. In either the dirname or the basename
	* case, we return '/'.
	*/
	ASSERT(firstbase == -1);
	firstbase = lastbase = lastdir = 0;
	}

	if (firstbase == -1) {
	/*
	* The entire string consists only of a basename
	* component. If we're looking for dirname, we need
	* to change our string to be just "."; if we're
	* looking for a basename, we'll just set the first
	* character of the basename to be 0.
	*/
	if (subr == DIF_SUBR_DIRNAME) {
	ASSERT(lastdir == -1);
	src = (uintptr_t)".";
	lastdir = 0;
	} else {
	firstbase = 0;
	}
	}

	if (subr == DIF_SUBR_DIRNAME) {
	if (lastdir == -1) {
	/*
	* We know that we have a slash in the name --
	* or lastdir would be set to 0, above. And
	* because lastdir is -1, we know that this
	* slash must be the first character. (That
	* is, the full string must be of the form
	* "/basename".) In this case, the last
	* character of the directory name is 0.
	*/
	lastdir = 0;
	}

	start = 0;
	end = lastdir;
	} else {
	ASSERT(subr == DIF_SUBR_BASENAME);
	ASSERT(firstbase != -1 && lastbase != -1);
	start = firstbase;
	end = lastbase;
	}

	for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
	dest[j] = dtrace_load8(src + i);

	dest[j] = '\0';
	regs[rd] = (uintptr_t)dest;
	mstate->dtms_scratch_ptr += size;
	break;
	}

	case DIF_SUBR_CLEANPATH: {
	char dest = (char )mstate->dtms_scratch_ptr, c;
	uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
	uintptr_t src = tupregs[0].dttk_value;
	int i = 0, j = 0;

	if (!dtrace_strcanload(src, size, mstate, vstate)) {
	regs[rd] = 0;
	break;
	}

	if (!DTRACE_INSCRATCH(mstate, size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}

	/*
	* Move forward, loading each character.
	*/
	do {
	c = dtrace_load8(src + i++);
	next:
	if (j + 5 >= size) /* 5 = strlen("/..c\0") */
	break;

	if (c != '/') {
	dest[j++] = c;
	continue;
	}

	c = dtrace_load8(src + i++);

	if (c == '/') {
	/*
	* We have two slashes -- we can just advance
	* to the next character.
	*/
	goto next;
	}

	if (c != '.') {
	/*
	* This is not "." and it's not ".." -- we can
	* just store the "/" and this character and
	* drive on.
	*/
	dest[j++] = '/';
	dest[j++] = c;
	continue;
	}

	c = dtrace_load8(src + i++);

	if (c == '/') {
	/*
	* This is a "/./" component. We're not going
	* to store anything in the destination buffer;
	* we're just going to go to the next component.
	*/
	goto next;
	}

	if (c != '.') {
	/*
	* This is not ".." -- we can just store the
	* "/." and this character and continue
	* processing.
	*/
	dest[j++] = '/';
	dest[j++] = '.';
	dest[j++] = c;
	continue;
	}

	c = dtrace_load8(src + i++);

	if (c != '/' && c != '\0') {
	/*
	* This is not ".." -- it's "..[mumble]".
	* We'll store the "/.." and this character
	* and continue processing.
	*/
	dest[j++] = '/';
	dest[j++] = '.';
	dest[j++] = '.';
	dest[j++] = c;
	continue;
	}

	/*
	* This is "/../" or "/..\0". We need to back up
	* our destination pointer until we find a "/".
	*/
	i--;
	while (j != 0 && dest[--j] != '/')
	continue;

	if (c == '\0')
	dest[++j] = '/';
	} while (c != '\0');

	dest[j] = '\0';
	regs[rd] = (uintptr_t)dest;
	mstate->dtms_scratch_ptr += size;
	break;
	}

	case DIF_SUBR_INET_NTOA:
	case DIF_SUBR_INET_NTOA6:
	case DIF_SUBR_INET_NTOP: {
	size_t size;
	int af, argi, i;
	char base, end;

	if (subr == DIF_SUBR_INET_NTOP) {
	af = (int)tupregs[0].dttk_value;
	argi = 1;
	} else {
	af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
	argi = 0;
	}

	if (af == AF_INET) {
	ipaddr_t ip4;
	uint8_t *ptr8, val;

	/*
	* Safely load the IPv4 address.
	*/
	ip4 = dtrace_load32(tupregs[argi].dttk_value);

	/*
	* Check an IPv4 string will fit in scratch.
	*/
	size = INET_ADDRSTRLEN;
	if (!DTRACE_INSCRATCH(mstate, size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}
	base = (char *)mstate->dtms_scratch_ptr;
	end = (char *)mstate->dtms_scratch_ptr + size - 1;

	/*
	* Stringify as a dotted decimal quad.
	*/
	*end-- = '\0';
	ptr8 = (uint8_t *)&ip4;
	for (i = 3; i >= 0; i--) {
	val = ptr8[i];

	if (val == 0) {
	*end-- = '0';
	} else {
	for (; val; val /= 10) {
	*end-- = '0' + (val % 10);
	}
	}

	if (i > 0)
	*end-- = '.';
	}
	ASSERT(end + 1 >= base);

	} else if (af == AF_INET6) {
	struct in6_addr ip6;
	int firstzero, tryzero, numzero, v6end;
	uint16_t val;
	const char digits[] = "0123456789abcdef";

	/*
	* Stringify using RFC 1884 convention 2 - 16 bit
	* hexadecimal values with a zero-run compression.
	* Lower case hexadecimal digits are used.
	* eg, fe80::214:4fff:fe0b:76c8.
	* The IPv4 embedded form is returned for inet_ntop,
	* just the IPv4 string is returned for inet_ntoa6.
	*/

	/*
	* Safely load the IPv6 address.
	*/
	dtrace_bcopy(
	(void *)(uintptr_t)tupregs[argi].dttk_value,
	(void *)(uintptr_t)&ip6, sizeof (struct in6_addr));

	/*
	* Check an IPv6 string will fit in scratch.
	*/
	size = INET6_ADDRSTRLEN;
	if (!DTRACE_INSCRATCH(mstate, size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}
	base = (char *)mstate->dtms_scratch_ptr;
	end = (char *)mstate->dtms_scratch_ptr + size - 1;
	*end-- = '\0';

	/*
	* Find the longest run of 16 bit zero values
	* for the single allowed zero compression - "::".
	*/
	firstzero = -1;
	tryzero = -1;
	numzero = 1;
	for (i = 0; i < sizeof (struct in6_addr); i++) {
	#if defined(sun)
	if (ip6._S6_un._S6_u8[i] == 0 &&
	#else
	if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
	#endif
	tryzero == -1 && i % 2 == 0) {
	tryzero = i;
	continue;
	}

	if (tryzero != -1 &&
	#if defined(sun)
	(ip6._S6_un._S6_u8[i] != 0 \|\|
	#else
	(ip6.__u6_addr.__u6_addr8[i] != 0 \|\|
	#endif
	i == sizeof (struct in6_addr) - 1)) {

	if (i - tryzero <= numzero) {
	tryzero = -1;
	continue;
	}

	firstzero = tryzero;
	numzero = i - i % 2 - tryzero;
	tryzero = -1;

	#if defined(sun)
	if (ip6._S6_un._S6_u8[i] == 0 &&
	#else
	if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
	#endif
	i == sizeof (struct in6_addr) - 1)
	numzero += 2;
	}
	}
	ASSERT(firstzero + numzero <= sizeof (struct in6_addr));

	/*
	* Check for an IPv4 embedded address.
	*/
	v6end = sizeof (struct in6_addr) - 2;
	if (IN6_IS_ADDR_V4MAPPED(&ip6) \|\|
	IN6_IS_ADDR_V4COMPAT(&ip6)) {
	for (i = sizeof (struct in6_addr) - 1;
	i >= DTRACE_V4MAPPED_OFFSET; i--) {
	ASSERT(end >= base);

	#if defined(sun)
	val = ip6._S6_un._S6_u8[i];
	#else
	val = ip6.__u6_addr.__u6_addr8[i];
	#endif

	if (val == 0) {
	*end-- = '0';
	} else {
	for (; val; val /= 10) {
	*end-- = '0' + val % 10;
	}
	}

	if (i > DTRACE_V4MAPPED_OFFSET)
	*end-- = '.';
	}

	if (subr == DIF_SUBR_INET_NTOA6)
	goto inetout;

	/*
	* Set v6end to skip the IPv4 address that
	* we have already stringified.
	*/
	v6end = 10;
	}

	/*
	* Build the IPv6 string by working through the
	* address in reverse.
	*/
	for (i = v6end; i >= 0; i -= 2) {
	ASSERT(end >= base);

	if (i == firstzero + numzero - 2) {
	*end-- = ':';
	*end-- = ':';
	i -= numzero - 2;
	continue;
	}

	if (i < 14 && i != firstzero - 2)
	*end-- = ':';

	#if defined(sun)
	val = (ip6._S6_un._S6_u8[i] << 8) +
	ip6._S6_un._S6_u8[i + 1];
	#else
	val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
	ip6.__u6_addr.__u6_addr8[i + 1];
	#endif

	if (val == 0) {
	*end-- = '0';
	} else {
	for (; val; val /= 16) {
	*end-- = digits[val % 16];
	}
	}
	}
	ASSERT(end + 1 >= base);

	} else {
	/*
	* The user didn't use AH_INET or AH_INET6.
	*/
	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
	regs[rd] = 0;
	break;
	}

	inetout: regs[rd] = (uintptr_t)end + 1;
	mstate->dtms_scratch_ptr += size;
	break;
	}

	case DIF_SUBR_MEMREF: {
	uintptr_t size = 2 * sizeof(uintptr_t);
	uintptr_t memref = (uintptr_t ) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
	size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;

	/* address and length */
	memref[0] = tupregs[0].dttk_value;
	memref[1] = tupregs[1].dttk_value;

	regs[rd] = (uintptr_t) memref;
	mstate->dtms_scratch_ptr += scratch_size;
	break;
	}

	case DIF_SUBR_TYPEREF: {
	uintptr_t size = 4 * sizeof(uintptr_t);
	uintptr_t typeref = (uintptr_t ) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
	size_t scratch_size = ((uintptr_t) typeref - mstate->dtms_scratch_ptr) + size;

	/* address, num_elements, type_str, type_len */
	typeref[0] = tupregs[0].dttk_value;
	typeref[1] = tupregs[1].dttk_value;
	typeref[2] = tupregs[2].dttk_value;
	typeref[3] = tupregs[3].dttk_value;

	regs[rd] = (uintptr_t) typeref;
	mstate->dtms_scratch_ptr += scratch_size;
	break;
	}
	}
	}

	/*
	* Emulate the execution of DTrace IR instructions specified by the given
	* DIF object. This function is deliberately void of assertions as all of
	* the necessary checks are handled by a call to dtrace_difo_validate().
	*/
	static uint64_t
	dtrace_dif_emulate(dtrace_difo_t difo, dtrace_mstate_t mstate,
	dtrace_vstate_t vstate, dtrace_state_t state)
	{
	const dif_instr_t *text = difo->dtdo_buf;
	const uint_t textlen = difo->dtdo_len;
	const char *strtab = difo->dtdo_strtab;
	const uint64_t *inttab = difo->dtdo_inttab;

	uint64_t rval = 0;
	dtrace_statvar_t *svar;
	dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
	dtrace_difv_t *v;
	volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
	volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;

	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
	uint64_t regs[DIF_DIR_NREGS];
	uint64_t *tmp;

	uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
	int64_t cc_r;
	uint_t pc = 0, id, opc = 0;
	uint8_t ttop = 0;
	dif_instr_t instr;
	uint_t r1, r2, rd;

	/*
	* We stash the current DIF object into the machine state: we need it
	* for subsequent access checking.
	*/
	mstate->dtms_difo = difo;

	regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */

	while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
	opc = pc;

	instr = text[pc++];
	r1 = DIF_INSTR_R1(instr);
	r2 = DIF_INSTR_R2(instr);
	rd = DIF_INSTR_RD(instr);

	switch (DIF_INSTR_OP(instr)) {
	case DIF_OP_OR:
	regs[rd] = regs[r1] \| regs[r2];
	break;
	case DIF_OP_XOR:
	regs[rd] = regs[r1] ^ regs[r2];
	break;
	case DIF_OP_AND:
	regs[rd] = regs[r1] & regs[r2];
	break;
	case DIF_OP_SLL:
	regs[rd] = regs[r1] << regs[r2];
	break;
	case DIF_OP_SRL:
	regs[rd] = regs[r1] >> regs[r2];
	break;
	case DIF_OP_SUB:
	regs[rd] = regs[r1] - regs[r2];
	break;
	case DIF_OP_ADD:
	regs[rd] = regs[r1] + regs[r2];
	break;
	case DIF_OP_MUL:
	regs[rd] = regs[r1] * regs[r2];
	break;
	case DIF_OP_SDIV:
	if (regs[r2] == 0) {
	regs[rd] = 0;
	*flags \|= CPU_DTRACE_DIVZERO;
	} else {
	regs[rd] = (int64_t)regs[r1] /
	(int64_t)regs[r2];
	}
	break;

	case DIF_OP_UDIV:
	if (regs[r2] == 0) {
	regs[rd] = 0;
	*flags \|= CPU_DTRACE_DIVZERO;
	} else {
	regs[rd] = regs[r1] / regs[r2];
	}
	break;

	case DIF_OP_SREM:
	if (regs[r2] == 0) {
	regs[rd] = 0;
	*flags \|= CPU_DTRACE_DIVZERO;
	} else {
	regs[rd] = (int64_t)regs[r1] %
	(int64_t)regs[r2];
	}
	break;

	case DIF_OP_UREM:
	if (regs[r2] == 0) {
	regs[rd] = 0;
	*flags \|= CPU_DTRACE_DIVZERO;
	} else {
	regs[rd] = regs[r1] % regs[r2];
	}
	break;

	case DIF_OP_NOT:
	regs[rd] = ~regs[r1];
	break;
	case DIF_OP_MOV:
	regs[rd] = regs[r1];
	break;
	case DIF_OP_CMP:
	cc_r = regs[r1] - regs[r2];
	cc_n = cc_r < 0;
	cc_z = cc_r == 0;
	cc_v = 0;
	cc_c = regs[r1] < regs[r2];
	break;
	case DIF_OP_TST:
	cc_n = cc_v = cc_c = 0;
	cc_z = regs[r1] == 0;
	break;
	case DIF_OP_BA:
	pc = DIF_INSTR_LABEL(instr);
	break;
	case DIF_OP_BE:
	if (cc_z)
	pc = DIF_INSTR_LABEL(instr);
	break;
	case DIF_OP_BNE:
	if (cc_z == 0)
	pc = DIF_INSTR_LABEL(instr);
	break;
	case DIF_OP_BG:
	if ((cc_z \| (cc_n ^ cc_v)) == 0)
	pc = DIF_INSTR_LABEL(instr);
	break;
	case DIF_OP_BGU:
	if ((cc_c \| cc_z) == 0)
	pc = DIF_INSTR_LABEL(instr);
	break;
	case DIF_OP_BGE:
	if ((cc_n ^ cc_v) == 0)
	pc = DIF_INSTR_LABEL(instr);
	break;
	case DIF_OP_BGEU:
	if (cc_c == 0)
	pc = DIF_INSTR_LABEL(instr);
	break;
	case DIF_OP_BL:
	if (cc_n ^ cc_v)
	pc = DIF_INSTR_LABEL(instr);
	break;
	case DIF_OP_BLU:
	if (cc_c)
	pc = DIF_INSTR_LABEL(instr);
	break;
	case DIF_OP_BLE:
	if (cc_z \| (cc_n ^ cc_v))
	pc = DIF_INSTR_LABEL(instr);
	break;
	case DIF_OP_BLEU:
	if (cc_c \| cc_z)
	pc = DIF_INSTR_LABEL(instr);
	break;
	case DIF_OP_RLDSB:
	if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
	*flags \|= CPU_DTRACE_KPRIV;
	*illval = regs[r1];
	break;
	}
	/FALLTHROUGH/
	case DIF_OP_LDSB:
	regs[rd] = (int8_t)dtrace_load8(regs[r1]);
	break;
	case DIF_OP_RLDSH:
	if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
	*flags \|= CPU_DTRACE_KPRIV;
	*illval = regs[r1];
	break;
	}
	/FALLTHROUGH/
	case DIF_OP_LDSH:
	regs[rd] = (int16_t)dtrace_load16(regs[r1]);
	break;
	case DIF_OP_RLDSW:
	if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
	*flags \|= CPU_DTRACE_KPRIV;
	*illval = regs[r1];
	break;
	}
	/FALLTHROUGH/
	case DIF_OP_LDSW:
	regs[rd] = (int32_t)dtrace_load32(regs[r1]);
	break;
	case DIF_OP_RLDUB:
	if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
	*flags \|= CPU_DTRACE_KPRIV;
	*illval = regs[r1];
	break;
	}
	/FALLTHROUGH/
	case DIF_OP_LDUB:
	regs[rd] = dtrace_load8(regs[r1]);
	break;
	case DIF_OP_RLDUH:
	if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
	*flags \|= CPU_DTRACE_KPRIV;
	*illval = regs[r1];
	break;
	}
	/FALLTHROUGH/
	case DIF_OP_LDUH:
	regs[rd] = dtrace_load16(regs[r1]);
	break;
	case DIF_OP_RLDUW:
	if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
	*flags \|= CPU_DTRACE_KPRIV;
	*illval = regs[r1];
	break;
	}
	/FALLTHROUGH/
	case DIF_OP_LDUW:
	regs[rd] = dtrace_load32(regs[r1]);
	break;
	case DIF_OP_RLDX:
	if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
	*flags \|= CPU_DTRACE_KPRIV;
	*illval = regs[r1];
	break;
	}
	/FALLTHROUGH/
	case DIF_OP_LDX:
	regs[rd] = dtrace_load64(regs[r1]);
	break;
	case DIF_OP_ULDSB:
	regs[rd] = (int8_t)
	dtrace_fuword8((void *)(uintptr_t)regs[r1]);
	break;
	case DIF_OP_ULDSH:
	regs[rd] = (int16_t)
	dtrace_fuword16((void *)(uintptr_t)regs[r1]);
	break;
	case DIF_OP_ULDSW:
	regs[rd] = (int32_t)
	dtrace_fuword32((void *)(uintptr_t)regs[r1]);
	break;
	case DIF_OP_ULDUB:
	regs[rd] =
	dtrace_fuword8((void *)(uintptr_t)regs[r1]);
	break;
	case DIF_OP_ULDUH:
	regs[rd] =
	dtrace_fuword16((void *)(uintptr_t)regs[r1]);
	break;
	case DIF_OP_ULDUW:
	regs[rd] =
	dtrace_fuword32((void *)(uintptr_t)regs[r1]);
	break;
	case DIF_OP_ULDX:
	regs[rd] =
	dtrace_fuword64((void *)(uintptr_t)regs[r1]);
	break;
	case DIF_OP_RET:
	rval = regs[rd];
	pc = textlen;
	break;
	case DIF_OP_NOP:
	break;
	case DIF_OP_SETX:
	regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
	break;
	case DIF_OP_SETS:
	regs[rd] = (uint64_t)(uintptr_t)
	(strtab + DIF_INSTR_STRING(instr));
	break;
	case DIF_OP_SCMP: {
	size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
	uintptr_t s1 = regs[r1];
	uintptr_t s2 = regs[r2];

	if (s1 != 0 &&
	!dtrace_strcanload(s1, sz, mstate, vstate))
	break;
	if (s2 != 0 &&
	!dtrace_strcanload(s2, sz, mstate, vstate))
	break;

	cc_r = dtrace_strncmp((char )s1, (char )s2, sz);

	cc_n = cc_r < 0;
	cc_z = cc_r == 0;
	cc_v = cc_c = 0;
	break;
	}
	case DIF_OP_LDGA:
	regs[rd] = dtrace_dif_variable(mstate, state,
	r1, regs[r2]);
	break;
	case DIF_OP_LDGS:
	id = DIF_INSTR_VAR(instr);

	if (id >= DIF_VAR_OTHER_UBASE) {
	uintptr_t a;

	id -= DIF_VAR_OTHER_UBASE;
	svar = vstate->dtvs_globals[id];
	ASSERT(svar != NULL);
	v = &svar->dtsv_var;

	if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
	regs[rd] = svar->dtsv_data;
	break;
	}

	a = (uintptr_t)svar->dtsv_data;

	if ((uint8_t )a == UINT8_MAX) {
	/*
	* If the 0th byte is set to UINT8_MAX
	* then this is to be treated as a
	* reference to a NULL variable.
	*/
	regs[rd] = 0;
	} else {
	regs[rd] = a + sizeof (uint64_t);
	}

	break;
	}

	regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
	break;

	case DIF_OP_STGS:
	id = DIF_INSTR_VAR(instr);

	ASSERT(id >= DIF_VAR_OTHER_UBASE);
	id -= DIF_VAR_OTHER_UBASE;

	svar = vstate->dtvs_globals[id];
	ASSERT(svar != NULL);
	v = &svar->dtsv_var;

	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
	uintptr_t a = (uintptr_t)svar->dtsv_data;

	ASSERT(a != 0);
	ASSERT(svar->dtsv_size != 0);

	if (regs[rd] == 0) {
	(uint8_t )a = UINT8_MAX;
	break;
	} else {
	(uint8_t )a = 0;
	a += sizeof (uint64_t);
	}
	if (!dtrace_vcanload(
	(void *)(uintptr_t)regs[rd], &v->dtdv_type,
	mstate, vstate))
	break;

	dtrace_vcopy((void *)(uintptr_t)regs[rd],
	(void *)a, &v->dtdv_type);
	break;
	}

	svar->dtsv_data = regs[rd];
	break;

	case DIF_OP_LDTA:
	/*
	* There are no DTrace built-in thread-local arrays at
	* present. This opcode is saved for future work.
	*/
	*flags \|= CPU_DTRACE_ILLOP;
	regs[rd] = 0;
	break;

	case DIF_OP_LDLS:
	id = DIF_INSTR_VAR(instr);

	if (id < DIF_VAR_OTHER_UBASE) {
	/*
	* For now, this has no meaning.
	*/
	regs[rd] = 0;
	break;
	}

	id -= DIF_VAR_OTHER_UBASE;

	ASSERT(id < vstate->dtvs_nlocals);
	ASSERT(vstate->dtvs_locals != NULL);

	svar = vstate->dtvs_locals[id];
	ASSERT(svar != NULL);
	v = &svar->dtsv_var;

	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
	uintptr_t a = (uintptr_t)svar->dtsv_data;
	size_t sz = v->dtdv_type.dtdt_size;

	sz += sizeof (uint64_t);
	ASSERT(svar->dtsv_size == NCPU * sz);
	a += curcpu * sz;

	if ((uint8_t )a == UINT8_MAX) {
	/*
	* If the 0th byte is set to UINT8_MAX
	* then this is to be treated as a
	* reference to a NULL variable.
	*/
	regs[rd] = 0;
	} else {
	regs[rd] = a + sizeof (uint64_t);
	}

	break;
	}

	ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
	tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
	regs[rd] = tmp[curcpu];
	break;

	case DIF_OP_STLS:
	id = DIF_INSTR_VAR(instr);

	ASSERT(id >= DIF_VAR_OTHER_UBASE);
	id -= DIF_VAR_OTHER_UBASE;
	ASSERT(id < vstate->dtvs_nlocals);

	ASSERT(vstate->dtvs_locals != NULL);
	svar = vstate->dtvs_locals[id];
	ASSERT(svar != NULL);
	v = &svar->dtsv_var;

	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
	uintptr_t a = (uintptr_t)svar->dtsv_data;
	size_t sz = v->dtdv_type.dtdt_size;

	sz += sizeof (uint64_t);
	ASSERT(svar->dtsv_size == NCPU * sz);
	a += curcpu * sz;

	if (regs[rd] == 0) {
	(uint8_t )a = UINT8_MAX;
	break;
	} else {
	(uint8_t )a = 0;
	a += sizeof (uint64_t);
	}

	if (!dtrace_vcanload(
	(void *)(uintptr_t)regs[rd], &v->dtdv_type,
	mstate, vstate))
	break;

	dtrace_vcopy((void *)(uintptr_t)regs[rd],
	(void *)a, &v->dtdv_type);
	break;
	}

	ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
	tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
	tmp[curcpu] = regs[rd];
	break;

	case DIF_OP_LDTS: {
	dtrace_dynvar_t *dvar;
	dtrace_key_t *key;

	id = DIF_INSTR_VAR(instr);
	ASSERT(id >= DIF_VAR_OTHER_UBASE);
	id -= DIF_VAR_OTHER_UBASE;
	v = &vstate->dtvs_tlocals[id];

	key = &tupregs[DIF_DTR_NREGS];
	key[0].dttk_value = (uint64_t)id;
	key[0].dttk_size = 0;
	DTRACE_TLS_THRKEY(key[1].dttk_value);
	key[1].dttk_size = 0;

	dvar = dtrace_dynvar(dstate, 2, key,
	sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
	mstate, vstate);

	if (dvar == NULL) {
	regs[rd] = 0;
	break;
	}

	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
	regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
	} else {
	regs[rd] = ((uint64_t )dvar->dtdv_data);
	}

	break;
	}

	case DIF_OP_STTS: {
	dtrace_dynvar_t *dvar;
	dtrace_key_t *key;

	id = DIF_INSTR_VAR(instr);
	ASSERT(id >= DIF_VAR_OTHER_UBASE);
	id -= DIF_VAR_OTHER_UBASE;

	key = &tupregs[DIF_DTR_NREGS];
	key[0].dttk_value = (uint64_t)id;
	key[0].dttk_size = 0;
	DTRACE_TLS_THRKEY(key[1].dttk_value);
	key[1].dttk_size = 0;
	v = &vstate->dtvs_tlocals[id];

	dvar = dtrace_dynvar(dstate, 2, key,
	v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
	v->dtdv_type.dtdt_size : sizeof (uint64_t),
	regs[rd] ? DTRACE_DYNVAR_ALLOC :
	DTRACE_DYNVAR_DEALLOC, mstate, vstate);

	/*
	* Given that we're storing to thread-local data,
	* we need to flush our predicate cache.
	*/
	curthread->t_predcache = 0;

	if (dvar == NULL)
	break;

	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
	if (!dtrace_vcanload(
	(void *)(uintptr_t)regs[rd],
	&v->dtdv_type, mstate, vstate))
	break;

	dtrace_vcopy((void *)(uintptr_t)regs[rd],
	dvar->dtdv_data, &v->dtdv_type);
	} else {
	((uint64_t )dvar->dtdv_data) = regs[rd];
	}

	break;
	}

	case DIF_OP_SRA:
	regs[rd] = (int64_t)regs[r1] >> regs[r2];
	break;

	case DIF_OP_CALL:
	dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
	regs, tupregs, ttop, mstate, state);
	break;

	case DIF_OP_PUSHTR:
	if (ttop == DIF_DTR_NREGS) {
	*flags \|= CPU_DTRACE_TUPOFLOW;
	break;
	}

	if (r1 == DIF_TYPE_STRING) {
	/*
	* If this is a string type and the size is 0,
	* we'll use the system-wide default string
	* size. Note that we are _not_ looking at
	* the value of the DTRACEOPT_STRSIZE option;
	* had this been set, we would expect to have
	* a non-zero size value in the "pushtr".
	*/
	tupregs[ttop].dttk_size =
	dtrace_strlen((char *)(uintptr_t)regs[rd],
	regs[r2] ? regs[r2] :
	dtrace_strsize_default) + 1;
	} else {
	tupregs[ttop].dttk_size = regs[r2];
	}

	tupregs[ttop++].dttk_value = regs[rd];
	break;

	case DIF_OP_PUSHTV:
	if (ttop == DIF_DTR_NREGS) {
	*flags \|= CPU_DTRACE_TUPOFLOW;
	break;
	}

	tupregs[ttop].dttk_value = regs[rd];
	tupregs[ttop++].dttk_size = 0;
	break;

	case DIF_OP_POPTS:
	if (ttop != 0)
	ttop--;
	break;

	case DIF_OP_FLUSHTS:
	ttop = 0;
	break;

	case DIF_OP_LDGAA:
	case DIF_OP_LDTAA: {
	dtrace_dynvar_t *dvar;
	dtrace_key_t *key = tupregs;
	uint_t nkeys = ttop;

	id = DIF_INSTR_VAR(instr);
	ASSERT(id >= DIF_VAR_OTHER_UBASE);
	id -= DIF_VAR_OTHER_UBASE;

	key[nkeys].dttk_value = (uint64_t)id;
	key[nkeys++].dttk_size = 0;

	if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
	DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
	key[nkeys++].dttk_size = 0;
	v = &vstate->dtvs_tlocals[id];
	} else {
	v = &vstate->dtvs_globals[id]->dtsv_var;
	}

	dvar = dtrace_dynvar(dstate, nkeys, key,
	v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
	v->dtdv_type.dtdt_size : sizeof (uint64_t),
	DTRACE_DYNVAR_NOALLOC, mstate, vstate);

	if (dvar == NULL) {
	regs[rd] = 0;
	break;
	}

	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
	regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
	} else {
	regs[rd] = ((uint64_t )dvar->dtdv_data);
	}

	break;
	}

	case DIF_OP_STGAA:
	case DIF_OP_STTAA: {
	dtrace_dynvar_t *dvar;
	dtrace_key_t *key = tupregs;
	uint_t nkeys = ttop;

	id = DIF_INSTR_VAR(instr);
	ASSERT(id >= DIF_VAR_OTHER_UBASE);
	id -= DIF_VAR_OTHER_UBASE;

	key[nkeys].dttk_value = (uint64_t)id;
	key[nkeys++].dttk_size = 0;

	if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
	DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
	key[nkeys++].dttk_size = 0;
	v = &vstate->dtvs_tlocals[id];
	} else {
	v = &vstate->dtvs_globals[id]->dtsv_var;
	}

	dvar = dtrace_dynvar(dstate, nkeys, key,
	v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
	v->dtdv_type.dtdt_size : sizeof (uint64_t),
	regs[rd] ? DTRACE_DYNVAR_ALLOC :
	DTRACE_DYNVAR_DEALLOC, mstate, vstate);

	if (dvar == NULL)
	break;

	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
	if (!dtrace_vcanload(
	(void *)(uintptr_t)regs[rd], &v->dtdv_type,
	mstate, vstate))
	break;

	dtrace_vcopy((void *)(uintptr_t)regs[rd],
	dvar->dtdv_data, &v->dtdv_type);
	} else {
	((uint64_t )dvar->dtdv_data) = regs[rd];
	}

	break;
	}

	case DIF_OP_ALLOCS: {
	uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
	size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];

	/*
	* Rounding up the user allocation size could have
	* overflowed large, bogus allocations (like -1ULL) to
	* 0.
	*/
	if (size < regs[r1] \|\|
	!DTRACE_INSCRATCH(mstate, size)) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	regs[rd] = 0;
	break;
	}

	dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
	mstate->dtms_scratch_ptr += size;
	regs[rd] = ptr;
	break;
	}

	case DIF_OP_COPYS:
	if (!dtrace_canstore(regs[rd], regs[r2],
	mstate, vstate)) {
	*flags \|= CPU_DTRACE_BADADDR;
	*illval = regs[rd];
	break;
	}

	if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
	break;

	dtrace_bcopy((void *)(uintptr_t)regs[r1],
	(void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
	break;

	case DIF_OP_STB:
	if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
	*flags \|= CPU_DTRACE_BADADDR;
	*illval = regs[rd];
	break;
	}
	((uint8_t )(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
	break;

	case DIF_OP_STH:
	if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
	*flags \|= CPU_DTRACE_BADADDR;
	*illval = regs[rd];
	break;
	}
	if (regs[rd] & 1) {
	*flags \|= CPU_DTRACE_BADALIGN;
	*illval = regs[rd];
	break;
	}
	((uint16_t )(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
	break;

	case DIF_OP_STW:
	if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
	*flags \|= CPU_DTRACE_BADADDR;
	*illval = regs[rd];
	break;
	}
	if (regs[rd] & 3) {
	*flags \|= CPU_DTRACE_BADALIGN;
	*illval = regs[rd];
	break;
	}
	((uint32_t )(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
	break;

	case DIF_OP_STX:
	if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
	*flags \|= CPU_DTRACE_BADADDR;
	*illval = regs[rd];
	break;
	}
	if (regs[rd] & 7) {
	*flags \|= CPU_DTRACE_BADALIGN;
	*illval = regs[rd];
	break;
	}
	((uint64_t )(uintptr_t)regs[rd]) = regs[r1];
	break;
	}
	}

	if (!(*flags & CPU_DTRACE_FAULT))
	return (rval);

	mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
	mstate->dtms_present \|= DTRACE_MSTATE_FLTOFFS;

	return (0);
	}

	static void
	dtrace_action_breakpoint(dtrace_ecb_t *ecb)
	{
	dtrace_probe_t *probe = ecb->dte_probe;
	dtrace_provider_t *prov = probe->dtpr_provider;
	char c[DTRACE_FULLNAMELEN + 80], *str;
	char *msg = "dtrace: breakpoint action at probe ";
	char *ecbmsg = " (ecb ";
	uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
	uintptr_t val = (uintptr_t)ecb;
	int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;

	if (dtrace_destructive_disallow)
	return;

	/*
	* It's impossible to be taking action on the NULL probe.
	*/
	ASSERT(probe != NULL);

	/*
	* This is a poor man's (destitute man's?) sprintf(): we want to
	* print the provider name, module name, function name and name of
	* the probe, along with the hex address of the ECB with the breakpoint
	* action -- all of which we must place in the character buffer by
	* hand.
	*/
	while (*msg != '\0')
	c[i++] = *msg++;

	for (str = prov->dtpv_name; *str != '\0'; str++)
	c[i++] = *str;
	c[i++] = ':';

	for (str = probe->dtpr_mod; *str != '\0'; str++)
	c[i++] = *str;
	c[i++] = ':';

	for (str = probe->dtpr_func; *str != '\0'; str++)
	c[i++] = *str;
	c[i++] = ':';

	for (str = probe->dtpr_name; *str != '\0'; str++)
	c[i++] = *str;

	while (*ecbmsg != '\0')
	c[i++] = *ecbmsg++;

	while (shift >= 0) {
	mask = (uintptr_t)0xf << shift;

	if (val >= ((uintptr_t)1 << shift))
	c[i++] = "0123456789abcdef"[(val & mask) >> shift];
	shift -= 4;
	}

	c[i++] = ')';
	c[i] = '\0';

	#if defined(sun)
	debug_enter(c);
	#else
	kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
	#endif
	}

	static void
	dtrace_action_panic(dtrace_ecb_t *ecb)
	{
	dtrace_probe_t *probe = ecb->dte_probe;

	/*
	* It's impossible to be taking action on the NULL probe.
	*/
	ASSERT(probe != NULL);

	if (dtrace_destructive_disallow)
	return;

	if (dtrace_panicked != NULL)
	return;

	if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
	return;

	/*
	* We won the right to panic. (We want to be sure that only one
	* thread calls panic() from dtrace_probe(), and that panic() is
	* called exactly once.)
	*/
	dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
	probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
	probe->dtpr_func, probe->dtpr_name, (void *)ecb);
	}

	static void
	dtrace_action_raise(uint64_t sig)
	{
	if (dtrace_destructive_disallow)
	return;

	if (sig >= NSIG) {
	DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
	return;
	}

	#if defined(sun)
	/*
	* raise() has a queue depth of 1 -- we ignore all subsequent
	* invocations of the raise() action.
	*/
	if (curthread->t_dtrace_sig == 0)
	curthread->t_dtrace_sig = (uint8_t)sig;

	curthread->t_sig_check = 1;
	aston(curthread);
	#else
	struct proc *p = curproc;
	PROC_LOCK(p);
	- psignal(p, sig);
	+ kern_psignal(p, sig);
	PROC_UNLOCK(p);
	#endif
	}

	static void
	dtrace_action_stop(void)
	{
	if (dtrace_destructive_disallow)
	return;

	#if defined(sun)
	if (!curthread->t_dtrace_stop) {
	curthread->t_dtrace_stop = 1;
	curthread->t_sig_check = 1;
	aston(curthread);
	}
	#else
	struct proc *p = curproc;
	PROC_LOCK(p);
	- psignal(p, SIGSTOP);
	+ kern_psignal(p, SIGSTOP);
	PROC_UNLOCK(p);
	#endif
	}

	static void
	dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
	{
	hrtime_t now;
	volatile uint16_t *flags;
	#if defined(sun)
	cpu_t *cpu = CPU;
	#else
	cpu_t *cpu = &solaris_cpu[curcpu];
	#endif

	if (dtrace_destructive_disallow)
	return;

	flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;

	now = dtrace_gethrtime();

	if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
	/*
	* We need to advance the mark to the current time.
	*/
	cpu->cpu_dtrace_chillmark = now;
	cpu->cpu_dtrace_chilled = 0;
	}

	/*
	* Now check to see if the requested chill time would take us over
	* the maximum amount of time allowed in the chill interval. (Or
	* worse, if the calculation itself induces overflow.)
	*/
	if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max \|\|
	cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
	*flags \|= CPU_DTRACE_ILLOP;
	return;
	}

	while (dtrace_gethrtime() - now < val)
	continue;

	/*
	* Normally, we assure that the value of the variable "timestamp" does
	* not change within an ECB. The presence of chill() represents an
	* exception to this rule, however.
	*/
	mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
	cpu->cpu_dtrace_chilled += val;
	}

	static void
	dtrace_action_ustack(dtrace_mstate_t mstate, dtrace_state_t state,
	uint64_t *buf, uint64_t arg)
	{
	int nframes = DTRACE_USTACK_NFRAMES(arg);
	int strsize = DTRACE_USTACK_STRSIZE(arg);
	uint64_t pcs = &buf[1], fps;
	char str = (char )&pcs[nframes];
	int size, offs = 0, i, j;
	uintptr_t old = mstate->dtms_scratch_ptr, saved;
	uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
	char *sym;

	/*
	* Should be taking a faster path if string space has not been
	* allocated.
	*/
	ASSERT(strsize != 0);

	/*
	* We will first allocate some temporary space for the frame pointers.
	*/
	fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
	size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
	(nframes * sizeof (uint64_t));

	if (!DTRACE_INSCRATCH(mstate, size)) {
	/*
	* Not enough room for our frame pointers -- need to indicate
	* that we ran out of scratch space.
	*/
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
	return;
	}

	mstate->dtms_scratch_ptr += size;
	saved = mstate->dtms_scratch_ptr;

	/*
	* Now get a stack with both program counters and frame pointers.
	*/
	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
	dtrace_getufpstack(buf, fps, nframes + 1);
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);

	/*
	* If that faulted, we're cooked.
	*/
	if (*flags & CPU_DTRACE_FAULT)
	goto out;

	/*
	* Now we want to walk up the stack, calling the USTACK helper. For
	* each iteration, we restore the scratch pointer.
	*/
	for (i = 0; i < nframes; i++) {
	mstate->dtms_scratch_ptr = saved;

	if (offs >= strsize)
	break;

	sym = (char *)(uintptr_t)dtrace_helper(
	DTRACE_HELPER_ACTION_USTACK,
	mstate, state, pcs[i], fps[i]);

	/*
	* If we faulted while running the helper, we're going to
	* clear the fault and null out the corresponding string.
	*/
	if (*flags & CPU_DTRACE_FAULT) {
	*flags &= ~CPU_DTRACE_FAULT;
	str[offs++] = '\0';
	continue;
	}

	if (sym == NULL) {
	str[offs++] = '\0';
	continue;
	}

	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);

	/*
	* Now copy in the string that the helper returned to us.
	*/
	for (j = 0; offs + j < strsize; j++) {
	if ((str[offs + j] = sym[j]) == '\0')
	break;
	}

	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);

	offs += j + 1;
	}

	if (offs >= strsize) {
	/*
	* If we didn't have room for all of the strings, we don't
	* abort processing -- this needn't be a fatal error -- but we
	* still want to increment a counter (dts_stkstroverflows) to
	* allow this condition to be warned about. (If this is from
	* a jstack() action, it is easily tuned via jstackstrsize.)
	*/
	dtrace_error(&state->dts_stkstroverflows);
	}

	while (offs < strsize)
	str[offs++] = '\0';

	out:
	mstate->dtms_scratch_ptr = old;
	}

	/*
	* If you're looking for the epicenter of DTrace, you just found it. This
	* is the function called by the provider to fire a probe -- from which all
	* subsequent probe-context DTrace activity emanates.
	*/
	void
	dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
	uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
	{
	processorid_t cpuid;
	dtrace_icookie_t cookie;
	dtrace_probe_t *probe;
	dtrace_mstate_t mstate;
	dtrace_ecb_t *ecb;
	dtrace_action_t *act;
	intptr_t offs;
	size_t size;
	int vtime, onintr;
	volatile uint16_t *flags;
	hrtime_t now;

	#if defined(sun)
	/*
	* Kick out immediately if this CPU is still being born (in which case
	* curthread will be set to -1) or the current thread can't allow
	* probes in its current context.
	*/
	if (((uintptr_t)curthread & 1) \|\| (curthread->t_flag & T_DONTDTRACE))
	return;
	#endif

	cookie = dtrace_interrupt_disable();
	probe = dtrace_probes[id - 1];
	cpuid = curcpu;
	onintr = CPU_ON_INTR(CPU);

	if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
	probe->dtpr_predcache == curthread->t_predcache) {
	/*
	* We have hit in the predicate cache; we know that
	* this predicate would evaluate to be false.
	*/
	dtrace_interrupt_enable(cookie);
	return;
	}

	#if defined(sun)
	if (panic_quiesce) {
	#else
	if (panicstr != NULL) {
	#endif
	/*
	* We don't trace anything if we're panicking.
	*/
	dtrace_interrupt_enable(cookie);
	return;
	}

	now = dtrace_gethrtime();
	vtime = dtrace_vtime_references != 0;

	if (vtime && curthread->t_dtrace_start)
	curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;

	mstate.dtms_difo = NULL;
	mstate.dtms_probe = probe;
	mstate.dtms_strtok = 0;
	mstate.dtms_arg[0] = arg0;
	mstate.dtms_arg[1] = arg1;
	mstate.dtms_arg[2] = arg2;
	mstate.dtms_arg[3] = arg3;
	mstate.dtms_arg[4] = arg4;

	flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;

	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
	dtrace_predicate_t *pred = ecb->dte_predicate;
	dtrace_state_t *state = ecb->dte_state;
	dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
	dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
	dtrace_vstate_t *vstate = &state->dts_vstate;
	dtrace_provider_t *prov = probe->dtpr_provider;
	int committed = 0;
	caddr_t tomax;

	/*
	* A little subtlety with the following (seemingly innocuous)
	* declaration of the automatic 'val': by looking at the
	* code, you might think that it could be declared in the
	* action processing loop, below. (That is, it's only used in
	* the action processing loop.) However, it must be declared
	* out of that scope because in the case of DIF expression
	* arguments to aggregating actions, one iteration of the
	* action loop will use the last iteration's value.
	*/
	uint64_t val = 0;

	mstate.dtms_present = DTRACE_MSTATE_ARGS \| DTRACE_MSTATE_PROBE;
	*flags &= ~CPU_DTRACE_ERROR;

	if (prov == dtrace_provider) {
	/*
	* If dtrace itself is the provider of this probe,
	* we're only going to continue processing the ECB if
	* arg0 (the dtrace_state_t) is equal to the ECB's
	* creating state. (This prevents disjoint consumers
	* from seeing one another's metaprobes.)
	*/
	if (arg0 != (uint64_t)(uintptr_t)state)
	continue;
	}

	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
	/*
	* We're not currently active. If our provider isn't
	* the dtrace pseudo provider, we're not interested.
	*/
	if (prov != dtrace_provider)
	continue;

	/*
	* Now we must further check if we are in the BEGIN
	* probe. If we are, we will only continue processing
	* if we're still in WARMUP -- if one BEGIN enabling
	* has invoked the exit() action, we don't want to
	* evaluate subsequent BEGIN enablings.
	*/
	if (probe->dtpr_id == dtrace_probeid_begin &&
	state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
	ASSERT(state->dts_activity ==
	DTRACE_ACTIVITY_DRAINING);
	continue;
	}
	}

	if (ecb->dte_cond) {
	/*
	* If the dte_cond bits indicate that this
	* consumer is only allowed to see user-mode firings
	* of this probe, call the provider's dtps_usermode()
	* entry point to check that the probe was fired
	* while in a user context. Skip this ECB if that's
	* not the case.
	*/
	if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
	prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
	probe->dtpr_id, probe->dtpr_arg) == 0)
	continue;

	#if defined(sun)
	/*
	* This is more subtle than it looks. We have to be
	* absolutely certain that CRED() isn't going to
	* change out from under us so it's only legit to
	* examine that structure if we're in constrained
	* situations. Currently, the only times we'll this
	* check is if a non-super-user has enabled the
	* profile or syscall providers -- providers that
	* allow visibility of all processes. For the
	* profile case, the check above will ensure that
	* we're examining a user context.
	*/
	if (ecb->dte_cond & DTRACE_COND_OWNER) {
	cred_t *cr;
	cred_t *s_cr =
	ecb->dte_state->dts_cred.dcr_cred;
	proc_t *proc;

	ASSERT(s_cr != NULL);

	if ((cr = CRED()) == NULL \|\|
	s_cr->cr_uid != cr->cr_uid \|\|
	s_cr->cr_uid != cr->cr_ruid \|\|
	s_cr->cr_uid != cr->cr_suid \|\|
	s_cr->cr_gid != cr->cr_gid \|\|
	s_cr->cr_gid != cr->cr_rgid \|\|
	s_cr->cr_gid != cr->cr_sgid \|\|
	(proc = ttoproc(curthread)) == NULL \|\|
	(proc->p_flag & SNOCD))
	continue;
	}

	if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
	cred_t *cr;
	cred_t *s_cr =
	ecb->dte_state->dts_cred.dcr_cred;

	ASSERT(s_cr != NULL);

	if ((cr = CRED()) == NULL \|\|
	s_cr->cr_zone->zone_id !=
	cr->cr_zone->zone_id)
	continue;
	}
	#endif
	}

	if (now - state->dts_alive > dtrace_deadman_timeout) {
	/*
	* We seem to be dead. Unless we (a) have kernel
	* destructive permissions (b) have expicitly enabled
	* destructive actions and (c) destructive actions have
	* not been disabled, we're going to transition into
	* the KILLED state, from which no further processing
	* on this state will be performed.
	*/
	if (!dtrace_priv_kernel_destructive(state) \|\|
	!state->dts_cred.dcr_destructive \|\|
	dtrace_destructive_disallow) {
	void *activity = &state->dts_activity;
	dtrace_activity_t current;

	do {
	current = state->dts_activity;
	} while (dtrace_cas32(activity, current,
	DTRACE_ACTIVITY_KILLED) != current);

	continue;
	}
	}

	if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
	ecb->dte_alignment, state, &mstate)) < 0)
	continue;

	tomax = buf->dtb_tomax;
	ASSERT(tomax != NULL);

	if (ecb->dte_size != 0)
	DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);

	mstate.dtms_epid = ecb->dte_epid;
	mstate.dtms_present \|= DTRACE_MSTATE_EPID;

	if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
	mstate.dtms_access = DTRACE_ACCESS_KERNEL;
	else
	mstate.dtms_access = 0;

	if (pred != NULL) {
	dtrace_difo_t *dp = pred->dtp_difo;
	int rval;

	rval = dtrace_dif_emulate(dp, &mstate, vstate, state);

	if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
	dtrace_cacheid_t cid = probe->dtpr_predcache;

	if (cid != DTRACE_CACHEIDNONE && !onintr) {
	/*
	* Update the predicate cache...
	*/
	ASSERT(cid == pred->dtp_cacheid);
	curthread->t_predcache = cid;
	}

	continue;
	}
	}

	for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
	act != NULL; act = act->dta_next) {
	size_t valoffs;
	dtrace_difo_t *dp;
	dtrace_recdesc_t *rec = &act->dta_rec;

	size = rec->dtrd_size;
	valoffs = offs + rec->dtrd_offset;

	if (DTRACEACT_ISAGG(act->dta_kind)) {
	uint64_t v = 0xbad;
	dtrace_aggregation_t *agg;

	agg = (dtrace_aggregation_t *)act;

	if ((dp = act->dta_difo) != NULL)
	v = dtrace_dif_emulate(dp,
	&mstate, vstate, state);

	if (*flags & CPU_DTRACE_ERROR)
	continue;

	/*
	* Note that we always pass the expression
	* value from the previous iteration of the
	* action loop. This value will only be used
	* if there is an expression argument to the
	* aggregating action, denoted by the
	* dtag_hasarg field.
	*/
	dtrace_aggregate(agg, buf,
	offs, aggbuf, v, val);
	continue;
	}

	switch (act->dta_kind) {
	case DTRACEACT_STOP:
	if (dtrace_priv_proc_destructive(state))
	dtrace_action_stop();
	continue;

	case DTRACEACT_BREAKPOINT:
	if (dtrace_priv_kernel_destructive(state))
	dtrace_action_breakpoint(ecb);
	continue;

	case DTRACEACT_PANIC:
	if (dtrace_priv_kernel_destructive(state))
	dtrace_action_panic(ecb);
	continue;

	case DTRACEACT_STACK:
	if (!dtrace_priv_kernel(state))
	continue;

	dtrace_getpcstack((pc_t *)(tomax + valoffs),
	size / sizeof (pc_t), probe->dtpr_aframes,
	DTRACE_ANCHORED(probe) ? NULL :
	(uint32_t *)arg0);
	continue;

	case DTRACEACT_JSTACK:
	case DTRACEACT_USTACK:
	if (!dtrace_priv_proc(state))
	continue;

	/*
	* See comment in DIF_VAR_PID.
	*/
	if (DTRACE_ANCHORED(mstate.dtms_probe) &&
	CPU_ON_INTR(CPU)) {
	int depth = DTRACE_USTACK_NFRAMES(
	rec->dtrd_arg) + 1;

	dtrace_bzero((void *)(tomax + valoffs),
	DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
	+ depth * sizeof (uint64_t));

	continue;
	}

	if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
	curproc->p_dtrace_helpers != NULL) {
	/*
	* This is the slow path -- we have
	* allocated string space, and we're
	* getting the stack of a process that
	* has helpers. Call into a separate
	* routine to perform this processing.
	*/
	dtrace_action_ustack(&mstate, state,
	(uint64_t *)(tomax + valoffs),
	rec->dtrd_arg);
	continue;
	}

	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
	dtrace_getupcstack((uint64_t *)
	(tomax + valoffs),
	DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
	continue;

	default:
	break;
	}

	dp = act->dta_difo;
	ASSERT(dp != NULL);

	val = dtrace_dif_emulate(dp, &mstate, vstate, state);

	if (*flags & CPU_DTRACE_ERROR)
	continue;

	switch (act->dta_kind) {
	case DTRACEACT_SPECULATE:
	ASSERT(buf == &state->dts_buffer[cpuid]);
	buf = dtrace_speculation_buffer(state,
	cpuid, val);

	if (buf == NULL) {
	*flags \|= CPU_DTRACE_DROP;
	continue;
	}

	offs = dtrace_buffer_reserve(buf,
	ecb->dte_needed, ecb->dte_alignment,
	state, NULL);

	if (offs < 0) {
	*flags \|= CPU_DTRACE_DROP;
	continue;
	}

	tomax = buf->dtb_tomax;
	ASSERT(tomax != NULL);

	if (ecb->dte_size != 0)
	DTRACE_STORE(uint32_t, tomax, offs,
	ecb->dte_epid);
	continue;

	case DTRACEACT_PRINTM: {
	/* The DIF returns a 'memref'. */
	uintptr_t memref = (uintptr_t )(uintptr_t) val;

	/* Get the size from the memref. */
	size = memref[1];

	/*
	* Check if the size exceeds the allocated
	* buffer size.
	*/
	if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
	/* Flag a drop! */
	*flags \|= CPU_DTRACE_DROP;
	continue;
	}

	/* Store the size in the buffer first. */
	DTRACE_STORE(uintptr_t, tomax,
	valoffs, size);

	/*
	* Offset the buffer address to the start
	* of the data.
	*/
	valoffs += sizeof(uintptr_t);

	/*
	* Reset to the memory address rather than
	* the memref array, then let the BYREF
	* code below do the work to store the
	* memory data in the buffer.
	*/
	val = memref[0];
	break;
	}

	case DTRACEACT_PRINTT: {
	/* The DIF returns a 'typeref'. */
	uintptr_t typeref = (uintptr_t )(uintptr_t) val;
	char c = '\0' + 1;
	size_t s;

	/*
	* Get the type string length and round it
	* up so that the data that follows is
	* aligned for easy access.
	*/
	size_t typs = strlen((char *) typeref[2]) + 1;
	typs = roundup(typs, sizeof(uintptr_t));

	/*
	*Get the size from the typeref using the
	* number of elements and the type size.
	*/
	size = typeref[1] * typeref[3];

	/*
	* Check if the size exceeds the allocated
	* buffer size.
	*/
	if (size + typs + 2 * sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
	/* Flag a drop! */
	*flags \|= CPU_DTRACE_DROP;

	}

	/* Store the size in the buffer first. */
	DTRACE_STORE(uintptr_t, tomax,
	valoffs, size);
	valoffs += sizeof(uintptr_t);

	/* Store the type size in the buffer. */
	DTRACE_STORE(uintptr_t, tomax,
	valoffs, typeref[3]);
	valoffs += sizeof(uintptr_t);

	val = typeref[2];

	for (s = 0; s < typs; s++) {
	if (c != '\0')
	c = dtrace_load8(val++);

	DTRACE_STORE(uint8_t, tomax,
	valoffs++, c);
	}

	/*
	* Reset to the memory address rather than
	* the typeref array, then let the BYREF
	* code below do the work to store the
	* memory data in the buffer.
	*/
	val = typeref[0];
	break;
	}

	case DTRACEACT_CHILL:
	if (dtrace_priv_kernel_destructive(state))
	dtrace_action_chill(&mstate, val);
	continue;

	case DTRACEACT_RAISE:
	if (dtrace_priv_proc_destructive(state))
	dtrace_action_raise(val);
	continue;

	case DTRACEACT_COMMIT:
	ASSERT(!committed);

	/*
	* We need to commit our buffer state.
	*/
	if (ecb->dte_size)
	buf->dtb_offset = offs + ecb->dte_size;
	buf = &state->dts_buffer[cpuid];
	dtrace_speculation_commit(state, cpuid, val);
	committed = 1;
	continue;

	case DTRACEACT_DISCARD:
	dtrace_speculation_discard(state, cpuid, val);
	continue;

	case DTRACEACT_DIFEXPR:
	case DTRACEACT_LIBACT:
	case DTRACEACT_PRINTF:
	case DTRACEACT_PRINTA:
	case DTRACEACT_SYSTEM:
	case DTRACEACT_FREOPEN:
	break;

	case DTRACEACT_SYM:
	case DTRACEACT_MOD:
	if (!dtrace_priv_kernel(state))
	continue;
	break;

	case DTRACEACT_USYM:
	case DTRACEACT_UMOD:
	case DTRACEACT_UADDR: {
	#if defined(sun)
	struct pid *pid = curthread->t_procp->p_pidp;
	#endif

	if (!dtrace_priv_proc(state))
	continue;

	DTRACE_STORE(uint64_t, tomax,
	#if defined(sun)
	valoffs, (uint64_t)pid->pid_id);
	#else
	valoffs, (uint64_t) curproc->p_pid);
	#endif
	DTRACE_STORE(uint64_t, tomax,
	valoffs + sizeof (uint64_t), val);

	continue;
	}

	case DTRACEACT_EXIT: {
	/*
	* For the exit action, we are going to attempt
	* to atomically set our activity to be
	* draining. If this fails (either because
	* another CPU has beat us to the exit action,
	* or because our current activity is something
	* other than ACTIVE or WARMUP), we will
	* continue. This assures that the exit action
	* can be successfully recorded at most once
	* when we're in the ACTIVE state. If we're
	* encountering the exit() action while in
	* COOLDOWN, however, we want to honor the new
	* status code. (We know that we're the only
	* thread in COOLDOWN, so there is no race.)
	*/
	void *activity = &state->dts_activity;
	dtrace_activity_t current = state->dts_activity;

	if (current == DTRACE_ACTIVITY_COOLDOWN)
	break;

	if (current != DTRACE_ACTIVITY_WARMUP)
	current = DTRACE_ACTIVITY_ACTIVE;

	if (dtrace_cas32(activity, current,
	DTRACE_ACTIVITY_DRAINING) != current) {
	*flags \|= CPU_DTRACE_DROP;
	continue;
	}

	break;
	}

	default:
	ASSERT(0);
	}

	if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
	uintptr_t end = valoffs + size;

	if (!dtrace_vcanload((void *)(uintptr_t)val,
	&dp->dtdo_rtype, &mstate, vstate))
	continue;

	/*
	* If this is a string, we're going to only
	* load until we find the zero byte -- after
	* which we'll store zero bytes.
	*/
	if (dp->dtdo_rtype.dtdt_kind ==
	DIF_TYPE_STRING) {
	char c = '\0' + 1;
	int intuple = act->dta_intuple;
	size_t s;

	for (s = 0; s < size; s++) {
	if (c != '\0')
	c = dtrace_load8(val++);

	DTRACE_STORE(uint8_t, tomax,
	valoffs++, c);

	if (c == '\0' && intuple)
	break;
	}

	continue;
	}

	while (valoffs < end) {
	DTRACE_STORE(uint8_t, tomax, valoffs++,
	dtrace_load8(val++));
	}

	continue;
	}

	switch (size) {
	case 0:
	break;

	case sizeof (uint8_t):
	DTRACE_STORE(uint8_t, tomax, valoffs, val);
	break;
	case sizeof (uint16_t):
	DTRACE_STORE(uint16_t, tomax, valoffs, val);
	break;
	case sizeof (uint32_t):
	DTRACE_STORE(uint32_t, tomax, valoffs, val);
	break;
	case sizeof (uint64_t):
	DTRACE_STORE(uint64_t, tomax, valoffs, val);
	break;
	default:
	/*
	* Any other size should have been returned by
	* reference, not by value.
	*/
	ASSERT(0);
	break;
	}
	}

	if (*flags & CPU_DTRACE_DROP)
	continue;

	if (*flags & CPU_DTRACE_FAULT) {
	int ndx;
	dtrace_action_t *err;

	buf->dtb_errors++;

	if (probe->dtpr_id == dtrace_probeid_error) {
	/*
	* There's nothing we can do -- we had an
	* error on the error probe. We bump an
	* error counter to at least indicate that
	* this condition happened.
	*/
	dtrace_error(&state->dts_dblerrors);
	continue;
	}

	if (vtime) {
	/*
	* Before recursing on dtrace_probe(), we
	* need to explicitly clear out our start
	* time to prevent it from being accumulated
	* into t_dtrace_vtime.
	*/
	curthread->t_dtrace_start = 0;
	}

	/*
	* Iterate over the actions to figure out which action
	* we were processing when we experienced the error.
	* Note that act points _past_ the faulting action; if
	* act is ecb->dte_action, the fault was in the
	* predicate, if it's ecb->dte_action->dta_next it's
	* in action #1, and so on.
	*/
	for (err = ecb->dte_action, ndx = 0;
	err != act; err = err->dta_next, ndx++)
	continue;

	dtrace_probe_error(state, ecb->dte_epid, ndx,
	(mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
	mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
	cpu_core[cpuid].cpuc_dtrace_illval);

	continue;
	}

	if (!committed)
	buf->dtb_offset = offs + ecb->dte_size;
	}

	if (vtime)
	curthread->t_dtrace_start = dtrace_gethrtime();

	dtrace_interrupt_enable(cookie);
	}

	/*
	* DTrace Probe Hashing Functions
	*
	* The functions in this section (and indeed, the functions in remaining
	* sections) are not _called_ from probe context. (Any exceptions to this are
	* marked with a "Note:".) Rather, they are called from elsewhere in the
	* DTrace framework to look-up probes in, add probes to and remove probes from
	* the DTrace probe hashes. (Each probe is hashed by each element of the
	* probe tuple -- allowing for fast lookups, regardless of what was
	* specified.)
	*/
	static uint_t
	dtrace_hash_str(const char *p)
	{
	unsigned int g;
	uint_t hval = 0;

	while (*p) {
	hval = (hval << 4) + *p++;
	if ((g = (hval & 0xf0000000)) != 0)
	hval ^= g >> 24;
	hval &= ~g;
	}
	return (hval);
	}

	static dtrace_hash_t *
	dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
	{
	dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);

	hash->dth_stroffs = stroffs;
	hash->dth_nextoffs = nextoffs;
	hash->dth_prevoffs = prevoffs;

	hash->dth_size = 1;
	hash->dth_mask = hash->dth_size - 1;

	hash->dth_tab = kmem_zalloc(hash->dth_size *
	sizeof (dtrace_hashbucket_t *), KM_SLEEP);

	return (hash);
	}

	static void
	dtrace_hash_destroy(dtrace_hash_t *hash)
	{
	#ifdef DEBUG
	int i;

	for (i = 0; i < hash->dth_size; i++)
	ASSERT(hash->dth_tab[i] == NULL);
	#endif

	kmem_free(hash->dth_tab,
	hash->dth_size * sizeof (dtrace_hashbucket_t *));
	kmem_free(hash, sizeof (dtrace_hash_t));
	}

	static void
	dtrace_hash_resize(dtrace_hash_t *hash)
	{
	int size = hash->dth_size, i, ndx;
	int new_size = hash->dth_size << 1;
	int new_mask = new_size - 1;
	dtrace_hashbucket_t *new_tab, bucket, *next;

	ASSERT((new_size & new_mask) == 0);

	new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);

	for (i = 0; i < size; i++) {
	for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
	dtrace_probe_t *probe = bucket->dthb_chain;

	ASSERT(probe != NULL);
	ndx = DTRACE_HASHSTR(hash, probe) & new_mask;

	next = bucket->dthb_next;
	bucket->dthb_next = new_tab[ndx];
	new_tab[ndx] = bucket;
	}
	}

	kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
	hash->dth_tab = new_tab;
	hash->dth_size = new_size;
	hash->dth_mask = new_mask;
	}

	static void
	dtrace_hash_add(dtrace_hash_t hash, dtrace_probe_t new)
	{
	int hashval = DTRACE_HASHSTR(hash, new);
	int ndx = hashval & hash->dth_mask;
	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
	dtrace_probe_t nextp, prevp;

	for (; bucket != NULL; bucket = bucket->dthb_next) {
	if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
	goto add;
	}

	if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
	dtrace_hash_resize(hash);
	dtrace_hash_add(hash, new);
	return;
	}

	bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
	bucket->dthb_next = hash->dth_tab[ndx];
	hash->dth_tab[ndx] = bucket;
	hash->dth_nbuckets++;

	add:
	nextp = DTRACE_HASHNEXT(hash, new);
	ASSERT(nextp == NULL && (DTRACE_HASHPREV(hash, new)) == NULL);
	*nextp = bucket->dthb_chain;

	if (bucket->dthb_chain != NULL) {
	prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
	ASSERT(*prevp == NULL);
	*prevp = new;
	}

	bucket->dthb_chain = new;
	bucket->dthb_len++;
	}

	static dtrace_probe_t *
	dtrace_hash_lookup(dtrace_hash_t hash, dtrace_probe_t template)
	{
	int hashval = DTRACE_HASHSTR(hash, template);
	int ndx = hashval & hash->dth_mask;
	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];

	for (; bucket != NULL; bucket = bucket->dthb_next) {
	if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
	return (bucket->dthb_chain);
	}

	return (NULL);
	}

	static int
	dtrace_hash_collisions(dtrace_hash_t hash, dtrace_probe_t template)
	{
	int hashval = DTRACE_HASHSTR(hash, template);
	int ndx = hashval & hash->dth_mask;
	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];

	for (; bucket != NULL; bucket = bucket->dthb_next) {
	if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
	return (bucket->dthb_len);
	}

	return (0);
	}

	static void
	dtrace_hash_remove(dtrace_hash_t hash, dtrace_probe_t probe)
	{
	int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
	dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];

	dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
	dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);

	/*
	* Find the bucket that we're removing this probe from.
	*/
	for (; bucket != NULL; bucket = bucket->dthb_next) {
	if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
	break;
	}

	ASSERT(bucket != NULL);

	if (*prevp == NULL) {
	if (*nextp == NULL) {
	/*
	* The removed probe was the only probe on this
	* bucket; we need to remove the bucket.
	*/
	dtrace_hashbucket_t *b = hash->dth_tab[ndx];

	ASSERT(bucket->dthb_chain == probe);
	ASSERT(b != NULL);

	if (b == bucket) {
	hash->dth_tab[ndx] = bucket->dthb_next;
	} else {
	while (b->dthb_next != bucket)
	b = b->dthb_next;
	b->dthb_next = bucket->dthb_next;
	}

	ASSERT(hash->dth_nbuckets > 0);
	hash->dth_nbuckets--;
	kmem_free(bucket, sizeof (dtrace_hashbucket_t));
	return;
	}

	bucket->dthb_chain = *nextp;
	} else {
	(DTRACE_HASHNEXT(hash, prevp)) = *nextp;
	}

	if (*nextp != NULL)
	(DTRACE_HASHPREV(hash, nextp)) = *prevp;
	}

	/*
	* DTrace Utility Functions
	*
	* These are random utility functions that are _not_ called from probe context.
	*/
	static int
	dtrace_badattr(const dtrace_attribute_t *a)
	{
	return (a->dtat_name > DTRACE_STABILITY_MAX \|\|
	a->dtat_data > DTRACE_STABILITY_MAX \|\|
	a->dtat_class > DTRACE_CLASS_MAX);
	}

	/*
	* Return a duplicate copy of a string. If the specified string is NULL,
	* this function returns a zero-length string.
	*/
	static char *
	dtrace_strdup(const char *str)
	{
	char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);

	if (str != NULL)
	(void) strcpy(new, str);

	return (new);
	}

	#define DTRACE_ISALPHA(c) \
	(((c) >= 'a' && (c) <= 'z') \|\| ((c) >= 'A' && (c) <= 'Z'))

	static int
	dtrace_badname(const char *s)
	{
	char c;

	if (s == NULL \|\| (c = *s++) == '\0')
	return (0);

	if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
	return (1);

	while ((c = *s++) != '\0') {
	if (!DTRACE_ISALPHA(c) && (c < '0' \|\| c > '9') &&
	c != '-' && c != '_' && c != '.' && c != '`')
	return (1);
	}

	return (0);
	}

	static void
	dtrace_cred2priv(cred_t cr, uint32_t privp, uid_t uidp, zoneid_t zoneidp)
	{
	uint32_t priv;

	#if defined(sun)
	if (cr == NULL \|\| PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
	/*
	* For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
	*/
	priv = DTRACE_PRIV_ALL;
	} else {
	*uidp = crgetuid(cr);
	*zoneidp = crgetzoneid(cr);

	priv = 0;
	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
	priv \|= DTRACE_PRIV_KERNEL \| DTRACE_PRIV_USER;
	else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
	priv \|= DTRACE_PRIV_USER;
	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
	priv \|= DTRACE_PRIV_PROC;
	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
	priv \|= DTRACE_PRIV_OWNER;
	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
	priv \|= DTRACE_PRIV_ZONEOWNER;
	}
	#else
	priv = DTRACE_PRIV_ALL;
	#endif

	*privp = priv;
	}

	#ifdef DTRACE_ERRDEBUG
	static void
	dtrace_errdebug(const char *str)
	{
	int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
	int occupied = 0;

	mutex_enter(&dtrace_errlock);
	dtrace_errlast = str;
	dtrace_errthread = curthread;

	while (occupied++ < DTRACE_ERRHASHSZ) {
	if (dtrace_errhash[hval].dter_msg == str) {
	dtrace_errhash[hval].dter_count++;
	goto out;
	}

	if (dtrace_errhash[hval].dter_msg != NULL) {
	hval = (hval + 1) % DTRACE_ERRHASHSZ;
	continue;
	}

	dtrace_errhash[hval].dter_msg = str;
	dtrace_errhash[hval].dter_count = 1;
	goto out;
	}

	panic("dtrace: undersized error hash");
	out:
	mutex_exit(&dtrace_errlock);
	}
	#endif

	/*
	* DTrace Matching Functions
	*
	* These functions are used to match groups of probes, given some elements of
	* a probe tuple, or some globbed expressions for elements of a probe tuple.
	*/
	static int
	dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
	zoneid_t zoneid)
	{
	if (priv != DTRACE_PRIV_ALL) {
	uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
	uint32_t match = priv & ppriv;

	/*
	* No PRIV_DTRACE_* privileges...
	*/
	if ((priv & (DTRACE_PRIV_PROC \| DTRACE_PRIV_USER \|
	DTRACE_PRIV_KERNEL)) == 0)
	return (0);

	/*
	* No matching bits, but there were bits to match...
	*/
	if (match == 0 && ppriv != 0)
	return (0);

	/*
	* Need to have permissions to the process, but don't...
	*/
	if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
	uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
	return (0);
	}

	/*
	* Need to be in the same zone unless we possess the
	* privilege to examine all zones.
	*/
	if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
	zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
	return (0);
	}
	}

	return (1);
	}

	/*
	* dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
	* consists of input pattern strings and an ops-vector to evaluate them.
	* This function returns >0 for match, 0 for no match, and <0 for error.
	*/
	static int
	dtrace_match_probe(const dtrace_probe_t prp, const dtrace_probekey_t pkp,
	uint32_t priv, uid_t uid, zoneid_t zoneid)
	{
	dtrace_provider_t *pvp = prp->dtpr_provider;
	int rv;

	if (pvp->dtpv_defunct)
	return (0);

	if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
	return (rv);

	if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
	return (rv);

	if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
	return (rv);

	if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
	return (rv);

	if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
	return (0);

	return (rv);
	}

	/*
	* dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
	* interface for matching a glob pattern 'p' to an input string 's'. Unlike
	* libc's version, the kernel version only applies to 8-bit ASCII strings.
	* In addition, all of the recursion cases except for '*' matching have been
	* unwound. For '*', we still implement recursive evaluation, but a depth
	* counter is maintained and matching is aborted if we recurse too deep.
	* The function returns 0 if no match, >0 if match, and <0 if recursion error.
	*/
	static int
	dtrace_match_glob(const char s, const char p, int depth)
	{
	const char *olds;
	char s1, c;
	int gs;

	if (depth > DTRACE_PROBEKEY_MAXDEPTH)
	return (-1);

	if (s == NULL)
	s = ""; /* treat NULL as empty string */

	top:
	olds = s;
	s1 = *s++;

	if (p == NULL)
	return (0);

	if ((c = *p++) == '\0')
	return (s1 == '\0');

	switch (c) {
	case '[': {
	int ok = 0, notflag = 0;
	char lc = '\0';

	if (s1 == '\0')
	return (0);

	if (*p == '!') {
	notflag = 1;
	p++;
	}

	if ((c = *p++) == '\0')
	return (0);

	do {
	if (c == '-' && lc != '\0' && *p != ']') {
	if ((c = *p++) == '\0')
	return (0);
	if (c == '\\' && (c = *p++) == '\0')
	return (0);

	if (notflag) {
	if (s1 < lc \|\| s1 > c)
	ok++;
	else
	return (0);
	} else if (lc <= s1 && s1 <= c)
	ok++;

	} else if (c == '\\' && (c = *p++) == '\0')
	return (0);

	lc = c; /* save left-hand 'c' for next iteration */

	if (notflag) {
	if (s1 != c)
	ok++;
	else
	return (0);
	} else if (s1 == c)
	ok++;

	if ((c = *p++) == '\0')
	return (0);

	} while (c != ']');

	if (ok)
	goto top;

	return (0);
	}

	case '\\':
	if ((c = *p++) == '\0')
	return (0);
	/FALLTHRU/

	default:
	if (c != s1)
	return (0);
	/FALLTHRU/

	case '?':
	if (s1 != '\0')
	goto top;
	return (0);

	case '*':
	while (p == '')
	p++; /* consecutive 's are identical to a single one /

	if (*p == '\0')
	return (1);

	for (s = olds; *s != '\0'; s++) {
	if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
	return (gs);
	}

	return (0);
	}
	}

	/ARGSUSED/
	static int
	dtrace_match_string(const char s, const char p, int depth)
	{
	return (s != NULL && strcmp(s, p) == 0);
	}

	/ARGSUSED/
	static int
	dtrace_match_nul(const char s, const char p, int depth)
	{
	return (1); /* always match the empty pattern */
	}

	/ARGSUSED/
	static int
	dtrace_match_nonzero(const char s, const char p, int depth)
	{
	return (s != NULL && s[0] != '\0');
	}

	static int
	dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
	zoneid_t zoneid, int (matched)(dtrace_probe_t , void ), void arg)
	{
	dtrace_probe_t template, *probe;
	dtrace_hash_t *hash = NULL;
	int len, best = INT_MAX, nmatched = 0;
	dtrace_id_t i;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	/*
	* If the probe ID is specified in the key, just lookup by ID and
	* invoke the match callback once if a matching probe is found.
	*/
	if (pkp->dtpk_id != DTRACE_IDNONE) {
	if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
	dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
	(void) (*matched)(probe, arg);
	nmatched++;
	}
	return (nmatched);
	}

	template.dtpr_mod = (char *)pkp->dtpk_mod;
	template.dtpr_func = (char *)pkp->dtpk_func;
	template.dtpr_name = (char *)pkp->dtpk_name;

	/*
	* We want to find the most distinct of the module name, function
	* name, and name. So for each one that is not a glob pattern or
	* empty string, we perform a lookup in the corresponding hash and
	* use the hash table with the fewest collisions to do our search.
	*/
	if (pkp->dtpk_mmatch == &dtrace_match_string &&
	(len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
	best = len;
	hash = dtrace_bymod;
	}

	if (pkp->dtpk_fmatch == &dtrace_match_string &&
	(len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
	best = len;
	hash = dtrace_byfunc;
	}

	if (pkp->dtpk_nmatch == &dtrace_match_string &&
	(len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
	best = len;
	hash = dtrace_byname;
	}

	/*
	* If we did not select a hash table, iterate over every probe and
	* invoke our callback for each one that matches our input probe key.
	*/
	if (hash == NULL) {
	for (i = 0; i < dtrace_nprobes; i++) {
	if ((probe = dtrace_probes[i]) == NULL \|\|
	dtrace_match_probe(probe, pkp, priv, uid,
	zoneid) <= 0)
	continue;

	nmatched++;

	if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
	break;
	}

	return (nmatched);
	}

	/*
	* If we selected a hash table, iterate over each probe of the same key
	* name and invoke the callback for every probe that matches the other
	* attributes of our input probe key.
	*/
	for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
	probe = *(DTRACE_HASHNEXT(hash, probe))) {

	if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
	continue;

	nmatched++;

	if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
	break;
	}

	return (nmatched);
	}

	/*
	* Return the function pointer dtrace_probecmp() should use to compare the
	* specified pattern with a string. For NULL or empty patterns, we select
	* dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
	* For non-empty non-glob strings, we use dtrace_match_string().
	*/
	static dtrace_probekey_f *
	dtrace_probekey_func(const char *p)
	{
	char c;

	if (p == NULL \|\| *p == '\0')
	return (&dtrace_match_nul);

	while ((c = *p++) != '\0') {
	if (c == '[' \|\| c == '?' \|\| c == '*' \|\| c == '\\')
	return (&dtrace_match_glob);
	}

	return (&dtrace_match_string);
	}

	/*
	* Build a probe comparison key for use with dtrace_match_probe() from the
	* given probe description. By convention, a null key only matches anchored
	* probes: if each field is the empty string, reset dtpk_fmatch to
	* dtrace_match_nonzero().
	*/
	static void
	dtrace_probekey(dtrace_probedesc_t pdp, dtrace_probekey_t pkp)
	{
	pkp->dtpk_prov = pdp->dtpd_provider;
	pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);

	pkp->dtpk_mod = pdp->dtpd_mod;
	pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);

	pkp->dtpk_func = pdp->dtpd_func;
	pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);

	pkp->dtpk_name = pdp->dtpd_name;
	pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);

	pkp->dtpk_id = pdp->dtpd_id;

	if (pkp->dtpk_id == DTRACE_IDNONE &&
	pkp->dtpk_pmatch == &dtrace_match_nul &&
	pkp->dtpk_mmatch == &dtrace_match_nul &&
	pkp->dtpk_fmatch == &dtrace_match_nul &&
	pkp->dtpk_nmatch == &dtrace_match_nul)
	pkp->dtpk_fmatch = &dtrace_match_nonzero;
	}

	/*
	* DTrace Provider-to-Framework API Functions
	*
	* These functions implement much of the Provider-to-Framework API, as
	* described in <sys/dtrace.h>. The parts of the API not in this section are
	* the functions in the API for probe management (found below), and
	* dtrace_probe() itself (found above).
	*/

	/*
	* Register the calling provider with the DTrace framework. This should
	* generally be called by DTrace providers in their attach(9E) entry point.
	*/
	int
	dtrace_register(const char name, const dtrace_pattr_t pap, uint32_t priv,
	cred_t cr, const dtrace_pops_t pops, void arg, dtrace_provider_id_t idp)
	{
	dtrace_provider_t *provider;

	if (name == NULL \|\| pap == NULL \|\| pops == NULL \|\| idp == NULL) {
	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
	"arguments", name ? name : "<NULL>");
	return (EINVAL);
	}

	if (name[0] == '\0' \|\| dtrace_badname(name)) {
	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
	"provider name", name);
	return (EINVAL);
	}

	if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) \|\|
	pops->dtps_enable == NULL \|\| pops->dtps_disable == NULL \|\|
	pops->dtps_destroy == NULL \|\|
	((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
	"provider ops", name);
	return (EINVAL);
	}

	if (dtrace_badattr(&pap->dtpa_provider) \|\|
	dtrace_badattr(&pap->dtpa_mod) \|\|
	dtrace_badattr(&pap->dtpa_func) \|\|
	dtrace_badattr(&pap->dtpa_name) \|\|
	dtrace_badattr(&pap->dtpa_args)) {
	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
	"provider attributes", name);
	return (EINVAL);
	}

	if (priv & ~DTRACE_PRIV_ALL) {
	cmn_err(CE_WARN, "failed to register provider '%s': invalid "
	"privilege attributes", name);
	return (EINVAL);
	}

	if ((priv & DTRACE_PRIV_KERNEL) &&
	(priv & (DTRACE_PRIV_USER \| DTRACE_PRIV_OWNER)) &&
	pops->dtps_usermode == NULL) {
	cmn_err(CE_WARN, "failed to register provider '%s': need "
	"dtps_usermode() op for given privilege attributes", name);
	return (EINVAL);
	}

	provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
	provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
	(void) strcpy(provider->dtpv_name, name);

	provider->dtpv_attr = *pap;
	provider->dtpv_priv.dtpp_flags = priv;
	if (cr != NULL) {
	provider->dtpv_priv.dtpp_uid = crgetuid(cr);
	provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
	}
	provider->dtpv_pops = *pops;

	if (pops->dtps_provide == NULL) {
	ASSERT(pops->dtps_provide_module != NULL);
	provider->dtpv_pops.dtps_provide =
	(void ()(void , dtrace_probedesc_t *))dtrace_nullop;
	}

	if (pops->dtps_provide_module == NULL) {
	ASSERT(pops->dtps_provide != NULL);
	provider->dtpv_pops.dtps_provide_module =
	(void ()(void , modctl_t *))dtrace_nullop;
	}

	if (pops->dtps_suspend == NULL) {
	ASSERT(pops->dtps_resume == NULL);
	provider->dtpv_pops.dtps_suspend =
	(void ()(void , dtrace_id_t, void *))dtrace_nullop;
	provider->dtpv_pops.dtps_resume =
	(void ()(void , dtrace_id_t, void *))dtrace_nullop;
	}

	provider->dtpv_arg = arg;
	*idp = (dtrace_provider_id_t)provider;

	if (pops == &dtrace_provider_ops) {
	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(dtrace_anon.dta_enabling == NULL);

	/*
	* We make sure that the DTrace provider is at the head of
	* the provider chain.
	*/
	provider->dtpv_next = dtrace_provider;
	dtrace_provider = provider;
	return (0);
	}

	mutex_enter(&dtrace_provider_lock);
	mutex_enter(&dtrace_lock);

	/*
	* If there is at least one provider registered, we'll add this
	* provider after the first provider.
	*/
	if (dtrace_provider != NULL) {
	provider->dtpv_next = dtrace_provider->dtpv_next;
	dtrace_provider->dtpv_next = provider;
	} else {
	dtrace_provider = provider;
	}

	if (dtrace_retained != NULL) {
	dtrace_enabling_provide(provider);

	/*
	* Now we need to call dtrace_enabling_matchall() -- which
	* will acquire cpu_lock and dtrace_lock. We therefore need
	* to drop all of our locks before calling into it...
	*/
	mutex_exit(&dtrace_lock);
	mutex_exit(&dtrace_provider_lock);
	dtrace_enabling_matchall();

	return (0);
	}

	mutex_exit(&dtrace_lock);
	mutex_exit(&dtrace_provider_lock);

	return (0);
	}

	/*
	* Unregister the specified provider from the DTrace framework. This should
	* generally be called by DTrace providers in their detach(9E) entry point.
	*/
	int
	dtrace_unregister(dtrace_provider_id_t id)
	{
	dtrace_provider_t old = (dtrace_provider_t )id;
	dtrace_provider_t *prev = NULL;
	int i, self = 0;
	dtrace_probe_t probe, first = NULL;

	if (old->dtpv_pops.dtps_enable ==
	(void ()(void , dtrace_id_t, void *))dtrace_nullop) {
	/*
	* If DTrace itself is the provider, we're called with locks
	* already held.
	*/
	ASSERT(old == dtrace_provider);
	#if defined(sun)
	ASSERT(dtrace_devi != NULL);
	#endif
	ASSERT(MUTEX_HELD(&dtrace_provider_lock));
	ASSERT(MUTEX_HELD(&dtrace_lock));
	self = 1;

	if (dtrace_provider->dtpv_next != NULL) {
	/*
	* There's another provider here; return failure.
	*/
	return (EBUSY);
	}
	} else {
	mutex_enter(&dtrace_provider_lock);
	mutex_enter(&mod_lock);
	mutex_enter(&dtrace_lock);
	}

	/*
	* If anyone has /dev/dtrace open, or if there are anonymous enabled
	* probes, we refuse to let providers slither away, unless this
	* provider has already been explicitly invalidated.
	*/
	if (!old->dtpv_defunct &&
	(dtrace_opens \|\| (dtrace_anon.dta_state != NULL &&
	dtrace_anon.dta_state->dts_necbs > 0))) {
	if (!self) {
	mutex_exit(&dtrace_lock);
	mutex_exit(&mod_lock);
	mutex_exit(&dtrace_provider_lock);
	}
	return (EBUSY);
	}

	/*
	* Attempt to destroy the probes associated with this provider.
	*/
	for (i = 0; i < dtrace_nprobes; i++) {
	if ((probe = dtrace_probes[i]) == NULL)
	continue;

	if (probe->dtpr_provider != old)
	continue;

	if (probe->dtpr_ecb == NULL)
	continue;

	/*
	* We have at least one ECB; we can't remove this provider.
	*/
	if (!self) {
	mutex_exit(&dtrace_lock);
	mutex_exit(&mod_lock);
	mutex_exit(&dtrace_provider_lock);
	}
	return (EBUSY);
	}

	/*
	* All of the probes for this provider are disabled; we can safely
	* remove all of them from their hash chains and from the probe array.
	*/
	for (i = 0; i < dtrace_nprobes; i++) {
	if ((probe = dtrace_probes[i]) == NULL)
	continue;

	if (probe->dtpr_provider != old)
	continue;

	dtrace_probes[i] = NULL;

	dtrace_hash_remove(dtrace_bymod, probe);
	dtrace_hash_remove(dtrace_byfunc, probe);
	dtrace_hash_remove(dtrace_byname, probe);

	if (first == NULL) {
	first = probe;
	probe->dtpr_nextmod = NULL;
	} else {
	probe->dtpr_nextmod = first;
	first = probe;
	}
	}

	/*
	* The provider's probes have been removed from the hash chains and
	* from the probe array. Now issue a dtrace_sync() to be sure that
	* everyone has cleared out from any probe array processing.
	*/
	dtrace_sync();

	for (probe = first; probe != NULL; probe = first) {
	first = probe->dtpr_nextmod;

	old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
	probe->dtpr_arg);
	kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
	kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
	kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
	#if defined(sun)
	vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
	#else
	free_unr(dtrace_arena, probe->dtpr_id);
	#endif
	kmem_free(probe, sizeof (dtrace_probe_t));
	}

	if ((prev = dtrace_provider) == old) {
	#if defined(sun)
	ASSERT(self \|\| dtrace_devi == NULL);
	ASSERT(old->dtpv_next == NULL \|\| dtrace_devi == NULL);
	#endif
	dtrace_provider = old->dtpv_next;
	} else {
	while (prev != NULL && prev->dtpv_next != old)
	prev = prev->dtpv_next;

	if (prev == NULL) {
	panic("attempt to unregister non-existent "
	"dtrace provider %p\n", (void *)id);
	}

	prev->dtpv_next = old->dtpv_next;
	}

	if (!self) {
	mutex_exit(&dtrace_lock);
	mutex_exit(&mod_lock);
	mutex_exit(&dtrace_provider_lock);
	}

	kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
	kmem_free(old, sizeof (dtrace_provider_t));

	return (0);
	}

	/*
	* Invalidate the specified provider. All subsequent probe lookups for the
	* specified provider will fail, but its probes will not be removed.
	*/
	void
	dtrace_invalidate(dtrace_provider_id_t id)
	{
	dtrace_provider_t pvp = (dtrace_provider_t )id;

	ASSERT(pvp->dtpv_pops.dtps_enable !=
	(void ()(void , dtrace_id_t, void *))dtrace_nullop);

	mutex_enter(&dtrace_provider_lock);
	mutex_enter(&dtrace_lock);

	pvp->dtpv_defunct = 1;

	mutex_exit(&dtrace_lock);
	mutex_exit(&dtrace_provider_lock);
	}

	/*
	* Indicate whether or not DTrace has attached.
	*/
	int
	dtrace_attached(void)
	{
	/*
	* dtrace_provider will be non-NULL iff the DTrace driver has
	* attached. (It's non-NULL because DTrace is always itself a
	* provider.)
	*/
	return (dtrace_provider != NULL);
	}

	/*
	* Remove all the unenabled probes for the given provider. This function is
	* not unlike dtrace_unregister(), except that it doesn't remove the provider
	* -- just as many of its associated probes as it can.
	*/
	int
	dtrace_condense(dtrace_provider_id_t id)
	{
	dtrace_provider_t prov = (dtrace_provider_t )id;
	int i;
	dtrace_probe_t *probe;

	/*
	* Make sure this isn't the dtrace provider itself.
	*/
	ASSERT(prov->dtpv_pops.dtps_enable !=
	(void ()(void , dtrace_id_t, void *))dtrace_nullop);

	mutex_enter(&dtrace_provider_lock);
	mutex_enter(&dtrace_lock);

	/*
	* Attempt to destroy the probes associated with this provider.
	*/
	for (i = 0; i < dtrace_nprobes; i++) {
	if ((probe = dtrace_probes[i]) == NULL)
	continue;

	if (probe->dtpr_provider != prov)
	continue;

	if (probe->dtpr_ecb != NULL)
	continue;

	dtrace_probes[i] = NULL;

	dtrace_hash_remove(dtrace_bymod, probe);
	dtrace_hash_remove(dtrace_byfunc, probe);
	dtrace_hash_remove(dtrace_byname, probe);

	prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
	probe->dtpr_arg);
	kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
	kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
	kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
	kmem_free(probe, sizeof (dtrace_probe_t));
	#if defined(sun)
	vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
	#else
	free_unr(dtrace_arena, i + 1);
	#endif
	}

	mutex_exit(&dtrace_lock);
	mutex_exit(&dtrace_provider_lock);

	return (0);
	}

	/*
	* DTrace Probe Management Functions
	*
	* The functions in this section perform the DTrace probe management,
	* including functions to create probes, look-up probes, and call into the
	* providers to request that probes be provided. Some of these functions are
	* in the Provider-to-Framework API; these functions can be identified by the
	* fact that they are not declared "static".
	*/

	/*
	* Create a probe with the specified module name, function name, and name.
	*/
	dtrace_id_t
	dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
	const char func, const char name, int aframes, void *arg)
	{
	dtrace_probe_t probe, *probes;
	dtrace_provider_t provider = (dtrace_provider_t )prov;
	dtrace_id_t id;

	if (provider == dtrace_provider) {
	ASSERT(MUTEX_HELD(&dtrace_lock));
	} else {
	mutex_enter(&dtrace_lock);
	}

	#if defined(sun)
	id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
	VM_BESTFIT \| VM_SLEEP);
	#else
	id = alloc_unr(dtrace_arena);
	#endif
	probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);

	probe->dtpr_id = id;
	probe->dtpr_gen = dtrace_probegen++;
	probe->dtpr_mod = dtrace_strdup(mod);
	probe->dtpr_func = dtrace_strdup(func);
	probe->dtpr_name = dtrace_strdup(name);
	probe->dtpr_arg = arg;
	probe->dtpr_aframes = aframes;
	probe->dtpr_provider = provider;

	dtrace_hash_add(dtrace_bymod, probe);
	dtrace_hash_add(dtrace_byfunc, probe);
	dtrace_hash_add(dtrace_byname, probe);

	if (id - 1 >= dtrace_nprobes) {
	size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
	size_t nsize = osize << 1;

	if (nsize == 0) {
	ASSERT(osize == 0);
	ASSERT(dtrace_probes == NULL);
	nsize = sizeof (dtrace_probe_t *);
	}

	probes = kmem_zalloc(nsize, KM_SLEEP);

	if (dtrace_probes == NULL) {
	ASSERT(osize == 0);
	dtrace_probes = probes;
	dtrace_nprobes = 1;
	} else {
	dtrace_probe_t **oprobes = dtrace_probes;

	bcopy(oprobes, probes, osize);
	dtrace_membar_producer();
	dtrace_probes = probes;

	dtrace_sync();

	/*
	* All CPUs are now seeing the new probes array; we can
	* safely free the old array.
	*/
	kmem_free(oprobes, osize);
	dtrace_nprobes <<= 1;
	}

	ASSERT(id - 1 < dtrace_nprobes);
	}

	ASSERT(dtrace_probes[id - 1] == NULL);
	dtrace_probes[id - 1] = probe;

	if (provider != dtrace_provider)
	mutex_exit(&dtrace_lock);

	return (id);
	}

	static dtrace_probe_t *
	dtrace_probe_lookup_id(dtrace_id_t id)
	{
	ASSERT(MUTEX_HELD(&dtrace_lock));

	if (id == 0 \|\| id > dtrace_nprobes)
	return (NULL);

	return (dtrace_probes[id - 1]);
	}

	static int
	dtrace_probe_lookup_match(dtrace_probe_t probe, void arg)
	{
	((dtrace_id_t )arg) = probe->dtpr_id;

	return (DTRACE_MATCH_DONE);
	}

	/*
	* Look up a probe based on provider and one or more of module name, function
	* name and probe name.
	*/
	dtrace_id_t
	dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
	char func, char name)
	{
	dtrace_probekey_t pkey;
	dtrace_id_t id;
	int match;

	pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
	pkey.dtpk_pmatch = &dtrace_match_string;
	pkey.dtpk_mod = mod;
	pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
	pkey.dtpk_func = func;
	pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
	pkey.dtpk_name = name;
	pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
	pkey.dtpk_id = DTRACE_IDNONE;

	mutex_enter(&dtrace_lock);
	match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
	dtrace_probe_lookup_match, &id);
	mutex_exit(&dtrace_lock);

	ASSERT(match == 1 \|\| match == 0);
	return (match ? id : 0);
	}

	/*
	* Returns the probe argument associated with the specified probe.
	*/
	void *
	dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
	{
	dtrace_probe_t *probe;
	void *rval = NULL;

	mutex_enter(&dtrace_lock);

	if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
	probe->dtpr_provider == (dtrace_provider_t *)id)
	rval = probe->dtpr_arg;

	mutex_exit(&dtrace_lock);

	return (rval);
	}

	/*
	* Copy a probe into a probe description.
	*/
	static void
	dtrace_probe_description(const dtrace_probe_t prp, dtrace_probedesc_t pdp)
	{
	bzero(pdp, sizeof (dtrace_probedesc_t));
	pdp->dtpd_id = prp->dtpr_id;

	(void) strncpy(pdp->dtpd_provider,
	prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);

	(void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
	(void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
	(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
	}

	#if !defined(sun)
	static int
	dtrace_probe_provide_cb(linker_file_t lf, void *arg)
	{
	dtrace_provider_t prv = (dtrace_provider_t ) arg;

	prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, lf);

	return(0);
	}
	#endif


	/*
	* Called to indicate that a probe -- or probes -- should be provided by a
	* specfied provider. If the specified description is NULL, the provider will
	* be told to provide all of its probes. (This is done whenever a new
	* consumer comes along, or whenever a retained enabling is to be matched.) If
	* the specified description is non-NULL, the provider is given the
	* opportunity to dynamically provide the specified probe, allowing providers
	* to support the creation of probes on-the-fly. (So-called _autocreated_
	* probes.) If the provider is NULL, the operations will be applied to all
	* providers; if the provider is non-NULL the operations will only be applied
	* to the specified provider. The dtrace_provider_lock must be held, and the
	* dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
	* will need to grab the dtrace_lock when it reenters the framework through
	* dtrace_probe_lookup(), dtrace_probe_create(), etc.
	*/
	static void
	dtrace_probe_provide(dtrace_probedesc_t desc, dtrace_provider_t prv)
	{
	#if defined(sun)
	modctl_t *ctl;
	#endif
	int all = 0;

	ASSERT(MUTEX_HELD(&dtrace_provider_lock));

	if (prv == NULL) {
	all = 1;
	prv = dtrace_provider;
	}

	do {
	/*
	* First, call the blanket provide operation.
	*/
	prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);

	/*
	* Now call the per-module provide operation. We will grab
	* mod_lock to prevent the list from being modified. Note
	* that this also prevents the mod_busy bits from changing.
	* (mod_busy can only be changed with mod_lock held.)
	*/
	mutex_enter(&mod_lock);

	#if defined(sun)
	ctl = &modules;
	do {
	if (ctl->mod_busy \|\| ctl->mod_mp == NULL)
	continue;

	prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);

	} while ((ctl = ctl->mod_next) != &modules);
	#else
	(void) linker_file_foreach(dtrace_probe_provide_cb, prv);
	#endif

	mutex_exit(&mod_lock);
	} while (all && (prv = prv->dtpv_next) != NULL);
	}

	#if defined(sun)
	/*
	* Iterate over each probe, and call the Framework-to-Provider API function
	* denoted by offs.
	*/
	static void
	dtrace_probe_foreach(uintptr_t offs)
	{
	dtrace_provider_t *prov;
	void (func)(void , dtrace_id_t, void *);
	dtrace_probe_t *probe;
	dtrace_icookie_t cookie;
	int i;

	/*
	* We disable interrupts to walk through the probe array. This is
	* safe -- the dtrace_sync() in dtrace_unregister() assures that we
	* won't see stale data.
	*/
	cookie = dtrace_interrupt_disable();

	for (i = 0; i < dtrace_nprobes; i++) {
	if ((probe = dtrace_probes[i]) == NULL)
	continue;

	if (probe->dtpr_ecb == NULL) {
	/*
	* This probe isn't enabled -- don't call the function.
	*/
	continue;
	}

	prov = probe->dtpr_provider;
	func = ((void()(void , dtrace_id_t, void *))
	((uintptr_t)&prov->dtpv_pops + offs));

	func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
	}

	dtrace_interrupt_enable(cookie);
	}
	#endif

	static int
	dtrace_probe_enable(dtrace_probedesc_t desc, dtrace_enabling_t enab)
	{
	dtrace_probekey_t pkey;
	uint32_t priv;
	uid_t uid;
	zoneid_t zoneid;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	dtrace_ecb_create_cache = NULL;

	if (desc == NULL) {
	/*
	* If we're passed a NULL description, we're being asked to
	* create an ECB with a NULL probe.
	*/
	(void) dtrace_ecb_create_enable(NULL, enab);
	return (0);
	}

	dtrace_probekey(desc, &pkey);
	dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
	&priv, &uid, &zoneid);

	return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
	enab));
	}

	/*
	* DTrace Helper Provider Functions
	*/
	static void
	dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
	{
	attr->dtat_name = DOF_ATTR_NAME(dofattr);
	attr->dtat_data = DOF_ATTR_DATA(dofattr);
	attr->dtat_class = DOF_ATTR_CLASS(dofattr);
	}

	static void
	dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
	const dof_provider_t dofprov, char strtab)
	{
	hprov->dthpv_provname = strtab + dofprov->dofpv_name;
	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
	dofprov->dofpv_provattr);
	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
	dofprov->dofpv_modattr);
	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
	dofprov->dofpv_funcattr);
	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
	dofprov->dofpv_nameattr);
	dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
	dofprov->dofpv_argsattr);
	}

	static void
	dtrace_helper_provide_one(dof_helper_t dhp, dof_sec_t sec, pid_t pid)
	{
	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
	dof_hdr_t dof = (dof_hdr_t )daddr;
	dof_sec_t str_sec, prb_sec, arg_sec, off_sec, *enoff_sec;
	dof_provider_t *provider;
	dof_probe_t *probe;
	uint32_t off, enoff;
	uint8_t *arg;
	char *strtab;
	uint_t i, nprobes;
	dtrace_helper_provdesc_t dhpv;
	dtrace_helper_probedesc_t dhpb;
	dtrace_meta_t *meta = dtrace_meta_pid;
	dtrace_mops_t *mops = &meta->dtm_mops;
	void *parg;

	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
	provider->dofpv_strtab * dof->dofh_secsize);
	prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
	provider->dofpv_probes * dof->dofh_secsize);
	arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
	provider->dofpv_prargs * dof->dofh_secsize);
	off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
	provider->dofpv_proffs * dof->dofh_secsize);

	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
	off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
	enoff = NULL;

	/*
	* See dtrace_helper_provider_validate().
	*/
	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
	provider->dofpv_prenoffs != DOF_SECT_NONE) {
	enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
	provider->dofpv_prenoffs * dof->dofh_secsize);
	enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
	}

	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;

	/*
	* Create the provider.
	*/
	dtrace_dofprov2hprov(&dhpv, provider, strtab);

	if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
	return;

	meta->dtm_count++;

	/*
	* Create the probes.
	*/
	for (i = 0; i < nprobes; i++) {
	probe = (dof_probe_t *)(uintptr_t)(daddr +
	prb_sec->dofs_offset + i * prb_sec->dofs_entsize);

	dhpb.dthpb_mod = dhp->dofhp_mod;
	dhpb.dthpb_func = strtab + probe->dofpr_func;
	dhpb.dthpb_name = strtab + probe->dofpr_name;
	dhpb.dthpb_base = probe->dofpr_addr;
	dhpb.dthpb_offs = off + probe->dofpr_offidx;
	dhpb.dthpb_noffs = probe->dofpr_noffs;
	if (enoff != NULL) {
	dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
	dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
	} else {
	dhpb.dthpb_enoffs = NULL;
	dhpb.dthpb_nenoffs = 0;
	}
	dhpb.dthpb_args = arg + probe->dofpr_argidx;
	dhpb.dthpb_nargc = probe->dofpr_nargc;
	dhpb.dthpb_xargc = probe->dofpr_xargc;
	dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
	dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;

	mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
	}
	}

	static void
	dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
	{
	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
	dof_hdr_t dof = (dof_hdr_t )daddr;
	int i;

	ASSERT(MUTEX_HELD(&dtrace_meta_lock));

	for (i = 0; i < dof->dofh_secnum; i++) {
	dof_sec_t sec = (dof_sec_t )(uintptr_t)(daddr +
	dof->dofh_secoff + i * dof->dofh_secsize);

	if (sec->dofs_type != DOF_SECT_PROVIDER)
	continue;

	dtrace_helper_provide_one(dhp, sec, pid);
	}

	/*
	* We may have just created probes, so we must now rematch against
	* any retained enablings. Note that this call will acquire both
	* cpu_lock and dtrace_lock; the fact that we are holding
	* dtrace_meta_lock now is what defines the ordering with respect to
	* these three locks.
	*/
	dtrace_enabling_matchall();
	}

	static void
	dtrace_helper_provider_remove_one(dof_helper_t dhp, dof_sec_t sec, pid_t pid)
	{
	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
	dof_hdr_t dof = (dof_hdr_t )daddr;
	dof_sec_t *str_sec;
	dof_provider_t *provider;
	char *strtab;
	dtrace_helper_provdesc_t dhpv;
	dtrace_meta_t *meta = dtrace_meta_pid;
	dtrace_mops_t *mops = &meta->dtm_mops;

	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
	str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
	provider->dofpv_strtab * dof->dofh_secsize);

	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);

	/*
	* Create the provider.
	*/
	dtrace_dofprov2hprov(&dhpv, provider, strtab);

	mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);

	meta->dtm_count--;
	}

	static void
	dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
	{
	uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
	dof_hdr_t dof = (dof_hdr_t )daddr;
	int i;

	ASSERT(MUTEX_HELD(&dtrace_meta_lock));

	for (i = 0; i < dof->dofh_secnum; i++) {
	dof_sec_t sec = (dof_sec_t )(uintptr_t)(daddr +
	dof->dofh_secoff + i * dof->dofh_secsize);

	if (sec->dofs_type != DOF_SECT_PROVIDER)
	continue;

	dtrace_helper_provider_remove_one(dhp, sec, pid);
	}
	}

	/*
	* DTrace Meta Provider-to-Framework API Functions
	*
	* These functions implement the Meta Provider-to-Framework API, as described
	* in <sys/dtrace.h>.
	*/
	int
	dtrace_meta_register(const char name, const dtrace_mops_t mops, void *arg,
	dtrace_meta_provider_id_t *idp)
	{
	dtrace_meta_t *meta;
	dtrace_helpers_t help, next;
	int i;

	*idp = DTRACE_METAPROVNONE;

	/*
	* We strictly don't need the name, but we hold onto it for
	* debuggability. All hail error queues!
	*/
	if (name == NULL) {
	cmn_err(CE_WARN, "failed to register meta-provider: "
	"invalid name");
	return (EINVAL);
	}

	if (mops == NULL \|\|
	mops->dtms_create_probe == NULL \|\|
	mops->dtms_provide_pid == NULL \|\|
	mops->dtms_remove_pid == NULL) {
	cmn_err(CE_WARN, "failed to register meta-register %s: "
	"invalid ops", name);
	return (EINVAL);
	}

	meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
	meta->dtm_mops = *mops;
	meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
	(void) strcpy(meta->dtm_name, name);
	meta->dtm_arg = arg;

	mutex_enter(&dtrace_meta_lock);
	mutex_enter(&dtrace_lock);

	if (dtrace_meta_pid != NULL) {
	mutex_exit(&dtrace_lock);
	mutex_exit(&dtrace_meta_lock);
	cmn_err(CE_WARN, "failed to register meta-register %s: "
	"user-land meta-provider exists", name);
	kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
	kmem_free(meta, sizeof (dtrace_meta_t));
	return (EINVAL);
	}

	dtrace_meta_pid = meta;
	*idp = (dtrace_meta_provider_id_t)meta;

	/*
	* If there are providers and probes ready to go, pass them
	* off to the new meta provider now.
	*/

	help = dtrace_deferred_pid;
	dtrace_deferred_pid = NULL;

	mutex_exit(&dtrace_lock);

	while (help != NULL) {
	for (i = 0; i < help->dthps_nprovs; i++) {
	dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
	help->dthps_pid);
	}

	next = help->dthps_next;
	help->dthps_next = NULL;
	help->dthps_prev = NULL;
	help->dthps_deferred = 0;
	help = next;
	}

	mutex_exit(&dtrace_meta_lock);

	return (0);
	}

	int
	dtrace_meta_unregister(dtrace_meta_provider_id_t id)
	{
	dtrace_meta_t *pp, old = (dtrace_meta_t *)id;

	mutex_enter(&dtrace_meta_lock);
	mutex_enter(&dtrace_lock);

	if (old == dtrace_meta_pid) {
	pp = &dtrace_meta_pid;
	} else {
	panic("attempt to unregister non-existent "
	"dtrace meta-provider %p\n", (void *)old);
	}

	if (old->dtm_count != 0) {
	mutex_exit(&dtrace_lock);
	mutex_exit(&dtrace_meta_lock);
	return (EBUSY);
	}

	*pp = NULL;

	mutex_exit(&dtrace_lock);
	mutex_exit(&dtrace_meta_lock);

	kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
	kmem_free(old, sizeof (dtrace_meta_t));

	return (0);
	}


	/*
	* DTrace DIF Object Functions
	*/
	static int
	dtrace_difo_err(uint_t pc, const char *format, ...)
	{
	if (dtrace_err_verbose) {
	va_list alist;

	(void) uprintf("dtrace DIF object error: [%u]: ", pc);
	va_start(alist, format);
	(void) vuprintf(format, alist);
	va_end(alist);
	}

	#ifdef DTRACE_ERRDEBUG
	dtrace_errdebug(format);
	#endif
	return (1);
	}

	/*
	* Validate a DTrace DIF object by checking the IR instructions. The following
	* rules are currently enforced by dtrace_difo_validate():
	*
	* 1. Each instruction must have a valid opcode
	* 2. Each register, string, variable, or subroutine reference must be valid
	* 3. No instruction can modify register %r0 (must be zero)
	* 4. All instruction reserved bits must be set to zero
	* 5. The last instruction must be a "ret" instruction
	* 6. All branch targets must reference a valid instruction _after_ the branch
	*/
	static int
	dtrace_difo_validate(dtrace_difo_t dp, dtrace_vstate_t vstate, uint_t nregs,
	cred_t *cr)
	{
	int err = 0, i;
	int (efunc)(uint_t pc, const char , ...) = dtrace_difo_err;
	int kcheckload;
	uint_t pc;

	kcheckload = cr == NULL \|\|
	(vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;

	dp->dtdo_destructive = 0;

	for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
	dif_instr_t instr = dp->dtdo_buf[pc];

	uint_t r1 = DIF_INSTR_R1(instr);
	uint_t r2 = DIF_INSTR_R2(instr);
	uint_t rd = DIF_INSTR_RD(instr);
	uint_t rs = DIF_INSTR_RS(instr);
	uint_t label = DIF_INSTR_LABEL(instr);
	uint_t v = DIF_INSTR_VAR(instr);
	uint_t subr = DIF_INSTR_SUBR(instr);
	uint_t type = DIF_INSTR_TYPE(instr);
	uint_t op = DIF_INSTR_OP(instr);

	switch (op) {
	case DIF_OP_OR:
	case DIF_OP_XOR:
	case DIF_OP_AND:
	case DIF_OP_SLL:
	case DIF_OP_SRL:
	case DIF_OP_SRA:
	case DIF_OP_SUB:
	case DIF_OP_ADD:
	case DIF_OP_MUL:
	case DIF_OP_SDIV:
	case DIF_OP_UDIV:
	case DIF_OP_SREM:
	case DIF_OP_UREM:
	case DIF_OP_COPYS:
	if (r1 >= nregs)
	err += efunc(pc, "invalid register %u\n", r1);
	if (r2 >= nregs)
	err += efunc(pc, "invalid register %u\n", r2);
	if (rd >= nregs)
	err += efunc(pc, "invalid register %u\n", rd);
	if (rd == 0)
	err += efunc(pc, "cannot write to %r0\n");
	break;
	case DIF_OP_NOT:
	case DIF_OP_MOV:
	case DIF_OP_ALLOCS:
	if (r1 >= nregs)
	err += efunc(pc, "invalid register %u\n", r1);
	if (r2 != 0)
	err += efunc(pc, "non-zero reserved bits\n");
	if (rd >= nregs)
	err += efunc(pc, "invalid register %u\n", rd);
	if (rd == 0)
	err += efunc(pc, "cannot write to %r0\n");
	break;
	case DIF_OP_LDSB:
	case DIF_OP_LDSH:
	case DIF_OP_LDSW:
	case DIF_OP_LDUB:
	case DIF_OP_LDUH:
	case DIF_OP_LDUW:
	case DIF_OP_LDX:
	if (r1 >= nregs)
	err += efunc(pc, "invalid register %u\n", r1);
	if (r2 != 0)
	err += efunc(pc, "non-zero reserved bits\n");
	if (rd >= nregs)
	err += efunc(pc, "invalid register %u\n", rd);
	if (rd == 0)
	err += efunc(pc, "cannot write to %r0\n");
	if (kcheckload)
	dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
	DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
	break;
	case DIF_OP_RLDSB:
	case DIF_OP_RLDSH:
	case DIF_OP_RLDSW:
	case DIF_OP_RLDUB:
	case DIF_OP_RLDUH:
	case DIF_OP_RLDUW:
	case DIF_OP_RLDX:
	if (r1 >= nregs)
	err += efunc(pc, "invalid register %u\n", r1);
	if (r2 != 0)
	err += efunc(pc, "non-zero reserved bits\n");
	if (rd >= nregs)
	err += efunc(pc, "invalid register %u\n", rd);
	if (rd == 0)
	err += efunc(pc, "cannot write to %r0\n");
	break;
	case DIF_OP_ULDSB:
	case DIF_OP_ULDSH:
	case DIF_OP_ULDSW:
	case DIF_OP_ULDUB:
	case DIF_OP_ULDUH:
	case DIF_OP_ULDUW:
	case DIF_OP_ULDX:
	if (r1 >= nregs)
	err += efunc(pc, "invalid register %u\n", r1);
	if (r2 != 0)
	err += efunc(pc, "non-zero reserved bits\n");
	if (rd >= nregs)
	err += efunc(pc, "invalid register %u\n", rd);
	if (rd == 0)
	err += efunc(pc, "cannot write to %r0\n");
	break;
	case DIF_OP_STB:
	case DIF_OP_STH:
	case DIF_OP_STW:
	case DIF_OP_STX:
	if (r1 >= nregs)
	err += efunc(pc, "invalid register %u\n", r1);
	if (r2 != 0)
	err += efunc(pc, "non-zero reserved bits\n");
	if (rd >= nregs)
	err += efunc(pc, "invalid register %u\n", rd);
	if (rd == 0)
	err += efunc(pc, "cannot write to 0 address\n");
	break;
	case DIF_OP_CMP:
	case DIF_OP_SCMP:
	if (r1 >= nregs)
	err += efunc(pc, "invalid register %u\n", r1);
	if (r2 >= nregs)
	err += efunc(pc, "invalid register %u\n", r2);
	if (rd != 0)
	err += efunc(pc, "non-zero reserved bits\n");
	break;
	case DIF_OP_TST:
	if (r1 >= nregs)
	err += efunc(pc, "invalid register %u\n", r1);
	if (r2 != 0 \|\| rd != 0)
	err += efunc(pc, "non-zero reserved bits\n");
	break;
	case DIF_OP_BA:
	case DIF_OP_BE:
	case DIF_OP_BNE:
	case DIF_OP_BG:
	case DIF_OP_BGU:
	case DIF_OP_BGE:
	case DIF_OP_BGEU:
	case DIF_OP_BL:
	case DIF_OP_BLU:
	case DIF_OP_BLE:
	case DIF_OP_BLEU:
	if (label >= dp->dtdo_len) {
	err += efunc(pc, "invalid branch target %u\n",
	label);
	}
	if (label <= pc) {
	err += efunc(pc, "backward branch to %u\n",
	label);
	}
	break;
	case DIF_OP_RET:
	if (r1 != 0 \|\| r2 != 0)
	err += efunc(pc, "non-zero reserved bits\n");
	if (rd >= nregs)
	err += efunc(pc, "invalid register %u\n", rd);
	break;
	case DIF_OP_NOP:
	case DIF_OP_POPTS:
	case DIF_OP_FLUSHTS:
	if (r1 != 0 \|\| r2 != 0 \|\| rd != 0)
	err += efunc(pc, "non-zero reserved bits\n");
	break;
	case DIF_OP_SETX:
	if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
	err += efunc(pc, "invalid integer ref %u\n",
	DIF_INSTR_INTEGER(instr));
	}
	if (rd >= nregs)
	err += efunc(pc, "invalid register %u\n", rd);
	if (rd == 0)
	err += efunc(pc, "cannot write to %r0\n");
	break;
	case DIF_OP_SETS:
	if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
	err += efunc(pc, "invalid string ref %u\n",
	DIF_INSTR_STRING(instr));
	}
	if (rd >= nregs)
	err += efunc(pc, "invalid register %u\n", rd);
	if (rd == 0)
	err += efunc(pc, "cannot write to %r0\n");
	break;
	case DIF_OP_LDGA:
	case DIF_OP_LDTA:
	if (r1 > DIF_VAR_ARRAY_MAX)
	err += efunc(pc, "invalid array %u\n", r1);
	if (r2 >= nregs)
	err += efunc(pc, "invalid register %u\n", r2);
	if (rd >= nregs)
	err += efunc(pc, "invalid register %u\n", rd);
	if (rd == 0)
	err += efunc(pc, "cannot write to %r0\n");
	break;
	case DIF_OP_LDGS:
	case DIF_OP_LDTS:
	case DIF_OP_LDLS:
	case DIF_OP_LDGAA:
	case DIF_OP_LDTAA:
	if (v < DIF_VAR_OTHER_MIN \|\| v > DIF_VAR_OTHER_MAX)
	err += efunc(pc, "invalid variable %u\n", v);
	if (rd >= nregs)
	err += efunc(pc, "invalid register %u\n", rd);
	if (rd == 0)
	err += efunc(pc, "cannot write to %r0\n");
	break;
	case DIF_OP_STGS:
	case DIF_OP_STTS:
	case DIF_OP_STLS:
	case DIF_OP_STGAA:
	case DIF_OP_STTAA:
	if (v < DIF_VAR_OTHER_UBASE \|\| v > DIF_VAR_OTHER_MAX)
	err += efunc(pc, "invalid variable %u\n", v);
	if (rs >= nregs)
	err += efunc(pc, "invalid register %u\n", rd);
	break;
	case DIF_OP_CALL:
	if (subr > DIF_SUBR_MAX)
	err += efunc(pc, "invalid subr %u\n", subr);
	if (rd >= nregs)
	err += efunc(pc, "invalid register %u\n", rd);
	if (rd == 0)
	err += efunc(pc, "cannot write to %r0\n");

	if (subr == DIF_SUBR_COPYOUT \|\|
	subr == DIF_SUBR_COPYOUTSTR) {
	dp->dtdo_destructive = 1;
	}
	break;
	case DIF_OP_PUSHTR:
	if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
	err += efunc(pc, "invalid ref type %u\n", type);
	if (r2 >= nregs)
	err += efunc(pc, "invalid register %u\n", r2);
	if (rs >= nregs)
	err += efunc(pc, "invalid register %u\n", rs);
	break;
	case DIF_OP_PUSHTV:
	if (type != DIF_TYPE_CTF)
	err += efunc(pc, "invalid val type %u\n", type);
	if (r2 >= nregs)
	err += efunc(pc, "invalid register %u\n", r2);
	if (rs >= nregs)
	err += efunc(pc, "invalid register %u\n", rs);
	break;
	default:
	err += efunc(pc, "invalid opcode %u\n",
	DIF_INSTR_OP(instr));
	}
	}

	if (dp->dtdo_len != 0 &&
	DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
	err += efunc(dp->dtdo_len - 1,
	"expected 'ret' as last DIF instruction\n");
	}

	if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
	/*
	* If we're not returning by reference, the size must be either
	* 0 or the size of one of the base types.
	*/
	switch (dp->dtdo_rtype.dtdt_size) {
	case 0:
	case sizeof (uint8_t):
	case sizeof (uint16_t):
	case sizeof (uint32_t):
	case sizeof (uint64_t):
	break;

	default:
	err += efunc(dp->dtdo_len - 1, "bad return size");
	}
	}

	for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
	dtrace_difv_t v = &dp->dtdo_vartab[i], existing = NULL;
	dtrace_diftype_t vt, et;
	uint_t id, ndx;

	if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
	v->dtdv_scope != DIFV_SCOPE_THREAD &&
	v->dtdv_scope != DIFV_SCOPE_LOCAL) {
	err += efunc(i, "unrecognized variable scope %d\n",
	v->dtdv_scope);
	break;
	}

	if (v->dtdv_kind != DIFV_KIND_ARRAY &&
	v->dtdv_kind != DIFV_KIND_SCALAR) {
	err += efunc(i, "unrecognized variable type %d\n",
	v->dtdv_kind);
	break;
	}

	if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
	err += efunc(i, "%d exceeds variable id limit\n", id);
	break;
	}

	if (id < DIF_VAR_OTHER_UBASE)
	continue;

	/*
	* For user-defined variables, we need to check that this
	* definition is identical to any previous definition that we
	* encountered.
	*/
	ndx = id - DIF_VAR_OTHER_UBASE;

	switch (v->dtdv_scope) {
	case DIFV_SCOPE_GLOBAL:
	if (ndx < vstate->dtvs_nglobals) {
	dtrace_statvar_t *svar;

	if ((svar = vstate->dtvs_globals[ndx]) != NULL)
	existing = &svar->dtsv_var;
	}

	break;

	case DIFV_SCOPE_THREAD:
	if (ndx < vstate->dtvs_ntlocals)
	existing = &vstate->dtvs_tlocals[ndx];
	break;

	case DIFV_SCOPE_LOCAL:
	if (ndx < vstate->dtvs_nlocals) {
	dtrace_statvar_t *svar;

	if ((svar = vstate->dtvs_locals[ndx]) != NULL)
	existing = &svar->dtsv_var;
	}

	break;
	}

	vt = &v->dtdv_type;

	if (vt->dtdt_flags & DIF_TF_BYREF) {
	if (vt->dtdt_size == 0) {
	err += efunc(i, "zero-sized variable\n");
	break;
	}

	if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
	vt->dtdt_size > dtrace_global_maxsize) {
	err += efunc(i, "oversized by-ref global\n");
	break;
	}
	}

	if (existing == NULL \|\| existing->dtdv_id == 0)
	continue;

	ASSERT(existing->dtdv_id == v->dtdv_id);
	ASSERT(existing->dtdv_scope == v->dtdv_scope);

	if (existing->dtdv_kind != v->dtdv_kind)
	err += efunc(i, "%d changed variable kind\n", id);

	et = &existing->dtdv_type;

	if (vt->dtdt_flags != et->dtdt_flags) {
	err += efunc(i, "%d changed variable type flags\n", id);
	break;
	}

	if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
	err += efunc(i, "%d changed variable type size\n", id);
	break;
	}
	}

	return (err);
	}

	/*
	* Validate a DTrace DIF object that it is to be used as a helper. Helpers
	* are much more constrained than normal DIFOs. Specifically, they may
	* not:
	*
	* 1. Make calls to subroutines other than copyin(), copyinstr() or
	* miscellaneous string routines
	* 2. Access DTrace variables other than the args[] array, and the
	* curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
	* 3. Have thread-local variables.
	* 4. Have dynamic variables.
	*/
	static int
	dtrace_difo_validate_helper(dtrace_difo_t *dp)
	{
	int (efunc)(uint_t pc, const char , ...) = dtrace_difo_err;
	int err = 0;
	uint_t pc;

	for (pc = 0; pc < dp->dtdo_len; pc++) {
	dif_instr_t instr = dp->dtdo_buf[pc];

	uint_t v = DIF_INSTR_VAR(instr);
	uint_t subr = DIF_INSTR_SUBR(instr);
	uint_t op = DIF_INSTR_OP(instr);

	switch (op) {
	case DIF_OP_OR:
	case DIF_OP_XOR:
	case DIF_OP_AND:
	case DIF_OP_SLL:
	case DIF_OP_SRL:
	case DIF_OP_SRA:
	case DIF_OP_SUB:
	case DIF_OP_ADD:
	case DIF_OP_MUL:
	case DIF_OP_SDIV:
	case DIF_OP_UDIV:
	case DIF_OP_SREM:
	case DIF_OP_UREM:
	case DIF_OP_COPYS:
	case DIF_OP_NOT:
	case DIF_OP_MOV:
	case DIF_OP_RLDSB:
	case DIF_OP_RLDSH:
	case DIF_OP_RLDSW:
	case DIF_OP_RLDUB:
	case DIF_OP_RLDUH:
	case DIF_OP_RLDUW:
	case DIF_OP_RLDX:
	case DIF_OP_ULDSB:
	case DIF_OP_ULDSH:
	case DIF_OP_ULDSW:
	case DIF_OP_ULDUB:
	case DIF_OP_ULDUH:
	case DIF_OP_ULDUW:
	case DIF_OP_ULDX:
	case DIF_OP_STB:
	case DIF_OP_STH:
	case DIF_OP_STW:
	case DIF_OP_STX:
	case DIF_OP_ALLOCS:
	case DIF_OP_CMP:
	case DIF_OP_SCMP:
	case DIF_OP_TST:
	case DIF_OP_BA:
	case DIF_OP_BE:
	case DIF_OP_BNE:
	case DIF_OP_BG:
	case DIF_OP_BGU:
	case DIF_OP_BGE:
	case DIF_OP_BGEU:
	case DIF_OP_BL:
	case DIF_OP_BLU:
	case DIF_OP_BLE:
	case DIF_OP_BLEU:
	case DIF_OP_RET:
	case DIF_OP_NOP:
	case DIF_OP_POPTS:
	case DIF_OP_FLUSHTS:
	case DIF_OP_SETX:
	case DIF_OP_SETS:
	case DIF_OP_LDGA:
	case DIF_OP_LDLS:
	case DIF_OP_STGS:
	case DIF_OP_STLS:
	case DIF_OP_PUSHTR:
	case DIF_OP_PUSHTV:
	break;

	case DIF_OP_LDGS:
	if (v >= DIF_VAR_OTHER_UBASE)
	break;

	if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
	break;

	if (v == DIF_VAR_CURTHREAD \|\| v == DIF_VAR_PID \|\|
	v == DIF_VAR_PPID \|\| v == DIF_VAR_TID \|\|
	v == DIF_VAR_EXECARGS \|\|
	v == DIF_VAR_EXECNAME \|\| v == DIF_VAR_ZONENAME \|\|
	v == DIF_VAR_UID \|\| v == DIF_VAR_GID)
	break;

	err += efunc(pc, "illegal variable %u\n", v);
	break;

	case DIF_OP_LDTA:
	case DIF_OP_LDTS:
	case DIF_OP_LDGAA:
	case DIF_OP_LDTAA:
	err += efunc(pc, "illegal dynamic variable load\n");
	break;

	case DIF_OP_STTS:
	case DIF_OP_STGAA:
	case DIF_OP_STTAA:
	err += efunc(pc, "illegal dynamic variable store\n");
	break;

	case DIF_OP_CALL:
	if (subr == DIF_SUBR_ALLOCA \|\|
	subr == DIF_SUBR_BCOPY \|\|
	subr == DIF_SUBR_COPYIN \|\|
	subr == DIF_SUBR_COPYINTO \|\|
	subr == DIF_SUBR_COPYINSTR \|\|
	subr == DIF_SUBR_INDEX \|\|
	subr == DIF_SUBR_INET_NTOA \|\|
	subr == DIF_SUBR_INET_NTOA6 \|\|
	subr == DIF_SUBR_INET_NTOP \|\|
	subr == DIF_SUBR_LLTOSTR \|\|
	subr == DIF_SUBR_RINDEX \|\|
	subr == DIF_SUBR_STRCHR \|\|
	subr == DIF_SUBR_STRJOIN \|\|
	subr == DIF_SUBR_STRRCHR \|\|
	subr == DIF_SUBR_STRSTR \|\|
	subr == DIF_SUBR_HTONS \|\|
	subr == DIF_SUBR_HTONL \|\|
	subr == DIF_SUBR_HTONLL \|\|
	subr == DIF_SUBR_NTOHS \|\|
	subr == DIF_SUBR_NTOHL \|\|
	subr == DIF_SUBR_NTOHLL \|\|
	subr == DIF_SUBR_MEMREF \|\|
	subr == DIF_SUBR_TYPEREF)
	break;

	err += efunc(pc, "invalid subr %u\n", subr);
	break;

	default:
	err += efunc(pc, "invalid opcode %u\n",
	DIF_INSTR_OP(instr));
	}
	}

	return (err);
	}

	/*
	* Returns 1 if the expression in the DIF object can be cached on a per-thread
	* basis; 0 if not.
	*/
	static int
	dtrace_difo_cacheable(dtrace_difo_t *dp)
	{
	int i;

	if (dp == NULL)
	return (0);

	for (i = 0; i < dp->dtdo_varlen; i++) {
	dtrace_difv_t *v = &dp->dtdo_vartab[i];

	if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
	continue;

	switch (v->dtdv_id) {
	case DIF_VAR_CURTHREAD:
	case DIF_VAR_PID:
	case DIF_VAR_TID:
	case DIF_VAR_EXECARGS:
	case DIF_VAR_EXECNAME:
	case DIF_VAR_ZONENAME:
	break;

	default:
	return (0);
	}
	}

	/*
	* This DIF object may be cacheable. Now we need to look for any
	* array loading instructions, any memory loading instructions, or
	* any stores to thread-local variables.
	*/
	for (i = 0; i < dp->dtdo_len; i++) {
	uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);

	if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) \|\|
	(op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) \|\|
	(op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) \|\|
	op == DIF_OP_LDGA \|\| op == DIF_OP_STTS)
	return (0);
	}

	return (1);
	}

	static void
	dtrace_difo_hold(dtrace_difo_t *dp)
	{
	int i;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	dp->dtdo_refcnt++;
	ASSERT(dp->dtdo_refcnt != 0);

	/*
	* We need to check this DIF object for references to the variable
	* DIF_VAR_VTIMESTAMP.
	*/
	for (i = 0; i < dp->dtdo_varlen; i++) {
	dtrace_difv_t *v = &dp->dtdo_vartab[i];

	if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
	continue;

	if (dtrace_vtime_references++ == 0)
	dtrace_vtime_enable();
	}
	}

	/*
	* This routine calculates the dynamic variable chunksize for a given DIF
	* object. The calculation is not fool-proof, and can probably be tricked by
	* malicious DIF -- but it works for all compiler-generated DIF. Because this
	* calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
	* if a dynamic variable size exceeds the chunksize.
	*/
	static void
	dtrace_difo_chunksize(dtrace_difo_t dp, dtrace_vstate_t vstate)
	{
	uint64_t sval = 0;
	dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
	const dif_instr_t *text = dp->dtdo_buf;
	uint_t pc, srd = 0;
	uint_t ttop = 0;
	size_t size, ksize;
	uint_t id, i;

	for (pc = 0; pc < dp->dtdo_len; pc++) {
	dif_instr_t instr = text[pc];
	uint_t op = DIF_INSTR_OP(instr);
	uint_t rd = DIF_INSTR_RD(instr);
	uint_t r1 = DIF_INSTR_R1(instr);
	uint_t nkeys = 0;
	uchar_t scope = 0;

	dtrace_key_t *key = tupregs;

	switch (op) {
	case DIF_OP_SETX:
	sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
	srd = rd;
	continue;

	case DIF_OP_STTS:
	key = &tupregs[DIF_DTR_NREGS];
	key[0].dttk_size = 0;
	key[1].dttk_size = 0;
	nkeys = 2;
	scope = DIFV_SCOPE_THREAD;
	break;

	case DIF_OP_STGAA:
	case DIF_OP_STTAA:
	nkeys = ttop;

	if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
	key[nkeys++].dttk_size = 0;

	key[nkeys++].dttk_size = 0;

	if (op == DIF_OP_STTAA) {
	scope = DIFV_SCOPE_THREAD;
	} else {
	scope = DIFV_SCOPE_GLOBAL;
	}

	break;

	case DIF_OP_PUSHTR:
	if (ttop == DIF_DTR_NREGS)
	return;

	if ((srd == 0 \|\| sval == 0) && r1 == DIF_TYPE_STRING) {
	/*
	* If the register for the size of the "pushtr"
	* is %r0 (or the value is 0) and the type is
	* a string, we'll use the system-wide default
	* string size.
	*/
	tupregs[ttop++].dttk_size =
	dtrace_strsize_default;
	} else {
	if (srd == 0)
	return;

	tupregs[ttop++].dttk_size = sval;
	}

	break;

	case DIF_OP_PUSHTV:
	if (ttop == DIF_DTR_NREGS)
	return;

	tupregs[ttop++].dttk_size = 0;
	break;

	case DIF_OP_FLUSHTS:
	ttop = 0;
	break;

	case DIF_OP_POPTS:
	if (ttop != 0)
	ttop--;
	break;
	}

	sval = 0;
	srd = 0;

	if (nkeys == 0)
	continue;

	/*
	* We have a dynamic variable allocation; calculate its size.
	*/
	for (ksize = 0, i = 0; i < nkeys; i++)
	ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));

	size = sizeof (dtrace_dynvar_t);
	size += sizeof (dtrace_key_t) * (nkeys - 1);
	size += ksize;

	/*
	* Now we need to determine the size of the stored data.
	*/
	id = DIF_INSTR_VAR(instr);

	for (i = 0; i < dp->dtdo_varlen; i++) {
	dtrace_difv_t *v = &dp->dtdo_vartab[i];

	if (v->dtdv_id == id && v->dtdv_scope == scope) {
	size += v->dtdv_type.dtdt_size;
	break;
	}
	}

	if (i == dp->dtdo_varlen)
	return;

	/*
	* We have the size. If this is larger than the chunk size
	* for our dynamic variable state, reset the chunk size.
	*/
	size = P2ROUNDUP(size, sizeof (uint64_t));

	if (size > vstate->dtvs_dynvars.dtds_chunksize)
	vstate->dtvs_dynvars.dtds_chunksize = size;
	}
	}

	static void
	dtrace_difo_init(dtrace_difo_t dp, dtrace_vstate_t vstate)
	{
	int i, oldsvars, osz, nsz, otlocals, ntlocals;
	uint_t id;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);

	for (i = 0; i < dp->dtdo_varlen; i++) {
	dtrace_difv_t *v = &dp->dtdo_vartab[i];
	dtrace_statvar_t svar, **svarp = NULL;
	size_t dsize = 0;
	uint8_t scope = v->dtdv_scope;
	int *np = NULL;

	if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
	continue;

	id -= DIF_VAR_OTHER_UBASE;

	switch (scope) {
	case DIFV_SCOPE_THREAD:
	while (id >= (otlocals = vstate->dtvs_ntlocals)) {
	dtrace_difv_t *tlocals;

	if ((ntlocals = (otlocals << 1)) == 0)
	ntlocals = 1;

	osz = otlocals * sizeof (dtrace_difv_t);
	nsz = ntlocals * sizeof (dtrace_difv_t);

	tlocals = kmem_zalloc(nsz, KM_SLEEP);

	if (osz != 0) {
	bcopy(vstate->dtvs_tlocals,
	tlocals, osz);
	kmem_free(vstate->dtvs_tlocals, osz);
	}

	vstate->dtvs_tlocals = tlocals;
	vstate->dtvs_ntlocals = ntlocals;
	}

	vstate->dtvs_tlocals[id] = *v;
	continue;

	case DIFV_SCOPE_LOCAL:
	np = &vstate->dtvs_nlocals;
	svarp = &vstate->dtvs_locals;

	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
	dsize = NCPU * (v->dtdv_type.dtdt_size +
	sizeof (uint64_t));
	else
	dsize = NCPU * sizeof (uint64_t);

	break;

	case DIFV_SCOPE_GLOBAL:
	np = &vstate->dtvs_nglobals;
	svarp = &vstate->dtvs_globals;

	if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
	dsize = v->dtdv_type.dtdt_size +
	sizeof (uint64_t);

	break;

	default:
	ASSERT(0);
	}

	while (id >= (oldsvars = *np)) {
	dtrace_statvar_t **statics;
	int newsvars, oldsize, newsize;

	if ((newsvars = (oldsvars << 1)) == 0)
	newsvars = 1;

	oldsize = oldsvars * sizeof (dtrace_statvar_t *);
	newsize = newsvars * sizeof (dtrace_statvar_t *);

	statics = kmem_zalloc(newsize, KM_SLEEP);

	if (oldsize != 0) {
	bcopy(*svarp, statics, oldsize);
	kmem_free(*svarp, oldsize);
	}

	*svarp = statics;
	*np = newsvars;
	}

	if ((svar = (*svarp)[id]) == NULL) {
	svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
	svar->dtsv_var = *v;

	if ((svar->dtsv_size = dsize) != 0) {
	svar->dtsv_data = (uint64_t)(uintptr_t)
	kmem_zalloc(dsize, KM_SLEEP);
	}

	(*svarp)[id] = svar;
	}

	svar->dtsv_refcnt++;
	}

	dtrace_difo_chunksize(dp, vstate);
	dtrace_difo_hold(dp);
	}

	static dtrace_difo_t *
	dtrace_difo_duplicate(dtrace_difo_t dp, dtrace_vstate_t vstate)
	{
	dtrace_difo_t *new;
	size_t sz;

	ASSERT(dp->dtdo_buf != NULL);
	ASSERT(dp->dtdo_refcnt != 0);

	new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);

	ASSERT(dp->dtdo_buf != NULL);
	sz = dp->dtdo_len * sizeof (dif_instr_t);
	new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
	bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
	new->dtdo_len = dp->dtdo_len;

	if (dp->dtdo_strtab != NULL) {
	ASSERT(dp->dtdo_strlen != 0);
	new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
	bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
	new->dtdo_strlen = dp->dtdo_strlen;
	}

	if (dp->dtdo_inttab != NULL) {
	ASSERT(dp->dtdo_intlen != 0);
	sz = dp->dtdo_intlen * sizeof (uint64_t);
	new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
	bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
	new->dtdo_intlen = dp->dtdo_intlen;
	}

	if (dp->dtdo_vartab != NULL) {
	ASSERT(dp->dtdo_varlen != 0);
	sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
	new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
	bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
	new->dtdo_varlen = dp->dtdo_varlen;
	}

	dtrace_difo_init(new, vstate);
	return (new);
	}

	static void
	dtrace_difo_destroy(dtrace_difo_t dp, dtrace_vstate_t vstate)
	{
	int i;

	ASSERT(dp->dtdo_refcnt == 0);

	for (i = 0; i < dp->dtdo_varlen; i++) {
	dtrace_difv_t *v = &dp->dtdo_vartab[i];
	dtrace_statvar_t svar, *svarp = NULL;
	uint_t id;
	uint8_t scope = v->dtdv_scope;
	int *np = NULL;

	switch (scope) {
	case DIFV_SCOPE_THREAD:
	continue;

	case DIFV_SCOPE_LOCAL:
	np = &vstate->dtvs_nlocals;
	svarp = vstate->dtvs_locals;
	break;

	case DIFV_SCOPE_GLOBAL:
	np = &vstate->dtvs_nglobals;
	svarp = vstate->dtvs_globals;
	break;

	default:
	ASSERT(0);
	}

	if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
	continue;

	id -= DIF_VAR_OTHER_UBASE;
	ASSERT(id < *np);

	svar = svarp[id];
	ASSERT(svar != NULL);
	ASSERT(svar->dtsv_refcnt > 0);

	if (--svar->dtsv_refcnt > 0)
	continue;

	if (svar->dtsv_size != 0) {
	ASSERT(svar->dtsv_data != 0);
	kmem_free((void *)(uintptr_t)svar->dtsv_data,
	svar->dtsv_size);
	}

	kmem_free(svar, sizeof (dtrace_statvar_t));
	svarp[id] = NULL;
	}

	if (dp->dtdo_buf != NULL)
	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
	if (dp->dtdo_inttab != NULL)
	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
	if (dp->dtdo_strtab != NULL)
	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
	if (dp->dtdo_vartab != NULL)
	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));

	kmem_free(dp, sizeof (dtrace_difo_t));
	}

	static void
	dtrace_difo_release(dtrace_difo_t dp, dtrace_vstate_t vstate)
	{
	int i;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(dp->dtdo_refcnt != 0);

	for (i = 0; i < dp->dtdo_varlen; i++) {
	dtrace_difv_t *v = &dp->dtdo_vartab[i];

	if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
	continue;

	ASSERT(dtrace_vtime_references > 0);
	if (--dtrace_vtime_references == 0)
	dtrace_vtime_disable();
	}

	if (--dp->dtdo_refcnt == 0)
	dtrace_difo_destroy(dp, vstate);
	}

	/*
	* DTrace Format Functions
	*/
	static uint16_t
	dtrace_format_add(dtrace_state_t state, char str)
	{
	char fmt, *new;
	uint16_t ndx, len = strlen(str) + 1;

	fmt = kmem_zalloc(len, KM_SLEEP);
	bcopy(str, fmt, len);

	for (ndx = 0; ndx < state->dts_nformats; ndx++) {
	if (state->dts_formats[ndx] == NULL) {
	state->dts_formats[ndx] = fmt;
	return (ndx + 1);
	}
	}

	if (state->dts_nformats == USHRT_MAX) {
	/*
	* This is only likely if a denial-of-service attack is being
	* attempted. As such, it's okay to fail silently here.
	*/
	kmem_free(fmt, len);
	return (0);
	}

	/*
	* For simplicity, we always resize the formats array to be exactly the
	* number of formats.
	*/
	ndx = state->dts_nformats++;
	new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);

	if (state->dts_formats != NULL) {
	ASSERT(ndx != 0);
	bcopy(state->dts_formats, new, ndx * sizeof (char *));
	kmem_free(state->dts_formats, ndx * sizeof (char *));
	}

	state->dts_formats = new;
	state->dts_formats[ndx] = fmt;

	return (ndx + 1);
	}

	static void
	dtrace_format_remove(dtrace_state_t *state, uint16_t format)
	{
	char *fmt;

	ASSERT(state->dts_formats != NULL);
	ASSERT(format <= state->dts_nformats);
	ASSERT(state->dts_formats[format - 1] != NULL);

	fmt = state->dts_formats[format - 1];
	kmem_free(fmt, strlen(fmt) + 1);
	state->dts_formats[format - 1] = NULL;
	}

	static void
	dtrace_format_destroy(dtrace_state_t *state)
	{
	int i;

	if (state->dts_nformats == 0) {
	ASSERT(state->dts_formats == NULL);
	return;
	}

	ASSERT(state->dts_formats != NULL);

	for (i = 0; i < state->dts_nformats; i++) {
	char *fmt = state->dts_formats[i];

	if (fmt == NULL)
	continue;

	kmem_free(fmt, strlen(fmt) + 1);
	}

	kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
	state->dts_nformats = 0;
	state->dts_formats = NULL;
	}

	/*
	* DTrace Predicate Functions
	*/
	static dtrace_predicate_t *
	dtrace_predicate_create(dtrace_difo_t *dp)
	{
	dtrace_predicate_t *pred;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(dp->dtdo_refcnt != 0);

	pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
	pred->dtp_difo = dp;
	pred->dtp_refcnt = 1;

	if (!dtrace_difo_cacheable(dp))
	return (pred);

	if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
	/*
	* This is only theoretically possible -- we have had 2^32
	* cacheable predicates on this machine. We cannot allow any
	* more predicates to become cacheable: as unlikely as it is,
	* there may be a thread caching a (now stale) predicate cache
	* ID. (N.B.: the temptation is being successfully resisted to
	* have this cmn_err() "Holy shit -- we executed this code!")
	*/
	return (pred);
	}

	pred->dtp_cacheid = dtrace_predcache_id++;

	return (pred);
	}

	static void
	dtrace_predicate_hold(dtrace_predicate_t *pred)
	{
	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
	ASSERT(pred->dtp_refcnt > 0);

	pred->dtp_refcnt++;
	}

	static void
	dtrace_predicate_release(dtrace_predicate_t pred, dtrace_vstate_t vstate)
	{
	dtrace_difo_t *dp = pred->dtp_difo;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
	ASSERT(pred->dtp_refcnt > 0);

	if (--pred->dtp_refcnt == 0) {
	dtrace_difo_release(pred->dtp_difo, vstate);
	kmem_free(pred, sizeof (dtrace_predicate_t));
	}
	}

	/*
	* DTrace Action Description Functions
	*/
	static dtrace_actdesc_t *
	dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
	uint64_t uarg, uint64_t arg)
	{
	dtrace_actdesc_t *act;

	#if defined(sun)
	ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) \|\| (arg != NULL &&
	arg >= KERNELBASE) \|\| (arg == NULL && kind == DTRACEACT_PRINTA));
	#endif

	act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
	act->dtad_kind = kind;
	act->dtad_ntuple = ntuple;
	act->dtad_uarg = uarg;
	act->dtad_arg = arg;
	act->dtad_refcnt = 1;

	return (act);
	}

	static void
	dtrace_actdesc_hold(dtrace_actdesc_t *act)
	{
	ASSERT(act->dtad_refcnt >= 1);
	act->dtad_refcnt++;
	}

	static void
	dtrace_actdesc_release(dtrace_actdesc_t act, dtrace_vstate_t vstate)
	{
	dtrace_actkind_t kind = act->dtad_kind;
	dtrace_difo_t *dp;

	ASSERT(act->dtad_refcnt >= 1);

	if (--act->dtad_refcnt != 0)
	return;

	if ((dp = act->dtad_difo) != NULL)
	dtrace_difo_release(dp, vstate);

	if (DTRACEACT_ISPRINTFLIKE(kind)) {
	char str = (char )(uintptr_t)act->dtad_arg;

	#if defined(sun)
	ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) \|\|
	(str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
	#endif

	if (str != NULL)
	kmem_free(str, strlen(str) + 1);
	}

	kmem_free(act, sizeof (dtrace_actdesc_t));
	}

	/*
	* DTrace ECB Functions
	*/
	static dtrace_ecb_t *
	dtrace_ecb_add(dtrace_state_t state, dtrace_probe_t probe)
	{
	dtrace_ecb_t *ecb;
	dtrace_epid_t epid;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
	ecb->dte_predicate = NULL;
	ecb->dte_probe = probe;

	/*
	* The default size is the size of the default action: recording
	* the epid.
	*/
	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
	ecb->dte_alignment = sizeof (dtrace_epid_t);

	epid = state->dts_epid++;

	if (epid - 1 >= state->dts_necbs) {
	dtrace_ecb_t oecbs = state->dts_ecbs, ecbs;
	int necbs = state->dts_necbs << 1;

	ASSERT(epid == state->dts_necbs + 1);

	if (necbs == 0) {
	ASSERT(oecbs == NULL);
	necbs = 1;
	}

	ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);

	if (oecbs != NULL)
	bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));

	dtrace_membar_producer();
	state->dts_ecbs = ecbs;

	if (oecbs != NULL) {
	/*
	* If this state is active, we must dtrace_sync()
	* before we can free the old dts_ecbs array: we're
	* coming in hot, and there may be active ring
	* buffer processing (which indexes into the dts_ecbs
	* array) on another CPU.
	*/
	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
	dtrace_sync();

	kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
	}

	dtrace_membar_producer();
	state->dts_necbs = necbs;
	}

	ecb->dte_state = state;

	ASSERT(state->dts_ecbs[epid - 1] == NULL);
	dtrace_membar_producer();
	state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;

	return (ecb);
	}

	static void
	dtrace_ecb_enable(dtrace_ecb_t *ecb)
	{
	dtrace_probe_t *probe = ecb->dte_probe;

	ASSERT(MUTEX_HELD(&cpu_lock));
	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(ecb->dte_next == NULL);

	if (probe == NULL) {
	/*
	* This is the NULL probe -- there's nothing to do.
	*/
	return;
	}

	if (probe->dtpr_ecb == NULL) {
	dtrace_provider_t *prov = probe->dtpr_provider;

	/*
	* We're the first ECB on this probe.
	*/
	probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;

	if (ecb->dte_predicate != NULL)
	probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;

	prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
	probe->dtpr_id, probe->dtpr_arg);
	} else {
	/*
	* This probe is already active. Swing the last pointer to
	* point to the new ECB, and issue a dtrace_sync() to assure
	* that all CPUs have seen the change.
	*/
	ASSERT(probe->dtpr_ecb_last != NULL);
	probe->dtpr_ecb_last->dte_next = ecb;
	probe->dtpr_ecb_last = ecb;
	probe->dtpr_predcache = 0;

	dtrace_sync();
	}
	}

	static void
	dtrace_ecb_resize(dtrace_ecb_t *ecb)
	{
	uint32_t maxalign = sizeof (dtrace_epid_t);
	uint32_t align = sizeof (uint8_t), offs, diff;
	dtrace_action_t *act;
	int wastuple = 0;
	uint32_t aggbase = UINT32_MAX;
	dtrace_state_t *state = ecb->dte_state;

	/*
	* If we record anything, we always record the epid. (And we always
	* record it first.)
	*/
	offs = sizeof (dtrace_epid_t);
	ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);

	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
	dtrace_recdesc_t *rec = &act->dta_rec;

	if ((align = rec->dtrd_alignment) > maxalign)
	maxalign = align;

	if (!wastuple && act->dta_intuple) {
	/*
	* This is the first record in a tuple. Align the
	* offset to be at offset 4 in an 8-byte aligned
	* block.
	*/
	diff = offs + sizeof (dtrace_aggid_t);

	if ((diff = (diff & (sizeof (uint64_t) - 1))))
	offs += sizeof (uint64_t) - diff;

	aggbase = offs - sizeof (dtrace_aggid_t);
	ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
	}

	/LINTED/
	if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
	/*
	* The current offset is not properly aligned; align it.
	*/
	offs += align - diff;
	}

	rec->dtrd_offset = offs;

	if (offs + rec->dtrd_size > ecb->dte_needed) {
	ecb->dte_needed = offs + rec->dtrd_size;

	if (ecb->dte_needed > state->dts_needed)
	state->dts_needed = ecb->dte_needed;
	}

	if (DTRACEACT_ISAGG(act->dta_kind)) {
	dtrace_aggregation_t agg = (dtrace_aggregation_t )act;
	dtrace_action_t first = agg->dtag_first, prev;

	ASSERT(rec->dtrd_size != 0 && first != NULL);
	ASSERT(wastuple);
	ASSERT(aggbase != UINT32_MAX);

	agg->dtag_base = aggbase;

	while ((prev = first->dta_prev) != NULL &&
	DTRACEACT_ISAGG(prev->dta_kind)) {
	agg = (dtrace_aggregation_t *)prev;
	first = agg->dtag_first;
	}

	if (prev != NULL) {
	offs = prev->dta_rec.dtrd_offset +
	prev->dta_rec.dtrd_size;
	} else {
	offs = sizeof (dtrace_epid_t);
	}
	wastuple = 0;
	} else {
	if (!act->dta_intuple)
	ecb->dte_size = offs + rec->dtrd_size;

	offs += rec->dtrd_size;
	}

	wastuple = act->dta_intuple;
	}

	if ((act = ecb->dte_action) != NULL &&
	!(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
	ecb->dte_size == sizeof (dtrace_epid_t)) {
	/*
	* If the size is still sizeof (dtrace_epid_t), then all
	* actions store no data; set the size to 0.
	*/
	ecb->dte_alignment = maxalign;
	ecb->dte_size = 0;

	/*
	* If the needed space is still sizeof (dtrace_epid_t), then
	* all actions need no additional space; set the needed
	* size to 0.
	*/
	if (ecb->dte_needed == sizeof (dtrace_epid_t))
	ecb->dte_needed = 0;

	return;
	}

	/*
	* Set our alignment, and make sure that the dte_size and dte_needed
	* are aligned to the size of an EPID.
	*/
	ecb->dte_alignment = maxalign;
	ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
	~(sizeof (dtrace_epid_t) - 1);
	ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
	~(sizeof (dtrace_epid_t) - 1);
	ASSERT(ecb->dte_size <= ecb->dte_needed);
	}

	static dtrace_action_t *
	dtrace_ecb_aggregation_create(dtrace_ecb_t ecb, dtrace_actdesc_t desc)
	{
	dtrace_aggregation_t *agg;
	size_t size = sizeof (uint64_t);
	int ntuple = desc->dtad_ntuple;
	dtrace_action_t *act;
	dtrace_recdesc_t *frec;
	dtrace_aggid_t aggid;
	dtrace_state_t *state = ecb->dte_state;

	agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
	agg->dtag_ecb = ecb;

	ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));

	switch (desc->dtad_kind) {
	case DTRACEAGG_MIN:
	agg->dtag_initial = INT64_MAX;
	agg->dtag_aggregate = dtrace_aggregate_min;
	break;

	case DTRACEAGG_MAX:
	agg->dtag_initial = INT64_MIN;
	agg->dtag_aggregate = dtrace_aggregate_max;
	break;

	case DTRACEAGG_COUNT:
	agg->dtag_aggregate = dtrace_aggregate_count;
	break;

	case DTRACEAGG_QUANTIZE:
	agg->dtag_aggregate = dtrace_aggregate_quantize;
	size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
	sizeof (uint64_t);
	break;

	case DTRACEAGG_LQUANTIZE: {
	uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
	uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);

	agg->dtag_initial = desc->dtad_arg;
	agg->dtag_aggregate = dtrace_aggregate_lquantize;

	if (step == 0 \|\| levels == 0)
	goto err;

	size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
	break;
	}

	case DTRACEAGG_AVG:
	agg->dtag_aggregate = dtrace_aggregate_avg;
	size = sizeof (uint64_t) * 2;
	break;

	case DTRACEAGG_STDDEV:
	agg->dtag_aggregate = dtrace_aggregate_stddev;
	size = sizeof (uint64_t) * 4;
	break;

	case DTRACEAGG_SUM:
	agg->dtag_aggregate = dtrace_aggregate_sum;
	break;

	default:
	goto err;
	}

	agg->dtag_action.dta_rec.dtrd_size = size;

	if (ntuple == 0)
	goto err;

	/*
	* We must make sure that we have enough actions for the n-tuple.
	*/
	for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
	if (DTRACEACT_ISAGG(act->dta_kind))
	break;

	if (--ntuple == 0) {
	/*
	* This is the action with which our n-tuple begins.
	*/
	agg->dtag_first = act;
	goto success;
	}
	}

	/*
	* This n-tuple is short by ntuple elements. Return failure.
	*/
	ASSERT(ntuple != 0);
	err:
	kmem_free(agg, sizeof (dtrace_aggregation_t));
	return (NULL);

	success:
	/*
	* If the last action in the tuple has a size of zero, it's actually
	* an expression argument for the aggregating action.
	*/
	ASSERT(ecb->dte_action_last != NULL);
	act = ecb->dte_action_last;

	if (act->dta_kind == DTRACEACT_DIFEXPR) {
	ASSERT(act->dta_difo != NULL);

	if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
	agg->dtag_hasarg = 1;
	}

	/*
	* We need to allocate an id for this aggregation.
	*/
	#if defined(sun)
	aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
	VM_BESTFIT \| VM_SLEEP);
	#else
	aggid = alloc_unr(state->dts_aggid_arena);
	#endif

	if (aggid - 1 >= state->dts_naggregations) {
	dtrace_aggregation_t **oaggs = state->dts_aggregations;
	dtrace_aggregation_t **aggs;
	int naggs = state->dts_naggregations << 1;
	int onaggs = state->dts_naggregations;

	ASSERT(aggid == state->dts_naggregations + 1);

	if (naggs == 0) {
	ASSERT(oaggs == NULL);
	naggs = 1;
	}

	aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);

	if (oaggs != NULL) {
	bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
	kmem_free(oaggs, onaggs * sizeof (*aggs));
	}

	state->dts_aggregations = aggs;
	state->dts_naggregations = naggs;
	}

	ASSERT(state->dts_aggregations[aggid - 1] == NULL);
	state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;

	frec = &agg->dtag_first->dta_rec;
	if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
	frec->dtrd_alignment = sizeof (dtrace_aggid_t);

	for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
	ASSERT(!act->dta_intuple);
	act->dta_intuple = 1;
	}

	return (&agg->dtag_action);
	}

	static void
	dtrace_ecb_aggregation_destroy(dtrace_ecb_t ecb, dtrace_action_t act)
	{
	dtrace_aggregation_t agg = (dtrace_aggregation_t )act;
	dtrace_state_t *state = ecb->dte_state;
	dtrace_aggid_t aggid = agg->dtag_id;

	ASSERT(DTRACEACT_ISAGG(act->dta_kind));
	#if defined(sun)
	vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
	#else
	free_unr(state->dts_aggid_arena, aggid);
	#endif

	ASSERT(state->dts_aggregations[aggid - 1] == agg);
	state->dts_aggregations[aggid - 1] = NULL;

	kmem_free(agg, sizeof (dtrace_aggregation_t));
	}

	static int
	dtrace_ecb_action_add(dtrace_ecb_t ecb, dtrace_actdesc_t desc)
	{
	dtrace_action_t action, last;
	dtrace_difo_t *dp = desc->dtad_difo;
	uint32_t size = 0, align = sizeof (uint8_t), mask;
	uint16_t format = 0;
	dtrace_recdesc_t *rec;
	dtrace_state_t *state = ecb->dte_state;
	dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
	uint64_t arg = desc->dtad_arg;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(ecb->dte_action == NULL \|\| ecb->dte_action->dta_refcnt == 1);

	if (DTRACEACT_ISAGG(desc->dtad_kind)) {
	/*
	* If this is an aggregating action, there must be neither
	* a speculate nor a commit on the action chain.
	*/
	dtrace_action_t *act;

	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
	if (act->dta_kind == DTRACEACT_COMMIT)
	return (EINVAL);

	if (act->dta_kind == DTRACEACT_SPECULATE)
	return (EINVAL);
	}

	action = dtrace_ecb_aggregation_create(ecb, desc);

	if (action == NULL)
	return (EINVAL);
	} else {
	if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) \|\|
	(desc->dtad_kind == DTRACEACT_DIFEXPR &&
	dp != NULL && dp->dtdo_destructive)) {
	state->dts_destructive = 1;
	}

	switch (desc->dtad_kind) {
	case DTRACEACT_PRINTF:
	case DTRACEACT_PRINTA:
	case DTRACEACT_SYSTEM:
	case DTRACEACT_FREOPEN:
	/*
	* We know that our arg is a string -- turn it into a
	* format.
	*/
	if (arg == 0) {
	ASSERT(desc->dtad_kind == DTRACEACT_PRINTA);
	format = 0;
	} else {
	ASSERT(arg != 0);
	#if defined(sun)
	ASSERT(arg > KERNELBASE);
	#endif
	format = dtrace_format_add(state,
	(char *)(uintptr_t)arg);
	}

	/FALLTHROUGH/
	case DTRACEACT_LIBACT:
	case DTRACEACT_DIFEXPR:
	if (dp == NULL)
	return (EINVAL);

	if ((size = dp->dtdo_rtype.dtdt_size) != 0)
	break;

	if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
	if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
	return (EINVAL);

	size = opt[DTRACEOPT_STRSIZE];
	}

	break;

	case DTRACEACT_STACK:
	if ((nframes = arg) == 0) {
	nframes = opt[DTRACEOPT_STACKFRAMES];
	ASSERT(nframes > 0);
	arg = nframes;
	}

	size = nframes * sizeof (pc_t);
	break;

	case DTRACEACT_JSTACK:
	if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
	strsize = opt[DTRACEOPT_JSTACKSTRSIZE];

	if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
	nframes = opt[DTRACEOPT_JSTACKFRAMES];

	arg = DTRACE_USTACK_ARG(nframes, strsize);

	/FALLTHROUGH/
	case DTRACEACT_USTACK:
	if (desc->dtad_kind != DTRACEACT_JSTACK &&
	(nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
	strsize = DTRACE_USTACK_STRSIZE(arg);
	nframes = opt[DTRACEOPT_USTACKFRAMES];
	ASSERT(nframes > 0);
	arg = DTRACE_USTACK_ARG(nframes, strsize);
	}

	/*
	* Save a slot for the pid.
	*/
	size = (nframes + 1) * sizeof (uint64_t);
	size += DTRACE_USTACK_STRSIZE(arg);
	size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));

	break;

	case DTRACEACT_SYM:
	case DTRACEACT_MOD:
	if (dp == NULL \|\| ((size = dp->dtdo_rtype.dtdt_size) !=
	sizeof (uint64_t)) \|\|
	(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
	return (EINVAL);
	break;

	case DTRACEACT_USYM:
	case DTRACEACT_UMOD:
	case DTRACEACT_UADDR:
	if (dp == NULL \|\|
	(dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) \|\|
	(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
	return (EINVAL);

	/*
	* We have a slot for the pid, plus a slot for the
	* argument. To keep things simple (aligned with
	* bitness-neutral sizing), we store each as a 64-bit
	* quantity.
	*/
	size = 2 * sizeof (uint64_t);
	break;

	case DTRACEACT_STOP:
	case DTRACEACT_BREAKPOINT:
	case DTRACEACT_PANIC:
	break;

	case DTRACEACT_CHILL:
	case DTRACEACT_DISCARD:
	case DTRACEACT_RAISE:
	if (dp == NULL)
	return (EINVAL);
	break;

	case DTRACEACT_EXIT:
	if (dp == NULL \|\|
	(size = dp->dtdo_rtype.dtdt_size) != sizeof (int) \|\|
	(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
	return (EINVAL);
	break;

	case DTRACEACT_SPECULATE:
	if (ecb->dte_size > sizeof (dtrace_epid_t))
	return (EINVAL);

	if (dp == NULL)
	return (EINVAL);

	state->dts_speculates = 1;
	break;

	case DTRACEACT_PRINTM:
	size = dp->dtdo_rtype.dtdt_size;
	break;

	case DTRACEACT_PRINTT:
	size = dp->dtdo_rtype.dtdt_size;
	break;

	case DTRACEACT_COMMIT: {
	dtrace_action_t *act = ecb->dte_action;

	for (; act != NULL; act = act->dta_next) {
	if (act->dta_kind == DTRACEACT_COMMIT)
	return (EINVAL);
	}

	if (dp == NULL)
	return (EINVAL);
	break;
	}

	default:
	return (EINVAL);
	}

	if (size != 0 \|\| desc->dtad_kind == DTRACEACT_SPECULATE) {
	/*
	* If this is a data-storing action or a speculate,
	* we must be sure that there isn't a commit on the
	* action chain.
	*/
	dtrace_action_t *act = ecb->dte_action;

	for (; act != NULL; act = act->dta_next) {
	if (act->dta_kind == DTRACEACT_COMMIT)
	return (EINVAL);
	}
	}

	action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
	action->dta_rec.dtrd_size = size;
	}

	action->dta_refcnt = 1;
	rec = &action->dta_rec;
	size = rec->dtrd_size;

	for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
	if (!(size & mask)) {
	align = mask + 1;
	break;
	}
	}

	action->dta_kind = desc->dtad_kind;

	if ((action->dta_difo = dp) != NULL)
	dtrace_difo_hold(dp);

	rec->dtrd_action = action->dta_kind;
	rec->dtrd_arg = arg;
	rec->dtrd_uarg = desc->dtad_uarg;
	rec->dtrd_alignment = (uint16_t)align;
	rec->dtrd_format = format;

	if ((last = ecb->dte_action_last) != NULL) {
	ASSERT(ecb->dte_action != NULL);
	action->dta_prev = last;
	last->dta_next = action;
	} else {
	ASSERT(ecb->dte_action == NULL);
	ecb->dte_action = action;
	}

	ecb->dte_action_last = action;

	return (0);
	}

	static void
	dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
	{
	dtrace_action_t act = ecb->dte_action, next;
	dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
	dtrace_difo_t *dp;
	uint16_t format;

	if (act != NULL && act->dta_refcnt > 1) {
	ASSERT(act->dta_next == NULL \|\| act->dta_next->dta_refcnt == 1);
	act->dta_refcnt--;
	} else {
	for (; act != NULL; act = next) {
	next = act->dta_next;
	ASSERT(next != NULL \|\| act == ecb->dte_action_last);
	ASSERT(act->dta_refcnt == 1);

	if ((format = act->dta_rec.dtrd_format) != 0)
	dtrace_format_remove(ecb->dte_state, format);

	if ((dp = act->dta_difo) != NULL)
	dtrace_difo_release(dp, vstate);

	if (DTRACEACT_ISAGG(act->dta_kind)) {
	dtrace_ecb_aggregation_destroy(ecb, act);
	} else {
	kmem_free(act, sizeof (dtrace_action_t));
	}
	}
	}

	ecb->dte_action = NULL;
	ecb->dte_action_last = NULL;
	ecb->dte_size = sizeof (dtrace_epid_t);
	}

	static void
	dtrace_ecb_disable(dtrace_ecb_t *ecb)
	{
	/*
	* We disable the ECB by removing it from its probe.
	*/
	dtrace_ecb_t pecb, prev = NULL;
	dtrace_probe_t *probe = ecb->dte_probe;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	if (probe == NULL) {
	/*
	* This is the NULL probe; there is nothing to disable.
	*/
	return;
	}

	for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
	if (pecb == ecb)
	break;
	prev = pecb;
	}

	ASSERT(pecb != NULL);

	if (prev == NULL) {
	probe->dtpr_ecb = ecb->dte_next;
	} else {
	prev->dte_next = ecb->dte_next;
	}

	if (ecb == probe->dtpr_ecb_last) {
	ASSERT(ecb->dte_next == NULL);
	probe->dtpr_ecb_last = prev;
	}

	/*
	* The ECB has been disconnected from the probe; now sync to assure
	* that all CPUs have seen the change before returning.
	*/
	dtrace_sync();

	if (probe->dtpr_ecb == NULL) {
	/*
	* That was the last ECB on the probe; clear the predicate
	* cache ID for the probe, disable it and sync one more time
	* to assure that we'll never hit it again.
	*/
	dtrace_provider_t *prov = probe->dtpr_provider;

	ASSERT(ecb->dte_next == NULL);
	ASSERT(probe->dtpr_ecb_last == NULL);
	probe->dtpr_predcache = DTRACE_CACHEIDNONE;
	prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
	probe->dtpr_id, probe->dtpr_arg);
	dtrace_sync();
	} else {
	/*
	* There is at least one ECB remaining on the probe. If there
	* is _exactly_ one, set the probe's predicate cache ID to be
	* the predicate cache ID of the remaining ECB.
	*/
	ASSERT(probe->dtpr_ecb_last != NULL);
	ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);

	if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
	dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;

	ASSERT(probe->dtpr_ecb->dte_next == NULL);

	if (p != NULL)
	probe->dtpr_predcache = p->dtp_cacheid;
	}

	ecb->dte_next = NULL;
	}
	}

	static void
	dtrace_ecb_destroy(dtrace_ecb_t *ecb)
	{
	dtrace_state_t *state = ecb->dte_state;
	dtrace_vstate_t *vstate = &state->dts_vstate;
	dtrace_predicate_t *pred;
	dtrace_epid_t epid = ecb->dte_epid;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(ecb->dte_next == NULL);
	ASSERT(ecb->dte_probe == NULL \|\| ecb->dte_probe->dtpr_ecb != ecb);

	if ((pred = ecb->dte_predicate) != NULL)
	dtrace_predicate_release(pred, vstate);

	dtrace_ecb_action_remove(ecb);

	ASSERT(state->dts_ecbs[epid - 1] == ecb);
	state->dts_ecbs[epid - 1] = NULL;

	kmem_free(ecb, sizeof (dtrace_ecb_t));
	}

	static dtrace_ecb_t *
	dtrace_ecb_create(dtrace_state_t state, dtrace_probe_t probe,
	dtrace_enabling_t *enab)
	{
	dtrace_ecb_t *ecb;
	dtrace_predicate_t *pred;
	dtrace_actdesc_t *act;
	dtrace_provider_t *prov;
	dtrace_ecbdesc_t *desc = enab->dten_current;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(state != NULL);

	ecb = dtrace_ecb_add(state, probe);
	ecb->dte_uarg = desc->dted_uarg;

	if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
	dtrace_predicate_hold(pred);
	ecb->dte_predicate = pred;
	}

	if (probe != NULL) {
	/*
	* If the provider shows more leg than the consumer is old
	* enough to see, we need to enable the appropriate implicit
	* predicate bits to prevent the ecb from activating at
	* revealing times.
	*
	* Providers specifying DTRACE_PRIV_USER at register time
	* are stating that they need the /proc-style privilege
	* model to be enforced, and this is what DTRACE_COND_OWNER
	* and DTRACE_COND_ZONEOWNER will then do at probe time.
	*/
	prov = probe->dtpr_provider;
	if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
	(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
	ecb->dte_cond \|= DTRACE_COND_OWNER;

	if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
	(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
	ecb->dte_cond \|= DTRACE_COND_ZONEOWNER;

	/*
	* If the provider shows us kernel innards and the user
	* is lacking sufficient privilege, enable the
	* DTRACE_COND_USERMODE implicit predicate.
	*/
	if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
	(prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
	ecb->dte_cond \|= DTRACE_COND_USERMODE;
	}

	if (dtrace_ecb_create_cache != NULL) {
	/*
	* If we have a cached ecb, we'll use its action list instead
	* of creating our own (saving both time and space).
	*/
	dtrace_ecb_t *cached = dtrace_ecb_create_cache;
	dtrace_action_t *act = cached->dte_action;

	if (act != NULL) {
	ASSERT(act->dta_refcnt > 0);
	act->dta_refcnt++;
	ecb->dte_action = act;
	ecb->dte_action_last = cached->dte_action_last;
	ecb->dte_needed = cached->dte_needed;
	ecb->dte_size = cached->dte_size;
	ecb->dte_alignment = cached->dte_alignment;
	}

	return (ecb);
	}

	for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
	if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
	dtrace_ecb_destroy(ecb);
	return (NULL);
	}
	}

	dtrace_ecb_resize(ecb);

	return (dtrace_ecb_create_cache = ecb);
	}

	static int
	dtrace_ecb_create_enable(dtrace_probe_t probe, void arg)
	{
	dtrace_ecb_t *ecb;
	dtrace_enabling_t *enab = arg;
	dtrace_state_t *state = enab->dten_vstate->dtvs_state;

	ASSERT(state != NULL);

	if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
	/*
	* This probe was created in a generation for which this
	* enabling has previously created ECBs; we don't want to
	* enable it again, so just kick out.
	*/
	return (DTRACE_MATCH_NEXT);
	}

	if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
	return (DTRACE_MATCH_DONE);

	dtrace_ecb_enable(ecb);
	return (DTRACE_MATCH_NEXT);
	}

	static dtrace_ecb_t *
	dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
	{
	dtrace_ecb_t *ecb;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	if (id == 0 \|\| id > state->dts_necbs)
	return (NULL);

	ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
	ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL \|\| ecb->dte_epid == id);

	return (state->dts_ecbs[id - 1]);
	}

	static dtrace_aggregation_t *
	dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
	{
	dtrace_aggregation_t *agg;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	if (id == 0 \|\| id > state->dts_naggregations)
	return (NULL);

	ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
	ASSERT((agg = state->dts_aggregations[id - 1]) == NULL \|\|
	agg->dtag_id == id);

	return (state->dts_aggregations[id - 1]);
	}

	/*
	* DTrace Buffer Functions
	*
	* The following functions manipulate DTrace buffers. Most of these functions
	* are called in the context of establishing or processing consumer state;
	* exceptions are explicitly noted.
	*/

	/*
	* Note: called from cross call context. This function switches the two
	* buffers on a given CPU. The atomicity of this operation is assured by
	* disabling interrupts while the actual switch takes place; the disabling of
	* interrupts serializes the execution with any execution of dtrace_probe() on
	* the same CPU.
	*/
	static void
	dtrace_buffer_switch(dtrace_buffer_t *buf)
	{
	caddr_t tomax = buf->dtb_tomax;
	caddr_t xamot = buf->dtb_xamot;
	dtrace_icookie_t cookie;

	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
	ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));

	cookie = dtrace_interrupt_disable();
	buf->dtb_tomax = xamot;
	buf->dtb_xamot = tomax;
	buf->dtb_xamot_drops = buf->dtb_drops;
	buf->dtb_xamot_offset = buf->dtb_offset;
	buf->dtb_xamot_errors = buf->dtb_errors;
	buf->dtb_xamot_flags = buf->dtb_flags;
	buf->dtb_offset = 0;
	buf->dtb_drops = 0;
	buf->dtb_errors = 0;
	buf->dtb_flags &= ~(DTRACEBUF_ERROR \| DTRACEBUF_DROPPED);
	dtrace_interrupt_enable(cookie);
	}

	/*
	* Note: called from cross call context. This function activates a buffer
	* on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
	* is guaranteed by the disabling of interrupts.
	*/
	static void
	dtrace_buffer_activate(dtrace_state_t *state)
	{
	dtrace_buffer_t *buf;
	dtrace_icookie_t cookie = dtrace_interrupt_disable();

	buf = &state->dts_buffer[curcpu];

	if (buf->dtb_tomax != NULL) {
	/*
	* We might like to assert that the buffer is marked inactive,
	* but this isn't necessarily true: the buffer for the CPU
	* that processes the BEGIN probe has its buffer activated
	* manually. In this case, we take the (harmless) action
	* re-clearing the bit INACTIVE bit.
	*/
	buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
	}

	dtrace_interrupt_enable(cookie);
	}

	static int
	dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
	processorid_t cpu)
	{
	#if defined(sun)
	cpu_t *cp;
	#endif
	dtrace_buffer_t *buf;

	#if defined(sun)
	ASSERT(MUTEX_HELD(&cpu_lock));
	ASSERT(MUTEX_HELD(&dtrace_lock));

	if (size > dtrace_nonroot_maxsize &&
	!PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
	return (EFBIG);

	cp = cpu_list;

	do {
	if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
	continue;

	buf = &bufs[cp->cpu_id];

	/*
	* If there is already a buffer allocated for this CPU, it
	* is only possible that this is a DR event. In this case,
	*/
	if (buf->dtb_tomax != NULL) {
	ASSERT(buf->dtb_size == size);
	continue;
	}

	ASSERT(buf->dtb_xamot == NULL);

	if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
	goto err;

	buf->dtb_size = size;
	buf->dtb_flags = flags;
	buf->dtb_offset = 0;
	buf->dtb_drops = 0;

	if (flags & DTRACEBUF_NOSWITCH)
	continue;

	if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
	goto err;
	} while ((cp = cp->cpu_next) != cpu_list);

	return (0);

	err:
	cp = cpu_list;

	do {
	if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
	continue;

	buf = &bufs[cp->cpu_id];

	if (buf->dtb_xamot != NULL) {
	ASSERT(buf->dtb_tomax != NULL);
	ASSERT(buf->dtb_size == size);
	kmem_free(buf->dtb_xamot, size);
	}

	if (buf->dtb_tomax != NULL) {
	ASSERT(buf->dtb_size == size);
	kmem_free(buf->dtb_tomax, size);
	}

	buf->dtb_tomax = NULL;
	buf->dtb_xamot = NULL;
	buf->dtb_size = 0;
	} while ((cp = cp->cpu_next) != cpu_list);

	return (ENOMEM);
	#else
	int i;

	#if defined(__amd64__)
	/*
	* FreeBSD isn't good at limiting the amount of memory we
	* ask to malloc, so let's place a limit here before trying
	* to do something that might well end in tears at bedtime.
	*/
	if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
	return(ENOMEM);
	#endif

	ASSERT(MUTEX_HELD(&dtrace_lock));
	CPU_FOREACH(i) {
	if (cpu != DTRACE_CPUALL && cpu != i)
	continue;

	buf = &bufs[i];

	/*
	* If there is already a buffer allocated for this CPU, it
	* is only possible that this is a DR event. In this case,
	* the buffer size must match our specified size.
	*/
	if (buf->dtb_tomax != NULL) {
	ASSERT(buf->dtb_size == size);
	continue;
	}

	ASSERT(buf->dtb_xamot == NULL);

	if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
	goto err;

	buf->dtb_size = size;
	buf->dtb_flags = flags;
	buf->dtb_offset = 0;
	buf->dtb_drops = 0;

	if (flags & DTRACEBUF_NOSWITCH)
	continue;

	if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
	goto err;
	}

	return (0);

	err:
	/*
	* Error allocating memory, so free the buffers that were
	* allocated before the failed allocation.
	*/
	CPU_FOREACH(i) {
	if (cpu != DTRACE_CPUALL && cpu != i)
	continue;

	buf = &bufs[i];

	if (buf->dtb_xamot != NULL) {
	ASSERT(buf->dtb_tomax != NULL);
	ASSERT(buf->dtb_size == size);
	kmem_free(buf->dtb_xamot, size);
	}

	if (buf->dtb_tomax != NULL) {
	ASSERT(buf->dtb_size == size);
	kmem_free(buf->dtb_tomax, size);
	}

	buf->dtb_tomax = NULL;
	buf->dtb_xamot = NULL;
	buf->dtb_size = 0;

	}

	return (ENOMEM);
	#endif
	}

	/*
	* Note: called from probe context. This function just increments the drop
	* count on a buffer. It has been made a function to allow for the
	* possibility of understanding the source of mysterious drop counts. (A
	* problem for which one may be particularly disappointed that DTrace cannot
	* be used to understand DTrace.)
	*/
	static void
	dtrace_buffer_drop(dtrace_buffer_t *buf)
	{
	buf->dtb_drops++;
	}

	/*
	* Note: called from probe context. This function is called to reserve space
	* in a buffer. If mstate is non-NULL, sets the scratch base and size in the
	* mstate. Returns the new offset in the buffer, or a negative value if an
	* error has occurred.
	*/
	static intptr_t
	dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
	dtrace_state_t state, dtrace_mstate_t mstate)
	{
	intptr_t offs = buf->dtb_offset, soffs;
	intptr_t woffs;
	caddr_t tomax;
	size_t total;

	if (buf->dtb_flags & DTRACEBUF_INACTIVE)
	return (-1);

	if ((tomax = buf->dtb_tomax) == NULL) {
	dtrace_buffer_drop(buf);
	return (-1);
	}

	if (!(buf->dtb_flags & (DTRACEBUF_RING \| DTRACEBUF_FILL))) {
	while (offs & (align - 1)) {
	/*
	* Assert that our alignment is off by a number which
	* is itself sizeof (uint32_t) aligned.
	*/
	ASSERT(!((align - (offs & (align - 1))) &
	(sizeof (uint32_t) - 1)));
	DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
	offs += sizeof (uint32_t);
	}

	if ((soffs = offs + needed) > buf->dtb_size) {
	dtrace_buffer_drop(buf);
	return (-1);
	}

	if (mstate == NULL)
	return (offs);

	mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
	mstate->dtms_scratch_size = buf->dtb_size - soffs;
	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;

	return (offs);
	}

	if (buf->dtb_flags & DTRACEBUF_FILL) {
	if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
	(buf->dtb_flags & DTRACEBUF_FULL))
	return (-1);
	goto out;
	}

	total = needed + (offs & (align - 1));

	/*
	* For a ring buffer, life is quite a bit more complicated. Before
	* we can store any padding, we need to adjust our wrapping offset.
	* (If we've never before wrapped or we're not about to, no adjustment
	* is required.)
	*/
	if ((buf->dtb_flags & DTRACEBUF_WRAPPED) \|\|
	offs + total > buf->dtb_size) {
	woffs = buf->dtb_xamot_offset;

	if (offs + total > buf->dtb_size) {
	/*
	* We can't fit in the end of the buffer. First, a
	* sanity check that we can fit in the buffer at all.
	*/
	if (total > buf->dtb_size) {
	dtrace_buffer_drop(buf);
	return (-1);
	}

	/*
	* We're going to be storing at the top of the buffer,
	* so now we need to deal with the wrapped offset. We
	* only reset our wrapped offset to 0 if it is
	* currently greater than the current offset. If it
	* is less than the current offset, it is because a
	* previous allocation induced a wrap -- but the
	* allocation didn't subsequently take the space due
	* to an error or false predicate evaluation. In this
	* case, we'll just leave the wrapped offset alone: if
	* the wrapped offset hasn't been advanced far enough
	* for this allocation, it will be adjusted in the
	* lower loop.
	*/
	if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
	if (woffs >= offs)
	woffs = 0;
	} else {
	woffs = 0;
	}

	/*
	* Now we know that we're going to be storing to the
	* top of the buffer and that there is room for us
	* there. We need to clear the buffer from the current
	* offset to the end (there may be old gunk there).
	*/
	while (offs < buf->dtb_size)
	tomax[offs++] = 0;

	/*
	* We need to set our offset to zero. And because we
	* are wrapping, we need to set the bit indicating as
	* much. We can also adjust our needed space back
	* down to the space required by the ECB -- we know
	* that the top of the buffer is aligned.
	*/
	offs = 0;
	total = needed;
	buf->dtb_flags \|= DTRACEBUF_WRAPPED;
	} else {
	/*
	* There is room for us in the buffer, so we simply
	* need to check the wrapped offset.
	*/
	if (woffs < offs) {
	/*
	* The wrapped offset is less than the offset.
	* This can happen if we allocated buffer space
	* that induced a wrap, but then we didn't
	* subsequently take the space due to an error
	* or false predicate evaluation. This is
	* okay; we know that _this_ allocation isn't
	* going to induce a wrap. We still can't
	* reset the wrapped offset to be zero,
	* however: the space may have been trashed in
	* the previous failed probe attempt. But at
	* least the wrapped offset doesn't need to
	* be adjusted at all...
	*/
	goto out;
	}
	}

	while (offs + total > woffs) {
	dtrace_epid_t epid = (uint32_t )(tomax + woffs);
	size_t size;

	if (epid == DTRACE_EPIDNONE) {
	size = sizeof (uint32_t);
	} else {
	ASSERT(epid <= state->dts_necbs);
	ASSERT(state->dts_ecbs[epid - 1] != NULL);

	size = state->dts_ecbs[epid - 1]->dte_size;
	}

	ASSERT(woffs + size <= buf->dtb_size);
	ASSERT(size != 0);

	if (woffs + size == buf->dtb_size) {
	/*
	* We've reached the end of the buffer; we want
	* to set the wrapped offset to 0 and break
	* out. However, if the offs is 0, then we're
	* in a strange edge-condition: the amount of
	* space that we want to reserve plus the size
	* of the record that we're overwriting is
	* greater than the size of the buffer. This
	* is problematic because if we reserve the
	* space but subsequently don't consume it (due
	* to a failed predicate or error) the wrapped
	* offset will be 0 -- yet the EPID at offset 0
	* will not be committed. This situation is
	* relatively easy to deal with: if we're in
	* this case, the buffer is indistinguishable
	* from one that hasn't wrapped; we need only
	* finish the job by clearing the wrapped bit,
	* explicitly setting the offset to be 0, and
	* zero'ing out the old data in the buffer.
	*/
	if (offs == 0) {
	buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
	buf->dtb_offset = 0;
	woffs = total;

	while (woffs < buf->dtb_size)
	tomax[woffs++] = 0;
	}

	woffs = 0;
	break;
	}

	woffs += size;
	}

	/*
	* We have a wrapped offset. It may be that the wrapped offset
	* has become zero -- that's okay.
	*/
	buf->dtb_xamot_offset = woffs;
	}

	out:
	/*
	* Now we can plow the buffer with any necessary padding.
	*/
	while (offs & (align - 1)) {
	/*
	* Assert that our alignment is off by a number which
	* is itself sizeof (uint32_t) aligned.
	*/
	ASSERT(!((align - (offs & (align - 1))) &
	(sizeof (uint32_t) - 1)));
	DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
	offs += sizeof (uint32_t);
	}

	if (buf->dtb_flags & DTRACEBUF_FILL) {
	if (offs + needed > buf->dtb_size - state->dts_reserve) {
	buf->dtb_flags \|= DTRACEBUF_FULL;
	return (-1);
	}
	}

	if (mstate == NULL)
	return (offs);

	/*
	* For ring buffers and fill buffers, the scratch space is always
	* the inactive buffer.
	*/
	mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
	mstate->dtms_scratch_size = buf->dtb_size;
	mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;

	return (offs);
	}

	static void
	dtrace_buffer_polish(dtrace_buffer_t *buf)
	{
	ASSERT(buf->dtb_flags & DTRACEBUF_RING);
	ASSERT(MUTEX_HELD(&dtrace_lock));

	if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
	return;

	/*
	* We need to polish the ring buffer. There are three cases:
	*
	* - The first (and presumably most common) is that there is no gap
	* between the buffer offset and the wrapped offset. In this case,
	* there is nothing in the buffer that isn't valid data; we can
	* mark the buffer as polished and return.
	*
	* - The second (less common than the first but still more common
	* than the third) is that there is a gap between the buffer offset
	* and the wrapped offset, and the wrapped offset is larger than the
	* buffer offset. This can happen because of an alignment issue, or
	* can happen because of a call to dtrace_buffer_reserve() that
	* didn't subsequently consume the buffer space. In this case,
	* we need to zero the data from the buffer offset to the wrapped
	* offset.
	*
	* - The third (and least common) is that there is a gap between the
	* buffer offset and the wrapped offset, but the wrapped offset is
	* _less_ than the buffer offset. This can only happen because a
	* call to dtrace_buffer_reserve() induced a wrap, but the space
	* was not subsequently consumed. In this case, we need to zero the
	* space from the offset to the end of the buffer _and_ from the
	* top of the buffer to the wrapped offset.
	*/
	if (buf->dtb_offset < buf->dtb_xamot_offset) {
	bzero(buf->dtb_tomax + buf->dtb_offset,
	buf->dtb_xamot_offset - buf->dtb_offset);
	}

	if (buf->dtb_offset > buf->dtb_xamot_offset) {
	bzero(buf->dtb_tomax + buf->dtb_offset,
	buf->dtb_size - buf->dtb_offset);
	bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
	}
	}

	static void
	dtrace_buffer_free(dtrace_buffer_t *bufs)
	{
	int i;

	for (i = 0; i < NCPU; i++) {
	dtrace_buffer_t *buf = &bufs[i];

	if (buf->dtb_tomax == NULL) {
	ASSERT(buf->dtb_xamot == NULL);
	ASSERT(buf->dtb_size == 0);
	continue;
	}

	if (buf->dtb_xamot != NULL) {
	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
	kmem_free(buf->dtb_xamot, buf->dtb_size);
	}

	kmem_free(buf->dtb_tomax, buf->dtb_size);
	buf->dtb_size = 0;
	buf->dtb_tomax = NULL;
	buf->dtb_xamot = NULL;
	}
	}

	/*
	* DTrace Enabling Functions
	*/
	static dtrace_enabling_t *
	dtrace_enabling_create(dtrace_vstate_t *vstate)
	{
	dtrace_enabling_t *enab;

	enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
	enab->dten_vstate = vstate;

	return (enab);
	}

	static void
	dtrace_enabling_add(dtrace_enabling_t enab, dtrace_ecbdesc_t ecb)
	{
	dtrace_ecbdesc_t **ndesc;
	size_t osize, nsize;

	/*
	* We can't add to enablings after we've enabled them, or after we've
	* retained them.
	*/
	ASSERT(enab->dten_probegen == 0);
	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);

	if (enab->dten_ndesc < enab->dten_maxdesc) {
	enab->dten_desc[enab->dten_ndesc++] = ecb;
	return;
	}

	osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);

	if (enab->dten_maxdesc == 0) {
	enab->dten_maxdesc = 1;
	} else {
	enab->dten_maxdesc <<= 1;
	}

	ASSERT(enab->dten_ndesc < enab->dten_maxdesc);

	nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
	ndesc = kmem_zalloc(nsize, KM_SLEEP);
	bcopy(enab->dten_desc, ndesc, osize);
	if (enab->dten_desc != NULL)
	kmem_free(enab->dten_desc, osize);

	enab->dten_desc = ndesc;
	enab->dten_desc[enab->dten_ndesc++] = ecb;
	}

	static void
	dtrace_enabling_addlike(dtrace_enabling_t enab, dtrace_ecbdesc_t ecb,
	dtrace_probedesc_t *pd)
	{
	dtrace_ecbdesc_t *new;
	dtrace_predicate_t *pred;
	dtrace_actdesc_t *act;

	/*
	* We're going to create a new ECB description that matches the
	* specified ECB in every way, but has the specified probe description.
	*/
	new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);

	if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
	dtrace_predicate_hold(pred);

	for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
	dtrace_actdesc_hold(act);

	new->dted_action = ecb->dted_action;
	new->dted_pred = ecb->dted_pred;
	new->dted_probe = *pd;
	new->dted_uarg = ecb->dted_uarg;

	dtrace_enabling_add(enab, new);
	}

	static void
	dtrace_enabling_dump(dtrace_enabling_t *enab)
	{
	int i;

	for (i = 0; i < enab->dten_ndesc; i++) {
	dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;

	cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
	desc->dtpd_provider, desc->dtpd_mod,
	desc->dtpd_func, desc->dtpd_name);
	}
	}

	static void
	dtrace_enabling_destroy(dtrace_enabling_t *enab)
	{
	int i;
	dtrace_ecbdesc_t *ep;
	dtrace_vstate_t *vstate = enab->dten_vstate;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	for (i = 0; i < enab->dten_ndesc; i++) {
	dtrace_actdesc_t act, next;
	dtrace_predicate_t *pred;

	ep = enab->dten_desc[i];

	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
	dtrace_predicate_release(pred, vstate);

	for (act = ep->dted_action; act != NULL; act = next) {
	next = act->dtad_next;
	dtrace_actdesc_release(act, vstate);
	}

	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
	}

	if (enab->dten_desc != NULL)
	kmem_free(enab->dten_desc,
	enab->dten_maxdesc * sizeof (dtrace_enabling_t *));

	/*
	* If this was a retained enabling, decrement the dts_nretained count
	* and take it off of the dtrace_retained list.
	*/
	if (enab->dten_prev != NULL \|\| enab->dten_next != NULL \|\|
	dtrace_retained == enab) {
	ASSERT(enab->dten_vstate->dtvs_state != NULL);
	ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
	enab->dten_vstate->dtvs_state->dts_nretained--;
	}

	if (enab->dten_prev == NULL) {
	if (dtrace_retained == enab) {
	dtrace_retained = enab->dten_next;

	if (dtrace_retained != NULL)
	dtrace_retained->dten_prev = NULL;
	}
	} else {
	ASSERT(enab != dtrace_retained);
	ASSERT(dtrace_retained != NULL);
	enab->dten_prev->dten_next = enab->dten_next;
	}

	if (enab->dten_next != NULL) {
	ASSERT(dtrace_retained != NULL);
	enab->dten_next->dten_prev = enab->dten_prev;
	}

	kmem_free(enab, sizeof (dtrace_enabling_t));
	}

	static int
	dtrace_enabling_retain(dtrace_enabling_t *enab)
	{
	dtrace_state_t *state;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
	ASSERT(enab->dten_vstate != NULL);

	state = enab->dten_vstate->dtvs_state;
	ASSERT(state != NULL);

	/*
	* We only allow each state to retain dtrace_retain_max enablings.
	*/
	if (state->dts_nretained >= dtrace_retain_max)
	return (ENOSPC);

	state->dts_nretained++;

	if (dtrace_retained == NULL) {
	dtrace_retained = enab;
	return (0);
	}

	enab->dten_next = dtrace_retained;
	dtrace_retained->dten_prev = enab;
	dtrace_retained = enab;

	return (0);
	}

	static int
	dtrace_enabling_replicate(dtrace_state_t state, dtrace_probedesc_t match,
	dtrace_probedesc_t *create)
	{
	dtrace_enabling_t new, enab;
	int found = 0, err = ENOENT;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
	ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
	ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
	ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);

	new = dtrace_enabling_create(&state->dts_vstate);

	/*
	* Iterate over all retained enablings, looking for enablings that
	* match the specified state.
	*/
	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
	int i;

	/*
	* dtvs_state can only be NULL for helper enablings -- and
	* helper enablings can't be retained.
	*/
	ASSERT(enab->dten_vstate->dtvs_state != NULL);

	if (enab->dten_vstate->dtvs_state != state)
	continue;

	/*
	* Now iterate over each probe description; we're looking for
	* an exact match to the specified probe description.
	*/
	for (i = 0; i < enab->dten_ndesc; i++) {
	dtrace_ecbdesc_t *ep = enab->dten_desc[i];
	dtrace_probedesc_t *pd = &ep->dted_probe;

	if (strcmp(pd->dtpd_provider, match->dtpd_provider))
	continue;

	if (strcmp(pd->dtpd_mod, match->dtpd_mod))
	continue;

	if (strcmp(pd->dtpd_func, match->dtpd_func))
	continue;

	if (strcmp(pd->dtpd_name, match->dtpd_name))
	continue;

	/*
	* We have a winning probe! Add it to our growing
	* enabling.
	*/
	found = 1;
	dtrace_enabling_addlike(new, ep, create);
	}
	}

	if (!found \|\| (err = dtrace_enabling_retain(new)) != 0) {
	dtrace_enabling_destroy(new);
	return (err);
	}

	return (0);
	}

	static void
	dtrace_enabling_retract(dtrace_state_t *state)
	{
	dtrace_enabling_t enab, next;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	/*
	* Iterate over all retained enablings, destroy the enablings retained
	* for the specified state.
	*/
	for (enab = dtrace_retained; enab != NULL; enab = next) {
	next = enab->dten_next;

	/*
	* dtvs_state can only be NULL for helper enablings -- and
	* helper enablings can't be retained.
	*/
	ASSERT(enab->dten_vstate->dtvs_state != NULL);

	if (enab->dten_vstate->dtvs_state == state) {
	ASSERT(state->dts_nretained > 0);
	dtrace_enabling_destroy(enab);
	}
	}

	ASSERT(state->dts_nretained == 0);
	}

	static int
	dtrace_enabling_match(dtrace_enabling_t enab, int nmatched)
	{
	int i = 0;
	int matched = 0;

	ASSERT(MUTEX_HELD(&cpu_lock));
	ASSERT(MUTEX_HELD(&dtrace_lock));

	for (i = 0; i < enab->dten_ndesc; i++) {
	dtrace_ecbdesc_t *ep = enab->dten_desc[i];

	enab->dten_current = ep;
	enab->dten_error = 0;

	matched += dtrace_probe_enable(&ep->dted_probe, enab);

	if (enab->dten_error != 0) {
	/*
	* If we get an error half-way through enabling the
	* probes, we kick out -- perhaps with some number of
	* them enabled. Leaving enabled probes enabled may
	* be slightly confusing for user-level, but we expect
	* that no one will attempt to actually drive on in
	* the face of such errors. If this is an anonymous
	* enabling (indicated with a NULL nmatched pointer),
	* we cmn_err() a message. We aren't expecting to
	* get such an error -- such as it can exist at all,
	* it would be a result of corrupted DOF in the driver
	* properties.
	*/
	if (nmatched == NULL) {
	cmn_err(CE_WARN, "dtrace_enabling_match() "
	"error on %p: %d", (void *)ep,
	enab->dten_error);
	}

	return (enab->dten_error);
	}
	}

	enab->dten_probegen = dtrace_probegen;
	if (nmatched != NULL)
	*nmatched = matched;

	return (0);
	}

	static void
	dtrace_enabling_matchall(void)
	{
	dtrace_enabling_t *enab;

	mutex_enter(&cpu_lock);
	mutex_enter(&dtrace_lock);

	/*
	* Iterate over all retained enablings to see if any probes match
	* against them. We only perform this operation on enablings for which
	* we have sufficient permissions by virtue of being in the global zone
	* or in the same zone as the DTrace client. Because we can be called
	* after dtrace_detach() has been called, we cannot assert that there
	* are retained enablings. We can safely load from dtrace_retained,
	* however: the taskq_destroy() at the end of dtrace_detach() will
	* block pending our completion.
	*/
	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
	#if defined(sun)
	cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;

	if (INGLOBALZONE(curproc) \|\| getzoneid() == crgetzoneid(cr))
	#endif
	(void) dtrace_enabling_match(enab, NULL);
	}

	mutex_exit(&dtrace_lock);
	mutex_exit(&cpu_lock);
	}

	/*
	* If an enabling is to be enabled without having matched probes (that is, if
	* dtrace_state_go() is to be called on the underlying dtrace_state_t), the
	* enabling must be _primed_ by creating an ECB for every ECB description.
	* This must be done to assure that we know the number of speculations, the
	* number of aggregations, the minimum buffer size needed, etc. before we
	* transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
	* enabling any probes, we create ECBs for every ECB decription, but with a
	* NULL probe -- which is exactly what this function does.
	*/
	static void
	dtrace_enabling_prime(dtrace_state_t *state)
	{
	dtrace_enabling_t *enab;
	int i;

	for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
	ASSERT(enab->dten_vstate->dtvs_state != NULL);

	if (enab->dten_vstate->dtvs_state != state)
	continue;

	/*
	* We don't want to prime an enabling more than once, lest
	* we allow a malicious user to induce resource exhaustion.
	* (The ECBs that result from priming an enabling aren't
	* leaked -- but they also aren't deallocated until the
	* consumer state is destroyed.)
	*/
	if (enab->dten_primed)
	continue;

	for (i = 0; i < enab->dten_ndesc; i++) {
	enab->dten_current = enab->dten_desc[i];
	(void) dtrace_probe_enable(NULL, enab);
	}

	enab->dten_primed = 1;
	}
	}

	/*
	* Called to indicate that probes should be provided due to retained
	* enablings. This is implemented in terms of dtrace_probe_provide(), but it
	* must take an initial lap through the enabling calling the dtps_provide()
	* entry point explicitly to allow for autocreated probes.
	*/
	static void
	dtrace_enabling_provide(dtrace_provider_t *prv)
	{
	int i, all = 0;
	dtrace_probedesc_t desc;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(MUTEX_HELD(&dtrace_provider_lock));

	if (prv == NULL) {
	all = 1;
	prv = dtrace_provider;
	}

	do {
	dtrace_enabling_t *enab = dtrace_retained;
	void *parg = prv->dtpv_arg;

	for (; enab != NULL; enab = enab->dten_next) {
	for (i = 0; i < enab->dten_ndesc; i++) {
	desc = enab->dten_desc[i]->dted_probe;
	mutex_exit(&dtrace_lock);
	prv->dtpv_pops.dtps_provide(parg, &desc);
	mutex_enter(&dtrace_lock);
	}
	}
	} while (all && (prv = prv->dtpv_next) != NULL);

	mutex_exit(&dtrace_lock);
	dtrace_probe_provide(NULL, all ? NULL : prv);
	mutex_enter(&dtrace_lock);
	}

	/*
	* DTrace DOF Functions
	*/
	/ARGSUSED/
	static void
	dtrace_dof_error(dof_hdr_t dof, const char str)
	{
	if (dtrace_err_verbose)
	cmn_err(CE_WARN, "failed to process DOF: %s", str);

	#ifdef DTRACE_ERRDEBUG
	dtrace_errdebug(str);
	#endif
	}

	/*
	* Create DOF out of a currently enabled state. Right now, we only create
	* DOF containing the run-time options -- but this could be expanded to create
	* complete DOF representing the enabled state.
	*/
	static dof_hdr_t *
	dtrace_dof_create(dtrace_state_t *state)
	{
	dof_hdr_t *dof;
	dof_sec_t *sec;
	dof_optdesc_t *opt;
	int i, len = sizeof (dof_hdr_t) +
	roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
	sizeof (dof_optdesc_t) * DTRACEOPT_MAX;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	dof = kmem_zalloc(len, KM_SLEEP);
	dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
	dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
	dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
	dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;

	dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
	dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
	dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
	dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
	dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
	dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;

	dof->dofh_flags = 0;
	dof->dofh_hdrsize = sizeof (dof_hdr_t);
	dof->dofh_secsize = sizeof (dof_sec_t);
	dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
	dof->dofh_secoff = sizeof (dof_hdr_t);
	dof->dofh_loadsz = len;
	dof->dofh_filesz = len;
	dof->dofh_pad = 0;

	/*
	* Fill in the option section header...
	*/
	sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
	sec->dofs_type = DOF_SECT_OPTDESC;
	sec->dofs_align = sizeof (uint64_t);
	sec->dofs_flags = DOF_SECF_LOAD;
	sec->dofs_entsize = sizeof (dof_optdesc_t);

	opt = (dof_optdesc_t *)((uintptr_t)sec +
	roundup(sizeof (dof_sec_t), sizeof (uint64_t)));

	sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
	sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;

	for (i = 0; i < DTRACEOPT_MAX; i++) {
	opt[i].dofo_option = i;
	opt[i].dofo_strtab = DOF_SECIDX_NONE;
	opt[i].dofo_value = state->dts_options[i];
	}

	return (dof);
	}

	static dof_hdr_t *
	dtrace_dof_copyin(uintptr_t uarg, int *errp)
	{
	dof_hdr_t hdr, *dof;

	ASSERT(!MUTEX_HELD(&dtrace_lock));

	/*
	* First, we're going to copyin() the sizeof (dof_hdr_t).
	*/
	if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
	dtrace_dof_error(NULL, "failed to copyin DOF header");
	*errp = EFAULT;
	return (NULL);
	}

	/*
	* Now we'll allocate the entire DOF and copy it in -- provided
	* that the length isn't outrageous.
	*/
	if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
	dtrace_dof_error(&hdr, "load size exceeds maximum");
	*errp = E2BIG;
	return (NULL);
	}

	if (hdr.dofh_loadsz < sizeof (hdr)) {
	dtrace_dof_error(&hdr, "invalid load size");
	*errp = EINVAL;
	return (NULL);
	}

	dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);

	if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0) {
	kmem_free(dof, hdr.dofh_loadsz);
	*errp = EFAULT;
	return (NULL);
	}

	return (dof);
	}

	#if !defined(sun)
	static __inline uchar_t
	dtrace_dof_char(char c) {
	switch (c) {
	case '0':
	case '1':
	case '2':
	case '3':
	case '4':
	case '5':
	case '6':
	case '7':
	case '8':
	case '9':
	return (c - '0');
	case 'A':
	case 'B':
	case 'C':
	case 'D':
	case 'E':
	case 'F':
	return (c - 'A' + 10);
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'e':
	case 'f':
	return (c - 'a' + 10);
	}
	/* Should not reach here. */
	return (0);
	}
	#endif

	static dof_hdr_t *
	dtrace_dof_property(const char *name)
	{
	uchar_t *buf;
	uint64_t loadsz;
	unsigned int len, i;
	dof_hdr_t *dof;

	#if defined(sun)
	/*
	* Unfortunately, array of values in .conf files are always (and
	* only) interpreted to be integer arrays. We must read our DOF
	* as an integer array, and then squeeze it into a byte array.
	*/
	if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
	(char )name, (int *)&buf, &len) != DDI_PROP_SUCCESS)
	return (NULL);

	for (i = 0; i < len; i++)
	buf[i] = (uchar_t)(((int *)buf)[i]);

	if (len < sizeof (dof_hdr_t)) {
	ddi_prop_free(buf);
	dtrace_dof_error(NULL, "truncated header");
	return (NULL);
	}

	if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
	ddi_prop_free(buf);
	dtrace_dof_error(NULL, "truncated DOF");
	return (NULL);
	}

	if (loadsz >= dtrace_dof_maxsize) {
	ddi_prop_free(buf);
	dtrace_dof_error(NULL, "oversized DOF");
	return (NULL);
	}

	dof = kmem_alloc(loadsz, KM_SLEEP);
	bcopy(buf, dof, loadsz);
	ddi_prop_free(buf);
	#else
	char *p;
	char *p_env;

	if ((p_env = getenv(name)) == NULL)
	return (NULL);

	len = strlen(p_env) / 2;

	buf = kmem_alloc(len, KM_SLEEP);

	dof = (dof_hdr_t *) buf;

	p = p_env;

	for (i = 0; i < len; i++) {
	buf[i] = (dtrace_dof_char(p[0]) << 4) \|
	dtrace_dof_char(p[1]);
	p += 2;
	}

	freeenv(p_env);

	if (len < sizeof (dof_hdr_t)) {
	kmem_free(buf, 0);
	dtrace_dof_error(NULL, "truncated header");
	return (NULL);
	}

	if (len < (loadsz = dof->dofh_loadsz)) {
	kmem_free(buf, 0);
	dtrace_dof_error(NULL, "truncated DOF");
	return (NULL);
	}

	if (loadsz >= dtrace_dof_maxsize) {
	kmem_free(buf, 0);
	dtrace_dof_error(NULL, "oversized DOF");
	return (NULL);
	}
	#endif

	return (dof);
	}

	static void
	dtrace_dof_destroy(dof_hdr_t *dof)
	{
	kmem_free(dof, dof->dofh_loadsz);
	}

	/*
	* Return the dof_sec_t pointer corresponding to a given section index. If the
	* index is not valid, dtrace_dof_error() is called and NULL is returned. If
	* a type other than DOF_SECT_NONE is specified, the header is checked against
	* this type and NULL is returned if the types do not match.
	*/
	static dof_sec_t *
	dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
	{
	dof_sec_t sec = (dof_sec_t )(uintptr_t)
	((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);

	if (i >= dof->dofh_secnum) {
	dtrace_dof_error(dof, "referenced section index is invalid");
	return (NULL);
	}

	if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
	dtrace_dof_error(dof, "referenced section is not loadable");
	return (NULL);
	}

	if (type != DOF_SECT_NONE && type != sec->dofs_type) {
	dtrace_dof_error(dof, "referenced section is the wrong type");
	return (NULL);
	}

	return (sec);
	}

	static dtrace_probedesc_t *
	dtrace_dof_probedesc(dof_hdr_t dof, dof_sec_t sec, dtrace_probedesc_t *desc)
	{
	dof_probedesc_t *probe;
	dof_sec_t *strtab;
	uintptr_t daddr = (uintptr_t)dof;
	uintptr_t str;
	size_t size;

	if (sec->dofs_type != DOF_SECT_PROBEDESC) {
	dtrace_dof_error(dof, "invalid probe section");
	return (NULL);
	}

	if (sec->dofs_align != sizeof (dof_secidx_t)) {
	dtrace_dof_error(dof, "bad alignment in probe description");
	return (NULL);
	}

	if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
	dtrace_dof_error(dof, "truncated probe description");
	return (NULL);
	}

	probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
	strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);

	if (strtab == NULL)
	return (NULL);

	str = daddr + strtab->dofs_offset;
	size = strtab->dofs_size;

	if (probe->dofp_provider >= strtab->dofs_size) {
	dtrace_dof_error(dof, "corrupt probe provider");
	return (NULL);
	}

	(void) strncpy(desc->dtpd_provider,
	(char *)(str + probe->dofp_provider),
	MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));

	if (probe->dofp_mod >= strtab->dofs_size) {
	dtrace_dof_error(dof, "corrupt probe module");
	return (NULL);
	}

	(void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
	MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));

	if (probe->dofp_func >= strtab->dofs_size) {
	dtrace_dof_error(dof, "corrupt probe function");
	return (NULL);
	}

	(void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
	MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));

	if (probe->dofp_name >= strtab->dofs_size) {
	dtrace_dof_error(dof, "corrupt probe name");
	return (NULL);
	}

	(void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
	MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));

	return (desc);
	}

	static dtrace_difo_t *
	dtrace_dof_difo(dof_hdr_t dof, dof_sec_t sec, dtrace_vstate_t *vstate,
	cred_t *cr)
	{
	dtrace_difo_t *dp;
	size_t ttl = 0;
	dof_difohdr_t *dofd;
	uintptr_t daddr = (uintptr_t)dof;
	size_t max = dtrace_difo_maxsize;
	int i, l, n;

	static const struct {
	int section;
	int bufoffs;
	int lenoffs;
	int entsize;
	int align;
	const char *msg;
	} difo[] = {
	{ DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
	offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
	sizeof (dif_instr_t), "multiple DIF sections" },

	{ DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
	offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
	sizeof (uint64_t), "multiple integer tables" },

	{ DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
	offsetof(dtrace_difo_t, dtdo_strlen), 0,
	sizeof (char), "multiple string tables" },

	{ DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
	offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
	sizeof (uint_t), "multiple variable tables" },

	{ DOF_SECT_NONE, 0, 0, 0, 0, NULL }
	};

	if (sec->dofs_type != DOF_SECT_DIFOHDR) {
	dtrace_dof_error(dof, "invalid DIFO header section");
	return (NULL);
	}

	if (sec->dofs_align != sizeof (dof_secidx_t)) {
	dtrace_dof_error(dof, "bad alignment in DIFO header");
	return (NULL);
	}

	if (sec->dofs_size < sizeof (dof_difohdr_t) \|\|
	sec->dofs_size % sizeof (dof_secidx_t)) {
	dtrace_dof_error(dof, "bad size in DIFO header");
	return (NULL);
	}

	dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
	n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;

	dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
	dp->dtdo_rtype = dofd->dofd_rtype;

	for (l = 0; l < n; l++) {
	dof_sec_t *subsec;
	void **bufp;
	uint32_t *lenp;

	if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
	dofd->dofd_links[l])) == NULL)
	goto err; /* invalid section link */

	if (ttl + subsec->dofs_size > max) {
	dtrace_dof_error(dof, "exceeds maximum size");
	goto err;
	}

	ttl += subsec->dofs_size;

	for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
	if (subsec->dofs_type != difo[i].section)
	continue;

	if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
	dtrace_dof_error(dof, "section not loaded");
	goto err;
	}

	if (subsec->dofs_align != difo[i].align) {
	dtrace_dof_error(dof, "bad alignment");
	goto err;
	}

	bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
	lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);

	if (*bufp != NULL) {
	dtrace_dof_error(dof, difo[i].msg);
	goto err;
	}

	if (difo[i].entsize != subsec->dofs_entsize) {
	dtrace_dof_error(dof, "entry size mismatch");
	goto err;
	}

	if (subsec->dofs_entsize != 0 &&
	(subsec->dofs_size % subsec->dofs_entsize) != 0) {
	dtrace_dof_error(dof, "corrupt entry size");
	goto err;
	}

	*lenp = subsec->dofs_size;
	*bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
	bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
	*bufp, subsec->dofs_size);

	if (subsec->dofs_entsize != 0)
	*lenp /= subsec->dofs_entsize;

	break;
	}

	/*
	* If we encounter a loadable DIFO sub-section that is not
	* known to us, assume this is a broken program and fail.
	*/
	if (difo[i].section == DOF_SECT_NONE &&
	(subsec->dofs_flags & DOF_SECF_LOAD)) {
	dtrace_dof_error(dof, "unrecognized DIFO subsection");
	goto err;
	}
	}

	if (dp->dtdo_buf == NULL) {
	/*
	* We can't have a DIF object without DIF text.
	*/
	dtrace_dof_error(dof, "missing DIF text");
	goto err;
	}

	/*
	* Before we validate the DIF object, run through the variable table
	* looking for the strings -- if any of their size are under, we'll set
	* their size to be the system-wide default string size. Note that
	* this should _not_ happen if the "strsize" option has been set --
	* in this case, the compiler should have set the size to reflect the
	* setting of the option.
	*/
	for (i = 0; i < dp->dtdo_varlen; i++) {
	dtrace_difv_t *v = &dp->dtdo_vartab[i];
	dtrace_diftype_t *t = &v->dtdv_type;

	if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
	continue;

	if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
	t->dtdt_size = dtrace_strsize_default;
	}

	if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
	goto err;

	dtrace_difo_init(dp, vstate);
	return (dp);

	err:
	kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
	kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
	kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
	kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));

	kmem_free(dp, sizeof (dtrace_difo_t));
	return (NULL);
	}

	static dtrace_predicate_t *
	dtrace_dof_predicate(dof_hdr_t dof, dof_sec_t sec, dtrace_vstate_t *vstate,
	cred_t *cr)
	{
	dtrace_difo_t *dp;

	if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
	return (NULL);

	return (dtrace_predicate_create(dp));
	}

	static dtrace_actdesc_t *
	dtrace_dof_actdesc(dof_hdr_t dof, dof_sec_t sec, dtrace_vstate_t *vstate,
	cred_t *cr)
	{
	dtrace_actdesc_t act, first = NULL, last = NULL, next;
	dof_actdesc_t *desc;
	dof_sec_t *difosec;
	size_t offs;
	uintptr_t daddr = (uintptr_t)dof;
	uint64_t arg;
	dtrace_actkind_t kind;

	if (sec->dofs_type != DOF_SECT_ACTDESC) {
	dtrace_dof_error(dof, "invalid action section");
	return (NULL);
	}

	if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
	dtrace_dof_error(dof, "truncated action description");
	return (NULL);
	}

	if (sec->dofs_align != sizeof (uint64_t)) {
	dtrace_dof_error(dof, "bad alignment in action description");
	return (NULL);
	}

	if (sec->dofs_size < sec->dofs_entsize) {
	dtrace_dof_error(dof, "section entry size exceeds total size");
	return (NULL);
	}

	if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
	dtrace_dof_error(dof, "bad entry size in action description");
	return (NULL);
	}

	if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
	dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
	return (NULL);
	}

	for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
	desc = (dof_actdesc_t *)(daddr +
	(uintptr_t)sec->dofs_offset + offs);
	kind = (dtrace_actkind_t)desc->dofa_kind;

	if (DTRACEACT_ISPRINTFLIKE(kind) &&
	(kind != DTRACEACT_PRINTA \|\|
	desc->dofa_strtab != DOF_SECIDX_NONE)) {
	dof_sec_t *strtab;
	char str, fmt;
	uint64_t i;

	/*
	* printf()-like actions must have a format string.
	*/
	if ((strtab = dtrace_dof_sect(dof,
	DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
	goto err;

	str = (char *)((uintptr_t)dof +
	(uintptr_t)strtab->dofs_offset);

	for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
	if (str[i] == '\0')
	break;
	}

	if (i >= strtab->dofs_size) {
	dtrace_dof_error(dof, "bogus format string");
	goto err;
	}

	if (i == desc->dofa_arg) {
	dtrace_dof_error(dof, "empty format string");
	goto err;
	}

	i -= desc->dofa_arg;
	fmt = kmem_alloc(i + 1, KM_SLEEP);
	bcopy(&str[desc->dofa_arg], fmt, i + 1);
	arg = (uint64_t)(uintptr_t)fmt;
	} else {
	if (kind == DTRACEACT_PRINTA) {
	ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
	arg = 0;
	} else {
	arg = desc->dofa_arg;
	}
	}

	act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
	desc->dofa_uarg, arg);

	if (last != NULL) {
	last->dtad_next = act;
	} else {
	first = act;
	}

	last = act;

	if (desc->dofa_difo == DOF_SECIDX_NONE)
	continue;

	if ((difosec = dtrace_dof_sect(dof,
	DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
	goto err;

	act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);

	if (act->dtad_difo == NULL)
	goto err;
	}

	ASSERT(first != NULL);
	return (first);

	err:
	for (act = first; act != NULL; act = next) {
	next = act->dtad_next;
	dtrace_actdesc_release(act, vstate);
	}

	return (NULL);
	}

	static dtrace_ecbdesc_t *
	dtrace_dof_ecbdesc(dof_hdr_t dof, dof_sec_t sec, dtrace_vstate_t *vstate,
	cred_t *cr)
	{
	dtrace_ecbdesc_t *ep;
	dof_ecbdesc_t *ecb;
	dtrace_probedesc_t *desc;
	dtrace_predicate_t *pred = NULL;

	if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
	dtrace_dof_error(dof, "truncated ECB description");
	return (NULL);
	}

	if (sec->dofs_align != sizeof (uint64_t)) {
	dtrace_dof_error(dof, "bad alignment in ECB description");
	return (NULL);
	}

	ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
	sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);

	if (sec == NULL)
	return (NULL);

	ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
	ep->dted_uarg = ecb->dofe_uarg;
	desc = &ep->dted_probe;

	if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
	goto err;

	if (ecb->dofe_pred != DOF_SECIDX_NONE) {
	if ((sec = dtrace_dof_sect(dof,
	DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
	goto err;

	if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
	goto err;

	ep->dted_pred.dtpdd_predicate = pred;
	}

	if (ecb->dofe_actions != DOF_SECIDX_NONE) {
	if ((sec = dtrace_dof_sect(dof,
	DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
	goto err;

	ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);

	if (ep->dted_action == NULL)
	goto err;
	}

	return (ep);

	err:
	if (pred != NULL)
	dtrace_predicate_release(pred, vstate);
	kmem_free(ep, sizeof (dtrace_ecbdesc_t));
	return (NULL);
	}

	/*
	* Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
	* specified DOF. At present, this amounts to simply adding 'ubase' to the
	* site of any user SETX relocations to account for load object base address.
	* In the future, if we need other relocations, this function can be extended.
	*/
	static int
	dtrace_dof_relocate(dof_hdr_t dof, dof_sec_t sec, uint64_t ubase)
	{
	uintptr_t daddr = (uintptr_t)dof;
	dof_relohdr_t *dofr =
	(dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
	dof_sec_t ss, rs, *ts;
	dof_relodesc_t *r;
	uint_t i, n;

	if (sec->dofs_size < sizeof (dof_relohdr_t) \|\|
	sec->dofs_align != sizeof (dof_secidx_t)) {
	dtrace_dof_error(dof, "invalid relocation header");
	return (-1);
	}

	ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
	rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
	ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);

	if (ss == NULL \|\| rs == NULL \|\| ts == NULL)
	return (-1); /* dtrace_dof_error() has been called already */

	if (rs->dofs_entsize < sizeof (dof_relodesc_t) \|\|
	rs->dofs_align != sizeof (uint64_t)) {
	dtrace_dof_error(dof, "invalid relocation section");
	return (-1);
	}

	r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
	n = rs->dofs_size / rs->dofs_entsize;

	for (i = 0; i < n; i++) {
	uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;

	switch (r->dofr_type) {
	case DOF_RELO_NONE:
	break;
	case DOF_RELO_SETX:
	if (r->dofr_offset >= ts->dofs_size \|\| r->dofr_offset +
	sizeof (uint64_t) > ts->dofs_size) {
	dtrace_dof_error(dof, "bad relocation offset");
	return (-1);
	}

	if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
	dtrace_dof_error(dof, "misaligned setx relo");
	return (-1);
	}

	(uint64_t )taddr += ubase;
	break;
	default:
	dtrace_dof_error(dof, "invalid relocation type");
	return (-1);
	}

	r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
	}

	return (0);
	}

	/*
	* The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
	* header: it should be at the front of a memory region that is at least
	* sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
	* size. It need not be validated in any other way.
	*/
	static int
	dtrace_dof_slurp(dof_hdr_t dof, dtrace_vstate_t vstate, cred_t *cr,
	dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
	{
	uint64_t len = dof->dofh_loadsz, seclen;
	uintptr_t daddr = (uintptr_t)dof;
	dtrace_ecbdesc_t *ep;
	dtrace_enabling_t *enab;
	uint_t i;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));

	/*
	* Check the DOF header identification bytes. In addition to checking
	* valid settings, we also verify that unused bits/bytes are zeroed so
	* we can use them later without fear of regressing existing binaries.
	*/
	if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
	DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
	dtrace_dof_error(dof, "DOF magic string mismatch");
	return (-1);
	}

	if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
	dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
	dtrace_dof_error(dof, "DOF has invalid data model");
	return (-1);
	}

	if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
	dtrace_dof_error(dof, "DOF encoding mismatch");
	return (-1);
	}

	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
	dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
	dtrace_dof_error(dof, "DOF version mismatch");
	return (-1);
	}

	if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
	dtrace_dof_error(dof, "DOF uses unsupported instruction set");
	return (-1);
	}

	if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
	dtrace_dof_error(dof, "DOF uses too many integer registers");
	return (-1);
	}

	if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
	dtrace_dof_error(dof, "DOF uses too many tuple registers");
	return (-1);
	}

	for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
	if (dof->dofh_ident[i] != 0) {
	dtrace_dof_error(dof, "DOF has invalid ident byte set");
	return (-1);
	}
	}

	if (dof->dofh_flags & ~DOF_FL_VALID) {
	dtrace_dof_error(dof, "DOF has invalid flag bits set");
	return (-1);
	}

	if (dof->dofh_secsize == 0) {
	dtrace_dof_error(dof, "zero section header size");
	return (-1);
	}

	/*
	* Check that the section headers don't exceed the amount of DOF
	* data. Note that we cast the section size and number of sections
	* to uint64_t's to prevent possible overflow in the multiplication.
	*/
	seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;

	if (dof->dofh_secoff > len \|\| seclen > len \|\|
	dof->dofh_secoff + seclen > len) {
	dtrace_dof_error(dof, "truncated section headers");
	return (-1);
	}

	if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
	dtrace_dof_error(dof, "misaligned section headers");
	return (-1);
	}

	if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
	dtrace_dof_error(dof, "misaligned section size");
	return (-1);
	}

	/*
	* Take an initial pass through the section headers to be sure that
	* the headers don't have stray offsets. If the 'noprobes' flag is
	* set, do not permit sections relating to providers, probes, or args.
	*/
	for (i = 0; i < dof->dofh_secnum; i++) {
	dof_sec_t sec = (dof_sec_t )(daddr +
	(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);

	if (noprobes) {
	switch (sec->dofs_type) {
	case DOF_SECT_PROVIDER:
	case DOF_SECT_PROBES:
	case DOF_SECT_PRARGS:
	case DOF_SECT_PROFFS:
	dtrace_dof_error(dof, "illegal sections "
	"for enabling");
	return (-1);
	}
	}

	if (!(sec->dofs_flags & DOF_SECF_LOAD))
	continue; /* just ignore non-loadable sections */

	if (sec->dofs_align & (sec->dofs_align - 1)) {
	dtrace_dof_error(dof, "bad section alignment");
	return (-1);
	}

	if (sec->dofs_offset & (sec->dofs_align - 1)) {
	dtrace_dof_error(dof, "misaligned section");
	return (-1);
	}

	if (sec->dofs_offset > len \|\| sec->dofs_size > len \|\|
	sec->dofs_offset + sec->dofs_size > len) {
	dtrace_dof_error(dof, "corrupt section header");
	return (-1);
	}

	if (sec->dofs_type == DOF_SECT_STRTAB && ((char )daddr +
	sec->dofs_offset + sec->dofs_size - 1) != '\0') {
	dtrace_dof_error(dof, "non-terminating string table");
	return (-1);
	}
	}

	/*
	* Take a second pass through the sections and locate and perform any
	* relocations that are present. We do this after the first pass to
	* be sure that all sections have had their headers validated.
	*/
	for (i = 0; i < dof->dofh_secnum; i++) {
	dof_sec_t sec = (dof_sec_t )(daddr +
	(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);

	if (!(sec->dofs_flags & DOF_SECF_LOAD))
	continue; /* skip sections that are not loadable */

	switch (sec->dofs_type) {
	case DOF_SECT_URELHDR:
	if (dtrace_dof_relocate(dof, sec, ubase) != 0)
	return (-1);
	break;
	}
	}

	if ((enab = *enabp) == NULL)
	enab = *enabp = dtrace_enabling_create(vstate);

	for (i = 0; i < dof->dofh_secnum; i++) {
	dof_sec_t sec = (dof_sec_t )(daddr +
	(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);

	if (sec->dofs_type != DOF_SECT_ECBDESC)
	continue;

	if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
	dtrace_enabling_destroy(enab);
	*enabp = NULL;
	return (-1);
	}

	dtrace_enabling_add(enab, ep);
	}

	return (0);
	}

	/*
	* Process DOF for any options. This routine assumes that the DOF has been
	* at least processed by dtrace_dof_slurp().
	*/
	static int
	dtrace_dof_options(dof_hdr_t dof, dtrace_state_t state)
	{
	int i, rval;
	uint32_t entsize;
	size_t offs;
	dof_optdesc_t *desc;

	for (i = 0; i < dof->dofh_secnum; i++) {
	dof_sec_t sec = (dof_sec_t )((uintptr_t)dof +
	(uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);

	if (sec->dofs_type != DOF_SECT_OPTDESC)
	continue;

	if (sec->dofs_align != sizeof (uint64_t)) {
	dtrace_dof_error(dof, "bad alignment in "
	"option description");
	return (EINVAL);
	}

	if ((entsize = sec->dofs_entsize) == 0) {
	dtrace_dof_error(dof, "zeroed option entry size");
	return (EINVAL);
	}

	if (entsize < sizeof (dof_optdesc_t)) {
	dtrace_dof_error(dof, "bad option entry size");
	return (EINVAL);
	}

	for (offs = 0; offs < sec->dofs_size; offs += entsize) {
	desc = (dof_optdesc_t *)((uintptr_t)dof +
	(uintptr_t)sec->dofs_offset + offs);

	if (desc->dofo_strtab != DOF_SECIDX_NONE) {
	dtrace_dof_error(dof, "non-zero option string");
	return (EINVAL);
	}

	if (desc->dofo_value == DTRACEOPT_UNSET) {
	dtrace_dof_error(dof, "unset option");
	return (EINVAL);
	}

	if ((rval = dtrace_state_option(state,
	desc->dofo_option, desc->dofo_value)) != 0) {
	dtrace_dof_error(dof, "rejected option");
	return (rval);
	}
	}
	}

	return (0);
	}

	/*
	* DTrace Consumer State Functions
	*/
	static int
	dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
	{
	size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
	void *base;
	uintptr_t limit;
	dtrace_dynvar_t dvar, next, *start;
	int i;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);

	bzero(dstate, sizeof (dtrace_dstate_t));

	if ((dstate->dtds_chunksize = chunksize) == 0)
	dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;

	if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
	size = min;

	if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
	return (ENOMEM);

	dstate->dtds_size = size;
	dstate->dtds_base = base;
	dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
	bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));

	hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));

	if (hashsize != 1 && (hashsize & 1))
	hashsize--;

	dstate->dtds_hashsize = hashsize;
	dstate->dtds_hash = dstate->dtds_base;

	/*
	* Set all of our hash buckets to point to the single sink, and (if
	* it hasn't already been set), set the sink's hash value to be the
	* sink sentinel value. The sink is needed for dynamic variable
	* lookups to know that they have iterated over an entire, valid hash
	* chain.
	*/
	for (i = 0; i < hashsize; i++)
	dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;

	if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
	dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;

	/*
	* Determine number of active CPUs. Divide free list evenly among
	* active CPUs.
	*/
	start = (dtrace_dynvar_t *)
	((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
	limit = (uintptr_t)base + size;

	maxper = (limit - (uintptr_t)start) / NCPU;
	maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;

	#if !defined(sun)
	CPU_FOREACH(i) {
	#else
	for (i = 0; i < NCPU; i++) {
	#endif
	dstate->dtds_percpu[i].dtdsc_free = dvar = start;

	/*
	* If we don't even have enough chunks to make it once through
	* NCPUs, we're just going to allocate everything to the first
	* CPU. And if we're on the last CPU, we're going to allocate
	* whatever is left over. In either case, we set the limit to
	* be the limit of the dynamic variable space.
	*/
	if (maxper == 0 \|\| i == NCPU - 1) {
	limit = (uintptr_t)base + size;
	start = NULL;
	} else {
	limit = (uintptr_t)start + maxper;
	start = (dtrace_dynvar_t *)limit;
	}

	ASSERT(limit <= (uintptr_t)base + size);

	for (;;) {
	next = (dtrace_dynvar_t *)((uintptr_t)dvar +
	dstate->dtds_chunksize);

	if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
	break;

	dvar->dtdv_next = next;
	dvar = next;
	}

	if (maxper == 0)
	break;
	}

	return (0);
	}

	static void
	dtrace_dstate_fini(dtrace_dstate_t *dstate)
	{
	ASSERT(MUTEX_HELD(&cpu_lock));

	if (dstate->dtds_base == NULL)
	return;

	kmem_free(dstate->dtds_base, dstate->dtds_size);
	kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
	}

	static void
	dtrace_vstate_fini(dtrace_vstate_t *vstate)
	{
	/*
	* Logical XOR, where are you?
	*/
	ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));

	if (vstate->dtvs_nglobals > 0) {
	kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
	sizeof (dtrace_statvar_t *));
	}

	if (vstate->dtvs_ntlocals > 0) {
	kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
	sizeof (dtrace_difv_t));
	}

	ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));

	if (vstate->dtvs_nlocals > 0) {
	kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
	sizeof (dtrace_statvar_t *));
	}
	}

	#if defined(sun)
	static void
	dtrace_state_clean(dtrace_state_t *state)
	{
	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
	return;

	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
	dtrace_speculation_clean(state);
	}

	static void
	dtrace_state_deadman(dtrace_state_t *state)
	{
	hrtime_t now;

	dtrace_sync();

	now = dtrace_gethrtime();

	if (state != dtrace_anon.dta_state &&
	now - state->dts_laststatus >= dtrace_deadman_user)
	return;

	/*
	* We must be sure that dts_alive never appears to be less than the
	* value upon entry to dtrace_state_deadman(), and because we lack a
	* dtrace_cas64(), we cannot store to it atomically. We thus instead
	* store INT64_MAX to it, followed by a memory barrier, followed by
	* the new value. This assures that dts_alive never appears to be
	* less than its true value, regardless of the order in which the
	* stores to the underlying storage are issued.
	*/
	state->dts_alive = INT64_MAX;
	dtrace_membar_producer();
	state->dts_alive = now;
	}
	#else
	static void
	dtrace_state_clean(void *arg)
	{
	dtrace_state_t *state = arg;
	dtrace_optval_t *opt = state->dts_options;

	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
	return;

	dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
	dtrace_speculation_clean(state);

	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
	dtrace_state_clean, state);
	}

	static void
	dtrace_state_deadman(void *arg)
	{
	dtrace_state_t *state = arg;
	hrtime_t now;

	dtrace_sync();

	dtrace_debug_output();

	now = dtrace_gethrtime();

	if (state != dtrace_anon.dta_state &&
	now - state->dts_laststatus >= dtrace_deadman_user)
	return;

	/*
	* We must be sure that dts_alive never appears to be less than the
	* value upon entry to dtrace_state_deadman(), and because we lack a
	* dtrace_cas64(), we cannot store to it atomically. We thus instead
	* store INT64_MAX to it, followed by a memory barrier, followed by
	* the new value. This assures that dts_alive never appears to be
	* less than its true value, regardless of the order in which the
	* stores to the underlying storage are issued.
	*/
	state->dts_alive = INT64_MAX;
	dtrace_membar_producer();
	state->dts_alive = now;

	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
	dtrace_state_deadman, state);
	}
	#endif

	static dtrace_state_t *
	#if defined(sun)
	dtrace_state_create(dev_t devp, cred_t cr)
	#else
	dtrace_state_create(struct cdev *dev)
	#endif
	{
	#if defined(sun)
	minor_t minor;
	major_t major;
	#else
	cred_t *cr = NULL;
	int m = 0;
	#endif
	char c[30];
	dtrace_state_t *state;
	dtrace_optval_t *opt;
	int bufsize = NCPU * sizeof (dtrace_buffer_t), i;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(MUTEX_HELD(&cpu_lock));

	#if defined(sun)
	minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
	VM_BESTFIT \| VM_SLEEP);

	if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
	vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
	return (NULL);
	}

	state = ddi_get_soft_state(dtrace_softstate, minor);
	#else
	if (dev != NULL) {
	cr = dev->si_cred;
	m = dev2unit(dev);
	}

	/* Allocate memory for the state. */
	state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
	#endif

	state->dts_epid = DTRACE_EPIDNONE + 1;

	(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
	#if defined(sun)
	state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
	NULL, NULL, NULL, 0, VM_SLEEP \| VMC_IDENTIFIER);

	if (devp != NULL) {
	major = getemajor(*devp);
	} else {
	major = ddi_driver_major(dtrace_devi);
	}

	state->dts_dev = makedevice(major, minor);

	if (devp != NULL)
	*devp = state->dts_dev;
	#else
	state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
	state->dts_dev = dev;
	#endif

	/*
	* We allocate NCPU buffers. On the one hand, this can be quite
	* a bit of memory per instance (nearly 36K on a Starcat). On the
	* other hand, it saves an additional memory reference in the probe
	* path.
	*/
	state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
	state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);

	#if defined(sun)
	state->dts_cleaner = CYCLIC_NONE;
	state->dts_deadman = CYCLIC_NONE;
	#else
	callout_init(&state->dts_cleaner, CALLOUT_MPSAFE);
	callout_init(&state->dts_deadman, CALLOUT_MPSAFE);
	#endif
	state->dts_vstate.dtvs_state = state;

	for (i = 0; i < DTRACEOPT_MAX; i++)
	state->dts_options[i] = DTRACEOPT_UNSET;

	/*
	* Set the default options.
	*/
	opt = state->dts_options;
	opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
	opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
	opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
	opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
	opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
	opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
	opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
	opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
	opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
	opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
	opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;

	state->dts_activity = DTRACE_ACTIVITY_INACTIVE;

	/*
	* Depending on the user credentials, we set flag bits which alter probe
	* visibility or the amount of destructiveness allowed. In the case of
	* actual anonymous tracing, or the possession of all privileges, all of
	* the normal checks are bypassed.
	*/
	if (cr == NULL \|\| PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
	state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
	state->dts_cred.dcr_action = DTRACE_CRA_ALL;
	} else {
	/*
	* Set up the credentials for this instantiation. We take a
	* hold on the credential to prevent it from disappearing on
	* us; this in turn prevents the zone_t referenced by this
	* credential from disappearing. This means that we can
	* examine the credential and the zone from probe context.
	*/
	crhold(cr);
	state->dts_cred.dcr_cred = cr;

	/*
	* CRA_PROC means "we have some privilege for dtrace" and
	* unlocks the use of variables like pid, zonename, etc.
	*/
	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) \|\|
	PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
	state->dts_cred.dcr_action \|= DTRACE_CRA_PROC;
	}

	/*
	* dtrace_user allows use of syscall and profile providers.
	* If the user also has proc_owner and/or proc_zone, we
	* extend the scope to include additional visibility and
	* destructive power.
	*/
	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
	state->dts_cred.dcr_visible \|=
	DTRACE_CRV_ALLPROC;

	state->dts_cred.dcr_action \|=
	DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
	}

	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
	state->dts_cred.dcr_visible \|=
	DTRACE_CRV_ALLZONE;

	state->dts_cred.dcr_action \|=
	DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
	}

	/*
	* If we have all privs in whatever zone this is,
	* we can do destructive things to processes which
	* have altered credentials.
	*/
	#if defined(sun)
	if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
	cr->cr_zone->zone_privset)) {
	state->dts_cred.dcr_action \|=
	DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
	}
	#endif
	}

	/*
	* Holding the dtrace_kernel privilege also implies that
	* the user has the dtrace_user privilege from a visibility
	* perspective. But without further privileges, some
	* destructive actions are not available.
	*/
	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
	/*
	* Make all probes in all zones visible. However,
	* this doesn't mean that all actions become available
	* to all zones.
	*/
	state->dts_cred.dcr_visible \|= DTRACE_CRV_KERNEL \|
	DTRACE_CRV_ALLPROC \| DTRACE_CRV_ALLZONE;

	state->dts_cred.dcr_action \|= DTRACE_CRA_KERNEL \|
	DTRACE_CRA_PROC;
	/*
	* Holding proc_owner means that destructive actions
	* for this zone are allowed.
	*/
	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
	state->dts_cred.dcr_action \|=
	DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;

	/*
	* Holding proc_zone means that destructive actions
	* for this user/group ID in all zones is allowed.
	*/
	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
	state->dts_cred.dcr_action \|=
	DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;

	#if defined(sun)
	/*
	* If we have all privs in whatever zone this is,
	* we can do destructive things to processes which
	* have altered credentials.
	*/
	if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
	cr->cr_zone->zone_privset)) {
	state->dts_cred.dcr_action \|=
	DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
	}
	#endif
	}

	/*
	* Holding the dtrace_proc privilege gives control over fasttrap
	* and pid providers. We need to grant wider destructive
	* privileges in the event that the user has proc_owner and/or
	* proc_zone.
	*/
	if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
	state->dts_cred.dcr_action \|=
	DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;

	if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
	state->dts_cred.dcr_action \|=
	DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
	}
	}

	return (state);
	}

	static int
	dtrace_state_buffer(dtrace_state_t state, dtrace_buffer_t buf, int which)
	{
	dtrace_optval_t *opt = state->dts_options, size;
	processorid_t cpu = 0;;
	int flags = 0, rval;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(MUTEX_HELD(&cpu_lock));
	ASSERT(which < DTRACEOPT_MAX);
	ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE \|\|
	(state == dtrace_anon.dta_state &&
	state->dts_activity == DTRACE_ACTIVITY_ACTIVE));

	if (opt[which] == DTRACEOPT_UNSET \|\| opt[which] == 0)
	return (0);

	if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
	cpu = opt[DTRACEOPT_CPU];

	if (which == DTRACEOPT_SPECSIZE)
	flags \|= DTRACEBUF_NOSWITCH;

	if (which == DTRACEOPT_BUFSIZE) {
	if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
	flags \|= DTRACEBUF_RING;

	if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
	flags \|= DTRACEBUF_FILL;

	if (state != dtrace_anon.dta_state \|\|
	state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
	flags \|= DTRACEBUF_INACTIVE;
	}

	for (size = opt[which]; size >= sizeof (uint64_t); size >>= 1) {
	/*
	* The size must be 8-byte aligned. If the size is not 8-byte
	* aligned, drop it down by the difference.
	*/
	if (size & (sizeof (uint64_t) - 1))
	size -= size & (sizeof (uint64_t) - 1);

	if (size < state->dts_reserve) {
	/*
	* Buffers always must be large enough to accommodate
	* their prereserved space. We return E2BIG instead
	* of ENOMEM in this case to allow for user-level
	* software to differentiate the cases.
	*/
	return (E2BIG);
	}

	rval = dtrace_buffer_alloc(buf, size, flags, cpu);

	if (rval != ENOMEM) {
	opt[which] = size;
	return (rval);
	}

	if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
	return (rval);
	}

	return (ENOMEM);
	}

	static int
	dtrace_state_buffers(dtrace_state_t *state)
	{
	dtrace_speculation_t *spec = state->dts_speculations;
	int rval, i;

	if ((rval = dtrace_state_buffer(state, state->dts_buffer,
	DTRACEOPT_BUFSIZE)) != 0)
	return (rval);

	if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
	DTRACEOPT_AGGSIZE)) != 0)
	return (rval);

	for (i = 0; i < state->dts_nspeculations; i++) {
	if ((rval = dtrace_state_buffer(state,
	spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
	return (rval);
	}

	return (0);
	}

	static void
	dtrace_state_prereserve(dtrace_state_t *state)
	{
	dtrace_ecb_t *ecb;
	dtrace_probe_t *probe;

	state->dts_reserve = 0;

	if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
	return;

	/*
	* If our buffer policy is a "fill" buffer policy, we need to set the
	* prereserved space to be the space required by the END probes.
	*/
	probe = dtrace_probes[dtrace_probeid_end - 1];
	ASSERT(probe != NULL);

	for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
	if (ecb->dte_state != state)
	continue;

	state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
	}
	}

	static int
	dtrace_state_go(dtrace_state_t state, processorid_t cpu)
	{
	dtrace_optval_t *opt = state->dts_options, sz, nspec;
	dtrace_speculation_t *spec;
	dtrace_buffer_t *buf;
	#if defined(sun)
	cyc_handler_t hdlr;
	cyc_time_t when;
	#endif
	int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
	dtrace_icookie_t cookie;

	mutex_enter(&cpu_lock);
	mutex_enter(&dtrace_lock);

	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
	rval = EBUSY;
	goto out;
	}

	/*
	* Before we can perform any checks, we must prime all of the
	* retained enablings that correspond to this state.
	*/
	dtrace_enabling_prime(state);

	if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
	rval = EACCES;
	goto out;
	}

	dtrace_state_prereserve(state);

	/*
	* Now we want to do is try to allocate our speculations.
	* We do not automatically resize the number of speculations; if
	* this fails, we will fail the operation.
	*/
	nspec = opt[DTRACEOPT_NSPEC];
	ASSERT(nspec != DTRACEOPT_UNSET);

	if (nspec > INT_MAX) {
	rval = ENOMEM;
	goto out;
	}

	spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);

	if (spec == NULL) {
	rval = ENOMEM;
	goto out;
	}

	state->dts_speculations = spec;
	state->dts_nspeculations = (int)nspec;

	for (i = 0; i < nspec; i++) {
	if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
	rval = ENOMEM;
	goto err;
	}

	spec[i].dtsp_buffer = buf;
	}

	if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
	if (dtrace_anon.dta_state == NULL) {
	rval = ENOENT;
	goto out;
	}

	if (state->dts_necbs != 0) {
	rval = EALREADY;
	goto out;
	}

	state->dts_anon = dtrace_anon_grab();
	ASSERT(state->dts_anon != NULL);
	state = state->dts_anon;

	/*
	* We want "grabanon" to be set in the grabbed state, so we'll
	* copy that option value from the grabbing state into the
	* grabbed state.
	*/
	state->dts_options[DTRACEOPT_GRABANON] =
	opt[DTRACEOPT_GRABANON];

	*cpu = dtrace_anon.dta_beganon;

	/*
	* If the anonymous state is active (as it almost certainly
	* is if the anonymous enabling ultimately matched anything),
	* we don't allow any further option processing -- but we
	* don't return failure.
	*/
	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
	goto out;
	}

	if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
	opt[DTRACEOPT_AGGSIZE] != 0) {
	if (state->dts_aggregations == NULL) {
	/*
	* We're not going to create an aggregation buffer
	* because we don't have any ECBs that contain
	* aggregations -- set this option to 0.
	*/
	opt[DTRACEOPT_AGGSIZE] = 0;
	} else {
	/*
	* If we have an aggregation buffer, we must also have
	* a buffer to use as scratch.
	*/
	if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET \|\|
	opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
	opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
	}
	}
	}

	if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
	opt[DTRACEOPT_SPECSIZE] != 0) {
	if (!state->dts_speculates) {
	/*
	* We're not going to create speculation buffers
	* because we don't have any ECBs that actually
	* speculate -- set the speculation size to 0.
	*/
	opt[DTRACEOPT_SPECSIZE] = 0;
	}
	}

	/*
	* The bare minimum size for any buffer that we're actually going to
	* do anything to is sizeof (uint64_t).
	*/
	sz = sizeof (uint64_t);

	if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) \|\|
	(state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) \|\|
	(state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
	/*
	* A buffer size has been explicitly set to 0 (or to a size
	* that will be adjusted to 0) and we need the space -- we
	* need to return failure. We return ENOSPC to differentiate
	* it from failing to allocate a buffer due to failure to meet
	* the reserve (for which we return E2BIG).
	*/
	rval = ENOSPC;
	goto out;
	}

	if ((rval = dtrace_state_buffers(state)) != 0)
	goto err;

	if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
	sz = dtrace_dstate_defsize;

	do {
	rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);

	if (rval == 0)
	break;

	if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
	goto err;
	} while (sz >>= 1);

	opt[DTRACEOPT_DYNVARSIZE] = sz;

	if (rval != 0)
	goto err;

	if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
	opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;

	if (opt[DTRACEOPT_CLEANRATE] == 0)
	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;

	if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;

	if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
	opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;

	state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
	#if defined(sun)
	hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
	hdlr.cyh_arg = state;
	hdlr.cyh_level = CY_LOW_LEVEL;

	when.cyt_when = 0;
	when.cyt_interval = opt[DTRACEOPT_CLEANRATE];

	state->dts_cleaner = cyclic_add(&hdlr, &when);

	hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
	hdlr.cyh_arg = state;
	hdlr.cyh_level = CY_LOW_LEVEL;

	when.cyt_when = 0;
	when.cyt_interval = dtrace_deadman_interval;

	state->dts_deadman = cyclic_add(&hdlr, &when);
	#else
	callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
	dtrace_state_clean, state);
	callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
	dtrace_state_deadman, state);
	#endif

	state->dts_activity = DTRACE_ACTIVITY_WARMUP;

	/*
	* Now it's time to actually fire the BEGIN probe. We need to disable
	* interrupts here both to record the CPU on which we fired the BEGIN
	* probe (the data from this CPU will be processed first at user
	* level) and to manually activate the buffer for this CPU.
	*/
	cookie = dtrace_interrupt_disable();
	*cpu = curcpu;
	ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
	state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;

	dtrace_probe(dtrace_probeid_begin,
	(uint64_t)(uintptr_t)state, 0, 0, 0, 0);
	dtrace_interrupt_enable(cookie);
	/*
	* We may have had an exit action from a BEGIN probe; only change our
	* state to ACTIVE if we're still in WARMUP.
	*/
	ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP \|\|
	state->dts_activity == DTRACE_ACTIVITY_DRAINING);

	if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
	state->dts_activity = DTRACE_ACTIVITY_ACTIVE;

	/*
	* Regardless of whether or not now we're in ACTIVE or DRAINING, we
	* want each CPU to transition its principal buffer out of the
	* INACTIVE state. Doing this assures that no CPU will suddenly begin
	* processing an ECB halfway down a probe's ECB chain; all CPUs will
	* atomically transition from processing none of a state's ECBs to
	* processing all of them.
	*/
	dtrace_xcall(DTRACE_CPUALL,
	(dtrace_xcall_t)dtrace_buffer_activate, state);
	goto out;

	err:
	dtrace_buffer_free(state->dts_buffer);
	dtrace_buffer_free(state->dts_aggbuffer);

	if ((nspec = state->dts_nspeculations) == 0) {
	ASSERT(state->dts_speculations == NULL);
	goto out;
	}

	spec = state->dts_speculations;
	ASSERT(spec != NULL);

	for (i = 0; i < state->dts_nspeculations; i++) {
	if ((buf = spec[i].dtsp_buffer) == NULL)
	break;

	dtrace_buffer_free(buf);
	kmem_free(buf, bufsize);
	}

	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
	state->dts_nspeculations = 0;
	state->dts_speculations = NULL;

	out:
	mutex_exit(&dtrace_lock);
	mutex_exit(&cpu_lock);

	return (rval);
	}

	static int
	dtrace_state_stop(dtrace_state_t state, processorid_t cpu)
	{
	dtrace_icookie_t cookie;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
	state->dts_activity != DTRACE_ACTIVITY_DRAINING)
	return (EINVAL);

	/*
	* We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
	* to be sure that every CPU has seen it. See below for the details
	* on why this is done.
	*/
	state->dts_activity = DTRACE_ACTIVITY_DRAINING;
	dtrace_sync();

	/*
	* By this point, it is impossible for any CPU to be still processing
	* with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
	* DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
	* other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
	* and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
	* iff we're in the END probe.
	*/
	state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
	dtrace_sync();
	ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);

	/*
	* Finally, we can release the reserve and call the END probe. We
	* disable interrupts across calling the END probe to allow us to
	* return the CPU on which we actually called the END probe. This
	* allows user-land to be sure that this CPU's principal buffer is
	* processed last.
	*/
	state->dts_reserve = 0;

	cookie = dtrace_interrupt_disable();
	*cpu = curcpu;
	dtrace_probe(dtrace_probeid_end,
	(uint64_t)(uintptr_t)state, 0, 0, 0, 0);
	dtrace_interrupt_enable(cookie);

	state->dts_activity = DTRACE_ACTIVITY_STOPPED;
	dtrace_sync();

	return (0);
	}

	static int
	dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
	dtrace_optval_t val)
	{
	ASSERT(MUTEX_HELD(&dtrace_lock));

	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
	return (EBUSY);

	if (option >= DTRACEOPT_MAX)
	return (EINVAL);

	if (option != DTRACEOPT_CPU && val < 0)
	return (EINVAL);

	switch (option) {
	case DTRACEOPT_DESTRUCTIVE:
	if (dtrace_destructive_disallow)
	return (EACCES);

	state->dts_cred.dcr_destructive = 1;
	break;

	case DTRACEOPT_BUFSIZE:
	case DTRACEOPT_DYNVARSIZE:
	case DTRACEOPT_AGGSIZE:
	case DTRACEOPT_SPECSIZE:
	case DTRACEOPT_STRSIZE:
	if (val < 0)
	return (EINVAL);

	if (val >= LONG_MAX) {
	/*
	* If this is an otherwise negative value, set it to
	* the highest multiple of 128m less than LONG_MAX.
	* Technically, we're adjusting the size without
	* regard to the buffer resizing policy, but in fact,
	* this has no effect -- if we set the buffer size to
	* ~LONG_MAX and the buffer policy is ultimately set to
	* be "manual", the buffer allocation is guaranteed to
	* fail, if only because the allocation requires two
	* buffers. (We set the the size to the highest
	* multiple of 128m because it ensures that the size
	* will remain a multiple of a megabyte when
	* repeatedly halved -- all the way down to 15m.)
	*/
	val = LONG_MAX - (1 << 27) + 1;
	}
	}

	state->dts_options[option] = val;

	return (0);
	}

	static void
	dtrace_state_destroy(dtrace_state_t *state)
	{
	dtrace_ecb_t *ecb;
	dtrace_vstate_t *vstate = &state->dts_vstate;
	#if defined(sun)
	minor_t minor = getminor(state->dts_dev);
	#endif
	int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
	dtrace_speculation_t *spec = state->dts_speculations;
	int nspec = state->dts_nspeculations;
	uint32_t match;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(MUTEX_HELD(&cpu_lock));

	/*
	* First, retract any retained enablings for this state.
	*/
	dtrace_enabling_retract(state);
	ASSERT(state->dts_nretained == 0);

	if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE \|\|
	state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
	/*
	* We have managed to come into dtrace_state_destroy() on a
	* hot enabling -- almost certainly because of a disorderly
	* shutdown of a consumer. (That is, a consumer that is
	* exiting without having called dtrace_stop().) In this case,
	* we're going to set our activity to be KILLED, and then
	* issue a sync to be sure that everyone is out of probe
	* context before we start blowing away ECBs.
	*/
	state->dts_activity = DTRACE_ACTIVITY_KILLED;
	dtrace_sync();
	}

	/*
	* Release the credential hold we took in dtrace_state_create().
	*/
	if (state->dts_cred.dcr_cred != NULL)
	crfree(state->dts_cred.dcr_cred);

	/*
	* Now we can safely disable and destroy any enabled probes. Because
	* any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
	* (especially if they're all enabled), we take two passes through the
	* ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
	* in the second we disable whatever is left over.
	*/
	for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
	for (i = 0; i < state->dts_necbs; i++) {
	if ((ecb = state->dts_ecbs[i]) == NULL)
	continue;

	if (match && ecb->dte_probe != NULL) {
	dtrace_probe_t *probe = ecb->dte_probe;
	dtrace_provider_t *prov = probe->dtpr_provider;

	if (!(prov->dtpv_priv.dtpp_flags & match))
	continue;
	}

	dtrace_ecb_disable(ecb);
	dtrace_ecb_destroy(ecb);
	}

	if (!match)
	break;
	}

	/*
	* Before we free the buffers, perform one more sync to assure that
	* every CPU is out of probe context.
	*/
	dtrace_sync();

	dtrace_buffer_free(state->dts_buffer);
	dtrace_buffer_free(state->dts_aggbuffer);

	for (i = 0; i < nspec; i++)
	dtrace_buffer_free(spec[i].dtsp_buffer);

	#if defined(sun)
	if (state->dts_cleaner != CYCLIC_NONE)
	cyclic_remove(state->dts_cleaner);

	if (state->dts_deadman != CYCLIC_NONE)
	cyclic_remove(state->dts_deadman);
	#else
	callout_stop(&state->dts_cleaner);
	callout_drain(&state->dts_cleaner);
	callout_stop(&state->dts_deadman);
	callout_drain(&state->dts_deadman);
	#endif

	dtrace_dstate_fini(&vstate->dtvs_dynvars);
	dtrace_vstate_fini(vstate);
	if (state->dts_ecbs != NULL)
	kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));

	if (state->dts_aggregations != NULL) {
	#ifdef DEBUG
	for (i = 0; i < state->dts_naggregations; i++)
	ASSERT(state->dts_aggregations[i] == NULL);
	#endif
	ASSERT(state->dts_naggregations > 0);
	kmem_free(state->dts_aggregations,
	state->dts_naggregations * sizeof (dtrace_aggregation_t *));
	}

	kmem_free(state->dts_buffer, bufsize);
	kmem_free(state->dts_aggbuffer, bufsize);

	for (i = 0; i < nspec; i++)
	kmem_free(spec[i].dtsp_buffer, bufsize);

	if (spec != NULL)
	kmem_free(spec, nspec * sizeof (dtrace_speculation_t));

	dtrace_format_destroy(state);

	if (state->dts_aggid_arena != NULL) {
	#if defined(sun)
	vmem_destroy(state->dts_aggid_arena);
	#else
	delete_unrhdr(state->dts_aggid_arena);
	#endif
	state->dts_aggid_arena = NULL;
	}
	#if defined(sun)
	ddi_soft_state_free(dtrace_softstate, minor);
	vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
	#endif
	}

	/*
	* DTrace Anonymous Enabling Functions
	*/
	static dtrace_state_t *
	dtrace_anon_grab(void)
	{
	dtrace_state_t *state;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	if ((state = dtrace_anon.dta_state) == NULL) {
	ASSERT(dtrace_anon.dta_enabling == NULL);
	return (NULL);
	}

	ASSERT(dtrace_anon.dta_enabling != NULL);
	ASSERT(dtrace_retained != NULL);

	dtrace_enabling_destroy(dtrace_anon.dta_enabling);
	dtrace_anon.dta_enabling = NULL;
	dtrace_anon.dta_state = NULL;

	return (state);
	}

	static void
	dtrace_anon_property(void)
	{
	int i, rv;
	dtrace_state_t *state;
	dof_hdr_t *dof;
	char c[32]; /* enough for "dof-data-" + digits */

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(MUTEX_HELD(&cpu_lock));

	for (i = 0; ; i++) {
	(void) snprintf(c, sizeof (c), "dof-data-%d", i);

	dtrace_err_verbose = 1;

	if ((dof = dtrace_dof_property(c)) == NULL) {
	dtrace_err_verbose = 0;
	break;
	}

	#if defined(sun)
	/*
	* We want to create anonymous state, so we need to transition
	* the kernel debugger to indicate that DTrace is active. If
	* this fails (e.g. because the debugger has modified text in
	* some way), we won't continue with the processing.
	*/
	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
	cmn_err(CE_NOTE, "kernel debugger active; anonymous "
	"enabling ignored.");
	dtrace_dof_destroy(dof);
	break;
	}
	#endif

	/*
	* If we haven't allocated an anonymous state, we'll do so now.
	*/
	if ((state = dtrace_anon.dta_state) == NULL) {
	#if defined(sun)
	state = dtrace_state_create(NULL, NULL);
	#else
	state = dtrace_state_create(NULL);
	#endif
	dtrace_anon.dta_state = state;

	if (state == NULL) {
	/*
	* This basically shouldn't happen: the only
	* failure mode from dtrace_state_create() is a
	* failure of ddi_soft_state_zalloc() that
	* itself should never happen. Still, the
	* interface allows for a failure mode, and
	* we want to fail as gracefully as possible:
	* we'll emit an error message and cease
	* processing anonymous state in this case.
	*/
	cmn_err(CE_WARN, "failed to create "
	"anonymous state");
	dtrace_dof_destroy(dof);
	break;
	}
	}

	rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
	&dtrace_anon.dta_enabling, 0, B_TRUE);

	if (rv == 0)
	rv = dtrace_dof_options(dof, state);

	dtrace_err_verbose = 0;
	dtrace_dof_destroy(dof);

	if (rv != 0) {
	/*
	* This is malformed DOF; chuck any anonymous state
	* that we created.
	*/
	ASSERT(dtrace_anon.dta_enabling == NULL);
	dtrace_state_destroy(state);
	dtrace_anon.dta_state = NULL;
	break;
	}

	ASSERT(dtrace_anon.dta_enabling != NULL);
	}

	if (dtrace_anon.dta_enabling != NULL) {
	int rval;

	/*
	* dtrace_enabling_retain() can only fail because we are
	* trying to retain more enablings than are allowed -- but
	* we only have one anonymous enabling, and we are guaranteed
	* to be allowed at least one retained enabling; we assert
	* that dtrace_enabling_retain() returns success.
	*/
	rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
	ASSERT(rval == 0);

	dtrace_enabling_dump(dtrace_anon.dta_enabling);
	}
	}

	/*
	* DTrace Helper Functions
	*/
	static void
	dtrace_helper_trace(dtrace_helper_action_t *helper,
	dtrace_mstate_t mstate, dtrace_vstate_t vstate, int where)
	{
	uint32_t size, next, nnext, i;
	dtrace_helptrace_t *ent;
	uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;

	if (!dtrace_helptrace_enabled)
	return;

	ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);

	/*
	* What would a tracing framework be without its own tracing
	* framework? (Well, a hell of a lot simpler, for starters...)
	*/
	size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
	sizeof (uint64_t) - sizeof (uint64_t);

	/*
	* Iterate until we can allocate a slot in the trace buffer.
	*/
	do {
	next = dtrace_helptrace_next;

	if (next + size < dtrace_helptrace_bufsize) {
	nnext = next + size;
	} else {
	nnext = size;
	}
	} while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);

	/*
	* We have our slot; fill it in.
	*/
	if (nnext == size)
	next = 0;

	ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
	ent->dtht_helper = helper;
	ent->dtht_where = where;
	ent->dtht_nlocals = vstate->dtvs_nlocals;

	ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
	mstate->dtms_fltoffs : -1;
	ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
	ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;

	for (i = 0; i < vstate->dtvs_nlocals; i++) {
	dtrace_statvar_t *svar;

	if ((svar = vstate->dtvs_locals[i]) == NULL)
	continue;

	ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
	ent->dtht_locals[i] =
	((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
	}
	}

	static uint64_t
	dtrace_helper(int which, dtrace_mstate_t *mstate,
	dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
	{
	uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
	uint64_t sarg0 = mstate->dtms_arg[0];
	uint64_t sarg1 = mstate->dtms_arg[1];
	uint64_t rval = 0;
	dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
	dtrace_helper_action_t *helper;
	dtrace_vstate_t *vstate;
	dtrace_difo_t *pred;
	int i, trace = dtrace_helptrace_enabled;

	ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);

	if (helpers == NULL)
	return (0);

	if ((helper = helpers->dthps_actions[which]) == NULL)
	return (0);

	vstate = &helpers->dthps_vstate;
	mstate->dtms_arg[0] = arg0;
	mstate->dtms_arg[1] = arg1;

	/*
	* Now iterate over each helper. If its predicate evaluates to 'true',
	* we'll call the corresponding actions. Note that the below calls
	* to dtrace_dif_emulate() may set faults in machine state. This is
	* okay: our caller (the outer dtrace_dif_emulate()) will simply plow
	* the stored DIF offset with its own (which is the desired behavior).
	* Also, note the calls to dtrace_dif_emulate() may allocate scratch
	* from machine state; this is okay, too.
	*/
	for (; helper != NULL; helper = helper->dtha_next) {
	if ((pred = helper->dtha_predicate) != NULL) {
	if (trace)
	dtrace_helper_trace(helper, mstate, vstate, 0);

	if (!dtrace_dif_emulate(pred, mstate, vstate, state))
	goto next;

	if (*flags & CPU_DTRACE_FAULT)
	goto err;
	}

	for (i = 0; i < helper->dtha_nactions; i++) {
	if (trace)
	dtrace_helper_trace(helper,
	mstate, vstate, i + 1);

	rval = dtrace_dif_emulate(helper->dtha_actions[i],
	mstate, vstate, state);

	if (*flags & CPU_DTRACE_FAULT)
	goto err;
	}

	next:
	if (trace)
	dtrace_helper_trace(helper, mstate, vstate,
	DTRACE_HELPTRACE_NEXT);
	}

	if (trace)
	dtrace_helper_trace(helper, mstate, vstate,
	DTRACE_HELPTRACE_DONE);

	/*
	* Restore the arg0 that we saved upon entry.
	*/
	mstate->dtms_arg[0] = sarg0;
	mstate->dtms_arg[1] = sarg1;

	return (rval);

	err:
	if (trace)
	dtrace_helper_trace(helper, mstate, vstate,
	DTRACE_HELPTRACE_ERR);

	/*
	* Restore the arg0 that we saved upon entry.
	*/
	mstate->dtms_arg[0] = sarg0;
	mstate->dtms_arg[1] = sarg1;

	return (0);
	}

	static void
	dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
	dtrace_vstate_t *vstate)
	{
	int i;

	if (helper->dtha_predicate != NULL)
	dtrace_difo_release(helper->dtha_predicate, vstate);

	for (i = 0; i < helper->dtha_nactions; i++) {
	ASSERT(helper->dtha_actions[i] != NULL);
	dtrace_difo_release(helper->dtha_actions[i], vstate);
	}

	kmem_free(helper->dtha_actions,
	helper->dtha_nactions * sizeof (dtrace_difo_t *));
	kmem_free(helper, sizeof (dtrace_helper_action_t));
	}

	static int
	dtrace_helper_destroygen(int gen)
	{
	proc_t *p = curproc;
	dtrace_helpers_t *help = p->p_dtrace_helpers;
	dtrace_vstate_t *vstate;
	int i;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	if (help == NULL \|\| gen > help->dthps_generation)
	return (EINVAL);

	vstate = &help->dthps_vstate;

	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
	dtrace_helper_action_t last = NULL, h, *next;

	for (h = help->dthps_actions[i]; h != NULL; h = next) {
	next = h->dtha_next;

	if (h->dtha_generation == gen) {
	if (last != NULL) {
	last->dtha_next = next;
	} else {
	help->dthps_actions[i] = next;
	}

	dtrace_helper_action_destroy(h, vstate);
	} else {
	last = h;
	}
	}
	}

	/*
	* Interate until we've cleared out all helper providers with the
	* given generation number.
	*/
	for (;;) {
	dtrace_helper_provider_t *prov;

	/*
	* Look for a helper provider with the right generation. We
	* have to start back at the beginning of the list each time
	* because we drop dtrace_lock. It's unlikely that we'll make
	* more than two passes.
	*/
	for (i = 0; i < help->dthps_nprovs; i++) {
	prov = help->dthps_provs[i];

	if (prov->dthp_generation == gen)
	break;
	}

	/*
	* If there were no matches, we're done.
	*/
	if (i == help->dthps_nprovs)
	break;

	/*
	* Move the last helper provider into this slot.
	*/
	help->dthps_nprovs--;
	help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
	help->dthps_provs[help->dthps_nprovs] = NULL;

	mutex_exit(&dtrace_lock);

	/*
	* If we have a meta provider, remove this helper provider.
	*/
	mutex_enter(&dtrace_meta_lock);
	if (dtrace_meta_pid != NULL) {
	ASSERT(dtrace_deferred_pid == NULL);
	dtrace_helper_provider_remove(&prov->dthp_prov,
	p->p_pid);
	}
	mutex_exit(&dtrace_meta_lock);

	dtrace_helper_provider_destroy(prov);

	mutex_enter(&dtrace_lock);
	}

	return (0);
	}

	static int
	dtrace_helper_validate(dtrace_helper_action_t *helper)
	{
	int err = 0, i;
	dtrace_difo_t *dp;

	if ((dp = helper->dtha_predicate) != NULL)
	err += dtrace_difo_validate_helper(dp);

	for (i = 0; i < helper->dtha_nactions; i++)
	err += dtrace_difo_validate_helper(helper->dtha_actions[i]);

	return (err == 0);
	}

	static int
	dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
	{
	dtrace_helpers_t *help;
	dtrace_helper_action_t helper, last;
	dtrace_actdesc_t *act;
	dtrace_vstate_t *vstate;
	dtrace_predicate_t *pred;
	int count = 0, nactions = 0, i;

	if (which < 0 \|\| which >= DTRACE_NHELPER_ACTIONS)
	return (EINVAL);

	help = curproc->p_dtrace_helpers;
	last = help->dthps_actions[which];
	vstate = &help->dthps_vstate;

	for (count = 0; last != NULL; last = last->dtha_next) {
	count++;
	if (last->dtha_next == NULL)
	break;
	}

	/*
	* If we already have dtrace_helper_actions_max helper actions for this
	* helper action type, we'll refuse to add a new one.
	*/
	if (count >= dtrace_helper_actions_max)
	return (ENOSPC);

	helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
	helper->dtha_generation = help->dthps_generation;

	if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
	ASSERT(pred->dtp_difo != NULL);
	dtrace_difo_hold(pred->dtp_difo);
	helper->dtha_predicate = pred->dtp_difo;
	}

	for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
	if (act->dtad_kind != DTRACEACT_DIFEXPR)
	goto err;

	if (act->dtad_difo == NULL)
	goto err;

	nactions++;
	}

	helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t )
	(helper->dtha_nactions = nactions), KM_SLEEP);

	for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
	dtrace_difo_hold(act->dtad_difo);
	helper->dtha_actions[i++] = act->dtad_difo;
	}

	if (!dtrace_helper_validate(helper))
	goto err;

	if (last == NULL) {
	help->dthps_actions[which] = helper;
	} else {
	last->dtha_next = helper;
	}

	if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
	dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
	dtrace_helptrace_next = 0;
	}

	return (0);
	err:
	dtrace_helper_action_destroy(helper, vstate);
	return (EINVAL);
	}

	static void
	dtrace_helper_provider_register(proc_t p, dtrace_helpers_t help,
	dof_helper_t *dofhp)
	{
	ASSERT(MUTEX_NOT_HELD(&dtrace_lock));

	mutex_enter(&dtrace_meta_lock);
	mutex_enter(&dtrace_lock);

	if (!dtrace_attached() \|\| dtrace_meta_pid == NULL) {
	/*
	* If the dtrace module is loaded but not attached, or if
	* there aren't isn't a meta provider registered to deal with
	* these provider descriptions, we need to postpone creating
	* the actual providers until later.
	*/

	if (help->dthps_next == NULL && help->dthps_prev == NULL &&
	dtrace_deferred_pid != help) {
	help->dthps_deferred = 1;
	help->dthps_pid = p->p_pid;
	help->dthps_next = dtrace_deferred_pid;
	help->dthps_prev = NULL;
	if (dtrace_deferred_pid != NULL)
	dtrace_deferred_pid->dthps_prev = help;
	dtrace_deferred_pid = help;
	}

	mutex_exit(&dtrace_lock);

	} else if (dofhp != NULL) {
	/*
	* If the dtrace module is loaded and we have a particular
	* helper provider description, pass that off to the
	* meta provider.
	*/

	mutex_exit(&dtrace_lock);

	dtrace_helper_provide(dofhp, p->p_pid);

	} else {
	/*
	* Otherwise, just pass all the helper provider descriptions
	* off to the meta provider.
	*/

	int i;
	mutex_exit(&dtrace_lock);

	for (i = 0; i < help->dthps_nprovs; i++) {
	dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
	p->p_pid);
	}
	}

	mutex_exit(&dtrace_meta_lock);
	}

	static int
	dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
	{
	dtrace_helpers_t *help;
	dtrace_helper_provider_t hprov, *tmp_provs;
	uint_t tmp_maxprovs, i;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	help = curproc->p_dtrace_helpers;
	ASSERT(help != NULL);

	/*
	* If we already have dtrace_helper_providers_max helper providers,
	* we're refuse to add a new one.
	*/
	if (help->dthps_nprovs >= dtrace_helper_providers_max)
	return (ENOSPC);

	/*
	* Check to make sure this isn't a duplicate.
	*/
	for (i = 0; i < help->dthps_nprovs; i++) {
	if (dofhp->dofhp_addr ==
	help->dthps_provs[i]->dthp_prov.dofhp_addr)
	return (EALREADY);
	}

	hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
	hprov->dthp_prov = *dofhp;
	hprov->dthp_ref = 1;
	hprov->dthp_generation = gen;

	/*
	* Allocate a bigger table for helper providers if it's already full.
	*/
	if (help->dthps_maxprovs == help->dthps_nprovs) {
	tmp_maxprovs = help->dthps_maxprovs;
	tmp_provs = help->dthps_provs;

	if (help->dthps_maxprovs == 0)
	help->dthps_maxprovs = 2;
	else
	help->dthps_maxprovs *= 2;
	if (help->dthps_maxprovs > dtrace_helper_providers_max)
	help->dthps_maxprovs = dtrace_helper_providers_max;

	ASSERT(tmp_maxprovs < help->dthps_maxprovs);

	help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
	sizeof (dtrace_helper_provider_t *), KM_SLEEP);

	if (tmp_provs != NULL) {
	bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
	sizeof (dtrace_helper_provider_t *));
	kmem_free(tmp_provs, tmp_maxprovs *
	sizeof (dtrace_helper_provider_t *));
	}
	}

	help->dthps_provs[help->dthps_nprovs] = hprov;
	help->dthps_nprovs++;

	return (0);
	}

	static void
	dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
	{
	mutex_enter(&dtrace_lock);

	if (--hprov->dthp_ref == 0) {
	dof_hdr_t *dof;
	mutex_exit(&dtrace_lock);
	dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
	dtrace_dof_destroy(dof);
	kmem_free(hprov, sizeof (dtrace_helper_provider_t));
	} else {
	mutex_exit(&dtrace_lock);
	}
	}

	static int
	dtrace_helper_provider_validate(dof_hdr_t dof, dof_sec_t sec)
	{
	uintptr_t daddr = (uintptr_t)dof;
	dof_sec_t str_sec, prb_sec, arg_sec, off_sec, *enoff_sec;
	dof_provider_t *provider;
	dof_probe_t *probe;
	uint8_t *arg;
	char strtab, typestr;
	dof_stridx_t typeidx;
	size_t typesz;
	uint_t nprobes, j, k;

	ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);

	if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
	dtrace_dof_error(dof, "misaligned section offset");
	return (-1);
	}

	/*
	* The section needs to be large enough to contain the DOF provider
	* structure appropriate for the given version.
	*/
	if (sec->dofs_size <
	((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
	offsetof(dof_provider_t, dofpv_prenoffs) :
	sizeof (dof_provider_t))) {
	dtrace_dof_error(dof, "provider section too small");
	return (-1);
	}

	provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
	str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
	prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
	arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
	off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);

	if (str_sec == NULL \|\| prb_sec == NULL \|\|
	arg_sec == NULL \|\| off_sec == NULL)
	return (-1);

	enoff_sec = NULL;

	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
	provider->dofpv_prenoffs != DOF_SECT_NONE &&
	(enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
	provider->dofpv_prenoffs)) == NULL)
	return (-1);

	strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);

	if (provider->dofpv_name >= str_sec->dofs_size \|\|
	strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
	dtrace_dof_error(dof, "invalid provider name");
	return (-1);
	}

	if (prb_sec->dofs_entsize == 0 \|\|
	prb_sec->dofs_entsize > prb_sec->dofs_size) {
	dtrace_dof_error(dof, "invalid entry size");
	return (-1);
	}

	if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
	dtrace_dof_error(dof, "misaligned entry size");
	return (-1);
	}

	if (off_sec->dofs_entsize != sizeof (uint32_t)) {
	dtrace_dof_error(dof, "invalid entry size");
	return (-1);
	}

	if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
	dtrace_dof_error(dof, "misaligned section offset");
	return (-1);
	}

	if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
	dtrace_dof_error(dof, "invalid entry size");
	return (-1);
	}

	arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);

	nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;

	/*
	* Take a pass through the probes to check for errors.
	*/
	for (j = 0; j < nprobes; j++) {
	probe = (dof_probe_t *)(uintptr_t)(daddr +
	prb_sec->dofs_offset + j * prb_sec->dofs_entsize);

	if (probe->dofpr_func >= str_sec->dofs_size) {
	dtrace_dof_error(dof, "invalid function name");
	return (-1);
	}

	if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
	dtrace_dof_error(dof, "function name too long");
	return (-1);
	}

	if (probe->dofpr_name >= str_sec->dofs_size \|\|
	strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
	dtrace_dof_error(dof, "invalid probe name");
	return (-1);
	}

	/*
	* The offset count must not wrap the index, and the offsets
	* must also not overflow the section's data.
	*/
	if (probe->dofpr_offidx + probe->dofpr_noffs <
	probe->dofpr_offidx \|\|
	(probe->dofpr_offidx + probe->dofpr_noffs) *
	off_sec->dofs_entsize > off_sec->dofs_size) {
	dtrace_dof_error(dof, "invalid probe offset");
	return (-1);
	}

	if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
	/*
	* If there's no is-enabled offset section, make sure
	* there aren't any is-enabled offsets. Otherwise
	* perform the same checks as for probe offsets
	* (immediately above).
	*/
	if (enoff_sec == NULL) {
	if (probe->dofpr_enoffidx != 0 \|\|
	probe->dofpr_nenoffs != 0) {
	dtrace_dof_error(dof, "is-enabled "
	"offsets with null section");
	return (-1);
	}
	} else if (probe->dofpr_enoffidx +
	probe->dofpr_nenoffs < probe->dofpr_enoffidx \|\|
	(probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
	enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
	dtrace_dof_error(dof, "invalid is-enabled "
	"offset");
	return (-1);
	}

	if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
	dtrace_dof_error(dof, "zero probe and "
	"is-enabled offsets");
	return (-1);
	}
	} else if (probe->dofpr_noffs == 0) {
	dtrace_dof_error(dof, "zero probe offsets");
	return (-1);
	}

	if (probe->dofpr_argidx + probe->dofpr_xargc <
	probe->dofpr_argidx \|\|
	(probe->dofpr_argidx + probe->dofpr_xargc) *
	arg_sec->dofs_entsize > arg_sec->dofs_size) {
	dtrace_dof_error(dof, "invalid args");
	return (-1);
	}

	typeidx = probe->dofpr_nargv;
	typestr = strtab + probe->dofpr_nargv;
	for (k = 0; k < probe->dofpr_nargc; k++) {
	if (typeidx >= str_sec->dofs_size) {
	dtrace_dof_error(dof, "bad "
	"native argument type");
	return (-1);
	}

	typesz = strlen(typestr) + 1;
	if (typesz > DTRACE_ARGTYPELEN) {
	dtrace_dof_error(dof, "native "
	"argument type too long");
	return (-1);
	}
	typeidx += typesz;
	typestr += typesz;
	}

	typeidx = probe->dofpr_xargv;
	typestr = strtab + probe->dofpr_xargv;
	for (k = 0; k < probe->dofpr_xargc; k++) {
	if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
	dtrace_dof_error(dof, "bad "
	"native argument index");
	return (-1);
	}

	if (typeidx >= str_sec->dofs_size) {
	dtrace_dof_error(dof, "bad "
	"translated argument type");
	return (-1);
	}

	typesz = strlen(typestr) + 1;
	if (typesz > DTRACE_ARGTYPELEN) {
	dtrace_dof_error(dof, "translated argument "
	"type too long");
	return (-1);
	}

	typeidx += typesz;
	typestr += typesz;
	}
	}

	return (0);
	}

	static int
	dtrace_helper_slurp(dof_hdr_t dof, dof_helper_t dhp)
	{
	dtrace_helpers_t *help;
	dtrace_vstate_t *vstate;
	dtrace_enabling_t *enab = NULL;
	int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
	uintptr_t daddr = (uintptr_t)dof;

	ASSERT(MUTEX_HELD(&dtrace_lock));

	if ((help = curproc->p_dtrace_helpers) == NULL)
	help = dtrace_helpers_create(curproc);

	vstate = &help->dthps_vstate;

	if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
	dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
	dtrace_dof_destroy(dof);
	return (rv);
	}

	/*
	* Look for helper providers and validate their descriptions.
	*/
	if (dhp != NULL) {
	for (i = 0; i < dof->dofh_secnum; i++) {
	dof_sec_t sec = (dof_sec_t )(uintptr_t)(daddr +
	dof->dofh_secoff + i * dof->dofh_secsize);

	if (sec->dofs_type != DOF_SECT_PROVIDER)
	continue;

	if (dtrace_helper_provider_validate(dof, sec) != 0) {
	dtrace_enabling_destroy(enab);
	dtrace_dof_destroy(dof);
	return (-1);
	}

	nprovs++;
	}
	}

	/*
	* Now we need to walk through the ECB descriptions in the enabling.
	*/
	for (i = 0; i < enab->dten_ndesc; i++) {
	dtrace_ecbdesc_t *ep = enab->dten_desc[i];
	dtrace_probedesc_t *desc = &ep->dted_probe;

	if (strcmp(desc->dtpd_provider, "dtrace") != 0)
	continue;

	if (strcmp(desc->dtpd_mod, "helper") != 0)
	continue;

	if (strcmp(desc->dtpd_func, "ustack") != 0)
	continue;

	if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
	ep)) != 0) {
	/*
	* Adding this helper action failed -- we are now going
	* to rip out the entire generation and return failure.
	*/
	(void) dtrace_helper_destroygen(help->dthps_generation);
	dtrace_enabling_destroy(enab);
	dtrace_dof_destroy(dof);
	return (-1);
	}

	nhelpers++;
	}

	if (nhelpers < enab->dten_ndesc)
	dtrace_dof_error(dof, "unmatched helpers");

	gen = help->dthps_generation++;
	dtrace_enabling_destroy(enab);

	if (dhp != NULL && nprovs > 0) {
	dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
	if (dtrace_helper_provider_add(dhp, gen) == 0) {
	mutex_exit(&dtrace_lock);
	dtrace_helper_provider_register(curproc, help, dhp);
	mutex_enter(&dtrace_lock);

	destroy = 0;
	}
	}

	if (destroy)
	dtrace_dof_destroy(dof);

	return (gen);
	}

	static dtrace_helpers_t *
	dtrace_helpers_create(proc_t *p)
	{
	dtrace_helpers_t *help;

	ASSERT(MUTEX_HELD(&dtrace_lock));
	ASSERT(p->p_dtrace_helpers == NULL);

	help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
	help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t )
	DTRACE_NHELPER_ACTIONS, KM_SLEEP);

	p->p_dtrace_helpers = help;
	dtrace_helpers++;

	return (help);
	}

	#if defined(sun)
	static
	#endif
	void
	dtrace_helpers_destroy(proc_t *p)
	{
	dtrace_helpers_t *help;
	dtrace_vstate_t *vstate;
	#if defined(sun)
	proc_t *p = curproc;
	#endif
	int i;

	mutex_enter(&dtrace_lock);

	ASSERT(p->p_dtrace_helpers != NULL);
	ASSERT(dtrace_helpers > 0);

	help = p->p_dtrace_helpers;
	vstate = &help->dthps_vstate;

	/*
	* We're now going to lose the help from this process.
	*/
	p->p_dtrace_helpers = NULL;
	dtrace_sync();

	/*
	* Destory the helper actions.
	*/
	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
	dtrace_helper_action_t h, next;

	for (h = help->dthps_actions[i]; h != NULL; h = next) {
	next = h->dtha_next;
	dtrace_helper_action_destroy(h, vstate);
	h = next;
	}
	}

	mutex_exit(&dtrace_lock);

	/*
	* Destroy the helper providers.
	*/
	if (help->dthps_maxprovs > 0) {
	mutex_enter(&dtrace_meta_lock);
	if (dtrace_meta_pid != NULL) {
	ASSERT(dtrace_deferred_pid == NULL);

	for (i = 0; i < help->dthps_nprovs; i++) {
	dtrace_helper_provider_remove(
	&help->dthps_provs[i]->dthp_prov, p->p_pid);
	}
	} else {
	mutex_enter(&dtrace_lock);
	ASSERT(help->dthps_deferred == 0 \|\|
	help->dthps_next != NULL \|\|
	help->dthps_prev != NULL \|\|
	help == dtrace_deferred_pid);

	/*
	* Remove the helper from the deferred list.
	*/
	if (help->dthps_next != NULL)
	help->dthps_next->dthps_prev = help->dthps_prev;
	if (help->dthps_prev != NULL)
	help->dthps_prev->dthps_next = help->dthps_next;
	if (dtrace_deferred_pid == help) {
	dtrace_deferred_pid = help->dthps_next;
	ASSERT(help->dthps_prev == NULL);
	}

	mutex_exit(&dtrace_lock);
	}

	mutex_exit(&dtrace_meta_lock);

	for (i = 0; i < help->dthps_nprovs; i++) {
	dtrace_helper_provider_destroy(help->dthps_provs[i]);
	}

	kmem_free(help->dthps_provs, help->dthps_maxprovs *
	sizeof (dtrace_helper_provider_t *));
	}

	mutex_enter(&dtrace_lock);

	dtrace_vstate_fini(&help->dthps_vstate);
	kmem_free(help->dthps_actions,
	sizeof (dtrace_helper_action_t ) DTRACE_NHELPER_ACTIONS);
	kmem_free(help, sizeof (dtrace_helpers_t));

	--dtrace_helpers;
	mutex_exit(&dtrace_lock);
	}

	#if defined(sun)
	static
	#endif
	void
	dtrace_helpers_duplicate(proc_t from, proc_t to)
	{
	dtrace_helpers_t help, newhelp;
	dtrace_helper_action_t helper, new, *last;
	dtrace_difo_t *dp;
	dtrace_vstate_t *vstate;
	int i, j, sz, hasprovs = 0;

	mutex_enter(&dtrace_lock);
	ASSERT(from->p_dtrace_helpers != NULL);
	ASSERT(dtrace_helpers > 0);

	help = from->p_dtrace_helpers;
	newhelp = dtrace_helpers_create(to);
	ASSERT(to->p_dtrace_helpers != NULL);

	newhelp->dthps_generation = help->dthps_generation;
	vstate = &newhelp->dthps_vstate;

	/*
	* Duplicate the helper actions.
	*/
	for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
	if ((helper = help->dthps_actions[i]) == NULL)
	continue;

	for (last = NULL; helper != NULL; helper = helper->dtha_next) {
	new = kmem_zalloc(sizeof (dtrace_helper_action_t),
	KM_SLEEP);
	new->dtha_generation = helper->dtha_generation;

	if ((dp = helper->dtha_predicate) != NULL) {
	dp = dtrace_difo_duplicate(dp, vstate);
	new->dtha_predicate = dp;
	}

	new->dtha_nactions = helper->dtha_nactions;
	sz = sizeof (dtrace_difo_t ) new->dtha_nactions;
	new->dtha_actions = kmem_alloc(sz, KM_SLEEP);

	for (j = 0; j < new->dtha_nactions; j++) {
	dtrace_difo_t *dp = helper->dtha_actions[j];

	ASSERT(dp != NULL);
	dp = dtrace_difo_duplicate(dp, vstate);
	new->dtha_actions[j] = dp;
	}

	if (last != NULL) {
	last->dtha_next = new;
	} else {
	newhelp->dthps_actions[i] = new;
	}

	last = new;
	}
	}

	/*
	* Duplicate the helper providers and register them with the
	* DTrace framework.
	*/
	if (help->dthps_nprovs > 0) {
	newhelp->dthps_nprovs = help->dthps_nprovs;
	newhelp->dthps_maxprovs = help->dthps_nprovs;
	newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
	sizeof (dtrace_helper_provider_t *), KM_SLEEP);
	for (i = 0; i < newhelp->dthps_nprovs; i++) {
	newhelp->dthps_provs[i] = help->dthps_provs[i];
	newhelp->dthps_provs[i]->dthp_ref++;
	}

	hasprovs = 1;
	}

	mutex_exit(&dtrace_lock);

	if (hasprovs)
	dtrace_helper_provider_register(to, newhelp, NULL);
	}

	#if defined(sun)
	/*
	* DTrace Hook Functions
	*/
	static void
	dtrace_module_loaded(modctl_t *ctl)
	{
	dtrace_provider_t *prv;

	mutex_enter(&dtrace_provider_lock);
	mutex_enter(&mod_lock);

	ASSERT(ctl->mod_busy);

	/*
	* We're going to call each providers per-module provide operation
	* specifying only this module.
	*/
	for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
	prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);

	mutex_exit(&mod_lock);
	mutex_exit(&dtrace_provider_lock);

	/*
	* If we have any retained enablings, we need to match against them.
	* Enabling probes requires that cpu_lock be held, and we cannot hold
	* cpu_lock here -- it is legal for cpu_lock to be held when loading a
	* module. (In particular, this happens when loading scheduling
	* classes.) So if we have any retained enablings, we need to dispatch
	* our task queue to do the match for us.
	*/
	mutex_enter(&dtrace_lock);

	if (dtrace_retained == NULL) {
	mutex_exit(&dtrace_lock);
	return;
	}

	(void) taskq_dispatch(dtrace_taskq,
	(task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);

	mutex_exit(&dtrace_lock);

	/*
	* And now, for a little heuristic sleaze: in general, we want to
	* match modules as soon as they load. However, we cannot guarantee
	* this, because it would lead us to the lock ordering violation
	* outlined above. The common case, of course, is that cpu_lock is
	* _not_ held -- so we delay here for a clock tick, hoping that that's
	* long enough for the task queue to do its work. If it's not, it's
	* not a serious problem -- it just means that the module that we
	* just loaded may not be immediately instrumentable.
	*/
	delay(1);
	}

	static void
	dtrace_module_unloaded(modctl_t *ctl)
	{
	dtrace_probe_t template, probe, first, *next;
	dtrace_provider_t *prov;

	template.dtpr_mod = ctl->mod_modname;

	mutex_enter(&dtrace_provider_lock);
	mutex_enter(&mod_lock);
	mutex_enter(&dtrace_lock);

	if (dtrace_bymod == NULL) {
	/*
	* The DTrace module is loaded (obviously) but not attached;
	* we don't have any work to do.
	*/
	mutex_exit(&dtrace_provider_lock);
	mutex_exit(&mod_lock);
	mutex_exit(&dtrace_lock);
	return;
	}

	for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
	probe != NULL; probe = probe->dtpr_nextmod) {
	if (probe->dtpr_ecb != NULL) {
	mutex_exit(&dtrace_provider_lock);
	mutex_exit(&mod_lock);
	mutex_exit(&dtrace_lock);

	/*
	* This shouldn't _actually_ be possible -- we're
	* unloading a module that has an enabled probe in it.
	* (It's normally up to the provider to make sure that
	* this can't happen.) However, because dtps_enable()
	* doesn't have a failure mode, there can be an
	* enable/unload race. Upshot: we don't want to
	* assert, but we're not going to disable the
	* probe, either.
	*/
	if (dtrace_err_verbose) {
	cmn_err(CE_WARN, "unloaded module '%s' had "
	"enabled probes", ctl->mod_modname);
	}

	return;
	}
	}

	probe = first;

	for (first = NULL; probe != NULL; probe = next) {
	ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);

	dtrace_probes[probe->dtpr_id - 1] = NULL;

	next = probe->dtpr_nextmod;
	dtrace_hash_remove(dtrace_bymod, probe);
	dtrace_hash_remove(dtrace_byfunc, probe);
	dtrace_hash_remove(dtrace_byname, probe);

	if (first == NULL) {
	first = probe;
	probe->dtpr_nextmod = NULL;
	} else {
	probe->dtpr_nextmod = first;
	first = probe;
	}
	}

	/*
	* We've removed all of the module's probes from the hash chains and
	* from the probe array. Now issue a dtrace_sync() to be sure that
	* everyone has cleared out from any probe array processing.
	*/
	dtrace_sync();

	for (probe = first; probe != NULL; probe = first) {
	first = probe->dtpr_nextmod;
	prov = probe->dtpr_provider;
	prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
	probe->dtpr_arg);
	kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
	kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
	kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
	vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
	kmem_free(probe, sizeof (dtrace_probe_t));
	}

	mutex_exit(&dtrace_lock);
	mutex_exit(&mod_lock);
	mutex_exit(&dtrace_provider_lock);
	}

	static void
	dtrace_suspend(void)
	{
	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
	}

	static void
	dtrace_resume(void)
	{
	dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
	}
	#endif

	static int
	dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
	{
	ASSERT(MUTEX_HELD(&cpu_lock));
	mutex_enter(&dtrace_lock);

	switch (what) {
	case CPU_CONFIG: {
	dtrace_state_t *state;
	dtrace_optval_t *opt, rs, c;

	/*
	* For now, we only allocate a new buffer for anonymous state.
	*/
	if ((state = dtrace_anon.dta_state) == NULL)
	break;

	if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
	break;

	opt = state->dts_options;
	c = opt[DTRACEOPT_CPU];

	if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
	break;

	/*
	* Regardless of what the actual policy is, we're going to
	* temporarily set our resize policy to be manual. We're
	* also going to temporarily set our CPU option to denote
	* the newly configured CPU.
	*/
	rs = opt[DTRACEOPT_BUFRESIZE];
	opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
	opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;

	(void) dtrace_state_buffers(state);

	opt[DTRACEOPT_BUFRESIZE] = rs;
	opt[DTRACEOPT_CPU] = c;

	break;
	}

	case CPU_UNCONFIG:
	/*
	* We don't free the buffer in the CPU_UNCONFIG case. (The
	* buffer will be freed when the consumer exits.)
	*/
	break;

	default:
	break;
	}

	mutex_exit(&dtrace_lock);
	return (0);
	}

	#if defined(sun)
	static void
	dtrace_cpu_setup_initial(processorid_t cpu)
	{
	(void) dtrace_cpu_setup(CPU_CONFIG, cpu);
	}
	#endif

	static void
	dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
	{
	if (dtrace_toxranges >= dtrace_toxranges_max) {
	int osize, nsize;
	dtrace_toxrange_t *range;

	osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);

	if (osize == 0) {
	ASSERT(dtrace_toxrange == NULL);
	ASSERT(dtrace_toxranges_max == 0);
	dtrace_toxranges_max = 1;
	} else {
	dtrace_toxranges_max <<= 1;
	}

	nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
	range = kmem_zalloc(nsize, KM_SLEEP);

	if (dtrace_toxrange != NULL) {
	ASSERT(osize != 0);
	bcopy(dtrace_toxrange, range, osize);
	kmem_free(dtrace_toxrange, osize);
	}

	dtrace_toxrange = range;
	}

	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
	ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);

	dtrace_toxrange[dtrace_toxranges].dtt_base = base;
	dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
	dtrace_toxranges++;
	}

	/*
	* DTrace Driver Cookbook Functions
	*/
	#if defined(sun)
	/ARGSUSED/
	static int
	dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
	{
	dtrace_provider_id_t id;
	dtrace_state_t *state = NULL;
	dtrace_enabling_t *enab;

	mutex_enter(&cpu_lock);
	mutex_enter(&dtrace_provider_lock);
	mutex_enter(&dtrace_lock);

	if (ddi_soft_state_init(&dtrace_softstate,
	sizeof (dtrace_state_t), 0) != 0) {
	cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
	mutex_exit(&cpu_lock);
	mutex_exit(&dtrace_provider_lock);
	mutex_exit(&dtrace_lock);
	return (DDI_FAILURE);
	}

	if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
	DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE \|\|
	ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
	DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
	cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
	ddi_remove_minor_node(devi, NULL);
	ddi_soft_state_fini(&dtrace_softstate);
	mutex_exit(&cpu_lock);
	mutex_exit(&dtrace_provider_lock);
	mutex_exit(&dtrace_lock);
	return (DDI_FAILURE);
	}

	ddi_report_dev(devi);
	dtrace_devi = devi;

	dtrace_modload = dtrace_module_loaded;
	dtrace_modunload = dtrace_module_unloaded;
	dtrace_cpu_init = dtrace_cpu_setup_initial;
	dtrace_helpers_cleanup = dtrace_helpers_destroy;
	dtrace_helpers_fork = dtrace_helpers_duplicate;
	dtrace_cpustart_init = dtrace_suspend;
	dtrace_cpustart_fini = dtrace_resume;
	dtrace_debugger_init = dtrace_suspend;
	dtrace_debugger_fini = dtrace_resume;

	register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);

	ASSERT(MUTEX_HELD(&cpu_lock));

	dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
	NULL, NULL, NULL, 0, VM_SLEEP \| VMC_IDENTIFIER);
	dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
	UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
	VM_SLEEP \| VMC_IDENTIFIER);
	dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
	1, INT_MAX, 0);

	dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
	sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
	NULL, NULL, NULL, NULL, NULL, 0);

	ASSERT(MUTEX_HELD(&cpu_lock));
	dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
	offsetof(dtrace_probe_t, dtpr_nextmod),
	offsetof(dtrace_probe_t, dtpr_prevmod));

	dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
	offsetof(dtrace_probe_t, dtpr_nextfunc),
	offsetof(dtrace_probe_t, dtpr_prevfunc));

	dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
	offsetof(dtrace_probe_t, dtpr_nextname),
	offsetof(dtrace_probe_t, dtpr_prevname));

	if (dtrace_retain_max < 1) {
	cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
	"setting to 1", dtrace_retain_max);
	dtrace_retain_max = 1;
	}

	/*
	* Now discover our toxic ranges.
	*/
	dtrace_toxic_ranges(dtrace_toxrange_add);

	/*
	* Before we register ourselves as a provider to our own framework,
	* we would like to assert that dtrace_provider is NULL -- but that's
	* not true if we were loaded as a dependency of a DTrace provider.
	* Once we've registered, we can assert that dtrace_provider is our
	* pseudo provider.
	*/
	(void) dtrace_register("dtrace", &dtrace_provider_attr,
	DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);

	ASSERT(dtrace_provider != NULL);
	ASSERT((dtrace_provider_id_t)dtrace_provider == id);

	dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
	dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
	dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
	dtrace_provider, NULL, NULL, "END", 0, NULL);
	dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
	dtrace_provider, NULL, NULL, "ERROR", 1, NULL);

	dtrace_anon_property();
	mutex_exit(&cpu_lock);

	/*
	* If DTrace helper tracing is enabled, we need to allocate the
	* trace buffer and initialize the values.
	*/
	if (dtrace_helptrace_enabled) {
	ASSERT(dtrace_helptrace_buffer == NULL);
	dtrace_helptrace_buffer =
	kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
	dtrace_helptrace_next = 0;
	}

	/*
	* If there are already providers, we must ask them to provide their
	* probes, and then match any anonymous enabling against them. Note
	* that there should be no other retained enablings at this time:
	* the only retained enablings at this time should be the anonymous
	* enabling.
	*/
	if (dtrace_anon.dta_enabling != NULL) {
	ASSERT(dtrace_retained == dtrace_anon.dta_enabling);

	dtrace_enabling_provide(NULL);
	state = dtrace_anon.dta_state;

	/*
	* We couldn't hold cpu_lock across the above call to
	* dtrace_enabling_provide(), but we must hold it to actually
	* enable the probes. We have to drop all of our locks, pick
	* up cpu_lock, and regain our locks before matching the
	* retained anonymous enabling.
	*/
	mutex_exit(&dtrace_lock);
	mutex_exit(&dtrace_provider_lock);

	mutex_enter(&cpu_lock);
	mutex_enter(&dtrace_provider_lock);
	mutex_enter(&dtrace_lock);

	if ((enab = dtrace_anon.dta_enabling) != NULL)
	(void) dtrace_enabling_match(enab, NULL);

	mutex_exit(&cpu_lock);
	}

	mutex_exit(&dtrace_lock);
	mutex_exit(&dtrace_provider_lock);

	if (state != NULL) {
	/*
	* If we created any anonymous state, set it going now.
	*/
	(void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
	}

	return (DDI_SUCCESS);
	}
	#endif

	#if !defined(sun)
	#if __FreeBSD_version >= 800039
	static void
	dtrace_dtr(void *data __unused)
	{
	}
	#endif
	#endif

	/ARGSUSED/
	static int
	#if defined(sun)
	dtrace_open(dev_t devp, int flag, int otyp, cred_t cred_p)
	#else
	dtrace_open(struct cdev dev, int oflags, int devtype, struct thread td)
	#endif
	{
	dtrace_state_t *state;
	uint32_t priv;
	uid_t uid;
	zoneid_t zoneid;

	#if defined(sun)
	if (getminor(*devp) == DTRACEMNRN_HELPER)
	return (0);

	/*
	* If this wasn't an open with the "helper" minor, then it must be
	* the "dtrace" minor.
	*/
	ASSERT(getminor(*devp) == DTRACEMNRN_DTRACE);
	#else
	cred_t *cred_p = NULL;

	#if __FreeBSD_version < 800039
	/*
	* The first minor device is the one that is cloned so there is
	* nothing more to do here.
	*/
	if (dev2unit(dev) == 0)
	return 0;

	/*
	* Devices are cloned, so if the DTrace state has already
	* been allocated, that means this device belongs to a
	* different client. Each client should open '/dev/dtrace'
	* to get a cloned device.
	*/
	if (dev->si_drv1 != NULL)
	return (EBUSY);
	#endif

	cred_p = dev->si_cred;
	#endif

	/*
	* If no DTRACE_PRIV_* bits are set in the credential, then the
	* caller lacks sufficient permission to do anything with DTrace.
	*/
	dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
	if (priv == DTRACE_PRIV_NONE) {
	#if !defined(sun)
	#if __FreeBSD_version < 800039
	/* Destroy the cloned device. */
	destroy_dev(dev);
	#endif
	#endif

	return (EACCES);
	}

	/*
	* Ask all providers to provide all their probes.
	*/
	mutex_enter(&dtrace_provider_lock);
	dtrace_probe_provide(NULL, NULL);
	mutex_exit(&dtrace_provider_lock);

	mutex_enter(&cpu_lock);
	mutex_enter(&dtrace_lock);
	dtrace_opens++;
	dtrace_membar_producer();

	#if defined(sun)
	/*
	* If the kernel debugger is active (that is, if the kernel debugger
	* modified text in some way), we won't allow the open.
	*/
	if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
	dtrace_opens--;
	mutex_exit(&cpu_lock);
	mutex_exit(&dtrace_lock);
	return (EBUSY);
	}

	state = dtrace_state_create(devp, cred_p);
	#else
	state = dtrace_state_create(dev);
	#if __FreeBSD_version < 800039
	dev->si_drv1 = state;
	#else
	devfs_set_cdevpriv(state, dtrace_dtr);
	#endif
	#endif

	mutex_exit(&cpu_lock);

	if (state == NULL) {
	#if defined(sun)
	if (--dtrace_opens == 0)
	(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
	#else
	--dtrace_opens;
	#endif
	mutex_exit(&dtrace_lock);
	#if !defined(sun)
	#if __FreeBSD_version < 800039
	/* Destroy the cloned device. */
	destroy_dev(dev);
	#endif
	#endif
	return (EAGAIN);
	}

	mutex_exit(&dtrace_lock);

	return (0);
	}

	/ARGSUSED/
	static int
	#if defined(sun)
	dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
	#else
	dtrace_close(struct cdev dev, int flags, int fmt __unused, struct thread td)
	#endif
	{
	#if defined(sun)
	minor_t minor = getminor(dev);
	dtrace_state_t *state;

	if (minor == DTRACEMNRN_HELPER)
	return (0);

	state = ddi_get_soft_state(dtrace_softstate, minor);
	#else
	#if __FreeBSD_version < 800039
	dtrace_state_t *state = dev->si_drv1;

	/* Check if this is not a cloned device. */
	if (dev2unit(dev) == 0)
	return (0);
	#else
	dtrace_state_t *state;
	devfs_get_cdevpriv((void **) &state);
	#endif

	#endif

	mutex_enter(&cpu_lock);
	mutex_enter(&dtrace_lock);

	if (state != NULL) {
	if (state->dts_anon) {
	/*
	* There is anonymous state. Destroy that first.
	*/
	ASSERT(dtrace_anon.dta_state == NULL);
	dtrace_state_destroy(state->dts_anon);
	}

	dtrace_state_destroy(state);

	#if !defined(sun)
	kmem_free(state, 0);
	#if __FreeBSD_version < 800039
	dev->si_drv1 = NULL;
	#else
	devfs_clear_cdevpriv();
	#endif
	#endif
	}

	ASSERT(dtrace_opens > 0);
	#if defined(sun)
	if (--dtrace_opens == 0)
	(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
	#else
	--dtrace_opens;
	#endif

	mutex_exit(&dtrace_lock);
	mutex_exit(&cpu_lock);

	#if __FreeBSD_version < 800039
	/* Schedule this cloned device to be destroyed. */
	destroy_dev_sched(dev);
	#endif

	return (0);
	}

	#if defined(sun)
	/ARGSUSED/
	static int
	dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
	{
	int rval;
	dof_helper_t help, *dhp = NULL;

	switch (cmd) {
	case DTRACEHIOC_ADDDOF:
	if (copyin((void *)arg, &help, sizeof (help)) != 0) {
	dtrace_dof_error(NULL, "failed to copyin DOF helper");
	return (EFAULT);
	}

	dhp = &help;
	arg = (intptr_t)help.dofhp_dof;
	/FALLTHROUGH/

	case DTRACEHIOC_ADD: {
	dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);

	if (dof == NULL)
	return (rval);

	mutex_enter(&dtrace_lock);

	/*
	* dtrace_helper_slurp() takes responsibility for the dof --
	* it may free it now or it may save it and free it later.
	*/
	if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
	*rv = rval;
	rval = 0;
	} else {
	rval = EINVAL;
	}

	mutex_exit(&dtrace_lock);
	return (rval);
	}

	case DTRACEHIOC_REMOVE: {
	mutex_enter(&dtrace_lock);
	rval = dtrace_helper_destroygen(arg);
	mutex_exit(&dtrace_lock);

	return (rval);
	}

	default:
	break;
	}

	return (ENOTTY);
	}

	/ARGSUSED/
	static int
	dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t cr, int rv)
	{
	minor_t minor = getminor(dev);
	dtrace_state_t *state;
	int rval;

	if (minor == DTRACEMNRN_HELPER)
	return (dtrace_ioctl_helper(cmd, arg, rv));

	state = ddi_get_soft_state(dtrace_softstate, minor);

	if (state->dts_anon) {
	ASSERT(dtrace_anon.dta_state == NULL);
	state = state->dts_anon;
	}

	switch (cmd) {
	case DTRACEIOC_PROVIDER: {
	dtrace_providerdesc_t pvd;
	dtrace_provider_t *pvp;

	if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
	return (EFAULT);

	pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
	mutex_enter(&dtrace_provider_lock);

	for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
	if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
	break;
	}

	mutex_exit(&dtrace_provider_lock);

	if (pvp == NULL)
	return (ESRCH);

	bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
	bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));

	if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
	return (EFAULT);

	return (0);
	}

	case DTRACEIOC_EPROBE: {
	dtrace_eprobedesc_t epdesc;
	dtrace_ecb_t *ecb;
	dtrace_action_t *act;
	void *buf;
	size_t size;
	uintptr_t dest;
	int nrecs;

	if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
	return (EFAULT);

	mutex_enter(&dtrace_lock);

	if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
	mutex_exit(&dtrace_lock);
	return (EINVAL);
	}

	if (ecb->dte_probe == NULL) {
	mutex_exit(&dtrace_lock);
	return (EINVAL);
	}

	epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
	epdesc.dtepd_uarg = ecb->dte_uarg;
	epdesc.dtepd_size = ecb->dte_size;

	nrecs = epdesc.dtepd_nrecs;
	epdesc.dtepd_nrecs = 0;
	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
	if (DTRACEACT_ISAGG(act->dta_kind) \|\| act->dta_intuple)
	continue;

	epdesc.dtepd_nrecs++;
	}

	/*
	* Now that we have the size, we need to allocate a temporary
	* buffer in which to store the complete description. We need
	* the temporary buffer to be able to drop dtrace_lock()
	* across the copyout(), below.
	*/
	size = sizeof (dtrace_eprobedesc_t) +
	(epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));

	buf = kmem_alloc(size, KM_SLEEP);
	dest = (uintptr_t)buf;

	bcopy(&epdesc, (void *)dest, sizeof (epdesc));
	dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);

	for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
	if (DTRACEACT_ISAGG(act->dta_kind) \|\| act->dta_intuple)
	continue;

	if (nrecs-- == 0)
	break;

	bcopy(&act->dta_rec, (void *)dest,
	sizeof (dtrace_recdesc_t));
	dest += sizeof (dtrace_recdesc_t);
	}

	mutex_exit(&dtrace_lock);

	if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
	kmem_free(buf, size);
	return (EFAULT);
	}

	kmem_free(buf, size);
	return (0);
	}

	case DTRACEIOC_AGGDESC: {
	dtrace_aggdesc_t aggdesc;
	dtrace_action_t *act;
	dtrace_aggregation_t *agg;
	int nrecs;
	uint32_t offs;
	dtrace_recdesc_t *lrec;
	void *buf;
	size_t size;
	uintptr_t dest;

	if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
	return (EFAULT);

	mutex_enter(&dtrace_lock);

	if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
	mutex_exit(&dtrace_lock);
	return (EINVAL);
	}

	aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;

	nrecs = aggdesc.dtagd_nrecs;
	aggdesc.dtagd_nrecs = 0;

	offs = agg->dtag_base;
	lrec = &agg->dtag_action.dta_rec;
	aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;

	for (act = agg->dtag_first; ; act = act->dta_next) {
	ASSERT(act->dta_intuple \|\|
	DTRACEACT_ISAGG(act->dta_kind));

	/*
	* If this action has a record size of zero, it
	* denotes an argument to the aggregating action.
	* Because the presence of this record doesn't (or
	* shouldn't) affect the way the data is interpreted,
	* we don't copy it out to save user-level the
	* confusion of dealing with a zero-length record.
	*/
	if (act->dta_rec.dtrd_size == 0) {
	ASSERT(agg->dtag_hasarg);
	continue;
	}

	aggdesc.dtagd_nrecs++;

	if (act == &agg->dtag_action)
	break;
	}

	/*
	* Now that we have the size, we need to allocate a temporary
	* buffer in which to store the complete description. We need
	* the temporary buffer to be able to drop dtrace_lock()
	* across the copyout(), below.
	*/
	size = sizeof (dtrace_aggdesc_t) +
	(aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));

	buf = kmem_alloc(size, KM_SLEEP);
	dest = (uintptr_t)buf;

	bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
	dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);

	for (act = agg->dtag_first; ; act = act->dta_next) {
	dtrace_recdesc_t rec = act->dta_rec;

	/*
	* See the comment in the above loop for why we pass
	* over zero-length records.
	*/
	if (rec.dtrd_size == 0) {
	ASSERT(agg->dtag_hasarg);
	continue;
	}

	if (nrecs-- == 0)
	break;

	rec.dtrd_offset -= offs;
	bcopy(&rec, (void *)dest, sizeof (rec));
	dest += sizeof (dtrace_recdesc_t);

	if (act == &agg->dtag_action)
	break;
	}

	mutex_exit(&dtrace_lock);

	if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
	kmem_free(buf, size);
	return (EFAULT);
	}

	kmem_free(buf, size);
	return (0);
	}

	case DTRACEIOC_ENABLE: {
	dof_hdr_t *dof;
	dtrace_enabling_t *enab = NULL;
	dtrace_vstate_t *vstate;
	int err = 0;

	*rv = 0;

	/*
	* If a NULL argument has been passed, we take this as our
	* cue to reevaluate our enablings.
	*/
	if (arg == NULL) {
	dtrace_enabling_matchall();

	return (0);
	}

	if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
	return (rval);

	mutex_enter(&cpu_lock);
	mutex_enter(&dtrace_lock);
	vstate = &state->dts_vstate;

	if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
	mutex_exit(&dtrace_lock);
	mutex_exit(&cpu_lock);
	dtrace_dof_destroy(dof);
	return (EBUSY);
	}

	if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
	mutex_exit(&dtrace_lock);
	mutex_exit(&cpu_lock);
	dtrace_dof_destroy(dof);
	return (EINVAL);
	}

	if ((rval = dtrace_dof_options(dof, state)) != 0) {
	dtrace_enabling_destroy(enab);
	mutex_exit(&dtrace_lock);
	mutex_exit(&cpu_lock);
	dtrace_dof_destroy(dof);
	return (rval);
	}

	if ((err = dtrace_enabling_match(enab, rv)) == 0) {
	err = dtrace_enabling_retain(enab);
	} else {
	dtrace_enabling_destroy(enab);
	}

	mutex_exit(&cpu_lock);
	mutex_exit(&dtrace_lock);
	dtrace_dof_destroy(dof);

	return (err);
	}

	case DTRACEIOC_REPLICATE: {
	dtrace_repldesc_t desc;
	dtrace_probedesc_t *match = &desc.dtrpd_match;
	dtrace_probedesc_t *create = &desc.dtrpd_create;
	int err;

	if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
	return (EFAULT);

	match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
	match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
	match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
	match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';

	create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
	create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
	create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
	create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';

	mutex_enter(&dtrace_lock);
	err = dtrace_enabling_replicate(state, match, create);
	mutex_exit(&dtrace_lock);

	return (err);
	}

	case DTRACEIOC_PROBEMATCH:
	case DTRACEIOC_PROBES: {
	dtrace_probe_t *probe = NULL;
	dtrace_probedesc_t desc;
	dtrace_probekey_t pkey;
	dtrace_id_t i;
	int m = 0;
	uint32_t priv;
	uid_t uid;
	zoneid_t zoneid;

	if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
	return (EFAULT);

	desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
	desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
	desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
	desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';

	/*
	* Before we attempt to match this probe, we want to give
	* all providers the opportunity to provide it.
	*/
	if (desc.dtpd_id == DTRACE_IDNONE) {
	mutex_enter(&dtrace_provider_lock);
	dtrace_probe_provide(&desc, NULL);
	mutex_exit(&dtrace_provider_lock);
	desc.dtpd_id++;
	}

	if (cmd == DTRACEIOC_PROBEMATCH) {
	dtrace_probekey(&desc, &pkey);
	pkey.dtpk_id = DTRACE_IDNONE;
	}

	dtrace_cred2priv(cr, &priv, &uid, &zoneid);

	mutex_enter(&dtrace_lock);

	if (cmd == DTRACEIOC_PROBEMATCH) {
	for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
	if ((probe = dtrace_probes[i - 1]) != NULL &&
	(m = dtrace_match_probe(probe, &pkey,
	priv, uid, zoneid)) != 0)
	break;
	}

	if (m < 0) {
	mutex_exit(&dtrace_lock);
	return (EINVAL);
	}

	} else {
	for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
	if ((probe = dtrace_probes[i - 1]) != NULL &&
	dtrace_match_priv(probe, priv, uid, zoneid))
	break;
	}
	}

	if (probe == NULL) {
	mutex_exit(&dtrace_lock);
	return (ESRCH);
	}

	dtrace_probe_description(probe, &desc);
	mutex_exit(&dtrace_lock);

	if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
	return (EFAULT);

	return (0);
	}

	case DTRACEIOC_PROBEARG: {
	dtrace_argdesc_t desc;
	dtrace_probe_t *probe;
	dtrace_provider_t *prov;

	if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
	return (EFAULT);

	if (desc.dtargd_id == DTRACE_IDNONE)
	return (EINVAL);

	if (desc.dtargd_ndx == DTRACE_ARGNONE)
	return (EINVAL);

	mutex_enter(&dtrace_provider_lock);
	mutex_enter(&mod_lock);
	mutex_enter(&dtrace_lock);

	if (desc.dtargd_id > dtrace_nprobes) {
	mutex_exit(&dtrace_lock);
	mutex_exit(&mod_lock);
	mutex_exit(&dtrace_provider_lock);
	return (EINVAL);
	}

	if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
	mutex_exit(&dtrace_lock);
	mutex_exit(&mod_lock);
	mutex_exit(&dtrace_provider_lock);
	return (EINVAL);
	}

	mutex_exit(&dtrace_lock);

	prov = probe->dtpr_provider;

	if (prov->dtpv_pops.dtps_getargdesc == NULL) {
	/*
	* There isn't any typed information for this probe.
	* Set the argument number to DTRACE_ARGNONE.
	*/
	desc.dtargd_ndx = DTRACE_ARGNONE;
	} else {
	desc.dtargd_native[0] = '\0';
	desc.dtargd_xlate[0] = '\0';
	desc.dtargd_mapping = desc.dtargd_ndx;

	prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
	probe->dtpr_id, probe->dtpr_arg, &desc);
	}

	mutex_exit(&mod_lock);
	mutex_exit(&dtrace_provider_lock);

	if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
	return (EFAULT);

	return (0);
	}

	case DTRACEIOC_GO: {
	processorid_t cpuid;
	rval = dtrace_state_go(state, &cpuid);

	if (rval != 0)
	return (rval);

	if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
	return (EFAULT);

	return (0);
	}

	case DTRACEIOC_STOP: {
	processorid_t cpuid;

	mutex_enter(&dtrace_lock);
	rval = dtrace_state_stop(state, &cpuid);
	mutex_exit(&dtrace_lock);

	if (rval != 0)
	return (rval);

	if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
	return (EFAULT);

	return (0);
	}

	case DTRACEIOC_DOFGET: {
	dof_hdr_t hdr, *dof;
	uint64_t len;

	if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
	return (EFAULT);

	mutex_enter(&dtrace_lock);
	dof = dtrace_dof_create(state);
	mutex_exit(&dtrace_lock);

	len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
	rval = copyout(dof, (void *)arg, len);
	dtrace_dof_destroy(dof);

	return (rval == 0 ? 0 : EFAULT);
	}

	case DTRACEIOC_AGGSNAP:
	case DTRACEIOC_BUFSNAP: {
	dtrace_bufdesc_t desc;
	caddr_t cached;
	dtrace_buffer_t *buf;

	if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
	return (EFAULT);

	if (desc.dtbd_cpu < 0 \|\| desc.dtbd_cpu >= NCPU)
	return (EINVAL);

	mutex_enter(&dtrace_lock);

	if (cmd == DTRACEIOC_BUFSNAP) {
	buf = &state->dts_buffer[desc.dtbd_cpu];
	} else {
	buf = &state->dts_aggbuffer[desc.dtbd_cpu];
	}

	if (buf->dtb_flags & (DTRACEBUF_RING \| DTRACEBUF_FILL)) {
	size_t sz = buf->dtb_offset;

	if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
	mutex_exit(&dtrace_lock);
	return (EBUSY);
	}

	/*
	* If this buffer has already been consumed, we're
	* going to indicate that there's nothing left here
	* to consume.
	*/
	if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
	mutex_exit(&dtrace_lock);

	desc.dtbd_size = 0;
	desc.dtbd_drops = 0;
	desc.dtbd_errors = 0;
	desc.dtbd_oldest = 0;
	sz = sizeof (desc);

	if (copyout(&desc, (void *)arg, sz) != 0)
	return (EFAULT);

	return (0);
	}

	/*
	* If this is a ring buffer that has wrapped, we want
	* to copy the whole thing out.
	*/
	if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
	dtrace_buffer_polish(buf);
	sz = buf->dtb_size;
	}

	if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
	mutex_exit(&dtrace_lock);
	return (EFAULT);
	}

	desc.dtbd_size = sz;
	desc.dtbd_drops = buf->dtb_drops;
	desc.dtbd_errors = buf->dtb_errors;
	desc.dtbd_oldest = buf->dtb_xamot_offset;

	mutex_exit(&dtrace_lock);

	if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
	return (EFAULT);

	buf->dtb_flags \|= DTRACEBUF_CONSUMED;

	return (0);
	}

	if (buf->dtb_tomax == NULL) {
	ASSERT(buf->dtb_xamot == NULL);
	mutex_exit(&dtrace_lock);
	return (ENOENT);
	}

	cached = buf->dtb_tomax;
	ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));

	dtrace_xcall(desc.dtbd_cpu,
	(dtrace_xcall_t)dtrace_buffer_switch, buf);

	state->dts_errors += buf->dtb_xamot_errors;

	/*
	* If the buffers did not actually switch, then the cross call
	* did not take place -- presumably because the given CPU is
	* not in the ready set. If this is the case, we'll return
	* ENOENT.
	*/
	if (buf->dtb_tomax == cached) {
	ASSERT(buf->dtb_xamot != cached);
	mutex_exit(&dtrace_lock);
	return (ENOENT);
	}

	ASSERT(cached == buf->dtb_xamot);

	/*
	* We have our snapshot; now copy it out.
	*/
	if (copyout(buf->dtb_xamot, desc.dtbd_data,
	buf->dtb_xamot_offset) != 0) {
	mutex_exit(&dtrace_lock);
	return (EFAULT);
	}

	desc.dtbd_size = buf->dtb_xamot_offset;
	desc.dtbd_drops = buf->dtb_xamot_drops;
	desc.dtbd_errors = buf->dtb_xamot_errors;
	desc.dtbd_oldest = 0;

	mutex_exit(&dtrace_lock);

	/*
	* Finally, copy out the buffer description.
	*/
	if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
	return (EFAULT);

	return (0);
	}

	case DTRACEIOC_CONF: {
	dtrace_conf_t conf;

	bzero(&conf, sizeof (conf));
	conf.dtc_difversion = DIF_VERSION;
	conf.dtc_difintregs = DIF_DIR_NREGS;
	conf.dtc_diftupregs = DIF_DTR_NREGS;
	conf.dtc_ctfmodel = CTF_MODEL_NATIVE;

	if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
	return (EFAULT);

	return (0);
	}

	case DTRACEIOC_STATUS: {
	dtrace_status_t stat;
	dtrace_dstate_t *dstate;
	int i, j;
	uint64_t nerrs;

	/*
	* See the comment in dtrace_state_deadman() for the reason
	* for setting dts_laststatus to INT64_MAX before setting
	* it to the correct value.
	*/
	state->dts_laststatus = INT64_MAX;
	dtrace_membar_producer();
	state->dts_laststatus = dtrace_gethrtime();

	bzero(&stat, sizeof (stat));

	mutex_enter(&dtrace_lock);

	if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
	mutex_exit(&dtrace_lock);
	return (ENOENT);
	}

	if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
	stat.dtst_exiting = 1;

	nerrs = state->dts_errors;
	dstate = &state->dts_vstate.dtvs_dynvars;

	for (i = 0; i < NCPU; i++) {
	dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];

	stat.dtst_dyndrops += dcpu->dtdsc_drops;
	stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
	stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;

	if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
	stat.dtst_filled++;

	nerrs += state->dts_buffer[i].dtb_errors;

	for (j = 0; j < state->dts_nspeculations; j++) {
	dtrace_speculation_t *spec;
	dtrace_buffer_t *buf;

	spec = &state->dts_speculations[j];
	buf = &spec->dtsp_buffer[i];
	stat.dtst_specdrops += buf->dtb_xamot_drops;
	}
	}

	stat.dtst_specdrops_busy = state->dts_speculations_busy;
	stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
	stat.dtst_stkstroverflows = state->dts_stkstroverflows;
	stat.dtst_dblerrors = state->dts_dblerrors;
	stat.dtst_killed =
	(state->dts_activity == DTRACE_ACTIVITY_KILLED);
	stat.dtst_errors = nerrs;

	mutex_exit(&dtrace_lock);

	if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
	return (EFAULT);

	return (0);
	}

	case DTRACEIOC_FORMAT: {
	dtrace_fmtdesc_t fmt;
	char *str;
	int len;

	if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
	return (EFAULT);

	mutex_enter(&dtrace_lock);

	if (fmt.dtfd_format == 0 \|\|
	fmt.dtfd_format > state->dts_nformats) {
	mutex_exit(&dtrace_lock);
	return (EINVAL);
	}

	/*
	* Format strings are allocated contiguously and they are
	* never freed; if a format index is less than the number
	* of formats, we can assert that the format map is non-NULL
	* and that the format for the specified index is non-NULL.
	*/
	ASSERT(state->dts_formats != NULL);
	str = state->dts_formats[fmt.dtfd_format - 1];
	ASSERT(str != NULL);

	len = strlen(str) + 1;

	if (len > fmt.dtfd_length) {
	fmt.dtfd_length = len;

	if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
	mutex_exit(&dtrace_lock);
	return (EINVAL);
	}
	} else {
	if (copyout(str, fmt.dtfd_string, len) != 0) {
	mutex_exit(&dtrace_lock);
	return (EINVAL);
	}
	}

	mutex_exit(&dtrace_lock);
	return (0);
	}

	default:
	break;
	}

	return (ENOTTY);
	}

	/ARGSUSED/
	static int
	dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
	{
	dtrace_state_t *state;

	switch (cmd) {
	case DDI_DETACH:
	break;

	case DDI_SUSPEND:
	return (DDI_SUCCESS);

	default:
	return (DDI_FAILURE);
	}

	mutex_enter(&cpu_lock);
	mutex_enter(&dtrace_provider_lock);
	mutex_enter(&dtrace_lock);

	ASSERT(dtrace_opens == 0);

	if (dtrace_helpers > 0) {
	mutex_exit(&dtrace_provider_lock);
	mutex_exit(&dtrace_lock);
	mutex_exit(&cpu_lock);
	return (DDI_FAILURE);
	}

	if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
	mutex_exit(&dtrace_provider_lock);
	mutex_exit(&dtrace_lock);
	mutex_exit(&cpu_lock);
	return (DDI_FAILURE);
	}

	dtrace_provider = NULL;

	if ((state = dtrace_anon_grab()) != NULL) {
	/*
	* If there were ECBs on this state, the provider should
	* have not been allowed to detach; assert that there is
	* none.
	*/
	ASSERT(state->dts_necbs == 0);
	dtrace_state_destroy(state);

	/*
	* If we're being detached with anonymous state, we need to
	* indicate to the kernel debugger that DTrace is now inactive.
	*/
	(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
	}

	bzero(&dtrace_anon, sizeof (dtrace_anon_t));
	unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
	dtrace_cpu_init = NULL;
	dtrace_helpers_cleanup = NULL;
	dtrace_helpers_fork = NULL;
	dtrace_cpustart_init = NULL;
	dtrace_cpustart_fini = NULL;
	dtrace_debugger_init = NULL;
	dtrace_debugger_fini = NULL;
	dtrace_modload = NULL;
	dtrace_modunload = NULL;

	mutex_exit(&cpu_lock);

	if (dtrace_helptrace_enabled) {
	kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
	dtrace_helptrace_buffer = NULL;
	}

	kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
	dtrace_probes = NULL;
	dtrace_nprobes = 0;

	dtrace_hash_destroy(dtrace_bymod);
	dtrace_hash_destroy(dtrace_byfunc);
	dtrace_hash_destroy(dtrace_byname);
	dtrace_bymod = NULL;
	dtrace_byfunc = NULL;
	dtrace_byname = NULL;

	kmem_cache_destroy(dtrace_state_cache);
	vmem_destroy(dtrace_minor);
	vmem_destroy(dtrace_arena);

	if (dtrace_toxrange != NULL) {
	kmem_free(dtrace_toxrange,
	dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
	dtrace_toxrange = NULL;
	dtrace_toxranges = 0;
	dtrace_toxranges_max = 0;
	}

	ddi_remove_minor_node(dtrace_devi, NULL);
	dtrace_devi = NULL;

	ddi_soft_state_fini(&dtrace_softstate);

	ASSERT(dtrace_vtime_references == 0);
	ASSERT(dtrace_opens == 0);
	ASSERT(dtrace_retained == NULL);

	mutex_exit(&dtrace_lock);
	mutex_exit(&dtrace_provider_lock);

	/*
	* We don't destroy the task queue until after we have dropped our
	* locks (taskq_destroy() may block on running tasks). To prevent
	* attempting to do work after we have effectively detached but before
	* the task queue has been destroyed, all tasks dispatched via the
	* task queue must check that DTrace is still attached before
	* performing any operation.
	*/
	taskq_destroy(dtrace_taskq);
	dtrace_taskq = NULL;

	return (DDI_SUCCESS);
	}
	#endif

	#if defined(sun)
	/ARGSUSED/
	static int
	dtrace_info(dev_info_t dip, ddi_info_cmd_t infocmd, void arg, void **result)
	{
	int error;

	switch (infocmd) {
	case DDI_INFO_DEVT2DEVINFO:
	result = (void )dtrace_devi;
	error = DDI_SUCCESS;
	break;
	case DDI_INFO_DEVT2INSTANCE:
	result = (void )0;
	error = DDI_SUCCESS;
	break;
	default:
	error = DDI_FAILURE;
	}
	return (error);
	}
	#endif

	#if defined(sun)
	static struct cb_ops dtrace_cb_ops = {
	dtrace_open, /* open */
	dtrace_close, /* close */
	nulldev, /* strategy */
	nulldev, /* print */
	nodev, /* dump */
	nodev, /* read */
	nodev, /* write */
	dtrace_ioctl, /* ioctl */
	nodev, /* devmap */
	nodev, /* mmap */
	nodev, /* segmap */
	nochpoll, /* poll */
	ddi_prop_op, /* cb_prop_op */
	0, /* streamtab */
	D_NEW \| D_MP /* Driver compatibility flag */
	};

	static struct dev_ops dtrace_ops = {
	DEVO_REV, /* devo_rev */
	0, /* refcnt */
	dtrace_info, /* get_dev_info */
	nulldev, /* identify */
	nulldev, /* probe */
	dtrace_attach, /* attach */
	dtrace_detach, /* detach */
	nodev, /* reset */
	&dtrace_cb_ops, /* driver operations */
	NULL, /* bus operations */
	nodev /* dev power */
	};

	static struct modldrv modldrv = {
	&mod_driverops, /* module type (this is a pseudo driver) */
	"Dynamic Tracing", /* name of module */
	&dtrace_ops, /* driver ops */
	};

	static struct modlinkage modlinkage = {
	MODREV_1,
	(void *)&modldrv,
	NULL
	};

	int
	_init(void)
	{
	return (mod_install(&modlinkage));
	}

	int
	_info(struct modinfo *modinfop)
	{
	return (mod_info(&modlinkage, modinfop));
	}

	int
	_fini(void)
	{
	return (mod_remove(&modlinkage));
	}
	#else

	static d_ioctl_t dtrace_ioctl;
	static d_ioctl_t dtrace_ioctl_helper;
	static void dtrace_load(void *);
	static int dtrace_unload(void);
	#if __FreeBSD_version < 800039
	static void dtrace_clone(void , struct ucred , char , int , struct cdev *);
	static struct clonedevs dtrace_clones; / Ptr to the array of cloned devices. */
	static eventhandler_tag eh_tag; /* Event handler tag. */
	#else
	static struct cdev *dtrace_dev;
	static struct cdev *helper_dev;
	#endif

	void dtrace_invop_init(void);
	void dtrace_invop_uninit(void);

	static struct cdevsw dtrace_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = D_TRACKCLOSE \| D_NEEDMINOR,
	.d_close = dtrace_close,
	.d_ioctl = dtrace_ioctl,
	.d_open = dtrace_open,
	.d_name = "dtrace",
	};

	static struct cdevsw helper_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = D_TRACKCLOSE \| D_NEEDMINOR,
	.d_ioctl = dtrace_ioctl_helper,
	.d_name = "helper",
	};

	#include <dtrace_anon.c>
	#if __FreeBSD_version < 800039
	#include <dtrace_clone.c>
	#endif
	#include <dtrace_ioctl.c>
	#include <dtrace_load.c>
	#include <dtrace_modevent.c>
	#include <dtrace_sysctl.c>
	#include <dtrace_unload.c>
	#include <dtrace_vtime.c>
	#include <dtrace_hacks.c>
	#include <dtrace_isa.c>

	SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
	SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
	SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);

	DEV_MODULE(dtrace, dtrace_modevent, NULL);
	MODULE_VERSION(dtrace, 1);
	MODULE_DEPEND(dtrace, cyclic, 1, 1, 1);
	MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
	#endif
	Index: head/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c
	===================================================================
	--- head/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c (revision 225616)
	+++ head/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c (revision 225617)
	@@ -1,1597 +1,1597 @@
	/*
	* CDDL HEADER START
	*
	* The contents of this file are subject to the terms of the
	* Common Development and Distribution License (the "License").
	* You may not use this file except in compliance with the License.
	*
	* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
	* or http://www.opensolaris.org/os/licensing.
	* See the License for the specific language governing permissions
	* and limitations under the License.
	*
	* When distributing Covered Code, include this CDDL HEADER in each
	* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
	* If applicable, add the following below this CDDL HEADER, with the
	* fields enclosed by brackets "[]" replaced with your own identifying
	* information: Portions Copyright [yyyy] [name of copyright owner]
	*
	* CDDL HEADER END
	*/

	/*
	* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
	* Use is subject to license terms.
	*/

	#pragma ident "%Z%%M% %I% %E% SMI"

	#include <sys/fasttrap_isa.h>
	#include <sys/fasttrap_impl.h>
	#include <sys/dtrace.h>
	#include <sys/dtrace_impl.h>
	#include <sys/cmn_err.h>
	#include <sys/frame.h>
	#include <sys/stack.h>
	#include <sys/sysmacros.h>
	#include <sys/trap.h>

	#include <v9/sys/machpcb.h>
	#include <v9/sys/privregs.h>

	/*
	* Lossless User-Land Tracing on SPARC
	* -----------------------------------
	*
	* The Basic Idea
	*
	* The most important design constraint is, of course, correct execution of
	* the user thread above all else. The next most important goal is rapid
	* execution. We combine execution of instructions in user-land with
	* emulation of certain instructions in the kernel to aim for complete
	* correctness and maximal performance.
	*
	* We take advantage of the split PC/NPC architecture to speed up logical
	* single-stepping; when we copy an instruction out to the scratch space in
	* the ulwp_t structure (held in the %g7 register on SPARC), we can
	* effectively single step by setting the PC to our scratch space and leaving
	* the NPC alone. This executes the replaced instruction and then continues
	* on without having to reenter the kernel as with single- stepping. The
	* obvious caveat is for instructions whose execution is PC dependant --
	* branches, call and link instructions (call and jmpl), and the rdpc
	* instruction. These instructions cannot be executed in the manner described
	* so they must be emulated in the kernel.
	*
	* Emulation for this small set of instructions if fairly simple; the most
	* difficult part being emulating branch conditions.
	*
	*
	* A Cache Heavy Portfolio
	*
	* It's important to note at this time that copying an instruction out to the
	* ulwp_t scratch space in user-land is rather complicated. SPARC has
	* separate data and instruction caches so any writes to the D$ (using a
	* store instruction for example) aren't necessarily reflected in the I$.
	* The flush instruction can be used to synchronize the two and must be used
	* for any self-modifying code, but the flush instruction only applies to the
	* primary address space (the absence of a flusha analogue to the flush
	* instruction that accepts an ASI argument is an obvious omission from SPARC
	* v9 where the notion of the alternate address space was introduced on
	* SPARC). To correctly copy out the instruction we must use a block store
	* that doesn't allocate in the D$ and ensures synchronization with the I$;
	* see dtrace_blksuword32() for the implementation (this function uses
	* ASI_BLK_COMMIT_S to write a block through the secondary ASI in the manner
	* described). Refer to the UltraSPARC I/II manual for details on the
	* ASI_BLK_COMMIT_S ASI.
	*
	*
	* Return Subtleties
	*
	* When we're firing a return probe we need to expose the value returned by
	* the function being traced. Since the function can set the return value
	* in its last instruction, we need to fire the return probe only _after_
	* the effects of the instruction are apparent. For instructions that we
	* emulate, we can call dtrace_probe() after we've performed the emulation;
	* for instructions that we execute after we return to user-land, we set
	* %pc to the instruction we copied out (as described above) and set %npc
	* to a trap instruction stashed in the ulwp_t structure. After the traced
	* instruction is executed, the trap instruction returns control to the
	* kernel where we can fire the return probe.
	*
	* This need for a second trap in cases where we execute the traced
	* instruction makes it all the more important to emulate the most common
	* instructions to avoid the second trip in and out of the kernel.
	*
	*
	* Making it Fast
	*
	* Since copying out an instruction is neither simple nor inexpensive for the
	* CPU, we should attempt to avoid doing it in as many cases as possible.
	* Since function entry and return are usually the most interesting probe
	* sites, we attempt to tune the performance of the fasttrap provider around
	* instructions typically in those places.
	*
	* Looking at a bunch of functions in libraries and executables reveals that
	* most functions begin with either a save or a sethi (to setup a larger
	* argument to the save) and end with a restore or an or (in the case of leaf
	* functions). To try to improve performance, we emulate all of these
	* instructions in the kernel.
	*
	* The save and restore instructions are a little tricky since they perform
	* register window maniplulation. Rather than trying to tinker with the
	* register windows from the kernel, we emulate the implicit add that takes
	* place as part of those instructions and set the %pc to point to a simple
	* save or restore we've hidden in the ulwp_t structure. If we're in a return
	* probe so want to make it seem as though the tracepoint has been completely
	* executed we need to remember that we've pulled this trick with restore and
	* pull registers from the previous window (the one that we'll switch to once
	* the simple store instruction is executed) rather than the current one. This
	* is why in the case of emulating a restore we set the DTrace CPU flag
	* CPU_DTRACE_FAKERESTORE before calling dtrace_probe() for the return probes
	* (see fasttrap_return_common()).
	*/

	#define OP(x) ((x) >> 30)
	#define OP2(x) (((x) >> 22) & 0x07)
	#define OP3(x) (((x) >> 19) & 0x3f)
	#define RCOND(x) (((x) >> 25) & 0x07)
	#define COND(x) (((x) >> 25) & 0x0f)
	#define A(x) (((x) >> 29) & 0x01)
	#define I(x) (((x) >> 13) & 0x01)
	#define RD(x) (((x) >> 25) & 0x1f)
	#define RS1(x) (((x) >> 14) & 0x1f)
	#define RS2(x) (((x) >> 0) & 0x1f)
	#define CC(x) (((x) >> 20) & 0x03)
	#define DISP16(x) ((((x) >> 6) & 0xc000) \| ((x) & 0x3fff))
	#define DISP22(x) ((x) & 0x3fffff)
	#define DISP19(x) ((x) & 0x7ffff)
	#define DISP30(x) ((x) & 0x3fffffff)
	#define SW_TRAP(x) ((x) & 0x7f)

	#define OP3_OR 0x02
	#define OP3_RD 0x28
	#define OP3_JMPL 0x38
	#define OP3_RETURN 0x39
	#define OP3_TCC 0x3a
	#define OP3_SAVE 0x3c
	#define OP3_RESTORE 0x3d

	#define OP3_PREFETCH 0x2d
	#define OP3_CASA 0x3c
	#define OP3_PREFETCHA 0x3d
	#define OP3_CASXA 0x3e

	#define OP2_ILLTRAP 0x0
	#define OP2_BPcc 0x1
	#define OP2_Bicc 0x2
	#define OP2_BPr 0x3
	#define OP2_SETHI 0x4
	#define OP2_FBPfcc 0x5
	#define OP2_FBfcc 0x6

	#define R_G0 0
	#define R_O0 8
	#define R_SP 14
	#define R_I0 24
	#define R_I1 25
	#define R_I2 26
	#define R_I3 27
	#define R_I4 28

	/*
	* Check the comment in fasttrap.h when changing these offsets or adding
	* new instructions.
	*/
	#define FASTTRAP_OFF_SAVE 64
	#define FASTTRAP_OFF_RESTORE 68
	#define FASTTRAP_OFF_FTRET 72
	#define FASTTRAP_OFF_RETURN 76

	#define BREAKPOINT_INSTR 0x91d02001 /* ta 1 */

	/*
	* Tunable to let users turn off the fancy save instruction optimization.
	* If a program is non-ABI compliant, there's a possibility that the save
	* instruction optimization could cause an error.
	*/
	int fasttrap_optimize_save = 1;

	static uint64_t
	fasttrap_anarg(struct regs *rp, int argno)
	{
	uint64_t value;

	if (argno < 6)
	return ((&rp->r_o0)[argno]);

	if (curproc->p_model == DATAMODEL_NATIVE) {
	struct frame fr = (struct frame )(rp->r_sp + STACK_BIAS);

	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
	value = dtrace_fulword(&fr->fr_argd[argno]);
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT \| CPU_DTRACE_BADADDR \|
	CPU_DTRACE_BADALIGN);
	} else {
	struct frame32 fr = (struct frame32 )rp->r_sp;

	DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
	value = dtrace_fuword32(&fr->fr_argd[argno]);
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT \| CPU_DTRACE_BADADDR \|
	CPU_DTRACE_BADALIGN);
	}

	return (value);
	}

	static ulong_t fasttrap_getreg(struct regs *, uint_t);
	static void fasttrap_putreg(struct regs *, uint_t, ulong_t);

	static void
	fasttrap_usdt_args(fasttrap_probe_t probe, struct regs rp,
	uint_t fake_restore, int argc, uintptr_t *argv)
	{
	int i, x, cap = MIN(argc, probe->ftp_nargs);
	int inc = (fake_restore ? 16 : 0);

	/*
	* The only way we'll hit the fake_restore case is if a USDT probe is
	* invoked as a tail-call. While it wouldn't be incorrect, we can
	* avoid a call to fasttrap_getreg(), and safely use rp->r_sp
	* directly since a tail-call can't be made if the invoked function
	* would use the argument dump space (i.e. if there were more than
	* 6 arguments). We take this shortcut because unconditionally rooting
	* around for R_FP (R_SP + 16) would be unnecessarily painful.
	*/

	if (curproc->p_model == DATAMODEL_NATIVE) {
	struct frame fr = (struct frame )(rp->r_sp + STACK_BIAS);
	uintptr_t v;

	for (i = 0; i < cap; i++) {
	x = probe->ftp_argmap[i];

	if (x < 6)
	argv[i] = fasttrap_getreg(rp, R_O0 + x + inc);
	else if (fasttrap_fulword(&fr->fr_argd[x], &v) != 0)
	argv[i] = 0;
	}

	} else {
	struct frame32 fr = (struct frame32 )rp->r_sp;
	uint32_t v;

	for (i = 0; i < cap; i++) {
	x = probe->ftp_argmap[i];

	if (x < 6)
	argv[i] = fasttrap_getreg(rp, R_O0 + x + inc);
	else if (fasttrap_fuword32(&fr->fr_argd[x], &v) != 0)
	argv[i] = 0;
	}
	}

	for (; i < argc; i++) {
	argv[i] = 0;
	}
	}

	static void
	fasttrap_return_common(struct regs *rp, uintptr_t pc, pid_t pid,
	uint_t fake_restore)
	{
	fasttrap_tracepoint_t *tp;
	fasttrap_bucket_t *bucket;
	fasttrap_id_t *id;
	kmutex_t *pid_mtx;
	dtrace_icookie_t cookie;

	pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
	mutex_enter(pid_mtx);
	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];

	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
	if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
	tp->ftt_proc->ftpc_acount != 0)
	break;
	}

	/*
	* Don't sweat it if we can't find the tracepoint again; unlike
	* when we're in fasttrap_pid_probe(), finding the tracepoint here
	* is not essential to the correct execution of the process.
	*/
	if (tp == NULL \|\| tp->ftt_retids == NULL) {
	mutex_exit(pid_mtx);
	return;
	}

	for (id = tp->ftt_retids; id != NULL; id = id->fti_next) {
	fasttrap_probe_t *probe = id->fti_probe;

	if (id->fti_ptype == DTFTP_POST_OFFSETS) {
	if (probe->ftp_argmap != NULL && fake_restore) {
	uintptr_t t[5];

	fasttrap_usdt_args(probe, rp, fake_restore,
	sizeof (t) / sizeof (t[0]), t);

	cookie = dtrace_interrupt_disable();
	DTRACE_CPUFLAG_SET(CPU_DTRACE_FAKERESTORE);
	dtrace_probe(probe->ftp_id, t[0], t[1],
	t[2], t[3], t[4]);
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_FAKERESTORE);
	dtrace_interrupt_enable(cookie);

	} else if (probe->ftp_argmap != NULL) {
	uintptr_t t[5];

	fasttrap_usdt_args(probe, rp, fake_restore,
	sizeof (t) / sizeof (t[0]), t);

	dtrace_probe(probe->ftp_id, t[0], t[1],
	t[2], t[3], t[4]);

	} else if (fake_restore) {
	uintptr_t arg0 = fasttrap_getreg(rp, R_I0);
	uintptr_t arg1 = fasttrap_getreg(rp, R_I1);
	uintptr_t arg2 = fasttrap_getreg(rp, R_I2);
	uintptr_t arg3 = fasttrap_getreg(rp, R_I3);
	uintptr_t arg4 = fasttrap_getreg(rp, R_I4);

	cookie = dtrace_interrupt_disable();
	DTRACE_CPUFLAG_SET(CPU_DTRACE_FAKERESTORE);
	dtrace_probe(probe->ftp_id, arg0, arg1,
	arg2, arg3, arg4);
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_FAKERESTORE);
	dtrace_interrupt_enable(cookie);

	} else {
	dtrace_probe(probe->ftp_id, rp->r_o0, rp->r_o1,
	rp->r_o2, rp->r_o3, rp->r_o4);
	}

	continue;
	}

	/*
	* If this is only a possible return point, we must
	* be looking at a potential tail call in leaf context.
	* If the %npc is still within this function, then we
	* must have misidentified a jmpl as a tail-call when it
	* is, in fact, part of a jump table. It would be nice to
	* remove this tracepoint, but this is neither the time
	* nor the place.
	*/
	if ((tp->ftt_flags & FASTTRAP_F_RETMAYBE) &&
	rp->r_npc - probe->ftp_faddr < probe->ftp_fsize)
	continue;

	/*
	* It's possible for a function to branch to the delay slot
	* of an instruction that we've identified as a return site.
	* We can dectect this spurious return probe activation by
	* observing that in this case %npc will be %pc + 4 and %npc
	* will be inside the current function (unless the user is
	* doing _crazy_ instruction picking in which case there's
	* very little we can do). The second check is important
	* in case the last instructions of a function make a tail-
	* call to the function located immediately subsequent.
	*/
	if (rp->r_npc == rp->r_pc + 4 &&
	rp->r_npc - probe->ftp_faddr < probe->ftp_fsize)
	continue;

	/*
	* The first argument is the offset of return tracepoint
	* in the function; the remaining arguments are the return
	* values.
	*
	* If fake_restore is set, we need to pull the return values
	* out of the %i's rather than the %o's -- a little trickier.
	*/
	if (!fake_restore) {
	dtrace_probe(probe->ftp_id, pc - probe->ftp_faddr,
	rp->r_o0, rp->r_o1, rp->r_o2, rp->r_o3);
	} else {
	uintptr_t arg0 = fasttrap_getreg(rp, R_I0);
	uintptr_t arg1 = fasttrap_getreg(rp, R_I1);
	uintptr_t arg2 = fasttrap_getreg(rp, R_I2);
	uintptr_t arg3 = fasttrap_getreg(rp, R_I3);

	cookie = dtrace_interrupt_disable();
	DTRACE_CPUFLAG_SET(CPU_DTRACE_FAKERESTORE);
	dtrace_probe(probe->ftp_id, pc - probe->ftp_faddr,
	arg0, arg1, arg2, arg3);
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_FAKERESTORE);
	dtrace_interrupt_enable(cookie);
	}
	}

	mutex_exit(pid_mtx);
	}

	int
	fasttrap_pid_probe(struct regs *rp)
	{
	proc_t *p = curproc;
	fasttrap_tracepoint_t *tp, tp_local;
	fasttrap_id_t *id;
	pid_t pid;
	uintptr_t pc = rp->r_pc;
	uintptr_t npc = rp->r_npc;
	uintptr_t orig_pc = pc;
	fasttrap_bucket_t *bucket;
	kmutex_t *pid_mtx;
	uint_t fake_restore = 0, is_enabled = 0;
	dtrace_icookie_t cookie;

	/*
	* It's possible that a user (in a veritable orgy of bad planning)
	* could redirect this thread's flow of control before it reached the
	* return probe fasttrap. In this case we need to kill the process
	* since it's in a unrecoverable state.
	*/
	if (curthread->t_dtrace_step) {
	ASSERT(curthread->t_dtrace_on);
	fasttrap_sigtrap(p, curthread, pc);
	return (0);
	}

	/*
	* Clear all user tracing flags.
	*/
	curthread->t_dtrace_ft = 0;
	curthread->t_dtrace_pc = 0;
	curthread->t_dtrace_npc = 0;
	curthread->t_dtrace_scrpc = 0;
	curthread->t_dtrace_astpc = 0;

	/*
	* Treat a child created by a call to vfork(2) as if it were its
	* parent. We know that there's only one thread of control in such a
	* process: this one.
	*/
	while (p->p_flag & SVFORK) {
	p = p->p_parent;
	}

	pid = p->p_pid;
	pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
	mutex_enter(pid_mtx);
	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];

	/*
	* Lookup the tracepoint that the process just hit.
	*/
	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
	if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
	tp->ftt_proc->ftpc_acount != 0)
	break;
	}

	/*
	* If we couldn't find a matching tracepoint, either a tracepoint has
	* been inserted without using the pid<pid> ioctl interface (see
	* fasttrap_ioctl), or somehow we have mislaid this tracepoint.
	*/
	if (tp == NULL) {
	mutex_exit(pid_mtx);
	return (-1);
	}

	for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
	fasttrap_probe_t *probe = id->fti_probe;
	int isentry = (id->fti_ptype == DTFTP_ENTRY);

	if (id->fti_ptype == DTFTP_IS_ENABLED) {
	is_enabled = 1;
	continue;
	}

	/*
	* We note that this was an entry probe to help ustack() find
	* the first caller.
	*/
	if (isentry) {
	cookie = dtrace_interrupt_disable();
	DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
	}
	dtrace_probe(probe->ftp_id, rp->r_o0, rp->r_o1, rp->r_o2,
	rp->r_o3, rp->r_o4);
	if (isentry) {
	DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
	dtrace_interrupt_enable(cookie);
	}
	}

	/*
	* We're about to do a bunch of work so we cache a local copy of
	* the tracepoint to emulate the instruction, and then find the
	* tracepoint again later if we need to light up any return probes.
	*/
	tp_local = *tp;
	mutex_exit(pid_mtx);
	tp = &tp_local;

	/*
	* If there's an is-enabled probe conntected to this tracepoint it
	* means that there was a 'mov %g0, %o0' instruction that was placed
	* there by DTrace when the binary was linked. As this probe is, in
	* fact, enabled, we need to stuff 1 into %o0. Accordingly, we can
	* bypass all the instruction emulation logic since we know the
	* inevitable result. It's possible that a user could construct a
	* scenario where the 'is-enabled' probe was on some other
	* instruction, but that would be a rather exotic way to shoot oneself
	* in the foot.
	*/
	if (is_enabled) {
	rp->r_o0 = 1;
	pc = rp->r_npc;
	npc = pc + 4;
	goto done;
	}

	/*
	* We emulate certain types of instructions to ensure correctness
	* (in the case of position dependent instructions) or optimize
	* common cases. The rest we have the thread execute back in user-
	* land.
	*/
	switch (tp->ftt_type) {
	case FASTTRAP_T_SAVE:
	{
	int32_t imm;

	/*
	* This an optimization to let us handle function entry
	* probes more efficiently. Many functions begin with a save
	* instruction that follows the pattern:
	* save %sp, <imm>, %sp
	*
	* Meanwhile, we've stashed the instruction:
	* save %g1, %g0, %sp
	*
	* off of %g7, so all we have to do is stick the right value
	* into %g1 and reset %pc to point to the instruction we've
	* cleverly hidden (%npc should not be touched).
	*/

	imm = tp->ftt_instr << 19;
	imm >>= 19;
	rp->r_g1 = rp->r_sp + imm;
	pc = rp->r_g7 + FASTTRAP_OFF_SAVE;
	break;
	}

	case FASTTRAP_T_RESTORE:
	{
	ulong_t value;
	uint_t rd;

	/*
	* This is an optimization to let us handle function
	* return probes more efficiently. Most non-leaf functions
	* end with the sequence:
	* ret
	* restore <reg>, <reg_or_imm>, %oX
	*
	* We've stashed the instruction:
	* restore %g0, %g0, %g0
	*
	* off of %g7 so we just need to place the correct value
	* in the right %i register (since after our fake-o
	* restore, the %i's will become the %o's) and set the %pc
	* to point to our hidden restore. We also set fake_restore to
	* let fasttrap_return_common() know that it will find the
	* return values in the %i's rather than the %o's.
	*/

	if (I(tp->ftt_instr)) {
	int32_t imm;

	imm = tp->ftt_instr << 19;
	imm >>= 19;
	value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) + imm;
	} else {
	value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) +
	fasttrap_getreg(rp, RS2(tp->ftt_instr));
	}

	/*
	* Convert %o's to %i's; leave %g's as they are.
	*/
	rd = RD(tp->ftt_instr);
	fasttrap_putreg(rp, ((rd & 0x18) == 0x8) ? rd + 16 : rd, value);

	pc = rp->r_g7 + FASTTRAP_OFF_RESTORE;
	fake_restore = 1;
	break;
	}

	case FASTTRAP_T_RETURN:
	{
	uintptr_t target;

	/*
	* A return instruction is like a jmpl (without the link
	* part) that executes an implicit restore. We've stashed
	* the instruction:
	* return %o0
	*
	* off of %g7 so we just need to place the target in %o0
	* and set the %pc to point to the stashed return instruction.
	* We use %o0 since that register disappears after the return
	* executes, erasing any evidence of this tampering.
	*/
	if (I(tp->ftt_instr)) {
	int32_t imm;

	imm = tp->ftt_instr << 19;
	imm >>= 19;
	target = fasttrap_getreg(rp, RS1(tp->ftt_instr)) + imm;
	} else {
	target = fasttrap_getreg(rp, RS1(tp->ftt_instr)) +
	fasttrap_getreg(rp, RS2(tp->ftt_instr));
	}

	fasttrap_putreg(rp, R_O0, target);

	pc = rp->r_g7 + FASTTRAP_OFF_RETURN;
	fake_restore = 1;
	break;
	}

	case FASTTRAP_T_OR:
	{
	ulong_t value;

	if (I(tp->ftt_instr)) {
	int32_t imm;

	imm = tp->ftt_instr << 19;
	imm >>= 19;
	value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) \| imm;
	} else {
	value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) \|
	fasttrap_getreg(rp, RS2(tp->ftt_instr));
	}

	fasttrap_putreg(rp, RD(tp->ftt_instr), value);
	pc = rp->r_npc;
	npc = pc + 4;
	break;
	}

	case FASTTRAP_T_SETHI:
	if (RD(tp->ftt_instr) != R_G0) {
	uint32_t imm32 = tp->ftt_instr << 10;
	fasttrap_putreg(rp, RD(tp->ftt_instr), (ulong_t)imm32);
	}
	pc = rp->r_npc;
	npc = pc + 4;
	break;

	case FASTTRAP_T_CCR:
	{
	uint_t c, v, z, n, taken;
	uint_t ccr = rp->r_tstate >> TSTATE_CCR_SHIFT;

	if (tp->ftt_cc != 0)
	ccr >>= 4;

	c = (ccr >> 0) & 1;
	v = (ccr >> 1) & 1;
	z = (ccr >> 2) & 1;
	n = (ccr >> 3) & 1;

	switch (tp->ftt_code) {
	case 0x0: /* BN */
	taken = 0; break;
	case 0x1: /* BE */
	taken = z; break;
	case 0x2: /* BLE */
	taken = z \| (n ^ v); break;
	case 0x3: /* BL */
	taken = n ^ v; break;
	case 0x4: /* BLEU */
	taken = c \| z; break;
	case 0x5: /* BCS (BLU) */
	taken = c; break;
	case 0x6: /* BNEG */
	taken = n; break;
	case 0x7: /* BVS */
	taken = v; break;
	case 0x8: /* BA */
	/*
	* We handle the BA case differently since the annul
	* bit means something slightly different.
	*/
	panic("fasttrap: mishandled a branch");
	taken = 1; break;
	case 0x9: /* BNE */
	taken = ~z; break;
	case 0xa: /* BG */
	taken = ~(z \| (n ^ v)); break;
	case 0xb: /* BGE */
	taken = ~(n ^ v); break;
	case 0xc: /* BGU */
	taken = ~(c \| z); break;
	case 0xd: /* BCC (BGEU) */
	taken = ~c; break;
	case 0xe: /* BPOS */
	taken = ~n; break;
	case 0xf: /* BVC */
	taken = ~v; break;
	}

	if (taken & 1) {
	pc = rp->r_npc;
	npc = tp->ftt_dest;
	} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
	/*
	* Untaken annulled branches don't execute the
	* instruction in the delay slot.
	*/
	pc = rp->r_npc + 4;
	npc = pc + 4;
	} else {
	pc = rp->r_npc;
	npc = pc + 4;
	}
	break;
	}

	case FASTTRAP_T_FCC:
	{
	uint_t fcc;
	uint_t taken;
	uint64_t fsr;

	dtrace_getfsr(&fsr);

	if (tp->ftt_cc == 0) {
	fcc = (fsr >> 10) & 0x3;
	} else {
	uint_t shift;
	ASSERT(tp->ftt_cc <= 3);
	shift = 30 + tp->ftt_cc * 2;
	fcc = (fsr >> shift) & 0x3;
	}

	switch (tp->ftt_code) {
	case 0x0: /* FBN */
	taken = (1 << fcc) & (0\|0\|0\|0); break;
	case 0x1: /* FBNE */
	taken = (1 << fcc) & (8\|4\|2\|0); break;
	case 0x2: /* FBLG */
	taken = (1 << fcc) & (0\|4\|2\|0); break;
	case 0x3: /* FBUL */
	taken = (1 << fcc) & (8\|0\|2\|0); break;
	case 0x4: /* FBL */
	taken = (1 << fcc) & (0\|0\|2\|0); break;
	case 0x5: /* FBUG */
	taken = (1 << fcc) & (8\|4\|0\|0); break;
	case 0x6: /* FBG */
	taken = (1 << fcc) & (0\|4\|0\|0); break;
	case 0x7: /* FBU */
	taken = (1 << fcc) & (8\|0\|0\|0); break;
	case 0x8: /* FBA */
	/*
	* We handle the FBA case differently since the annul
	* bit means something slightly different.
	*/
	panic("fasttrap: mishandled a branch");
	taken = (1 << fcc) & (8\|4\|2\|1); break;
	case 0x9: /* FBE */
	taken = (1 << fcc) & (0\|0\|0\|1); break;
	case 0xa: /* FBUE */
	taken = (1 << fcc) & (8\|0\|0\|1); break;
	case 0xb: /* FBGE */
	taken = (1 << fcc) & (0\|4\|0\|1); break;
	case 0xc: /* FBUGE */
	taken = (1 << fcc) & (8\|4\|0\|1); break;
	case 0xd: /* FBLE */
	taken = (1 << fcc) & (0\|0\|2\|1); break;
	case 0xe: /* FBULE */
	taken = (1 << fcc) & (8\|0\|2\|1); break;
	case 0xf: /* FBO */
	taken = (1 << fcc) & (0\|4\|2\|1); break;
	}

	if (taken) {
	pc = rp->r_npc;
	npc = tp->ftt_dest;
	} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
	/*
	* Untaken annulled branches don't execute the
	* instruction in the delay slot.
	*/
	pc = rp->r_npc + 4;
	npc = pc + 4;
	} else {
	pc = rp->r_npc;
	npc = pc + 4;
	}
	break;
	}

	case FASTTRAP_T_REG:
	{
	int64_t value;
	uint_t taken;
	uint_t reg = RS1(tp->ftt_instr);

	/*
	* An ILP32 process shouldn't be using a branch predicated on
	* an %i or an %l since it would violate the ABI. It's a
	* violation of the ABI because we can't ensure deterministic
	* behavior. We should have identified this case when we
	* enabled the probe.
	*/
	ASSERT(p->p_model == DATAMODEL_LP64 \|\| reg < 16);

	value = (int64_t)fasttrap_getreg(rp, reg);

	switch (tp->ftt_code) {
	case 0x1: /* BRZ */
	taken = (value == 0); break;
	case 0x2: /* BRLEZ */
	taken = (value <= 0); break;
	case 0x3: /* BRLZ */
	taken = (value < 0); break;
	case 0x5: /* BRNZ */
	taken = (value != 0); break;
	case 0x6: /* BRGZ */
	taken = (value > 0); break;
	case 0x7: /* BRGEZ */
	taken = (value >= 0); break;
	default:
	case 0x0:
	case 0x4:
	panic("fasttrap: mishandled a branch");
	}

	if (taken) {
	pc = rp->r_npc;
	npc = tp->ftt_dest;
	} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
	/*
	* Untaken annulled branches don't execute the
	* instruction in the delay slot.
	*/
	pc = rp->r_npc + 4;
	npc = pc + 4;
	} else {
	pc = rp->r_npc;
	npc = pc + 4;
	}
	break;
	}

	case FASTTRAP_T_ALWAYS:
	/*
	* BAs, BA,As...
	*/

	if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
	/*
	* Annulled branch always instructions never execute
	* the instruction in the delay slot.
	*/
	pc = tp->ftt_dest;
	npc = tp->ftt_dest + 4;
	} else {
	pc = rp->r_npc;
	npc = tp->ftt_dest;
	}
	break;

	case FASTTRAP_T_RDPC:
	fasttrap_putreg(rp, RD(tp->ftt_instr), rp->r_pc);
	pc = rp->r_npc;
	npc = pc + 4;
	break;

	case FASTTRAP_T_CALL:
	/*
	* It's a call _and_ link remember...
	*/
	rp->r_o7 = rp->r_pc;
	pc = rp->r_npc;
	npc = tp->ftt_dest;
	break;

	case FASTTRAP_T_JMPL:
	pc = rp->r_npc;

	if (I(tp->ftt_instr)) {
	uint_t rs1 = RS1(tp->ftt_instr);
	int32_t imm;

	imm = tp->ftt_instr << 19;
	imm >>= 19;
	npc = fasttrap_getreg(rp, rs1) + imm;
	} else {
	uint_t rs1 = RS1(tp->ftt_instr);
	uint_t rs2 = RS2(tp->ftt_instr);

	npc = fasttrap_getreg(rp, rs1) +
	fasttrap_getreg(rp, rs2);
	}

	/*
	* Do the link part of the jump-and-link instruction.
	*/
	fasttrap_putreg(rp, RD(tp->ftt_instr), rp->r_pc);

	break;

	case FASTTRAP_T_COMMON:
	{
	curthread->t_dtrace_scrpc = rp->r_g7;
	curthread->t_dtrace_astpc = rp->r_g7 + FASTTRAP_OFF_FTRET;

	/*
	* Copy the instruction to a reserved location in the
	* user-land thread structure, then set the PC to that
	* location and leave the NPC alone. We take pains to ensure
	* consistency in the instruction stream (See SPARC
	* Architecture Manual Version 9, sections 8.4.7, A.20, and
	* H.1.6; UltraSPARC I/II User's Manual, sections 3.1.1.1,
	* and 13.6.4) by using the ASI ASI_BLK_COMMIT_S to copy the
	* instruction into the user's address space without
	* bypassing the I$. There's no AS_USER version of this ASI
	* (as exist for other ASIs) so we use the lofault
	* mechanism to catch faults.
	*/
	if (dtrace_blksuword32(rp->r_g7, &tp->ftt_instr, 1) == -1) {
	/*
	* If the copyout fails, then the process's state
	* is not consistent (the effects of the traced
	* instruction will never be seen). This process
	* cannot be allowed to continue execution.
	*/
	fasttrap_sigtrap(curproc, curthread, pc);
	return (0);
	}

	curthread->t_dtrace_pc = pc;
	curthread->t_dtrace_npc = npc;
	curthread->t_dtrace_on = 1;

	pc = curthread->t_dtrace_scrpc;

	if (tp->ftt_retids != NULL) {
	curthread->t_dtrace_step = 1;
	curthread->t_dtrace_ret = 1;
	npc = curthread->t_dtrace_astpc;
	}
	break;
	}

	default:
	panic("fasttrap: mishandled an instruction");
	}

	/*
	* This bit me in the ass a couple of times, so lets toss this
	* in as a cursory sanity check.
	*/
	ASSERT(pc != rp->r_g7 + 4);
	ASSERT(pc != rp->r_g7 + 8);

	done:
	/*
	* If there were no return probes when we first found the tracepoint,
	* we should feel no obligation to honor any return probes that were
	* subsequently enabled -- they'll just have to wait until the next
	* time around.
	*/
	if (tp->ftt_retids != NULL) {
	/*
	* We need to wait until the results of the instruction are
	* apparent before invoking any return probes. If this
	* instruction was emulated we can just call
	* fasttrap_return_common(); if it needs to be executed, we
	* need to wait until we return to the kernel.
	*/
	if (tp->ftt_type != FASTTRAP_T_COMMON) {
	fasttrap_return_common(rp, orig_pc, pid, fake_restore);
	} else {
	ASSERT(curthread->t_dtrace_ret != 0);
	ASSERT(curthread->t_dtrace_pc == orig_pc);
	ASSERT(curthread->t_dtrace_scrpc == rp->r_g7);
	ASSERT(npc == curthread->t_dtrace_astpc);
	}
	}

	ASSERT(pc != 0);
	rp->r_pc = pc;
	rp->r_npc = npc;

	return (0);
	}

	int
	fasttrap_return_probe(struct regs *rp)
	{
	proc_t *p = ttoproc(curthread);
	pid_t pid;
	uintptr_t pc = curthread->t_dtrace_pc;
	uintptr_t npc = curthread->t_dtrace_npc;

	curthread->t_dtrace_pc = 0;
	curthread->t_dtrace_npc = 0;
	curthread->t_dtrace_scrpc = 0;
	curthread->t_dtrace_astpc = 0;

	/*
	* Treat a child created by a call to vfork(2) as if it were its
	* parent. We know there's only one thread of control in such a
	* process: this one.
	*/
	while (p->p_flag & SVFORK) {
	p = p->p_parent;
	}

	/*
	* We set the %pc and %npc to their values when the traced
	* instruction was initially executed so that it appears to
	* dtrace_probe() that we're on the original instruction, and so that
	* the user can't easily detect our complex web of lies.
	* dtrace_return_probe() (our caller) will correctly set %pc and %npc
	* after we return.
	*/
	rp->r_pc = pc;
	rp->r_npc = npc;

	pid = p->p_pid;
	fasttrap_return_common(rp, pc, pid, 0);

	return (0);
	}

	int
	fasttrap_tracepoint_install(proc_t p, fasttrap_tracepoint_t tp)
	{
	fasttrap_instr_t instr = FASTTRAP_INSTR;

	if (uwrite(p, &instr, 4, tp->ftt_pc) != 0)
	return (-1);

	return (0);
	}

	int
	fasttrap_tracepoint_remove(proc_t p, fasttrap_tracepoint_t tp)
	{
	fasttrap_instr_t instr;

	/*
	* Distinguish between read or write failures and a changed
	* instruction.
	*/
	if (uread(p, &instr, 4, tp->ftt_pc) != 0)
	return (0);
	if (instr != FASTTRAP_INSTR && instr != BREAKPOINT_INSTR)
	return (0);
	if (uwrite(p, &tp->ftt_instr, 4, tp->ftt_pc) != 0)
	return (-1);

	return (0);
	}

	int
	fasttrap_tracepoint_init(proc_t p, fasttrap_tracepoint_t tp, uintptr_t pc,
	fasttrap_probe_type_t type)
	{
	uint32_t instr;
	int32_t disp;

	/*
	* Read the instruction at the given address out of the process's
	* address space. We don't have to worry about a debugger
	* changing this instruction before we overwrite it with our trap
	* instruction since P_PR_LOCK is set.
	*/
	if (uread(p, &instr, 4, pc) != 0)
	return (-1);

	/*
	* Decode the instruction to fill in the probe flags. We can have
	* the process execute most instructions on its own using a pc/npc
	* trick, but pc-relative control transfer present a problem since
	* we're relocating the instruction. We emulate these instructions
	* in the kernel. We assume a default type and over-write that as
	* needed.
	*
	* pc-relative instructions must be emulated for correctness;
	* other instructions (which represent a large set of commonly traced
	* instructions) are emulated or otherwise optimized for performance.
	*/
	tp->ftt_type = FASTTRAP_T_COMMON;
	if (OP(instr) == 1) {
	/*
	* Call instructions.
	*/
	tp->ftt_type = FASTTRAP_T_CALL;
	disp = DISP30(instr) << 2;
	tp->ftt_dest = pc + (intptr_t)disp;

	} else if (OP(instr) == 0) {
	/*
	* Branch instructions.
	*
	* Unconditional branches need careful attention when they're
	* annulled: annulled unconditional branches never execute
	* the instruction in the delay slot.
	*/
	switch (OP2(instr)) {
	case OP2_ILLTRAP:
	case 0x7:
	/*
	* The compiler may place an illtrap after a call to
	* a function that returns a structure. In the case of
	* a returned structure, the compiler places an illtrap
	* whose const22 field is the size of the returned
	* structure immediately following the delay slot of
	* the call. To stay out of the way, we refuse to
	* place tracepoints on top of illtrap instructions.
	*
	* This is one of the dumbest architectural decisions
	* I've ever had to work around.
	*
	* We also identify the only illegal op2 value (See
	* SPARC Architecture Manual Version 9, E.2 table 31).
	*/
	return (-1);

	case OP2_BPcc:
	if (COND(instr) == 8) {
	tp->ftt_type = FASTTRAP_T_ALWAYS;
	} else {
	/*
	* Check for an illegal instruction.
	*/
	if (CC(instr) & 1)
	return (-1);
	tp->ftt_type = FASTTRAP_T_CCR;
	tp->ftt_cc = CC(instr);
	tp->ftt_code = COND(instr);
	}

	if (A(instr) != 0)
	tp->ftt_flags \|= FASTTRAP_F_ANNUL;

	disp = DISP19(instr);
	disp <<= 13;
	disp >>= 11;
	tp->ftt_dest = pc + (intptr_t)disp;
	break;

	case OP2_Bicc:
	if (COND(instr) == 8) {
	tp->ftt_type = FASTTRAP_T_ALWAYS;
	} else {
	tp->ftt_type = FASTTRAP_T_CCR;
	tp->ftt_cc = 0;
	tp->ftt_code = COND(instr);
	}

	if (A(instr) != 0)
	tp->ftt_flags \|= FASTTRAP_F_ANNUL;

	disp = DISP22(instr);
	disp <<= 10;
	disp >>= 8;
	tp->ftt_dest = pc + (intptr_t)disp;
	break;

	case OP2_BPr:
	/*
	* Check for an illegal instruction.
	*/
	if ((RCOND(instr) & 3) == 0)
	return (-1);

	/*
	* It's a violation of the v8plus ABI to use a
	* register-predicated branch in a 32-bit app if
	* the register used is an %l or an %i (%gs and %os
	* are legit because they're not saved to the stack
	* in 32-bit words when we take a trap).
	*/
	if (p->p_model == DATAMODEL_ILP32 && RS1(instr) >= 16)
	return (-1);

	tp->ftt_type = FASTTRAP_T_REG;
	if (A(instr) != 0)
	tp->ftt_flags \|= FASTTRAP_F_ANNUL;
	disp = DISP16(instr);
	disp <<= 16;
	disp >>= 14;
	tp->ftt_dest = pc + (intptr_t)disp;
	tp->ftt_code = RCOND(instr);
	break;

	case OP2_SETHI:
	tp->ftt_type = FASTTRAP_T_SETHI;
	break;

	case OP2_FBPfcc:
	if (COND(instr) == 8) {
	tp->ftt_type = FASTTRAP_T_ALWAYS;
	} else {
	tp->ftt_type = FASTTRAP_T_FCC;
	tp->ftt_cc = CC(instr);
	tp->ftt_code = COND(instr);
	}

	if (A(instr) != 0)
	tp->ftt_flags \|= FASTTRAP_F_ANNUL;

	disp = DISP19(instr);
	disp <<= 13;
	disp >>= 11;
	tp->ftt_dest = pc + (intptr_t)disp;
	break;

	case OP2_FBfcc:
	if (COND(instr) == 8) {
	tp->ftt_type = FASTTRAP_T_ALWAYS;
	} else {
	tp->ftt_type = FASTTRAP_T_FCC;
	tp->ftt_cc = 0;
	tp->ftt_code = COND(instr);
	}

	if (A(instr) != 0)
	tp->ftt_flags \|= FASTTRAP_F_ANNUL;

	disp = DISP22(instr);
	disp <<= 10;
	disp >>= 8;
	tp->ftt_dest = pc + (intptr_t)disp;
	break;
	}

	} else if (OP(instr) == 2) {
	switch (OP3(instr)) {
	case OP3_RETURN:
	tp->ftt_type = FASTTRAP_T_RETURN;
	break;

	case OP3_JMPL:
	tp->ftt_type = FASTTRAP_T_JMPL;
	break;

	case OP3_RD:
	if (RS1(instr) == 5)
	tp->ftt_type = FASTTRAP_T_RDPC;
	break;

	case OP3_SAVE:
	/*
	* We optimize for save instructions at function
	* entry; see the comment in fasttrap_pid_probe()
	* (near FASTTRAP_T_SAVE) for details.
	*/
	if (fasttrap_optimize_save != 0 &&
	type == DTFTP_ENTRY &&
	I(instr) == 1 && RD(instr) == R_SP)
	tp->ftt_type = FASTTRAP_T_SAVE;
	break;

	case OP3_RESTORE:
	/*
	* We optimize restore instructions at function
	* return; see the comment in fasttrap_pid_probe()
	* (near FASTTRAP_T_RESTORE) for details.
	*
	* rd must be an %o or %g register.
	*/
	if ((RD(instr) & 0x10) == 0)
	tp->ftt_type = FASTTRAP_T_RESTORE;
	break;

	case OP3_OR:
	/*
	* A large proportion of instructions in the delay
	* slot of retl instructions are or's so we emulate
	* these downstairs as an optimization.
	*/
	tp->ftt_type = FASTTRAP_T_OR;
	break;

	case OP3_TCC:
	/*
	* Breakpoint instructions are effectively position-
	* dependent since the debugger uses the %pc value
	* to lookup which breakpoint was executed. As a
	* result, we can't actually instrument breakpoints.
	*/
	if (SW_TRAP(instr) == ST_BREAKPOINT)
	return (-1);
	break;

	case 0x19:
	case 0x1d:
	case 0x29:
	case 0x33:
	case 0x3f:
	/*
	* Identify illegal instructions (See SPARC
	* Architecture Manual Version 9, E.2 table 32).
	*/
	return (-1);
	}
	} else if (OP(instr) == 3) {
	uint32_t op3 = OP3(instr);

	/*
	* Identify illegal instructions (See SPARC Architecture
	* Manual Version 9, E.2 table 33).
	*/
	if ((op3 & 0x28) == 0x28) {
	if (op3 != OP3_PREFETCH && op3 != OP3_CASA &&
	op3 != OP3_PREFETCHA && op3 != OP3_CASXA)
	return (-1);
	} else {
	if ((op3 & 0x0f) == 0x0c \|\| (op3 & 0x3b) == 0x31)
	return (-1);
	}
	}

	tp->ftt_instr = instr;

	/*
	* We don't know how this tracepoint is going to be used, but in case
	* it's used as part of a function return probe, we need to indicate
	* whether it's always a return site or only potentially a return
	* site. If it's part of a return probe, it's always going to be a
	* return from that function if it's a restore instruction or if
	* the previous instruction was a return. If we could reliably
	* distinguish jump tables from return sites, this wouldn't be
	* necessary.
	*/
	if (tp->ftt_type != FASTTRAP_T_RESTORE &&
	(uread(p, &instr, 4, pc - sizeof (instr)) != 0 \|\|
	!(OP(instr) == 2 && OP3(instr) == OP3_RETURN)))
	tp->ftt_flags \|= FASTTRAP_F_RETMAYBE;

	return (0);
	}

	/ARGSUSED/
	uint64_t
	fasttrap_pid_getarg(void arg, dtrace_id_t id, void parg, int argno,
	int aframes)
	{
	return (fasttrap_anarg(ttolwp(curthread)->lwp_regs, argno));
	}

	/ARGSUSED/
	uint64_t
	fasttrap_usdt_getarg(void arg, dtrace_id_t id, void parg, int argno,
	int aframes)
	{
	return (fasttrap_anarg(ttolwp(curthread)->lwp_regs, argno));
	}

	static uint64_t fasttrap_getreg_fast_cnt;
	static uint64_t fasttrap_getreg_mpcb_cnt;
	static uint64_t fasttrap_getreg_slow_cnt;

	static ulong_t
	fasttrap_getreg(struct regs *rp, uint_t reg)
	{
	ulong_t value;
	dtrace_icookie_t cookie;
	struct machpcb *mpcb;
	extern ulong_t dtrace_getreg_win(uint_t, uint_t);

	/*
	* We have the %os and %gs in our struct regs, but if we need to
	* snag a %l or %i we need to go scrounging around in the process's
	* address space.
	*/
	if (reg == 0)
	return (0);

	if (reg < 16)
	return ((&rp->r_g1)[reg - 1]);

	/*
	* Before we look at the user's stack, we'll check the register
	* windows to see if the information we want is in there.
	*/
	cookie = dtrace_interrupt_disable();
	if (dtrace_getotherwin() > 0) {
	value = dtrace_getreg_win(reg, 1);
	dtrace_interrupt_enable(cookie);

	atomic_add_64(&fasttrap_getreg_fast_cnt, 1);

	return (value);
	}
	dtrace_interrupt_enable(cookie);

	/*
	* First check the machpcb structure to see if we've already read
	* in the register window we're looking for; if we haven't, (and
	* we probably haven't) try to copy in the value of the register.
	*/
	/* LINTED - alignment */
	mpcb = (struct machpcb *)((caddr_t)rp - REGOFF);

	if (get_udatamodel() == DATAMODEL_NATIVE) {
	struct frame fr = (struct frame )(rp->r_sp + STACK_BIAS);

	if (mpcb->mpcb_wbcnt > 0) {
	struct rwindow rwin = (void )mpcb->mpcb_wbuf;
	int i = mpcb->mpcb_wbcnt;
	do {
	i--;
	if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
	continue;

	atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
	return (rwin[i].rw_local[reg - 16]);
	} while (i > 0);
	}

	if (fasttrap_fulword(&fr->fr_local[reg - 16], &value) != 0)
	goto err;
	} else {
	struct frame32 *fr =
	(struct frame32 *)(uintptr_t)(caddr32_t)rp->r_sp;
	uint32_t v32 = (uint32_t )&value;

	if (mpcb->mpcb_wbcnt > 0) {
	struct rwindow32 rwin = (void )mpcb->mpcb_wbuf;
	int i = mpcb->mpcb_wbcnt;
	do {
	i--;
	if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
	continue;

	atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
	return (rwin[i].rw_local[reg - 16]);
	} while (i > 0);
	}

	if (fasttrap_fuword32(&fr->fr_local[reg - 16], &v32[1]) != 0)
	goto err;

	v32[0] = 0;
	}

	atomic_add_64(&fasttrap_getreg_slow_cnt, 1);
	return (value);

	err:
	/*
	* If the copy in failed, the process will be in a irrecoverable
	* state, and we have no choice but to kill it.
	*/
	- psignal(ttoproc(curthread), SIGILL);
	+ kern_psignal(ttoproc(curthread), SIGILL);
	return (0);
	}

	static uint64_t fasttrap_putreg_fast_cnt;
	static uint64_t fasttrap_putreg_mpcb_cnt;
	static uint64_t fasttrap_putreg_slow_cnt;

	static void
	fasttrap_putreg(struct regs *rp, uint_t reg, ulong_t value)
	{
	dtrace_icookie_t cookie;
	struct machpcb *mpcb;
	extern void dtrace_putreg_win(uint_t, ulong_t);

	if (reg == 0)
	return;

	if (reg < 16) {
	(&rp->r_g1)[reg - 1] = value;
	return;
	}

	/*
	* If the user process is still using some register windows, we
	* can just place the value in the correct window.
	*/
	cookie = dtrace_interrupt_disable();
	if (dtrace_getotherwin() > 0) {
	dtrace_putreg_win(reg, value);
	dtrace_interrupt_enable(cookie);
	atomic_add_64(&fasttrap_putreg_fast_cnt, 1);
	return;
	}
	dtrace_interrupt_enable(cookie);

	/*
	* First see if there's a copy of the register window in the
	* machpcb structure that we can modify; if there isn't try to
	* copy out the value. If that fails, we try to create a new
	* register window in the machpcb structure. While this isn't
	* _precisely_ the intended use of the machpcb structure, it
	* can't cause any problems since we know at this point in the
	* code that all of the user's data have been flushed out of the
	* register file (since %otherwin is 0).
	*/
	/* LINTED - alignment */
	mpcb = (struct machpcb *)((caddr_t)rp - REGOFF);

	if (get_udatamodel() == DATAMODEL_NATIVE) {
	struct frame fr = (struct frame )(rp->r_sp + STACK_BIAS);
	/* LINTED - alignment */
	struct rwindow rwin = (struct rwindow )mpcb->mpcb_wbuf;

	if (mpcb->mpcb_wbcnt > 0) {
	int i = mpcb->mpcb_wbcnt;
	do {
	i--;
	if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
	continue;

	rwin[i].rw_local[reg - 16] = value;
	atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
	return;
	} while (i > 0);
	}

	if (fasttrap_sulword(&fr->fr_local[reg - 16], value) != 0) {
	if (mpcb->mpcb_wbcnt >= MAXWIN \|\| copyin(fr,
	&rwin[mpcb->mpcb_wbcnt], sizeof (*rwin)) != 0)
	goto err;

	rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = value;
	mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
	mpcb->mpcb_wbcnt++;
	atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
	return;
	}
	} else {
	struct frame32 *fr =
	(struct frame32 *)(uintptr_t)(caddr32_t)rp->r_sp;
	/* LINTED - alignment */
	struct rwindow32 rwin = (struct rwindow32 )mpcb->mpcb_wbuf;
	uint32_t v32 = (uint32_t)value;

	if (mpcb->mpcb_wbcnt > 0) {
	int i = mpcb->mpcb_wbcnt;
	do {
	i--;
	if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
	continue;

	rwin[i].rw_local[reg - 16] = v32;
	atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
	return;
	} while (i > 0);
	}

	if (fasttrap_suword32(&fr->fr_local[reg - 16], v32) != 0) {
	if (mpcb->mpcb_wbcnt >= MAXWIN \|\| copyin(fr,
	&rwin[mpcb->mpcb_wbcnt], sizeof (*rwin)) != 0)
	goto err;

	rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = v32;
	mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
	mpcb->mpcb_wbcnt++;
	atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
	return;
	}
	}

	atomic_add_64(&fasttrap_putreg_slow_cnt, 1);
	return;

	err:
	/*
	* If we couldn't record this register's value, the process is in an
	* irrecoverable state and we have no choice but to euthanize it.
	*/
	- psignal(ttoproc(curthread), SIGILL);
	+ kern_psignal(ttoproc(curthread), SIGILL);
	}
	Index: head/sys/compat/freebsd32/freebsd32_ioctl.c
	===================================================================
	--- head/sys/compat/freebsd32/freebsd32_ioctl.c (revision 225616)
	+++ head/sys/compat/freebsd32/freebsd32_ioctl.c (revision 225617)
	@@ -1,404 +1,404 @@
	/*-
	* Copyright (c) 2008 David E. O'Brien
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of the author nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/capability.h>
	#include <sys/cdio.h>
	#include <sys/fcntl.h>
	#include <sys/filio.h>
	#include <sys/file.h>
	#include <sys/ioccom.h>
	#include <sys/malloc.h>
	#include <sys/mdioctl.h>
	#include <sys/memrange.h>
	#include <sys/pciio.h>
	#include <sys/proc.h>
	#include <sys/syscall.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>

	#include <compat/freebsd32/freebsd32.h>
	#include <compat/freebsd32/freebsd32_ioctl.h>
	#include <compat/freebsd32/freebsd32_proto.h>

	/* Cannot get exact size in 64-bit due to alignment issue of entire struct. */
	CTASSERT((sizeof(struct md_ioctl32)+4) == 436);
	CTASSERT(sizeof(struct ioc_read_toc_entry32) == 8);
	CTASSERT(sizeof(struct ioc_toc_header32) == 4);
	CTASSERT(sizeof(struct mem_range_op32) == 12);
	CTASSERT(sizeof(struct pci_conf_io32) == 36);
	CTASSERT(sizeof(struct pci_match_conf32) == 44);
	CTASSERT(sizeof(struct pci_conf32) == 44);


	static int
	freebsd32_ioctl_md(struct thread td, struct freebsd32_ioctl_args uap,
	struct file *fp)
	{
	struct md_ioctl mdv;
	struct md_ioctl32 md32;
	u_long com = 0;
	int i, error;

	if (uap->com & IOC_IN) {
	if ((error = copyin(uap->data, &md32, sizeof(md32)))) {
	return (error);
	}
	CP(md32, mdv, md_version);
	CP(md32, mdv, md_unit);
	CP(md32, mdv, md_type);
	PTRIN_CP(md32, mdv, md_file);
	CP(md32, mdv, md_mediasize);
	CP(md32, mdv, md_sectorsize);
	CP(md32, mdv, md_options);
	CP(md32, mdv, md_base);
	CP(md32, mdv, md_fwheads);
	CP(md32, mdv, md_fwsectors);
	} else if (uap->com & IOC_OUT) {
	/*
	* Zero the buffer so the user always
	* gets back something deterministic.
	*/
	bzero(&mdv, sizeof mdv);
	}

	switch (uap->com) {
	case MDIOCATTACH_32:
	com = MDIOCATTACH;
	break;
	case MDIOCDETACH_32:
	com = MDIOCDETACH;
	break;
	case MDIOCQUERY_32:
	com = MDIOCQUERY;
	break;
	case MDIOCLIST_32:
	com = MDIOCLIST;
	break;
	default:
	panic("%s: unknown MDIOC %#x", __func__, uap->com);
	}
	error = fo_ioctl(fp, com, (caddr_t)&mdv, td->td_ucred, td);
	if (error == 0 && (com & IOC_OUT)) {
	CP(mdv, md32, md_version);
	CP(mdv, md32, md_unit);
	CP(mdv, md32, md_type);
	PTROUT_CP(mdv, md32, md_file);
	CP(mdv, md32, md_mediasize);
	CP(mdv, md32, md_sectorsize);
	CP(mdv, md32, md_options);
	CP(mdv, md32, md_base);
	CP(mdv, md32, md_fwheads);
	CP(mdv, md32, md_fwsectors);
	if (com == MDIOCLIST) {
	/*
	* Use MDNPAD, and not MDNPAD32. Padding is
	* allocated and used by compat32 ABI.
	*/
	for (i = 0; i < MDNPAD; i++)
	CP(mdv, md32, md_pad[i]);
	}
	error = copyout(&md32, uap->data, sizeof(md32));
	}
	return error;
	}


	static int
	freebsd32_ioctl_ioc_toc_header(struct thread *td,
	struct freebsd32_ioctl_args uap, struct file fp)
	{
	struct ioc_toc_header toch;
	struct ioc_toc_header32 toch32;
	int error;

	if ((error = copyin(uap->data, &toch32, sizeof(toch32))))
	return (error);
	CP(toch32, toch, len);
	CP(toch32, toch, starting_track);
	CP(toch32, toch, ending_track);
	error = fo_ioctl(fp, CDIOREADTOCHEADER, (caddr_t)&toch,
	td->td_ucred, td);
	return (error);
	}


	static int
	freebsd32_ioctl_ioc_read_toc(struct thread *td,
	struct freebsd32_ioctl_args uap, struct file fp)
	{
	struct ioc_read_toc_entry toce;
	struct ioc_read_toc_entry32 toce32;
	int error;

	if ((error = copyin(uap->data, &toce32, sizeof(toce32))))
	return (error);
	CP(toce32, toce, address_format);
	CP(toce32, toce, starting_track);
	CP(toce32, toce, data_len);
	PTRIN_CP(toce32, toce, data);

	if ((error = fo_ioctl(fp, CDIOREADTOCENTRYS, (caddr_t)&toce,
	td->td_ucred, td))) {
	CP(toce, toce32, address_format);
	CP(toce, toce32, starting_track);
	CP(toce, toce32, data_len);
	PTROUT_CP(toce, toce32, data);
	error = copyout(&toce32, uap->data, sizeof(toce32));
	}
	return error;
	}

	static int
	freebsd32_ioctl_fiodgname(struct thread *td,
	struct freebsd32_ioctl_args uap, struct file fp)
	{
	struct fiodgname_arg fgn;
	struct fiodgname_arg32 fgn32;
	int error;

	if ((error = copyin(uap->data, &fgn32, sizeof fgn32)) != 0)
	return (error);
	CP(fgn32, fgn, len);
	PTRIN_CP(fgn32, fgn, buf);
	error = fo_ioctl(fp, FIODGNAME, (caddr_t)&fgn, td->td_ucred, td);
	return (error);
	}

	static int
	freebsd32_ioctl_memrange(struct thread *td,
	struct freebsd32_ioctl_args uap, struct file fp)
	{
	struct mem_range_op mro;
	struct mem_range_op32 mro32;
	int error;
	u_long com;

	if ((error = copyin(uap->data, &mro32, sizeof(mro32))) != 0)
	return (error);

	PTRIN_CP(mro32, mro, mo_desc);
	CP(mro32, mro, mo_arg[0]);
	CP(mro32, mro, mo_arg[1]);

	com = 0;
	switch (uap->com) {
	case MEMRANGE_GET32:
	com = MEMRANGE_GET;
	break;

	case MEMRANGE_SET32:
	com = MEMRANGE_SET;
	break;

	default:
	panic("%s: unknown MEMRANGE %#x", __func__, uap->com);
	}

	if ((error = fo_ioctl(fp, com, (caddr_t)&mro, td->td_ucred, td)) != 0)
	return (error);

	if ( (com & IOC_OUT) ) {
	CP(mro, mro32, mo_arg[0]);
	CP(mro, mro32, mo_arg[1]);

	error = copyout(&mro32, uap->data, sizeof(mro32));
	}

	return (error);
	}

	static int
	freebsd32_ioctl_pciocgetconf(struct thread *td,
	struct freebsd32_ioctl_args uap, struct file fp)
	{
	struct pci_conf_io pci;
	struct pci_conf_io32 pci32;
	struct pci_match_conf32 pmc32;
	struct pci_match_conf32 *pmc32p;
	struct pci_match_conf pmc;
	struct pci_match_conf *pmcp;
	struct pci_conf32 pc32;
	struct pci_conf32 *pc32p;
	struct pci_conf pc;
	struct pci_conf *pcp;
	u_int32_t i;
	u_int32_t npat_to_convert;
	u_int32_t nmatch_to_convert;
	vm_offset_t addr;
	int error;

	if ((error = copyin(uap->data, &pci32, sizeof(pci32))) != 0)
	return (error);

	CP(pci32, pci, num_patterns);
	CP(pci32, pci, offset);
	CP(pci32, pci, generation);

	npat_to_convert = pci32.pat_buf_len / sizeof(struct pci_match_conf32);
	pci.pat_buf_len = npat_to_convert * sizeof(struct pci_match_conf);
	pci.patterns = NULL;
	nmatch_to_convert = pci32.match_buf_len / sizeof(struct pci_conf32);
	pci.match_buf_len = nmatch_to_convert * sizeof(struct pci_conf);
	pci.matches = NULL;

	if ((error = copyout_map(td, &addr, pci.pat_buf_len)) != 0)
	goto cleanup;
	pci.patterns = (struct pci_match_conf *)addr;
	if ((error = copyout_map(td, &addr, pci.match_buf_len)) != 0)
	goto cleanup;
	pci.matches = (struct pci_conf *)addr;

	npat_to_convert = min(npat_to_convert, pci.num_patterns);

	for (i = 0, pmc32p = (struct pci_match_conf32 *)PTRIN(pci32.patterns),
	pmcp = pci.patterns;
	i < npat_to_convert; i++, pmc32p++, pmcp++) {
	if ((error = copyin(pmc32p, &pmc32, sizeof(pmc32))) != 0)
	goto cleanup;
	CP(pmc32,pmc,pc_sel);
	strlcpy(pmc.pd_name, pmc32.pd_name, sizeof(pmc.pd_name));
	CP(pmc32,pmc,pd_unit);
	CP(pmc32,pmc,pc_vendor);
	CP(pmc32,pmc,pc_device);
	CP(pmc32,pmc,pc_class);
	CP(pmc32,pmc,flags);
	if ((error = copyout(&pmc, pmcp, sizeof(pmc))) != 0)
	goto cleanup;
	}

	if ((error = fo_ioctl(fp, PCIOCGETCONF, (caddr_t)&pci,
	td->td_ucred, td)) != 0)
	goto cleanup;

	nmatch_to_convert = min(nmatch_to_convert, pci.num_matches);

	for (i = 0, pcp = pci.matches,
	pc32p = (struct pci_conf32 *)PTRIN(pci32.matches);
	i < nmatch_to_convert; i++, pcp++, pc32p++) {
	if ((error = copyin(pcp, &pc, sizeof(pc))) != 0)
	goto cleanup;
	CP(pc,pc32,pc_sel);
	CP(pc,pc32,pc_hdr);
	CP(pc,pc32,pc_subvendor);
	CP(pc,pc32,pc_subdevice);
	CP(pc,pc32,pc_vendor);
	CP(pc,pc32,pc_device);
	CP(pc,pc32,pc_class);
	CP(pc,pc32,pc_subclass);
	CP(pc,pc32,pc_progif);
	CP(pc,pc32,pc_revid);
	strlcpy(pc32.pd_name, pc.pd_name, sizeof(pc32.pd_name));
	CP(pc,pc32,pd_unit);
	if ((error = copyout(&pc32, pc32p, sizeof(pc32))) != 0)
	goto cleanup;
	}

	CP(pci, pci32, num_matches);
	CP(pci, pci32, offset);
	CP(pci, pci32, generation);
	CP(pci, pci32, status);

	error = copyout(&pci32, uap->data, sizeof(pci32));

	cleanup:
	if (pci.patterns)
	copyout_unmap(td, (vm_offset_t)pci.patterns, pci.pat_buf_len);
	if (pci.matches)
	copyout_unmap(td, (vm_offset_t)pci.matches, pci.match_buf_len);

	return (error);
	}

	int
	freebsd32_ioctl(struct thread td, struct freebsd32_ioctl_args uap)
	{
	struct ioctl_args ap /*{
	int fd;
	u_long com;
	caddr_t data;
	}*/ ;
	struct file *fp;
	int error;

	if ((error = fget(td, uap->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	if ((fp->f_flag & (FREAD \| FWRITE)) == 0) {
	fdrop(fp, td);
	return (EBADF);
	}

	switch (uap->com) {
	case MDIOCATTACH_32: /* FALLTHROUGH */
	case MDIOCDETACH_32: /* FALLTHROUGH */
	case MDIOCQUERY_32: /* FALLTHROUGH */
	case MDIOCLIST_32:
	error = freebsd32_ioctl_md(td, uap, fp);
	break;

	case CDIOREADTOCENTRYS_32:
	error = freebsd32_ioctl_ioc_read_toc(td, uap, fp);
	break;

	case CDIOREADTOCHEADER_32:
	error = freebsd32_ioctl_ioc_toc_header(td, uap, fp);
	break;

	case FIODGNAME_32:
	error = freebsd32_ioctl_fiodgname(td, uap, fp);
	break;

	case MEMRANGE_GET32: /* FALLTHROUGH */
	case MEMRANGE_SET32:
	error = freebsd32_ioctl_memrange(td, uap, fp);
	break;

	case PCIOCGETCONF_32:
	error = freebsd32_ioctl_pciocgetconf(td, uap, fp);
	break;

	default:
	fdrop(fp, td);
	ap.fd = uap->fd;
	ap.com = uap->com;
	PTRIN_CP(*uap, ap, data);
	- return ioctl(td, &ap);
	+ return sys_ioctl(td, &ap);
	}

	fdrop(fp, td);
	return error;
	}
	Index: head/sys/compat/freebsd32/freebsd32_misc.c
	===================================================================
	--- head/sys/compat/freebsd32/freebsd32_misc.c (revision 225616)
	+++ head/sys/compat/freebsd32/freebsd32_misc.c (revision 225617)
	@@ -1,2817 +1,2817 @@
	/*-
	* Copyright (c) 2002 Doug Rabson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#define __ELF_WORD_SIZE 32

	#include <sys/param.h>
	#include <sys/bus.h>
	#include <sys/clock.h>
	#include <sys/exec.h>
	#include <sys/fcntl.h>
	#include <sys/filedesc.h>
	#include <sys/imgact.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/linker.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/file.h> /* Must come after sys/malloc.h */
	#include <sys/imgact.h>
	#include <sys/mbuf.h>
	#include <sys/mman.h>
	#include <sys/module.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/proc.h>
	#include <sys/reboot.h>
	#include <sys/resource.h>
	#include <sys/resourcevar.h>
	#include <sys/selinfo.h>
	#include <sys/eventvar.h> /* Must come after sys/selinfo.h */
	#include <sys/pipe.h> /* Must come after sys/selinfo.h */
	#include <sys/signal.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/stat.h>
	#include <sys/syscall.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/thr.h>
	#include <sys/unistd.h>
	#include <sys/ucontext.h>
	#include <sys/vnode.h>
	#include <sys/wait.h>
	#include <sys/ipc.h>
	#include <sys/msg.h>
	#include <sys/sem.h>
	#include <sys/shm.h>

	#ifdef INET
	#include <netinet/in.h>
	#endif

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_extern.h>

	#include <machine/cpu.h>
	#include <machine/elf.h>

	#include <security/audit/audit.h>

	#include <compat/freebsd32/freebsd32_util.h>
	#include <compat/freebsd32/freebsd32.h>
	#include <compat/freebsd32/freebsd32_ipc.h>
	#include <compat/freebsd32/freebsd32_signal.h>
	#include <compat/freebsd32/freebsd32_proto.h>

	CTASSERT(sizeof(struct timeval32) == 8);
	CTASSERT(sizeof(struct timespec32) == 8);
	CTASSERT(sizeof(struct itimerval32) == 16);
	CTASSERT(sizeof(struct statfs32) == 256);
	CTASSERT(sizeof(struct rusage32) == 72);
	CTASSERT(sizeof(struct sigaltstack32) == 12);
	CTASSERT(sizeof(struct kevent32) == 20);
	CTASSERT(sizeof(struct iovec32) == 8);
	CTASSERT(sizeof(struct msghdr32) == 28);
	CTASSERT(sizeof(struct stat32) == 96);
	CTASSERT(sizeof(struct sigaction32) == 24);

	static int freebsd32_kevent_copyout(void arg, struct kevent kevp, int count);
	static int freebsd32_kevent_copyin(void arg, struct kevent kevp, int count);

	#if BYTE_ORDER == BIG_ENDIAN
	#define PAIR32TO64(type, name) ((name ## 2) \| ((type)(name ## 1) << 32))
	#define RETVAL_HI 0
	#define RETVAL_LO 1
	#else
	#define PAIR32TO64(type, name) ((name ## 1) \| ((type)(name ## 2) << 32))
	#define RETVAL_HI 1
	#define RETVAL_LO 0
	#endif

	void
	freebsd32_rusage_out(const struct rusage s, struct rusage32 s32)
	{

	TV_CP(s, s32, ru_utime);
	TV_CP(s, s32, ru_stime);
	CP(s, s32, ru_maxrss);
	CP(s, s32, ru_ixrss);
	CP(s, s32, ru_idrss);
	CP(s, s32, ru_isrss);
	CP(s, s32, ru_minflt);
	CP(s, s32, ru_majflt);
	CP(s, s32, ru_nswap);
	CP(s, s32, ru_inblock);
	CP(s, s32, ru_oublock);
	CP(s, s32, ru_msgsnd);
	CP(s, s32, ru_msgrcv);
	CP(s, s32, ru_nsignals);
	CP(s, s32, ru_nvcsw);
	CP(s, s32, ru_nivcsw);
	}

	int
	freebsd32_wait4(struct thread td, struct freebsd32_wait4_args uap)
	{
	int error, status;
	struct rusage32 ru32;
	struct rusage ru, *rup;

	if (uap->rusage != NULL)
	rup = &ru;
	else
	rup = NULL;
	error = kern_wait(td, uap->pid, &status, uap->options, rup);
	if (error)
	return (error);
	if (uap->status != NULL)
	error = copyout(&status, uap->status, sizeof(status));
	if (uap->rusage != NULL && error == 0) {
	freebsd32_rusage_out(&ru, &ru32);
	error = copyout(&ru32, uap->rusage, sizeof(ru32));
	}
	return (error);
	}

	#ifdef COMPAT_FREEBSD4
	static void
	copy_statfs(struct statfs in, struct statfs32 out)
	{

	statfs_scale_blocks(in, INT32_MAX);
	bzero(out, sizeof(*out));
	CP(in, out, f_bsize);
	out->f_iosize = MIN(in->f_iosize, INT32_MAX);
	CP(in, out, f_blocks);
	CP(in, out, f_bfree);
	CP(in, out, f_bavail);
	out->f_files = MIN(in->f_files, INT32_MAX);
	out->f_ffree = MIN(in->f_ffree, INT32_MAX);
	CP(in, out, f_fsid);
	CP(in, out, f_owner);
	CP(in, out, f_type);
	CP(in, out, f_flags);
	out->f_syncwrites = MIN(in->f_syncwrites, INT32_MAX);
	out->f_asyncwrites = MIN(in->f_asyncwrites, INT32_MAX);
	strlcpy(out->f_fstypename,
	in->f_fstypename, MFSNAMELEN);
	strlcpy(out->f_mntonname,
	in->f_mntonname, min(MNAMELEN, FREEBSD4_MNAMELEN));
	out->f_syncreads = MIN(in->f_syncreads, INT32_MAX);
	out->f_asyncreads = MIN(in->f_asyncreads, INT32_MAX);
	strlcpy(out->f_mntfromname,
	in->f_mntfromname, min(MNAMELEN, FREEBSD4_MNAMELEN));
	}
	#endif

	#ifdef COMPAT_FREEBSD4
	int
	freebsd4_freebsd32_getfsstat(struct thread td, struct freebsd4_freebsd32_getfsstat_args uap)
	{
	struct statfs buf, sp;
	struct statfs32 stat32;
	size_t count, size;
	int error;

	count = uap->bufsize / sizeof(struct statfs32);
	size = count * sizeof(struct statfs);
	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
	if (size > 0) {
	count = td->td_retval[0];
	sp = buf;
	while (count > 0 && error == 0) {
	copy_statfs(sp, &stat32);
	error = copyout(&stat32, uap->buf, sizeof(stat32));
	sp++;
	uap->buf++;
	count--;
	}
	free(buf, M_TEMP);
	}
	return (error);
	}
	#endif

	int
	freebsd32_sigaltstack(struct thread *td,
	struct freebsd32_sigaltstack_args *uap)
	{
	struct sigaltstack32 s32;
	struct sigaltstack ss, oss, *ssp;
	int error;

	if (uap->ss != NULL) {
	error = copyin(uap->ss, &s32, sizeof(s32));
	if (error)
	return (error);
	PTRIN_CP(s32, ss, ss_sp);
	CP(s32, ss, ss_size);
	CP(s32, ss, ss_flags);
	ssp = &ss;
	} else
	ssp = NULL;
	error = kern_sigaltstack(td, ssp, &oss);
	if (error == 0 && uap->oss != NULL) {
	PTROUT_CP(oss, s32, ss_sp);
	CP(oss, s32, ss_size);
	CP(oss, s32, ss_flags);
	error = copyout(&s32, uap->oss, sizeof(s32));
	}
	return (error);
	}

	/*
	* Custom version of exec_copyin_args() so that we can translate
	* the pointers.
	*/
	int
	freebsd32_exec_copyin_args(struct image_args args, char fname,
	enum uio_seg segflg, u_int32_t argv, u_int32_t envv)
	{
	char argp, envp;
	u_int32_t *p32, arg;
	size_t length;
	int error;

	bzero(args, sizeof(*args));
	if (argv == NULL)
	return (EFAULT);

	/*
	* Allocate demand-paged memory for the file name, argument, and
	* environment strings.
	*/
	error = exec_alloc_args(args);
	if (error != 0)
	return (error);

	/*
	* Copy the file name.
	*/
	if (fname != NULL) {
	args->fname = args->buf;
	error = (segflg == UIO_SYSSPACE) ?
	copystr(fname, args->fname, PATH_MAX, &length) :
	copyinstr(fname, args->fname, PATH_MAX, &length);
	if (error != 0)
	goto err_exit;
	} else
	length = 0;

	args->begin_argv = args->buf + length;
	args->endp = args->begin_argv;
	args->stringspace = ARG_MAX;

	/*
	* extract arguments first
	*/
	p32 = argv;
	for (;;) {
	error = copyin(p32++, &arg, sizeof(arg));
	if (error)
	goto err_exit;
	if (arg == 0)
	break;
	argp = PTRIN(arg);
	error = copyinstr(argp, args->endp, args->stringspace, &length);
	if (error) {
	if (error == ENAMETOOLONG)
	error = E2BIG;
	goto err_exit;
	}
	args->stringspace -= length;
	args->endp += length;
	args->argc++;
	}

	args->begin_envv = args->endp;

	/*
	* extract environment strings
	*/
	if (envv) {
	p32 = envv;
	for (;;) {
	error = copyin(p32++, &arg, sizeof(arg));
	if (error)
	goto err_exit;
	if (arg == 0)
	break;
	envp = PTRIN(arg);
	error = copyinstr(envp, args->endp, args->stringspace,
	&length);
	if (error) {
	if (error == ENAMETOOLONG)
	error = E2BIG;
	goto err_exit;
	}
	args->stringspace -= length;
	args->endp += length;
	args->envc++;
	}
	}

	return (0);

	err_exit:
	exec_free_args(args);
	return (error);
	}

	int
	freebsd32_execve(struct thread td, struct freebsd32_execve_args uap)
	{
	struct image_args eargs;
	int error;

	error = freebsd32_exec_copyin_args(&eargs, uap->fname, UIO_USERSPACE,
	uap->argv, uap->envv);
	if (error == 0)
	error = kern_execve(td, &eargs, NULL);
	return (error);
	}

	int
	freebsd32_fexecve(struct thread td, struct freebsd32_fexecve_args uap)
	{
	struct image_args eargs;
	int error;

	error = freebsd32_exec_copyin_args(&eargs, NULL, UIO_SYSSPACE,
	uap->argv, uap->envv);
	if (error == 0) {
	eargs.fd = uap->fd;
	error = kern_execve(td, &eargs, NULL);
	}
	return (error);
	}

	#ifdef __ia64__
	static int
	freebsd32_mmap_partial(struct thread *td, vm_offset_t start, vm_offset_t end,
	int prot, int fd, off_t pos)
	{
	vm_map_t map;
	vm_map_entry_t entry;
	int rv;

	map = &td->td_proc->p_vmspace->vm_map;
	if (fd != -1)
	prot \|= VM_PROT_WRITE;

	if (vm_map_lookup_entry(map, start, &entry)) {
	if ((entry->protection & prot) != prot) {
	rv = vm_map_protect(map,
	trunc_page(start),
	round_page(end),
	entry->protection \| prot,
	FALSE);
	if (rv != KERN_SUCCESS)
	return (EINVAL);
	}
	} else {
	vm_offset_t addr = trunc_page(start);
	rv = vm_map_find(map, 0, 0,
	&addr, PAGE_SIZE, FALSE, prot,
	VM_PROT_ALL, 0);
	if (rv != KERN_SUCCESS)
	return (EINVAL);
	}

	if (fd != -1) {
	struct pread_args r;
	r.fd = fd;
	r.buf = (void *) start;
	r.nbyte = end - start;
	r.offset = pos;
	- return (pread(td, &r));
	+ return (sys_pread(td, &r));
	} else {
	while (start < end) {
	subyte((void *) start, 0);
	start++;
	}
	return (0);
	}
	}
	#endif

	int
	freebsd32_mmap(struct thread td, struct freebsd32_mmap_args uap)
	{
	struct mmap_args ap;
	vm_offset_t addr = (vm_offset_t) uap->addr;
	vm_size_t len = uap->len;
	int prot = uap->prot;
	int flags = uap->flags;
	int fd = uap->fd;
	off_t pos = PAIR32TO64(off_t,uap->pos);
	#ifdef __ia64__
	vm_size_t pageoff;
	int error;

	/*
	* Attempt to handle page size hassles.
	*/
	pageoff = (pos & PAGE_MASK);
	if (flags & MAP_FIXED) {
	vm_offset_t start, end;
	start = addr;
	end = addr + len;

	if (start != trunc_page(start)) {
	error = freebsd32_mmap_partial(td, start,
	round_page(start), prot,
	fd, pos);
	if (fd != -1)
	pos += round_page(start) - start;
	start = round_page(start);
	}
	if (end != round_page(end)) {
	vm_offset_t t = trunc_page(end);
	error = freebsd32_mmap_partial(td, t, end,
	prot, fd,
	pos + t - start);
	end = trunc_page(end);
	}
	if (end > start && fd != -1 && (pos & PAGE_MASK)) {
	/*
	* We can't map this region at all. The specified
	* address doesn't have the same alignment as the file
	* position. Fake the mapping by simply reading the
	* entire region into memory. First we need to make
	* sure the region exists.
	*/
	vm_map_t map;
	struct pread_args r;
	int rv;

	prot \|= VM_PROT_WRITE;
	map = &td->td_proc->p_vmspace->vm_map;
	rv = vm_map_remove(map, start, end);
	if (rv != KERN_SUCCESS)
	return (EINVAL);
	rv = vm_map_find(map, 0, 0,
	&start, end - start, FALSE,
	prot, VM_PROT_ALL, 0);
	if (rv != KERN_SUCCESS)
	return (EINVAL);
	r.fd = fd;
	r.buf = (void *) start;
	r.nbyte = end - start;
	r.offset = pos;
	- error = pread(td, &r);
	+ error = sys_pread(td, &r);
	if (error)
	return (error);

	td->td_retval[0] = addr;
	return (0);
	}
	if (end == start) {
	/*
	* After dealing with the ragged ends, there
	* might be none left.
	*/
	td->td_retval[0] = addr;
	return (0);
	}
	addr = start;
	len = end - start;
	}
	#endif

	ap.addr = (void *) addr;
	ap.len = len;
	ap.prot = prot;
	ap.flags = flags;
	ap.fd = fd;
	ap.pos = pos;

	- return (mmap(td, &ap));
	+ return (sys_mmap(td, &ap));
	}

	#ifdef COMPAT_FREEBSD6
	int
	freebsd6_freebsd32_mmap(struct thread td, struct freebsd6_freebsd32_mmap_args uap)
	{
	struct freebsd32_mmap_args ap;

	ap.addr = uap->addr;
	ap.len = uap->len;
	ap.prot = uap->prot;
	ap.flags = uap->flags;
	ap.fd = uap->fd;
	ap.pos1 = uap->pos1;
	ap.pos2 = uap->pos2;

	return (freebsd32_mmap(td, &ap));
	}
	#endif

	int
	freebsd32_setitimer(struct thread td, struct freebsd32_setitimer_args uap)
	{
	struct itimerval itv, oitv, *itvp;
	struct itimerval32 i32;
	int error;

	if (uap->itv != NULL) {
	error = copyin(uap->itv, &i32, sizeof(i32));
	if (error)
	return (error);
	TV_CP(i32, itv, it_interval);
	TV_CP(i32, itv, it_value);
	itvp = &itv;
	} else
	itvp = NULL;
	error = kern_setitimer(td, uap->which, itvp, &oitv);
	if (error \|\| uap->oitv == NULL)
	return (error);
	TV_CP(oitv, i32, it_interval);
	TV_CP(oitv, i32, it_value);
	return (copyout(&i32, uap->oitv, sizeof(i32)));
	}

	int
	freebsd32_getitimer(struct thread td, struct freebsd32_getitimer_args uap)
	{
	struct itimerval itv;
	struct itimerval32 i32;
	int error;

	error = kern_getitimer(td, uap->which, &itv);
	if (error \|\| uap->itv == NULL)
	return (error);
	TV_CP(itv, i32, it_interval);
	TV_CP(itv, i32, it_value);
	return (copyout(&i32, uap->itv, sizeof(i32)));
	}

	int
	freebsd32_select(struct thread td, struct freebsd32_select_args uap)
	{
	struct timeval32 tv32;
	struct timeval tv, *tvp;
	int error;

	if (uap->tv != NULL) {
	error = copyin(uap->tv, &tv32, sizeof(tv32));
	if (error)
	return (error);
	CP(tv32, tv, tv_sec);
	CP(tv32, tv, tv_usec);
	tvp = &tv;
	} else
	tvp = NULL;
	/*
	* XXX Do pointers need PTRIN()?
	*/
	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
	sizeof(int32_t) * 8));
	}

	int
	freebsd32_pselect(struct thread td, struct freebsd32_pselect_args uap)
	{
	struct timespec32 ts32;
	struct timespec ts;
	struct timeval tv, *tvp;
	sigset_t set, *uset;
	int error;

	if (uap->ts != NULL) {
	error = copyin(uap->ts, &ts32, sizeof(ts32));
	if (error != 0)
	return (error);
	CP(ts32, ts, tv_sec);
	CP(ts32, ts, tv_nsec);
	TIMESPEC_TO_TIMEVAL(&tv, &ts);
	tvp = &tv;
	} else
	tvp = NULL;
	if (uap->sm != NULL) {
	error = copyin(uap->sm, &set, sizeof(set));
	if (error != 0)
	return (error);
	uset = &set;
	} else
	uset = NULL;
	/*
	* XXX Do pointers need PTRIN()?
	*/
	error = kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
	uset, sizeof(int32_t) * 8);
	return (error);
	}

	/*
	* Copy 'count' items into the destination list pointed to by uap->eventlist.
	*/
	static int
	freebsd32_kevent_copyout(void arg, struct kevent kevp, int count)
	{
	struct freebsd32_kevent_args *uap;
	struct kevent32 ks32[KQ_NEVENTS];
	int i, error = 0;

	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
	uap = (struct freebsd32_kevent_args *)arg;

	for (i = 0; i < count; i++) {
	CP(kevp[i], ks32[i], ident);
	CP(kevp[i], ks32[i], filter);
	CP(kevp[i], ks32[i], flags);
	CP(kevp[i], ks32[i], fflags);
	CP(kevp[i], ks32[i], data);
	PTROUT_CP(kevp[i], ks32[i], udata);
	}
	error = copyout(ks32, uap->eventlist, count * sizeof *ks32);
	if (error == 0)
	uap->eventlist += count;
	return (error);
	}

	/*
	* Copy 'count' items from the list pointed to by uap->changelist.
	*/
	static int
	freebsd32_kevent_copyin(void arg, struct kevent kevp, int count)
	{
	struct freebsd32_kevent_args *uap;
	struct kevent32 ks32[KQ_NEVENTS];
	int i, error = 0;

	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
	uap = (struct freebsd32_kevent_args *)arg;

	error = copyin(uap->changelist, ks32, count * sizeof *ks32);
	if (error)
	goto done;
	uap->changelist += count;

	for (i = 0; i < count; i++) {
	CP(ks32[i], kevp[i], ident);
	CP(ks32[i], kevp[i], filter);
	CP(ks32[i], kevp[i], flags);
	CP(ks32[i], kevp[i], fflags);
	CP(ks32[i], kevp[i], data);
	PTRIN_CP(ks32[i], kevp[i], udata);
	}
	done:
	return (error);
	}

	int
	freebsd32_kevent(struct thread td, struct freebsd32_kevent_args uap)
	{
	struct timespec32 ts32;
	struct timespec ts, *tsp;
	struct kevent_copyops k_ops = { uap,
	freebsd32_kevent_copyout,
	freebsd32_kevent_copyin};
	int error;


	if (uap->timeout) {
	error = copyin(uap->timeout, &ts32, sizeof(ts32));
	if (error)
	return (error);
	CP(ts32, ts, tv_sec);
	CP(ts32, ts, tv_nsec);
	tsp = &ts;
	} else
	tsp = NULL;
	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
	&k_ops, tsp);
	return (error);
	}

	int
	freebsd32_gettimeofday(struct thread *td,
	struct freebsd32_gettimeofday_args *uap)
	{
	struct timeval atv;
	struct timeval32 atv32;
	struct timezone rtz;
	int error = 0;

	if (uap->tp) {
	microtime(&atv);
	CP(atv, atv32, tv_sec);
	CP(atv, atv32, tv_usec);
	error = copyout(&atv32, uap->tp, sizeof (atv32));
	}
	if (error == 0 && uap->tzp != NULL) {
	rtz.tz_minuteswest = tz_minuteswest;
	rtz.tz_dsttime = tz_dsttime;
	error = copyout(&rtz, uap->tzp, sizeof (rtz));
	}
	return (error);
	}

	int
	freebsd32_getrusage(struct thread td, struct freebsd32_getrusage_args uap)
	{
	struct rusage32 s32;
	struct rusage s;
	int error;

	error = kern_getrusage(td, uap->who, &s);
	if (error)
	return (error);
	if (uap->rusage != NULL) {
	freebsd32_rusage_out(&s, &s32);
	error = copyout(&s32, uap->rusage, sizeof(s32));
	}
	return (error);
	}

	static int
	freebsd32_copyinuio(struct iovec32 iovp, u_int iovcnt, struct uio *uiop)
	{
	struct iovec32 iov32;
	struct iovec *iov;
	struct uio *uio;
	u_int iovlen;
	int error, i;

	*uiop = NULL;
	if (iovcnt > UIO_MAXIOV)
	return (EINVAL);
	iovlen = iovcnt * sizeof(struct iovec);
	uio = malloc(iovlen + sizeof *uio, M_IOV, M_WAITOK);
	iov = (struct iovec *)(uio + 1);
	for (i = 0; i < iovcnt; i++) {
	error = copyin(&iovp[i], &iov32, sizeof(struct iovec32));
	if (error) {
	free(uio, M_IOV);
	return (error);
	}
	iov[i].iov_base = PTRIN(iov32.iov_base);
	iov[i].iov_len = iov32.iov_len;
	}
	uio->uio_iov = iov;
	uio->uio_iovcnt = iovcnt;
	uio->uio_segflg = UIO_USERSPACE;
	uio->uio_offset = -1;
	uio->uio_resid = 0;
	for (i = 0; i < iovcnt; i++) {
	if (iov->iov_len > INT_MAX - uio->uio_resid) {
	free(uio, M_IOV);
	return (EINVAL);
	}
	uio->uio_resid += iov->iov_len;
	iov++;
	}
	*uiop = uio;
	return (0);
	}

	int
	freebsd32_readv(struct thread td, struct freebsd32_readv_args uap)
	{
	struct uio *auio;
	int error;

	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_readv(td, uap->fd, auio);
	free(auio, M_IOV);
	return (error);
	}

	int
	freebsd32_writev(struct thread td, struct freebsd32_writev_args uap)
	{
	struct uio *auio;
	int error;

	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_writev(td, uap->fd, auio);
	free(auio, M_IOV);
	return (error);
	}

	int
	freebsd32_preadv(struct thread td, struct freebsd32_preadv_args uap)
	{
	struct uio *auio;
	int error;

	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_preadv(td, uap->fd, auio, PAIR32TO64(off_t,uap->offset));
	free(auio, M_IOV);
	return (error);
	}

	int
	freebsd32_pwritev(struct thread td, struct freebsd32_pwritev_args uap)
	{
	struct uio *auio;
	int error;

	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_pwritev(td, uap->fd, auio, PAIR32TO64(off_t,uap->offset));
	free(auio, M_IOV);
	return (error);
	}

	int
	freebsd32_copyiniov(struct iovec32 iovp32, u_int iovcnt, struct iovec *iovp,
	int error)
	{
	struct iovec32 iov32;
	struct iovec *iov;
	u_int iovlen;
	int i;

	*iovp = NULL;
	if (iovcnt > UIO_MAXIOV)
	return (error);
	iovlen = iovcnt * sizeof(struct iovec);
	iov = malloc(iovlen, M_IOV, M_WAITOK);
	for (i = 0; i < iovcnt; i++) {
	error = copyin(&iovp32[i], &iov32, sizeof(struct iovec32));
	if (error) {
	free(iov, M_IOV);
	return (error);
	}
	iov[i].iov_base = PTRIN(iov32.iov_base);
	iov[i].iov_len = iov32.iov_len;
	}
	*iovp = iov;
	return (0);
	}

	static int
	freebsd32_copyinmsghdr(struct msghdr32 msg32, struct msghdr msg)
	{
	struct msghdr32 m32;
	int error;

	error = copyin(msg32, &m32, sizeof(m32));
	if (error)
	return (error);
	msg->msg_name = PTRIN(m32.msg_name);
	msg->msg_namelen = m32.msg_namelen;
	msg->msg_iov = PTRIN(m32.msg_iov);
	msg->msg_iovlen = m32.msg_iovlen;
	msg->msg_control = PTRIN(m32.msg_control);
	msg->msg_controllen = m32.msg_controllen;
	msg->msg_flags = m32.msg_flags;
	return (0);
	}

	static int
	freebsd32_copyoutmsghdr(struct msghdr msg, struct msghdr32 msg32)
	{
	struct msghdr32 m32;
	int error;

	m32.msg_name = PTROUT(msg->msg_name);
	m32.msg_namelen = msg->msg_namelen;
	m32.msg_iov = PTROUT(msg->msg_iov);
	m32.msg_iovlen = msg->msg_iovlen;
	m32.msg_control = PTROUT(msg->msg_control);
	m32.msg_controllen = msg->msg_controllen;
	m32.msg_flags = msg->msg_flags;
	error = copyout(&m32, msg32, sizeof(m32));
	return (error);
	}

	#define FREEBSD32_ALIGNBYTES (sizeof(int) - 1)
	#define FREEBSD32_ALIGN(p) \
	(((u_long)(p) + FREEBSD32_ALIGNBYTES) & ~FREEBSD32_ALIGNBYTES)
	#define FREEBSD32_CMSG_SPACE(l) \
	(FREEBSD32_ALIGN(sizeof(struct cmsghdr)) + FREEBSD32_ALIGN(l))

	#define FREEBSD32_CMSG_DATA(cmsg) ((unsigned char *)(cmsg) + \
	FREEBSD32_ALIGN(sizeof(struct cmsghdr)))
	static int
	freebsd32_copy_msg_out(struct msghdr msg, struct mbuf control)
	{
	struct cmsghdr *cm;
	void *data;
	socklen_t clen, datalen;
	int error;
	caddr_t ctlbuf;
	int len, maxlen, copylen;
	struct mbuf *m;
	error = 0;

	len = msg->msg_controllen;
	maxlen = msg->msg_controllen;
	msg->msg_controllen = 0;

	m = control;
	ctlbuf = msg->msg_control;

	while (m && len > 0) {
	cm = mtod(m, struct cmsghdr *);
	clen = m->m_len;

	while (cm != NULL) {

	if (sizeof(struct cmsghdr) > clen \|\|
	cm->cmsg_len > clen) {
	error = EINVAL;
	break;
	}

	data = CMSG_DATA(cm);
	datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;

	/* Adjust message length */
	cm->cmsg_len = FREEBSD32_ALIGN(sizeof(struct cmsghdr)) +
	datalen;


	/* Copy cmsghdr */
	copylen = sizeof(struct cmsghdr);
	if (len < copylen) {
	msg->msg_flags \|= MSG_CTRUNC;
	copylen = len;
	}

	error = copyout(cm,ctlbuf,copylen);
	if (error)
	goto exit;

	ctlbuf += FREEBSD32_ALIGN(copylen);
	len -= FREEBSD32_ALIGN(copylen);

	if (len <= 0)
	break;

	/* Copy data */
	copylen = datalen;
	if (len < copylen) {
	msg->msg_flags \|= MSG_CTRUNC;
	copylen = len;
	}

	error = copyout(data,ctlbuf,copylen);
	if (error)
	goto exit;

	ctlbuf += FREEBSD32_ALIGN(copylen);
	len -= FREEBSD32_ALIGN(copylen);

	if (CMSG_SPACE(datalen) < clen) {
	clen -= CMSG_SPACE(datalen);
	cm = (struct cmsghdr *)
	((caddr_t)cm + CMSG_SPACE(datalen));
	} else {
	clen = 0;
	cm = NULL;
	}
	}
	m = m->m_next;
	}

	msg->msg_controllen = (len <= 0) ? maxlen : ctlbuf - (caddr_t)msg->msg_control;

	exit:
	return (error);

	}

	int
	freebsd32_recvmsg(td, uap)
	struct thread *td;
	struct freebsd32_recvmsg_args /* {
	int s;
	struct msghdr32 *msg;
	int flags;
	} / uap;
	{
	struct msghdr msg;
	struct msghdr32 m32;
	struct iovec uiov, iov;
	struct mbuf *control = NULL;
	struct mbuf **controlp;

	int error;
	error = copyin(uap->msg, &m32, sizeof(m32));
	if (error)
	return (error);
	error = freebsd32_copyinmsghdr(uap->msg, &msg);
	if (error)
	return (error);
	error = freebsd32_copyiniov(PTRIN(m32.msg_iov), m32.msg_iovlen, &iov,
	EMSGSIZE);
	if (error)
	return (error);
	msg.msg_flags = uap->flags;
	uiov = msg.msg_iov;
	msg.msg_iov = iov;

	controlp = (msg.msg_control != NULL) ? &control : NULL;
	error = kern_recvit(td, uap->s, &msg, UIO_USERSPACE, controlp);
	if (error == 0) {
	msg.msg_iov = uiov;

	if (control != NULL)
	error = freebsd32_copy_msg_out(&msg, control);
	else
	msg.msg_controllen = 0;

	if (error == 0)
	error = freebsd32_copyoutmsghdr(&msg, uap->msg);
	}
	free(iov, M_IOV);

	if (control != NULL)
	m_freem(control);

	return (error);
	}


	static int
	freebsd32_convert_msg_in(struct mbuf **controlp)
	{
	struct mbuf control = controlp;
	struct cmsghdr cm = mtod(control, struct cmsghdr );
	void *data;
	socklen_t clen = control->m_len, datalen;
	int error;

	error = 0;
	*controlp = NULL;

	while (cm != NULL) {
	if (sizeof(struct cmsghdr) > clen \|\| cm->cmsg_len > clen) {
	error = EINVAL;
	break;
	}

	data = FREEBSD32_CMSG_DATA(cm);
	datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;

	*controlp = sbcreatecontrol(data, datalen, cm->cmsg_type,
	cm->cmsg_level);
	controlp = &(*controlp)->m_next;

	if (FREEBSD32_CMSG_SPACE(datalen) < clen) {
	clen -= FREEBSD32_CMSG_SPACE(datalen);
	cm = (struct cmsghdr *)
	((caddr_t)cm + FREEBSD32_CMSG_SPACE(datalen));
	} else {
	clen = 0;
	cm = NULL;
	}
	}

	m_freem(control);
	return (error);
	}


	int
	freebsd32_sendmsg(struct thread *td,
	struct freebsd32_sendmsg_args *uap)
	{
	struct msghdr msg;
	struct msghdr32 m32;
	struct iovec *iov;
	struct mbuf *control = NULL;
	struct sockaddr *to = NULL;
	int error;

	error = copyin(uap->msg, &m32, sizeof(m32));
	if (error)
	return (error);
	error = freebsd32_copyinmsghdr(uap->msg, &msg);
	if (error)
	return (error);
	error = freebsd32_copyiniov(PTRIN(m32.msg_iov), m32.msg_iovlen, &iov,
	EMSGSIZE);
	if (error)
	return (error);
	msg.msg_iov = iov;
	if (msg.msg_name != NULL) {
	error = getsockaddr(&to, msg.msg_name, msg.msg_namelen);
	if (error) {
	to = NULL;
	goto out;
	}
	msg.msg_name = to;
	}

	if (msg.msg_control) {
	if (msg.msg_controllen < sizeof(struct cmsghdr)) {
	error = EINVAL;
	goto out;
	}

	error = sockargs(&control, msg.msg_control,
	msg.msg_controllen, MT_CONTROL);
	if (error)
	goto out;

	error = freebsd32_convert_msg_in(&control);
	if (error)
	goto out;
	}

	error = kern_sendit(td, uap->s, &msg, uap->flags, control,
	UIO_USERSPACE);

	out:
	free(iov, M_IOV);
	if (to)
	free(to, M_SONAME);
	return (error);
	}

	int
	freebsd32_recvfrom(struct thread *td,
	struct freebsd32_recvfrom_args *uap)
	{
	struct msghdr msg;
	struct iovec aiov;
	int error;

	if (uap->fromlenaddr) {
	error = copyin(PTRIN(uap->fromlenaddr), &msg.msg_namelen,
	sizeof(msg.msg_namelen));
	if (error)
	return (error);
	} else {
	msg.msg_namelen = 0;
	}

	msg.msg_name = PTRIN(uap->from);
	msg.msg_iov = &aiov;
	msg.msg_iovlen = 1;
	aiov.iov_base = PTRIN(uap->buf);
	aiov.iov_len = uap->len;
	msg.msg_control = NULL;
	msg.msg_flags = uap->flags;
	error = kern_recvit(td, uap->s, &msg, UIO_USERSPACE, NULL);
	if (error == 0 && uap->fromlenaddr)
	error = copyout(&msg.msg_namelen, PTRIN(uap->fromlenaddr),
	sizeof (msg.msg_namelen));
	return (error);
	}

	int
	freebsd32_settimeofday(struct thread *td,
	struct freebsd32_settimeofday_args *uap)
	{
	struct timeval32 tv32;
	struct timeval tv, *tvp;
	struct timezone tz, *tzp;
	int error;

	if (uap->tv) {
	error = copyin(uap->tv, &tv32, sizeof(tv32));
	if (error)
	return (error);
	CP(tv32, tv, tv_sec);
	CP(tv32, tv, tv_usec);
	tvp = &tv;
	} else
	tvp = NULL;
	if (uap->tzp) {
	error = copyin(uap->tzp, &tz, sizeof(tz));
	if (error)
	return (error);
	tzp = &tz;
	} else
	tzp = NULL;
	return (kern_settimeofday(td, tvp, tzp));
	}

	int
	freebsd32_utimes(struct thread td, struct freebsd32_utimes_args uap)
	{
	struct timeval32 s32[2];
	struct timeval s[2], *sp;
	int error;

	if (uap->tptr != NULL) {
	error = copyin(uap->tptr, s32, sizeof(s32));
	if (error)
	return (error);
	CP(s32[0], s[0], tv_sec);
	CP(s32[0], s[0], tv_usec);
	CP(s32[1], s[1], tv_sec);
	CP(s32[1], s[1], tv_usec);
	sp = s;
	} else
	sp = NULL;
	return (kern_utimes(td, uap->path, UIO_USERSPACE, sp, UIO_SYSSPACE));
	}

	int
	freebsd32_lutimes(struct thread td, struct freebsd32_lutimes_args uap)
	{
	struct timeval32 s32[2];
	struct timeval s[2], *sp;
	int error;

	if (uap->tptr != NULL) {
	error = copyin(uap->tptr, s32, sizeof(s32));
	if (error)
	return (error);
	CP(s32[0], s[0], tv_sec);
	CP(s32[0], s[0], tv_usec);
	CP(s32[1], s[1], tv_sec);
	CP(s32[1], s[1], tv_usec);
	sp = s;
	} else
	sp = NULL;
	return (kern_lutimes(td, uap->path, UIO_USERSPACE, sp, UIO_SYSSPACE));
	}

	int
	freebsd32_futimes(struct thread td, struct freebsd32_futimes_args uap)
	{
	struct timeval32 s32[2];
	struct timeval s[2], *sp;
	int error;

	if (uap->tptr != NULL) {
	error = copyin(uap->tptr, s32, sizeof(s32));
	if (error)
	return (error);
	CP(s32[0], s[0], tv_sec);
	CP(s32[0], s[0], tv_usec);
	CP(s32[1], s[1], tv_sec);
	CP(s32[1], s[1], tv_usec);
	sp = s;
	} else
	sp = NULL;
	return (kern_futimes(td, uap->fd, sp, UIO_SYSSPACE));
	}

	int
	freebsd32_futimesat(struct thread td, struct freebsd32_futimesat_args uap)
	{
	struct timeval32 s32[2];
	struct timeval s[2], *sp;
	int error;

	if (uap->times != NULL) {
	error = copyin(uap->times, s32, sizeof(s32));
	if (error)
	return (error);
	CP(s32[0], s[0], tv_sec);
	CP(s32[0], s[0], tv_usec);
	CP(s32[1], s[1], tv_sec);
	CP(s32[1], s[1], tv_usec);
	sp = s;
	} else
	sp = NULL;
	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
	sp, UIO_SYSSPACE));
	}

	int
	freebsd32_adjtime(struct thread td, struct freebsd32_adjtime_args uap)
	{
	struct timeval32 tv32;
	struct timeval delta, olddelta, *deltap;
	int error;

	if (uap->delta) {
	error = copyin(uap->delta, &tv32, sizeof(tv32));
	if (error)
	return (error);
	CP(tv32, delta, tv_sec);
	CP(tv32, delta, tv_usec);
	deltap = δ
	} else
	deltap = NULL;
	error = kern_adjtime(td, deltap, &olddelta);
	if (uap->olddelta && error == 0) {
	CP(olddelta, tv32, tv_sec);
	CP(olddelta, tv32, tv_usec);
	error = copyout(&tv32, uap->olddelta, sizeof(tv32));
	}
	return (error);
	}

	#ifdef COMPAT_FREEBSD4
	int
	freebsd4_freebsd32_statfs(struct thread td, struct freebsd4_freebsd32_statfs_args uap)
	{
	struct statfs32 s32;
	struct statfs s;
	int error;

	error = kern_statfs(td, uap->path, UIO_USERSPACE, &s);
	if (error)
	return (error);
	copy_statfs(&s, &s32);
	return (copyout(&s32, uap->buf, sizeof(s32)));
	}
	#endif

	#ifdef COMPAT_FREEBSD4
	int
	freebsd4_freebsd32_fstatfs(struct thread td, struct freebsd4_freebsd32_fstatfs_args uap)
	{
	struct statfs32 s32;
	struct statfs s;
	int error;

	error = kern_fstatfs(td, uap->fd, &s);
	if (error)
	return (error);
	copy_statfs(&s, &s32);
	return (copyout(&s32, uap->buf, sizeof(s32)));
	}
	#endif

	#ifdef COMPAT_FREEBSD4
	int
	freebsd4_freebsd32_fhstatfs(struct thread td, struct freebsd4_freebsd32_fhstatfs_args uap)
	{
	struct statfs32 s32;
	struct statfs s;
	fhandle_t fh;
	int error;

	if ((error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t))) != 0)
	return (error);
	error = kern_fhstatfs(td, fh, &s);
	if (error)
	return (error);
	copy_statfs(&s, &s32);
	return (copyout(&s32, uap->buf, sizeof(s32)));
	}
	#endif

	int
	freebsd32_pread(struct thread td, struct freebsd32_pread_args uap)
	{
	struct pread_args ap;

	ap.fd = uap->fd;
	ap.buf = uap->buf;
	ap.nbyte = uap->nbyte;
	ap.offset = PAIR32TO64(off_t,uap->offset);
	- return (pread(td, &ap));
	+ return (sys_pread(td, &ap));
	}

	int
	freebsd32_pwrite(struct thread td, struct freebsd32_pwrite_args uap)
	{
	struct pwrite_args ap;

	ap.fd = uap->fd;
	ap.buf = uap->buf;
	ap.nbyte = uap->nbyte;
	ap.offset = PAIR32TO64(off_t,uap->offset);
	- return (pwrite(td, &ap));
	+ return (sys_pwrite(td, &ap));
	}

	#ifdef COMPAT_43
	int
	ofreebsd32_lseek(struct thread td, struct ofreebsd32_lseek_args uap)
	{
	struct lseek_args nuap;

	nuap.fd = uap->fd;
	nuap.offset = uap->offset;
	nuap.whence = uap->whence;
	- return (lseek(td, &nuap));
	+ return (sys_lseek(td, &nuap));
	}
	#endif

	int
	freebsd32_lseek(struct thread td, struct freebsd32_lseek_args uap)
	{
	int error;
	struct lseek_args ap;
	off_t pos;

	ap.fd = uap->fd;
	ap.offset = PAIR32TO64(off_t,uap->offset);
	ap.whence = uap->whence;
	- error = lseek(td, &ap);
	+ error = sys_lseek(td, &ap);
	/* Expand the quad return into two parts for eax and edx */
	pos = (off_t )(td->td_retval);
	td->td_retval[RETVAL_LO] = pos & 0xffffffff; /* %eax */
	td->td_retval[RETVAL_HI] = pos >> 32; /* %edx */
	return error;
	}

	int
	freebsd32_truncate(struct thread td, struct freebsd32_truncate_args uap)
	{
	struct truncate_args ap;

	ap.path = uap->path;
	ap.length = PAIR32TO64(off_t,uap->length);
	- return (truncate(td, &ap));
	+ return (sys_truncate(td, &ap));
	}

	int
	freebsd32_ftruncate(struct thread td, struct freebsd32_ftruncate_args uap)
	{
	struct ftruncate_args ap;

	ap.fd = uap->fd;
	ap.length = PAIR32TO64(off_t,uap->length);
	- return (ftruncate(td, &ap));
	+ return (sys_ftruncate(td, &ap));
	}

	#ifdef COMPAT_43
	int
	ofreebsd32_getdirentries(struct thread *td,
	struct ofreebsd32_getdirentries_args *uap)
	{
	struct ogetdirentries_args ap;
	int error;
	long loff;
	int32_t loff_cut;

	ap.fd = uap->fd;
	ap.buf = uap->buf;
	ap.count = uap->count;
	ap.basep = NULL;
	error = kern_ogetdirentries(td, &ap, &loff);
	if (error == 0) {
	loff_cut = loff;
	error = copyout(&loff_cut, uap->basep, sizeof(int32_t));
	}
	return (error);
	}
	#endif

	int
	freebsd32_getdirentries(struct thread *td,
	struct freebsd32_getdirentries_args *uap)
	{
	long base;
	int32_t base32;
	int error;

	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base);
	if (error)
	return (error);
	if (uap->basep != NULL) {
	base32 = base;
	error = copyout(&base32, uap->basep, sizeof(int32_t));
	}
	return (error);
	}

	#ifdef COMPAT_FREEBSD6
	/* versions with the 'int pad' argument */
	int
	freebsd6_freebsd32_pread(struct thread td, struct freebsd6_freebsd32_pread_args uap)
	{
	struct pread_args ap;

	ap.fd = uap->fd;
	ap.buf = uap->buf;
	ap.nbyte = uap->nbyte;
	ap.offset = PAIR32TO64(off_t,uap->offset);
	- return (pread(td, &ap));
	+ return (sys_pread(td, &ap));
	}

	int
	freebsd6_freebsd32_pwrite(struct thread td, struct freebsd6_freebsd32_pwrite_args uap)
	{
	struct pwrite_args ap;

	ap.fd = uap->fd;
	ap.buf = uap->buf;
	ap.nbyte = uap->nbyte;
	ap.offset = PAIR32TO64(off_t,uap->offset);
	- return (pwrite(td, &ap));
	+ return (sys_pwrite(td, &ap));
	}

	int
	freebsd6_freebsd32_lseek(struct thread td, struct freebsd6_freebsd32_lseek_args uap)
	{
	int error;
	struct lseek_args ap;
	off_t pos;

	ap.fd = uap->fd;
	ap.offset = PAIR32TO64(off_t,uap->offset);
	ap.whence = uap->whence;
	- error = lseek(td, &ap);
	+ error = sys_lseek(td, &ap);
	/* Expand the quad return into two parts for eax and edx */
	pos = (off_t )(td->td_retval);
	td->td_retval[RETVAL_LO] = pos & 0xffffffff; /* %eax */
	td->td_retval[RETVAL_HI] = pos >> 32; /* %edx */
	return error;
	}

	int
	freebsd6_freebsd32_truncate(struct thread td, struct freebsd6_freebsd32_truncate_args uap)
	{
	struct truncate_args ap;

	ap.path = uap->path;
	ap.length = PAIR32TO64(off_t,uap->length);
	- return (truncate(td, &ap));
	+ return (sys_truncate(td, &ap));
	}

	int
	freebsd6_freebsd32_ftruncate(struct thread td, struct freebsd6_freebsd32_ftruncate_args uap)
	{
	struct ftruncate_args ap;

	ap.fd = uap->fd;
	ap.length = PAIR32TO64(off_t,uap->length);
	- return (ftruncate(td, &ap));
	+ return (sys_ftruncate(td, &ap));
	}
	#endif /* COMPAT_FREEBSD6 */

	struct sf_hdtr32 {
	uint32_t headers;
	int hdr_cnt;
	uint32_t trailers;
	int trl_cnt;
	};

	static int
	freebsd32_do_sendfile(struct thread *td,
	struct freebsd32_sendfile_args *uap, int compat)
	{
	struct sendfile_args ap;
	struct sf_hdtr32 hdtr32;
	struct sf_hdtr hdtr;
	struct uio hdr_uio, trl_uio;
	struct iovec32 *iov32;
	int error;

	hdr_uio = trl_uio = NULL;

	ap.fd = uap->fd;
	ap.s = uap->s;
	ap.offset = PAIR32TO64(off_t,uap->offset);
	ap.nbytes = uap->nbytes;
	ap.hdtr = (struct sf_hdtr )uap->hdtr; / XXX not used */
	ap.sbytes = uap->sbytes;
	ap.flags = uap->flags;

	if (uap->hdtr != NULL) {
	error = copyin(uap->hdtr, &hdtr32, sizeof(hdtr32));
	if (error)
	goto out;
	PTRIN_CP(hdtr32, hdtr, headers);
	CP(hdtr32, hdtr, hdr_cnt);
	PTRIN_CP(hdtr32, hdtr, trailers);
	CP(hdtr32, hdtr, trl_cnt);

	if (hdtr.headers != NULL) {
	iov32 = PTRIN(hdtr32.headers);
	error = freebsd32_copyinuio(iov32,
	hdtr32.hdr_cnt, &hdr_uio);
	if (error)
	goto out;
	}
	if (hdtr.trailers != NULL) {
	iov32 = PTRIN(hdtr32.trailers);
	error = freebsd32_copyinuio(iov32,
	hdtr32.trl_cnt, &trl_uio);
	if (error)
	goto out;
	}
	}

	error = kern_sendfile(td, &ap, hdr_uio, trl_uio, compat);
	out:
	if (hdr_uio)
	free(hdr_uio, M_IOV);
	if (trl_uio)
	free(trl_uio, M_IOV);
	return (error);
	}

	#ifdef COMPAT_FREEBSD4
	int
	freebsd4_freebsd32_sendfile(struct thread *td,
	struct freebsd4_freebsd32_sendfile_args *uap)
	{
	return (freebsd32_do_sendfile(td,
	(struct freebsd32_sendfile_args *)uap, 1));
	}
	#endif

	int
	freebsd32_sendfile(struct thread td, struct freebsd32_sendfile_args uap)
	{

	return (freebsd32_do_sendfile(td, uap, 0));
	}

	static void
	copy_stat(struct stat in, struct stat32 out)
	{

	CP(in, out, st_dev);
	CP(in, out, st_ino);
	CP(in, out, st_mode);
	CP(in, out, st_nlink);
	CP(in, out, st_uid);
	CP(in, out, st_gid);
	CP(in, out, st_rdev);
	TS_CP(in, out, st_atim);
	TS_CP(in, out, st_mtim);
	TS_CP(in, out, st_ctim);
	CP(in, out, st_size);
	CP(in, out, st_blocks);
	CP(in, out, st_blksize);
	CP(in, out, st_flags);
	CP(in, out, st_gen);
	TS_CP(in, out, st_birthtim);
	}

	#ifdef COMPAT_43
	static void
	copy_ostat(struct stat in, struct ostat32 out)
	{

	CP(in, out, st_dev);
	CP(in, out, st_ino);
	CP(in, out, st_mode);
	CP(in, out, st_nlink);
	CP(in, out, st_uid);
	CP(in, out, st_gid);
	CP(in, out, st_rdev);
	CP(in, out, st_size);
	TS_CP(in, out, st_atim);
	TS_CP(in, out, st_mtim);
	TS_CP(in, out, st_ctim);
	CP(in, out, st_blksize);
	CP(in, out, st_blocks);
	CP(in, out, st_flags);
	CP(in, out, st_gen);
	}
	#endif

	int
	freebsd32_stat(struct thread td, struct freebsd32_stat_args uap)
	{
	struct stat sb;
	struct stat32 sb32;
	int error;

	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
	if (error)
	return (error);
	copy_stat(&sb, &sb32);
	error = copyout(&sb32, uap->ub, sizeof (sb32));
	return (error);
	}

	#ifdef COMPAT_43
	int
	ofreebsd32_stat(struct thread td, struct ofreebsd32_stat_args uap)
	{
	struct stat sb;
	struct ostat32 sb32;
	int error;

	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
	if (error)
	return (error);
	copy_ostat(&sb, &sb32);
	error = copyout(&sb32, uap->ub, sizeof (sb32));
	return (error);
	}
	#endif

	int
	freebsd32_fstat(struct thread td, struct freebsd32_fstat_args uap)
	{
	struct stat ub;
	struct stat32 ub32;
	int error;

	error = kern_fstat(td, uap->fd, &ub);
	if (error)
	return (error);
	copy_stat(&ub, &ub32);
	error = copyout(&ub32, uap->ub, sizeof(ub32));
	return (error);
	}

	#ifdef COMPAT_43
	int
	ofreebsd32_fstat(struct thread td, struct ofreebsd32_fstat_args uap)
	{
	struct stat ub;
	struct ostat32 ub32;
	int error;

	error = kern_fstat(td, uap->fd, &ub);
	if (error)
	return (error);
	copy_ostat(&ub, &ub32);
	error = copyout(&ub32, uap->ub, sizeof(ub32));
	return (error);
	}
	#endif

	int
	freebsd32_fstatat(struct thread td, struct freebsd32_fstatat_args uap)
	{
	struct stat ub;
	struct stat32 ub32;
	int error;

	error = kern_statat(td, uap->flag, uap->fd, uap->path, UIO_USERSPACE, &ub);
	if (error)
	return (error);
	copy_stat(&ub, &ub32);
	error = copyout(&ub32, uap->buf, sizeof(ub32));
	return (error);
	}

	int
	freebsd32_lstat(struct thread td, struct freebsd32_lstat_args uap)
	{
	struct stat sb;
	struct stat32 sb32;
	int error;

	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
	if (error)
	return (error);
	copy_stat(&sb, &sb32);
	error = copyout(&sb32, uap->ub, sizeof (sb32));
	return (error);
	}

	#ifdef COMPAT_43
	int
	ofreebsd32_lstat(struct thread td, struct ofreebsd32_lstat_args uap)
	{
	struct stat sb;
	struct ostat32 sb32;
	int error;

	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
	if (error)
	return (error);
	copy_ostat(&sb, &sb32);
	error = copyout(&sb32, uap->ub, sizeof (sb32));
	return (error);
	}
	#endif

	int
	freebsd32_sysctl(struct thread td, struct freebsd32_sysctl_args uap)
	{
	int error, name[CTL_MAXNAME];
	size_t j, oldlen;

	if (uap->namelen > CTL_MAXNAME \|\| uap->namelen < 2)
	return (EINVAL);
	error = copyin(uap->name, name, uap->namelen * sizeof(int));
	if (error)
	return (error);
	if (uap->oldlenp)
	oldlen = fuword32(uap->oldlenp);
	else
	oldlen = 0;
	error = userland_sysctl(td, name, uap->namelen,
	uap->old, &oldlen, 1,
	uap->new, uap->newlen, &j, SCTL_MASK32);
	if (error && error != ENOMEM)
	return (error);
	if (uap->oldlenp)
	suword32(uap->oldlenp, j);
	return (0);
	}

	int
	freebsd32_jail(struct thread td, struct freebsd32_jail_args uap)
	{
	uint32_t version;
	int error;
	struct jail j;

	error = copyin(uap->jail, &version, sizeof(uint32_t));
	if (error)
	return (error);

	switch (version) {
	case 0:
	{
	/* FreeBSD single IPv4 jails. */
	struct jail32_v0 j32_v0;

	bzero(&j, sizeof(struct jail));
	error = copyin(uap->jail, &j32_v0, sizeof(struct jail32_v0));
	if (error)
	return (error);
	CP(j32_v0, j, version);
	PTRIN_CP(j32_v0, j, path);
	PTRIN_CP(j32_v0, j, hostname);
	j.ip4s = j32_v0.ip_number;
	break;
	}

	case 1:
	/*
	* Version 1 was used by multi-IPv4 jail implementations
	* that never made it into the official kernel.
	*/
	return (EINVAL);

	case 2: /* JAIL_API_VERSION */
	{
	/* FreeBSD multi-IPv4/IPv6,noIP jails. */
	struct jail32 j32;

	error = copyin(uap->jail, &j32, sizeof(struct jail32));
	if (error)
	return (error);
	CP(j32, j, version);
	PTRIN_CP(j32, j, path);
	PTRIN_CP(j32, j, hostname);
	PTRIN_CP(j32, j, jailname);
	CP(j32, j, ip4s);
	CP(j32, j, ip6s);
	PTRIN_CP(j32, j, ip4);
	PTRIN_CP(j32, j, ip6);
	break;
	}

	default:
	/* Sci-Fi jails are not supported, sorry. */
	return (EINVAL);
	}
	return (kern_jail(td, &j));
	}

	int
	freebsd32_jail_set(struct thread td, struct freebsd32_jail_set_args uap)
	{
	struct uio *auio;
	int error;

	/* Check that we have an even number of iovecs. */
	if (uap->iovcnt & 1)
	return (EINVAL);

	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_jail_set(td, auio, uap->flags);
	free(auio, M_IOV);
	return (error);
	}

	int
	freebsd32_jail_get(struct thread td, struct freebsd32_jail_get_args uap)
	{
	struct iovec32 iov32;
	struct uio *auio;
	int error, i;

	/* Check that we have an even number of iovecs. */
	if (uap->iovcnt & 1)
	return (EINVAL);

	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_jail_get(td, auio, uap->flags);
	if (error == 0)
	for (i = 0; i < uap->iovcnt; i++) {
	PTROUT_CP(auio->uio_iov[i], iov32, iov_base);
	CP(auio->uio_iov[i], iov32, iov_len);
	error = copyout(&iov32, uap->iovp + i, sizeof(iov32));
	if (error != 0)
	break;
	}
	free(auio, M_IOV);
	return (error);
	}

	int
	freebsd32_sigaction(struct thread td, struct freebsd32_sigaction_args uap)
	{
	struct sigaction32 s32;
	struct sigaction sa, osa, *sap;
	int error;

	if (uap->act) {
	error = copyin(uap->act, &s32, sizeof(s32));
	if (error)
	return (error);
	sa.sa_handler = PTRIN(s32.sa_u);
	CP(s32, sa, sa_flags);
	CP(s32, sa, sa_mask);
	sap = &sa;
	} else
	sap = NULL;
	error = kern_sigaction(td, uap->sig, sap, &osa, 0);
	if (error == 0 && uap->oact != NULL) {
	s32.sa_u = PTROUT(osa.sa_handler);
	CP(osa, s32, sa_flags);
	CP(osa, s32, sa_mask);
	error = copyout(&s32, uap->oact, sizeof(s32));
	}
	return (error);
	}

	#ifdef COMPAT_FREEBSD4
	int
	freebsd4_freebsd32_sigaction(struct thread *td,
	struct freebsd4_freebsd32_sigaction_args *uap)
	{
	struct sigaction32 s32;
	struct sigaction sa, osa, *sap;
	int error;

	if (uap->act) {
	error = copyin(uap->act, &s32, sizeof(s32));
	if (error)
	return (error);
	sa.sa_handler = PTRIN(s32.sa_u);
	CP(s32, sa, sa_flags);
	CP(s32, sa, sa_mask);
	sap = &sa;
	} else
	sap = NULL;
	error = kern_sigaction(td, uap->sig, sap, &osa, KSA_FREEBSD4);
	if (error == 0 && uap->oact != NULL) {
	s32.sa_u = PTROUT(osa.sa_handler);
	CP(osa, s32, sa_flags);
	CP(osa, s32, sa_mask);
	error = copyout(&s32, uap->oact, sizeof(s32));
	}
	return (error);
	}
	#endif

	#ifdef COMPAT_43
	struct osigaction32 {
	u_int32_t sa_u;
	osigset_t sa_mask;
	int sa_flags;
	};

	#define ONSIG 32

	int
	ofreebsd32_sigaction(struct thread *td,
	struct ofreebsd32_sigaction_args *uap)
	{
	struct osigaction32 s32;
	struct sigaction sa, osa, *sap;
	int error;

	if (uap->signum <= 0 \|\| uap->signum >= ONSIG)
	return (EINVAL);

	if (uap->nsa) {
	error = copyin(uap->nsa, &s32, sizeof(s32));
	if (error)
	return (error);
	sa.sa_handler = PTRIN(s32.sa_u);
	CP(s32, sa, sa_flags);
	OSIG2SIG(s32.sa_mask, sa.sa_mask);
	sap = &sa;
	} else
	sap = NULL;
	error = kern_sigaction(td, uap->signum, sap, &osa, KSA_OSIGSET);
	if (error == 0 && uap->osa != NULL) {
	s32.sa_u = PTROUT(osa.sa_handler);
	CP(osa, s32, sa_flags);
	SIG2OSIG(osa.sa_mask, s32.sa_mask);
	error = copyout(&s32, uap->osa, sizeof(s32));
	}
	return (error);
	}

	int
	ofreebsd32_sigprocmask(struct thread *td,
	struct ofreebsd32_sigprocmask_args *uap)
	{
	sigset_t set, oset;
	int error;

	OSIG2SIG(uap->mask, set);
	error = kern_sigprocmask(td, uap->how, &set, &oset, SIGPROCMASK_OLD);
	SIG2OSIG(oset, td->td_retval[0]);
	return (error);
	}

	int
	ofreebsd32_sigpending(struct thread *td,
	struct ofreebsd32_sigpending_args *uap)
	{
	struct proc *p = td->td_proc;
	sigset_t siglist;

	PROC_LOCK(p);
	siglist = p->p_siglist;
	SIGSETOR(siglist, td->td_siglist);
	PROC_UNLOCK(p);
	SIG2OSIG(siglist, td->td_retval[0]);
	return (0);
	}

	struct sigvec32 {
	u_int32_t sv_handler;
	int sv_mask;
	int sv_flags;
	};

	int
	ofreebsd32_sigvec(struct thread *td,
	struct ofreebsd32_sigvec_args *uap)
	{
	struct sigvec32 vec;
	struct sigaction sa, osa, *sap;
	int error;

	if (uap->signum <= 0 \|\| uap->signum >= ONSIG)
	return (EINVAL);

	if (uap->nsv) {
	error = copyin(uap->nsv, &vec, sizeof(vec));
	if (error)
	return (error);
	sa.sa_handler = PTRIN(vec.sv_handler);
	OSIG2SIG(vec.sv_mask, sa.sa_mask);
	sa.sa_flags = vec.sv_flags;
	sa.sa_flags ^= SA_RESTART;
	sap = &sa;
	} else
	sap = NULL;
	error = kern_sigaction(td, uap->signum, sap, &osa, KSA_OSIGSET);
	if (error == 0 && uap->osv != NULL) {
	vec.sv_handler = PTROUT(osa.sa_handler);
	SIG2OSIG(osa.sa_mask, vec.sv_mask);
	vec.sv_flags = osa.sa_flags;
	vec.sv_flags &= ~SA_NOCLDWAIT;
	vec.sv_flags ^= SA_RESTART;
	error = copyout(&vec, uap->osv, sizeof(vec));
	}
	return (error);
	}

	int
	ofreebsd32_sigblock(struct thread *td,
	struct ofreebsd32_sigblock_args *uap)
	{
	sigset_t set, oset;

	OSIG2SIG(uap->mask, set);
	kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0);
	SIG2OSIG(oset, td->td_retval[0]);
	return (0);
	}

	int
	ofreebsd32_sigsetmask(struct thread *td,
	struct ofreebsd32_sigsetmask_args *uap)
	{
	sigset_t set, oset;

	OSIG2SIG(uap->mask, set);
	kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0);
	SIG2OSIG(oset, td->td_retval[0]);
	return (0);
	}

	int
	ofreebsd32_sigsuspend(struct thread *td,
	struct ofreebsd32_sigsuspend_args *uap)
	{
	sigset_t mask;

	OSIG2SIG(uap->mask, mask);
	return (kern_sigsuspend(td, mask));
	}

	struct sigstack32 {
	u_int32_t ss_sp;
	int ss_onstack;
	};

	int
	ofreebsd32_sigstack(struct thread *td,
	struct ofreebsd32_sigstack_args *uap)
	{
	struct sigstack32 s32;
	struct sigstack nss, oss;
	int error = 0, unss;

	if (uap->nss != NULL) {
	error = copyin(uap->nss, &s32, sizeof(s32));
	if (error)
	return (error);
	nss.ss_sp = PTRIN(s32.ss_sp);
	CP(s32, nss, ss_onstack);
	unss = 1;
	} else {
	unss = 0;
	}
	oss.ss_sp = td->td_sigstk.ss_sp;
	oss.ss_onstack = sigonstack(cpu_getstack(td));
	if (unss) {
	td->td_sigstk.ss_sp = nss.ss_sp;
	td->td_sigstk.ss_size = 0;
	td->td_sigstk.ss_flags \|= (nss.ss_onstack & SS_ONSTACK);
	td->td_pflags \|= TDP_ALTSTACK;
	}
	if (uap->oss != NULL) {
	s32.ss_sp = PTROUT(oss.ss_sp);
	CP(oss, s32, ss_onstack);
	error = copyout(&s32, uap->oss, sizeof(s32));
	}
	return (error);
	}
	#endif

	int
	freebsd32_nanosleep(struct thread td, struct freebsd32_nanosleep_args uap)
	{
	struct timespec32 rmt32, rqt32;
	struct timespec rmt, rqt;
	int error;

	error = copyin(uap->rqtp, &rqt32, sizeof(rqt32));
	if (error)
	return (error);

	CP(rqt32, rqt, tv_sec);
	CP(rqt32, rqt, tv_nsec);

	if (uap->rmtp &&
	!useracc((caddr_t)uap->rmtp, sizeof(rmt), VM_PROT_WRITE))
	return (EFAULT);
	error = kern_nanosleep(td, &rqt, &rmt);
	if (error && uap->rmtp) {
	int error2;

	CP(rmt, rmt32, tv_sec);
	CP(rmt, rmt32, tv_nsec);

	error2 = copyout(&rmt32, uap->rmtp, sizeof(rmt32));
	if (error2)
	error = error2;
	}
	return (error);
	}

	int
	freebsd32_clock_gettime(struct thread *td,
	struct freebsd32_clock_gettime_args *uap)
	{
	struct timespec ats;
	struct timespec32 ats32;
	int error;

	error = kern_clock_gettime(td, uap->clock_id, &ats);
	if (error == 0) {
	CP(ats, ats32, tv_sec);
	CP(ats, ats32, tv_nsec);
	error = copyout(&ats32, uap->tp, sizeof(ats32));
	}
	return (error);
	}

	int
	freebsd32_clock_settime(struct thread *td,
	struct freebsd32_clock_settime_args *uap)
	{
	struct timespec ats;
	struct timespec32 ats32;
	int error;

	error = copyin(uap->tp, &ats32, sizeof(ats32));
	if (error)
	return (error);
	CP(ats32, ats, tv_sec);
	CP(ats32, ats, tv_nsec);

	return (kern_clock_settime(td, uap->clock_id, &ats));
	}

	int
	freebsd32_clock_getres(struct thread *td,
	struct freebsd32_clock_getres_args *uap)
	{
	struct timespec ts;
	struct timespec32 ts32;
	int error;

	if (uap->tp == NULL)
	return (0);
	error = kern_clock_getres(td, uap->clock_id, &ts);
	if (error == 0) {
	CP(ts, ts32, tv_sec);
	CP(ts, ts32, tv_nsec);
	error = copyout(&ts32, uap->tp, sizeof(ts32));
	}
	return (error);
	}

	int
	freebsd32_thr_new(struct thread *td,
	struct freebsd32_thr_new_args *uap)
	{
	struct thr_param32 param32;
	struct thr_param param;
	int error;

	if (uap->param_size < 0 \|\|
	uap->param_size > sizeof(struct thr_param32))
	return (EINVAL);
	bzero(&param, sizeof(struct thr_param));
	bzero(&param32, sizeof(struct thr_param32));
	error = copyin(uap->param, &param32, uap->param_size);
	if (error != 0)
	return (error);
	param.start_func = PTRIN(param32.start_func);
	param.arg = PTRIN(param32.arg);
	param.stack_base = PTRIN(param32.stack_base);
	param.stack_size = param32.stack_size;
	param.tls_base = PTRIN(param32.tls_base);
	param.tls_size = param32.tls_size;
	param.child_tid = PTRIN(param32.child_tid);
	param.parent_tid = PTRIN(param32.parent_tid);
	param.flags = param32.flags;
	param.rtp = PTRIN(param32.rtp);
	param.spare[0] = PTRIN(param32.spare[0]);
	param.spare[1] = PTRIN(param32.spare[1]);
	param.spare[2] = PTRIN(param32.spare[2]);

	return (kern_thr_new(td, &param));
	}

	int
	freebsd32_thr_suspend(struct thread td, struct freebsd32_thr_suspend_args uap)
	{
	struct timespec32 ts32;
	struct timespec ts, *tsp;
	int error;

	error = 0;
	tsp = NULL;
	if (uap->timeout != NULL) {
	error = copyin((const void )uap->timeout, (void )&ts32,
	sizeof(struct timespec32));
	if (error != 0)
	return (error);
	ts.tv_sec = ts32.tv_sec;
	ts.tv_nsec = ts32.tv_nsec;
	tsp = &ts;
	}
	return (kern_thr_suspend(td, tsp));
	}

	void
	siginfo_to_siginfo32(const siginfo_t src, struct siginfo32 dst)
	{
	bzero(dst, sizeof(*dst));
	dst->si_signo = src->si_signo;
	dst->si_errno = src->si_errno;
	dst->si_code = src->si_code;
	dst->si_pid = src->si_pid;
	dst->si_uid = src->si_uid;
	dst->si_status = src->si_status;
	dst->si_addr = (uintptr_t)src->si_addr;
	dst->si_value.sigval_int = src->si_value.sival_int;
	dst->si_timerid = src->si_timerid;
	dst->si_overrun = src->si_overrun;
	}

	int
	freebsd32_sigtimedwait(struct thread td, struct freebsd32_sigtimedwait_args uap)
	{
	struct timespec32 ts32;
	struct timespec ts;
	struct timespec *timeout;
	sigset_t set;
	ksiginfo_t ksi;
	struct siginfo32 si32;
	int error;

	if (uap->timeout) {
	error = copyin(uap->timeout, &ts32, sizeof(ts32));
	if (error)
	return (error);
	ts.tv_sec = ts32.tv_sec;
	ts.tv_nsec = ts32.tv_nsec;
	timeout = &ts;
	} else
	timeout = NULL;

	error = copyin(uap->set, &set, sizeof(set));
	if (error)
	return (error);

	error = kern_sigtimedwait(td, set, &ksi, timeout);
	if (error)
	return (error);

	if (uap->info) {
	siginfo_to_siginfo32(&ksi.ksi_info, &si32);
	error = copyout(&si32, uap->info, sizeof(struct siginfo32));
	}

	if (error == 0)
	td->td_retval[0] = ksi.ksi_signo;
	return (error);
	}

	/*
	* MPSAFE
	*/
	int
	freebsd32_sigwaitinfo(struct thread td, struct freebsd32_sigwaitinfo_args uap)
	{
	ksiginfo_t ksi;
	struct siginfo32 si32;
	sigset_t set;
	int error;

	error = copyin(uap->set, &set, sizeof(set));
	if (error)
	return (error);

	error = kern_sigtimedwait(td, set, &ksi, NULL);
	if (error)
	return (error);

	if (uap->info) {
	siginfo_to_siginfo32(&ksi.ksi_info, &si32);
	error = copyout(&si32, uap->info, sizeof(struct siginfo32));
	}
	if (error == 0)
	td->td_retval[0] = ksi.ksi_signo;
	return (error);
	}

	int
	freebsd32_cpuset_setid(struct thread *td,
	struct freebsd32_cpuset_setid_args *uap)
	{
	struct cpuset_setid_args ap;

	ap.which = uap->which;
	ap.id = PAIR32TO64(id_t,uap->id);
	ap.setid = uap->setid;

	- return (cpuset_setid(td, &ap));
	+ return (sys_cpuset_setid(td, &ap));
	}

	int
	freebsd32_cpuset_getid(struct thread *td,
	struct freebsd32_cpuset_getid_args *uap)
	{
	struct cpuset_getid_args ap;

	ap.level = uap->level;
	ap.which = uap->which;
	ap.id = PAIR32TO64(id_t,uap->id);
	ap.setid = uap->setid;

	- return (cpuset_getid(td, &ap));
	+ return (sys_cpuset_getid(td, &ap));
	}

	int
	freebsd32_cpuset_getaffinity(struct thread *td,
	struct freebsd32_cpuset_getaffinity_args *uap)
	{
	struct cpuset_getaffinity_args ap;

	ap.level = uap->level;
	ap.which = uap->which;
	ap.id = PAIR32TO64(id_t,uap->id);
	ap.cpusetsize = uap->cpusetsize;
	ap.mask = uap->mask;

	- return (cpuset_getaffinity(td, &ap));
	+ return (sys_cpuset_getaffinity(td, &ap));
	}

	int
	freebsd32_cpuset_setaffinity(struct thread *td,
	struct freebsd32_cpuset_setaffinity_args *uap)
	{
	struct cpuset_setaffinity_args ap;

	ap.level = uap->level;
	ap.which = uap->which;
	ap.id = PAIR32TO64(id_t,uap->id);
	ap.cpusetsize = uap->cpusetsize;
	ap.mask = uap->mask;

	- return (cpuset_setaffinity(td, &ap));
	+ return (sys_cpuset_setaffinity(td, &ap));
	}

	int
	freebsd32_nmount(struct thread *td,
	struct freebsd32_nmount_args /* {
	struct iovec *iovp;
	unsigned int iovcnt;
	int flags;
	} / uap)
	{
	struct uio *auio;
	int error;

	AUDIT_ARG_FFLAGS(uap->flags);

	/*
	* Filter out MNT_ROOTFS. We do not want clients of nmount() in
	* userspace to set this flag, but we must filter it out if we want
	* MNT_UPDATE on the root file system to work.
	* MNT_ROOTFS should only be set by the kernel when mounting its
	* root file system.
	*/
	uap->flags &= ~MNT_ROOTFS;

	/*
	* check that we have an even number of iovec's
	* and that we have at least two options.
	*/
	if ((uap->iovcnt & 1) \|\| (uap->iovcnt < 4))
	return (EINVAL);

	error = freebsd32_copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = vfs_donmount(td, uap->flags, auio);

	free(auio, M_IOV);
	return error;
	}

	#if 0
	int
	freebsd32_xxx(struct thread td, struct freebsd32_xxx_args uap)
	{
	struct yyy32 *p32, s32;
	struct yyy *p = NULL, s;
	struct xxx_arg ap;
	int error;

	if (uap->zzz) {
	error = copyin(uap->zzz, &s32, sizeof(s32));
	if (error)
	return (error);
	/* translate in */
	p = &s;
	}
	error = kern_xxx(td, p);
	if (error)
	return (error);
	if (uap->zzz) {
	/* translate out */
	error = copyout(&s32, p32, sizeof(s32));
	}
	return (error);
	}
	#endif

	int
	syscall32_register(int offset, struct sysent new_sysent,
	struct sysent *old_sysent)
	{
	if (*offset == NO_SYSCALL) {
	int i;

	for (i = 1; i < SYS_MAXSYSCALL; ++i)
	if (freebsd32_sysent[i].sy_call ==
	(sy_call_t *)lkmnosys)
	break;
	if (i == SYS_MAXSYSCALL)
	return (ENFILE);
	*offset = i;
	} else if (offset < 0 \|\| offset >= SYS_MAXSYSCALL)
	return (EINVAL);
	else if (freebsd32_sysent[offset].sy_call != (sy_call_t )lkmnosys &&
	freebsd32_sysent[offset].sy_call != (sy_call_t )lkmressys)
	return (EEXIST);

	old_sysent = freebsd32_sysent[offset];
	freebsd32_sysent[offset] = new_sysent;
	return 0;
	}

	int
	syscall32_deregister(int offset, struct sysent old_sysent)
	{

	if (*offset)
	freebsd32_sysent[offset] = old_sysent;
	return 0;
	}

	int
	syscall32_module_handler(struct module mod, int what, void arg)
	{
	struct syscall_module_data data = (struct syscall_module_data)arg;
	modspecific_t ms;
	int error;

	switch (what) {
	case MOD_LOAD:
	error = syscall32_register(data->offset, data->new_sysent,
	&data->old_sysent);
	if (error) {
	/* Leave a mark so we know to safely unload below. */
	data->offset = NULL;
	return error;
	}
	ms.intval = *data->offset;
	MOD_XLOCK;
	module_setspecific(mod, &ms);
	MOD_XUNLOCK;
	if (data->chainevh)
	error = data->chainevh(mod, what, data->chainarg);
	return (error);
	case MOD_UNLOAD:
	/*
	* MOD_LOAD failed, so just return without calling the
	* chained handler since we didn't pass along the MOD_LOAD
	* event.
	*/
	if (data->offset == NULL)
	return (0);
	if (data->chainevh) {
	error = data->chainevh(mod, what, data->chainarg);
	if (error)
	return (error);
	}
	error = syscall32_deregister(data->offset, &data->old_sysent);
	return (error);
	default:
	error = EOPNOTSUPP;
	if (data->chainevh)
	error = data->chainevh(mod, what, data->chainarg);
	return (error);
	}
	}

	int
	syscall32_helper_register(struct syscall_helper_data *sd)
	{
	struct syscall_helper_data *sd1;
	int error;

	for (sd1 = sd; sd1->syscall_no != NO_SYSCALL; sd1++) {
	error = syscall32_register(&sd1->syscall_no, &sd1->new_sysent,
	&sd1->old_sysent);
	if (error != 0) {
	syscall32_helper_unregister(sd);
	return (error);
	}
	sd1->registered = 1;
	}
	return (0);
	}

	int
	syscall32_helper_unregister(struct syscall_helper_data *sd)
	{
	struct syscall_helper_data *sd1;

	for (sd1 = sd; sd1->registered != 0; sd1++) {
	syscall32_deregister(&sd1->syscall_no, &sd1->old_sysent);
	sd1->registered = 0;
	}
	return (0);
	}

	register_t *
	freebsd32_copyout_strings(struct image_params *imgp)
	{
	int argc, envc, i;
	u_int32_t *vectp;
	char stringp, destp;
	u_int32_t *stack_base;
	struct freebsd32_ps_strings *arginfo;
	char canary[sizeof(long) * 8];
	int32_t pagesizes32[MAXPAGESIZES];
	size_t execpath_len;
	int szsigcode;

	/*
	* Calculate string base and vector table pointers.
	* Also deal with signal trampoline code for this exec type.
	*/
	if (imgp->execpath != NULL && imgp->auxargs != NULL)
	execpath_len = strlen(imgp->execpath) + 1;
	else
	execpath_len = 0;
	arginfo = (struct freebsd32_ps_strings *)curproc->p_sysent->
	sv_psstrings;
	if (imgp->proc->p_sysent->sv_sigcode_base == 0)
	szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
	else
	szsigcode = 0;
	destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
	roundup(execpath_len, sizeof(char *)) -
	roundup(sizeof(canary), sizeof(char *)) -
	roundup(sizeof(pagesizes32), sizeof(char *)) -
	roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));

	/*
	* install sigcode
	*/
	if (szsigcode != 0)
	copyout(imgp->proc->p_sysent->sv_sigcode,
	((caddr_t)arginfo - szsigcode), szsigcode);

	/*
	* Copy the image path for the rtld.
	*/
	if (execpath_len != 0) {
	imgp->execpathp = (uintptr_t)arginfo - szsigcode - execpath_len;
	copyout(imgp->execpath, (void *)imgp->execpathp,
	execpath_len);
	}

	/*
	* Prepare the canary for SSP.
	*/
	arc4rand(canary, sizeof(canary), 0);
	imgp->canary = (uintptr_t)arginfo - szsigcode - execpath_len -
	sizeof(canary);
	copyout(canary, (void *)imgp->canary, sizeof(canary));
	imgp->canarylen = sizeof(canary);

	/*
	* Prepare the pagesizes array.
	*/
	for (i = 0; i < MAXPAGESIZES; i++)
	pagesizes32[i] = (uint32_t)pagesizes[i];
	imgp->pagesizes = (uintptr_t)arginfo - szsigcode - execpath_len -
	roundup(sizeof(canary), sizeof(char *)) - sizeof(pagesizes32);
	copyout(pagesizes32, (void *)imgp->pagesizes, sizeof(pagesizes32));
	imgp->pagesizeslen = sizeof(pagesizes32);

	/*
	* If we have a valid auxargs ptr, prepare some room
	* on the stack.
	*/
	if (imgp->auxargs) {
	/*
	* 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
	* lower compatibility.
	*/
	imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size
	: (AT_COUNT * 2);
	/*
	* The '+ 2' is for the null pointers at the end of each of
	* the arg and env vector sets,and imgp->auxarg_size is room
	* for argument of Runtime loader.
	*/
	vectp = (u_int32_t *) (destp - (imgp->args->argc +
	imgp->args->envc + 2 + imgp->auxarg_size + execpath_len) *
	sizeof(u_int32_t));
	} else
	/*
	* The '+ 2' is for the null pointers at the end of each of
	* the arg and env vector sets
	*/
	vectp = (u_int32_t *)
	(destp - (imgp->args->argc + imgp->args->envc + 2) * sizeof(u_int32_t));

	/*
	* vectp also becomes our initial stack base
	*/
	stack_base = vectp;

	stringp = imgp->args->begin_argv;
	argc = imgp->args->argc;
	envc = imgp->args->envc;
	/*
	* Copy out strings - arguments and environment.
	*/
	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);

	/*
	* Fill in "ps_strings" struct for ps, w, etc.
	*/
	suword32(&arginfo->ps_argvstr, (u_int32_t)(intptr_t)vectp);
	suword32(&arginfo->ps_nargvstr, argc);

	/*
	* Fill in argument portion of vector table.
	*/
	for (; argc > 0; --argc) {
	suword32(vectp++, (u_int32_t)(intptr_t)destp);
	while (*stringp++ != 0)
	destp++;
	destp++;
	}

	/* a null vector table pointer separates the argp's from the envp's */
	suword32(vectp++, 0);

	suword32(&arginfo->ps_envstr, (u_int32_t)(intptr_t)vectp);
	suword32(&arginfo->ps_nenvstr, envc);

	/*
	* Fill in environment portion of vector table.
	*/
	for (; envc > 0; --envc) {
	suword32(vectp++, (u_int32_t)(intptr_t)destp);
	while (*stringp++ != 0)
	destp++;
	destp++;
	}

	/* end of vector table is a null pointer */
	suword32(vectp, 0);

	return ((register_t *)stack_base);
	}

	int
	freebsd32_kldstat(struct thread td, struct freebsd32_kldstat_args uap)
	{
	struct kld_file_stat stat;
	struct kld32_file_stat stat32;
	int error, version;

	if ((error = copyin(&uap->stat->version, &version, sizeof(version)))
	!= 0)
	return (error);
	if (version != sizeof(struct kld32_file_stat_1) &&
	version != sizeof(struct kld32_file_stat))
	return (EINVAL);

	error = kern_kldstat(td, uap->fileid, &stat);
	if (error != 0)
	return (error);

	bcopy(&stat.name[0], &stat32.name[0], sizeof(stat.name));
	CP(stat, stat32, refs);
	CP(stat, stat32, id);
	PTROUT_CP(stat, stat32, address);
	CP(stat, stat32, size);
	bcopy(&stat.pathname[0], &stat32.pathname[0], sizeof(stat.pathname));
	return (copyout(&stat32, uap->stat, version));
	}

	int
	freebsd32_posix_fallocate(struct thread *td,
	struct freebsd32_posix_fallocate_args *uap)
	{
	struct posix_fallocate_args ap;

	ap.fd = uap->fd;
	ap.offset = (uap->offsetlo \| ((off_t)uap->offsethi << 32));
	ap.len = (uap->lenlo \| ((off_t)uap->lenhi << 32));
	- return (posix_fallocate(td, &ap));
	+ return (sys_posix_fallocate(td, &ap));
	}
	Index: head/sys/compat/freebsd32/freebsd32_util.h
	===================================================================
	--- head/sys/compat/freebsd32/freebsd32_util.h (revision 225616)
	+++ head/sys/compat/freebsd32/freebsd32_util.h (revision 225617)
	@@ -1,109 +1,118 @@
	/*-
	* Copyright (c) 1998-1999 Andrew Gallatin
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software withough specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#ifndef _COMPAT_FREEBSD32_FREEBSD32_UTIL_H_
	#define _COMPAT_FREEBSD32_FREEBSD32_UTIL_H_

	#include <sys/cdefs.h>
	#include <sys/exec.h>
	#include <sys/sysent.h>
	#include <sys/uio.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>

	struct freebsd32_ps_strings {
	u_int32_t ps_argvstr; /* first of 0 or more argument strings */
	int ps_nargvstr; /* the number of argument strings */
	u_int32_t ps_envstr; /* first of 0 or more environment strings */
	int ps_nenvstr; /* the number of environment strings */
	};

	#if defined(__amd64__) \|\| defined(__ia64__)
	#include <compat/ia32/ia32_util.h>
	#endif

	#define FREEBSD32_PS_STRINGS \
	(FREEBSD32_USRSTACK - sizeof(struct freebsd32_ps_strings))

	extern struct sysent freebsd32_sysent[];

	#define SYSCALL32_MODULE(name, offset, new_sysent, evh, arg) \
	static struct syscall_module_data name##_syscall32_mod = { \
	evh, arg, offset, new_sysent, { 0, NULL } \
	}; \
	\
	static moduledata_t name##32_mod = { \
	"sys32/" #name, \
	syscall32_module_handler, \
	&name##_syscall32_mod \
	}; \
	DECLARE_MODULE(name##32, name##32_mod, SI_SUB_SYSCALLS, SI_ORDER_MIDDLE)

	#define SYSCALL32_MODULE_HELPER(syscallname) \
	static int syscallname##_syscall32 = FREEBSD32_SYS_##syscallname; \
	static struct sysent syscallname##_sysent32 = { \
	(sizeof(struct syscallname ## _args ) \
	/ sizeof(register_t)), \
	(sy_call_t *)& syscallname \
	}; \
	SYSCALL32_MODULE(syscallname, \
	& syscallname##_syscall32, & syscallname##_sysent32,\
	NULL, NULL);

	#define SYSCALL32_INIT_HELPER(syscallname) { \
	.new_sysent = { \
	.sy_narg = (sizeof(struct syscallname ## _args ) \
	/ sizeof(register_t)), \
	.sy_call = (sy_call_t *)& syscallname, \
	}, \
	.syscall_no = FREEBSD32_SYS_##syscallname \
	}

	+#define SYSCALL32_INIT_HELPER_COMPAT(syscallname) { \
	+ .new_sysent = { \
	+ .sy_narg = (sizeof(struct syscallname ## _args ) \
	+ / sizeof(register_t)), \
	+ .sy_call = (sy_call_t *)& sys_ ## syscallname, \
	+ }, \
	+ .syscall_no = FREEBSD32_SYS_##syscallname \
	+}
	+
	int syscall32_register(int offset, struct sysent new_sysent,
	struct sysent *old_sysent);
	int syscall32_deregister(int offset, struct sysent old_sysent);
	int syscall32_module_handler(struct module mod, int what, void arg);
	int syscall32_helper_register(struct syscall_helper_data *sd);
	int syscall32_helper_unregister(struct syscall_helper_data *sd);

	struct iovec32;
	struct rusage32;
	register_t freebsd32_copyout_strings(struct image_params imgp);
	int freebsd32_copyiniov(struct iovec32 *iovp, u_int iovcnt,
	struct iovec **iov, int error);
	void freebsd32_rusage_out(const struct rusage s, struct rusage32 s32);

	struct image_args;
	int freebsd32_exec_copyin_args(struct image_args args, char fname,
	enum uio_seg segflg, u_int32_t argv, u_int32_t envv);

	#endif /* !_COMPAT_FREEBSD32_FREEBSD32_UTIL_H_ */
	Index: head/sys/compat/linux/linux_emul.c
	===================================================================
	--- head/sys/compat/linux/linux_emul.c (revision 225616)
	+++ head/sys/compat/linux/linux_emul.c (revision 225617)
	@@ -1,372 +1,372 @@
	/*-
	* Copyright (c) 2006 Roman Divacky
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/imgact.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/sx.h>
	#include <sys/proc.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/unistd.h>

	#ifdef COMPAT_LINUX32
	#include <machine/../linux32/linux.h>
	#include <machine/../linux32/linux32_proto.h>
	#else
	#include <machine/../linux/linux.h>
	#include <machine/../linux/linux_proto.h>
	#endif

	#include <compat/linux/linux_emul.h>
	#include <compat/linux/linux_futex.h>

	struct sx emul_shared_lock;
	struct mtx emul_lock;

	/* this returns locked reference to the emuldata entry (if found) */
	struct linux_emuldata *
	em_find(struct proc *p, int locked)
	{
	struct linux_emuldata *em;

	if (locked == EMUL_DOLOCK)
	EMUL_LOCK(&emul_lock);

	em = p->p_emuldata;

	if (em == NULL && locked == EMUL_DOLOCK)
	EMUL_UNLOCK(&emul_lock);

	return (em);
	}

	int
	linux_proc_init(struct thread *td, pid_t child, int flags)
	{
	struct linux_emuldata em, p_em;
	struct proc *p;

	if (child != 0) {
	/* non-exec call */
	em = malloc(sizeof *em, M_LINUX, M_WAITOK \| M_ZERO);
	em->pid = child;
	em->pdeath_signal = 0;
	em->flags = 0;
	em->robust_futexes = NULL;
	if (flags & LINUX_CLONE_THREAD) {
	/* handled later in the code */
	} else {
	struct linux_emuldata_shared *s;

	s = malloc(sizeof *s, M_LINUX, M_WAITOK \| M_ZERO);
	s->refs = 1;
	s->group_pid = child;

	LIST_INIT(&s->threads);
	em->shared = s;
	}
	} else {
	/* lookup the old one */
	em = em_find(td->td_proc, EMUL_DOLOCK);
	KASSERT(em != NULL, ("proc_init: emuldata not found in exec case.\n"));
	}

	em->child_clear_tid = NULL;
	em->child_set_tid = NULL;

	/*
	* allocate the shared struct only in clone()/fork cases in the case
	* of clone() td = calling proc and child = pid of the newly created
	* proc
	*/
	if (child != 0) {
	if (flags & LINUX_CLONE_THREAD) {
	/* lookup the parent */
	/*
	* we dont have to lock the p_em because
	* its waiting for us in linux_clone so
	* there is no chance of it changing the
	* p_em->shared address
	*/
	p_em = em_find(td->td_proc, EMUL_DONTLOCK);
	KASSERT(p_em != NULL, ("proc_init: parent emuldata not found for CLONE_THREAD\n"));
	em->shared = p_em->shared;
	EMUL_SHARED_WLOCK(&emul_shared_lock);
	em->shared->refs++;
	EMUL_SHARED_WUNLOCK(&emul_shared_lock);
	} else {
	/*
	* handled earlier to avoid malloc(M_WAITOK) with
	* rwlock held
	*/
	}
	}
	if (child != 0) {
	EMUL_SHARED_WLOCK(&emul_shared_lock);
	LIST_INSERT_HEAD(&em->shared->threads, em, threads);
	EMUL_SHARED_WUNLOCK(&emul_shared_lock);

	p = pfind(child);
	KASSERT(p != NULL, ("process not found in proc_init\n"));
	p->p_emuldata = em;
	PROC_UNLOCK(p);
	} else
	EMUL_UNLOCK(&emul_lock);

	return (0);
	}

	void
	linux_proc_exit(void arg __unused, struct proc p)
	{
	struct linux_emuldata *em;
	int error, shared_flags, shared_xstat;
	struct thread *td = FIRST_THREAD_IN_PROC(p);
	int *child_clear_tid;
	struct proc q, nq;

	if (__predict_true(p->p_sysent != &elf_linux_sysvec))
	return;

	release_futexes(p);

	/* find the emuldata */
	em = em_find(p, EMUL_DOLOCK);

	KASSERT(em != NULL, ("proc_exit: emuldata not found.\n"));

	/* reparent all procs that are not a thread leader to initproc */
	if (em->shared->group_pid != p->p_pid) {
	child_clear_tid = em->child_clear_tid;
	EMUL_UNLOCK(&emul_lock);
	sx_xlock(&proctree_lock);
	wakeup(initproc);
	PROC_LOCK(p);
	proc_reparent(p, initproc);
	p->p_sigparent = SIGCHLD;
	PROC_UNLOCK(p);
	sx_xunlock(&proctree_lock);
	} else {
	child_clear_tid = em->child_clear_tid;
	EMUL_UNLOCK(&emul_lock);
	}

	EMUL_SHARED_WLOCK(&emul_shared_lock);
	shared_flags = em->shared->flags;
	shared_xstat = em->shared->xstat;
	LIST_REMOVE(em, threads);

	em->shared->refs--;
	if (em->shared->refs == 0) {
	EMUL_SHARED_WUNLOCK(&emul_shared_lock);
	free(em->shared, M_LINUX);
	} else
	EMUL_SHARED_WUNLOCK(&emul_shared_lock);

	if ((shared_flags & EMUL_SHARED_HASXSTAT) != 0)
	p->p_xstat = shared_xstat;

	if (child_clear_tid != NULL) {
	struct linux_sys_futex_args cup;
	int null = 0;

	error = copyout(&null, child_clear_tid, sizeof(null));
	if (error) {
	free(em, M_LINUX);
	return;
	}

	/* futexes stuff */
	cup.uaddr = child_clear_tid;
	cup.op = LINUX_FUTEX_WAKE;
	cup.val = 0x7fffffff; /* Awake everyone */
	cup.timeout = NULL;
	cup.uaddr2 = NULL;
	cup.val3 = 0;
	error = linux_sys_futex(FIRST_THREAD_IN_PROC(p), &cup);
	/*
	* this cannot happen at the moment and if this happens it
	* probably means there is a user space bug
	*/
	if (error)
	printf(LMSG("futex stuff in proc_exit failed.\n"));
	}

	/* clean the stuff up */
	free(em, M_LINUX);

	/* this is a little weird but rewritten from exit1() */
	sx_xlock(&proctree_lock);
	q = LIST_FIRST(&p->p_children);
	for (; q != NULL; q = nq) {
	nq = LIST_NEXT(q, p_sibling);
	if (q->p_flag & P_WEXIT)
	continue;
	if (__predict_false(q->p_sysent != &elf_linux_sysvec))
	continue;
	em = em_find(q, EMUL_DOLOCK);
	KASSERT(em != NULL, ("linux_reparent: emuldata not found: %i\n", q->p_pid));
	PROC_LOCK(q);
	if ((q->p_flag & P_WEXIT) == 0 && em->pdeath_signal != 0) {
	- psignal(q, em->pdeath_signal);
	+ kern_psignal(q, em->pdeath_signal);
	}
	PROC_UNLOCK(q);
	EMUL_UNLOCK(&emul_lock);
	}
	sx_xunlock(&proctree_lock);
	}

	/*
	* This is used in a case of transition from FreeBSD binary execing to linux binary
	* in this case we create linux emuldata proc entry with the pid of the currently running
	* process.
	*/
	void
	linux_proc_exec(void arg __unused, struct proc p, struct image_params *imgp)
	{
	if (__predict_false(imgp->sysent == &elf_linux_sysvec
	&& p->p_sysent != &elf_linux_sysvec))
	linux_proc_init(FIRST_THREAD_IN_PROC(p), p->p_pid, 0);
	if (__predict_false((p->p_sysent->sv_flags & SV_ABI_MASK) ==
	SV_ABI_LINUX))
	/* Kill threads regardless of imgp->sysent value */
	linux_kill_threads(FIRST_THREAD_IN_PROC(p), SIGKILL);
	if (__predict_false(imgp->sysent != &elf_linux_sysvec
	&& p->p_sysent == &elf_linux_sysvec)) {
	struct linux_emuldata *em;

	/*
	* XXX:There's a race because here we assign p->p_emuldata NULL
	* but the process is still counted as linux one for a short
	* time so some other process might reference it and try to
	* access its p->p_emuldata and panicing on a NULL reference.
	*/
	em = em_find(p, EMUL_DONTLOCK);

	KASSERT(em != NULL, ("proc_exec: emuldata not found.\n"));

	EMUL_SHARED_WLOCK(&emul_shared_lock);
	LIST_REMOVE(em, threads);

	PROC_LOCK(p);
	p->p_emuldata = NULL;
	PROC_UNLOCK(p);

	em->shared->refs--;
	if (em->shared->refs == 0) {
	EMUL_SHARED_WUNLOCK(&emul_shared_lock);
	free(em->shared, M_LINUX);
	} else
	EMUL_SHARED_WUNLOCK(&emul_shared_lock);

	free(em, M_LINUX);
	}
	}

	void
	linux_schedtail(struct thread *td)
	{
	struct linux_emuldata *em;
	struct proc *p;
	int error = 0;
	int *child_set_tid;

	p = td->td_proc;

	/* find the emuldata */
	em = em_find(p, EMUL_DOLOCK);

	KASSERT(em != NULL, ("linux_schedtail: emuldata not found.\n"));
	child_set_tid = em->child_set_tid;
	EMUL_UNLOCK(&emul_lock);

	if (child_set_tid != NULL)
	error = copyout(&p->p_pid, (int *)child_set_tid,
	sizeof(p->p_pid));

	return;
	}

	int
	linux_set_tid_address(struct thread td, struct linux_set_tid_address_args args)
	{
	struct linux_emuldata *em;

	#ifdef DEBUG
	if (ldebug(set_tid_address))
	printf(ARGS(set_tid_address, "%p"), args->tidptr);
	#endif

	/* find the emuldata */
	em = em_find(td->td_proc, EMUL_DOLOCK);

	KASSERT(em != NULL, ("set_tid_address: emuldata not found.\n"));

	em->child_clear_tid = args->tidptr;
	td->td_retval[0] = td->td_proc->p_pid;

	EMUL_UNLOCK(&emul_lock);
	return 0;
	}

	void
	linux_kill_threads(struct thread *td, int sig)
	{
	struct linux_emuldata em, td_em, *tmp_em;
	struct proc *sp;

	td_em = em_find(td->td_proc, EMUL_DONTLOCK);

	KASSERT(td_em != NULL, ("linux_kill_threads: emuldata not found.\n"));

	EMUL_SHARED_RLOCK(&emul_shared_lock);
	LIST_FOREACH_SAFE(em, &td_em->shared->threads, threads, tmp_em) {
	if (em->pid == td_em->pid)
	continue;

	sp = pfind(em->pid);
	if ((sp->p_flag & P_WEXIT) == 0)
	- psignal(sp, sig);
	+ kern_psignal(sp, sig);
	PROC_UNLOCK(sp);
	#ifdef DEBUG
	printf(LMSG("linux_kill_threads: kill PID %d\n"), em->pid);
	#endif
	}
	EMUL_SHARED_RUNLOCK(&emul_shared_lock);
	}
	Index: head/sys/compat/linux/linux_file.c
	===================================================================
	--- head/sys/compat/linux/linux_file.c (revision 225616)
	+++ head/sys/compat/linux/linux_file.c (revision 225617)
	@@ -1,1532 +1,1532 @@
	/*-
	* Copyright (c) 1994-1995 Søren Schmidt
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/conf.h>
	#include <sys/dirent.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/proc.h>
	#include <sys/stat.h>
	#include <sys/sx.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>
	#include <sys/tty.h>
	#include <sys/unistd.h>
	#include <sys/vnode.h>

	#include <security/mac/mac_framework.h>

	#include <ufs/ufs/extattr.h>
	#include <ufs/ufs/quota.h>
	#include <ufs/ufs/ufsmount.h>

	#ifdef COMPAT_LINUX32
	#include <machine/../linux32/linux.h>
	#include <machine/../linux32/linux32_proto.h>
	#else
	#include <machine/../linux/linux.h>
	#include <machine/../linux/linux_proto.h>
	#endif
	#include <compat/linux/linux_util.h>
	#include <compat/linux/linux_file.h>

	int
	linux_creat(struct thread td, struct linux_creat_args args)
	{
	char *path;
	int error;

	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(creat))
	printf(ARGS(creat, "%s, %d"), path, args->mode);
	#endif
	error = kern_open(td, path, UIO_SYSSPACE, O_WRONLY \| O_CREAT \| O_TRUNC,
	args->mode);
	LFREEPATH(path);
	return (error);
	}


	static int
	linux_common_open(struct thread td, int dirfd, char path, int l_flags, int mode)
	{
	struct proc *p = td->td_proc;
	struct file *fp;
	int fd;
	int bsd_flags, error;

	bsd_flags = 0;
	switch (l_flags & LINUX_O_ACCMODE) {
	case LINUX_O_WRONLY:
	bsd_flags \|= O_WRONLY;
	break;
	case LINUX_O_RDWR:
	bsd_flags \|= O_RDWR;
	break;
	default:
	bsd_flags \|= O_RDONLY;
	}
	if (l_flags & LINUX_O_NDELAY)
	bsd_flags \|= O_NONBLOCK;
	if (l_flags & LINUX_O_APPEND)
	bsd_flags \|= O_APPEND;
	if (l_flags & LINUX_O_SYNC)
	bsd_flags \|= O_FSYNC;
	if (l_flags & LINUX_O_NONBLOCK)
	bsd_flags \|= O_NONBLOCK;
	if (l_flags & LINUX_FASYNC)
	bsd_flags \|= O_ASYNC;
	if (l_flags & LINUX_O_CREAT)
	bsd_flags \|= O_CREAT;
	if (l_flags & LINUX_O_TRUNC)
	bsd_flags \|= O_TRUNC;
	if (l_flags & LINUX_O_EXCL)
	bsd_flags \|= O_EXCL;
	if (l_flags & LINUX_O_NOCTTY)
	bsd_flags \|= O_NOCTTY;
	if (l_flags & LINUX_O_DIRECT)
	bsd_flags \|= O_DIRECT;
	if (l_flags & LINUX_O_NOFOLLOW)
	bsd_flags \|= O_NOFOLLOW;
	if (l_flags & LINUX_O_DIRECTORY)
	bsd_flags \|= O_DIRECTORY;
	/* XXX LINUX_O_NOATIME: unable to be easily implemented. */

	error = kern_openat(td, dirfd, path, UIO_SYSSPACE, bsd_flags, mode);

	if (!error) {
	fd = td->td_retval[0];
	/*
	* XXX In between kern_open() and fget(), another process
	* having the same filedesc could use that fd without
	* checking below.
	*/
	error = fget(td, fd, CAP_IOCTL, &fp);
	if (!error) {
	sx_slock(&proctree_lock);
	PROC_LOCK(p);
	if (!(bsd_flags & O_NOCTTY) &&
	SESS_LEADER(p) && !(p->p_flag & P_CONTROLT)) {
	PROC_UNLOCK(p);
	sx_unlock(&proctree_lock);
	if (fp->f_type == DTYPE_VNODE)
	(void) fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0,
	td->td_ucred, td);
	} else {
	PROC_UNLOCK(p);
	sx_sunlock(&proctree_lock);
	}
	fdrop(fp, td);
	/*
	* XXX as above, fdrop()/kern_close() pair is racy.
	*/
	if (error)
	kern_close(td, fd);
	}
	}

	#ifdef DEBUG
	if (ldebug(open))
	printf(LMSG("open returns error %d"), error);
	#endif
	LFREEPATH(path);
	return (error);
	}

	int
	linux_openat(struct thread td, struct linux_openat_args args)
	{
	char *path;
	int dfd;

	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
	if (args->flags & LINUX_O_CREAT)
	LCONVPATH_AT(td, args->filename, &path, 1, dfd);
	else
	LCONVPATH_AT(td, args->filename, &path, 0, dfd);
	#ifdef DEBUG
	if (ldebug(openat))
	printf(ARGS(openat, "%i, %s, 0x%x, 0x%x"), args->dfd,
	path, args->flags, args->mode);
	#endif
	return (linux_common_open(td, dfd, path, args->flags, args->mode));
	}

	int
	linux_open(struct thread td, struct linux_open_args args)
	{
	char *path;

	if (args->flags & LINUX_O_CREAT)
	LCONVPATHCREAT(td, args->path, &path);
	else
	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(open))
	printf(ARGS(open, "%s, 0x%x, 0x%x"),
	path, args->flags, args->mode);
	#endif

	return (linux_common_open(td, AT_FDCWD, path, args->flags, args->mode));
	}

	int
	linux_lseek(struct thread td, struct linux_lseek_args args)
	{

	struct lseek_args /* {
	int fd;
	int pad;
	off_t offset;
	int whence;
	} */ tmp_args;
	int error;

	#ifdef DEBUG
	if (ldebug(lseek))
	printf(ARGS(lseek, "%d, %ld, %d"),
	args->fdes, (long)args->off, args->whence);
	#endif
	tmp_args.fd = args->fdes;
	tmp_args.offset = (off_t)args->off;
	tmp_args.whence = args->whence;
	- error = lseek(td, &tmp_args);
	+ error = sys_lseek(td, &tmp_args);
	return error;
	}

	int
	linux_llseek(struct thread td, struct linux_llseek_args args)
	{
	struct lseek_args bsd_args;
	int error;
	off_t off;

	#ifdef DEBUG
	if (ldebug(llseek))
	printf(ARGS(llseek, "%d, %d:%d, %d"),
	args->fd, args->ohigh, args->olow, args->whence);
	#endif
	off = (args->olow) \| (((off_t) args->ohigh) << 32);

	bsd_args.fd = args->fd;
	bsd_args.offset = off;
	bsd_args.whence = args->whence;

	- if ((error = lseek(td, &bsd_args)))
	+ if ((error = sys_lseek(td, &bsd_args)))
	return error;

	if ((error = copyout(td->td_retval, args->res, sizeof (off_t))))
	return error;

	td->td_retval[0] = 0;
	return 0;
	}

	int
	linux_readdir(struct thread td, struct linux_readdir_args args)
	{
	struct linux_getdents_args lda;

	lda.fd = args->fd;
	lda.dent = args->dent;
	lda.count = 1;
	return linux_getdents(td, &lda);
	}

	/*
	* Note that linux_getdents(2) and linux_getdents64(2) have the same
	* arguments. They only differ in the definition of struct dirent they
	* operate on. We use this to common the code, with the exception of
	* accessing struct dirent. Note that linux_readdir(2) is implemented
	* by means of linux_getdents(2). In this case we never operate on
	* struct dirent64 and thus don't need to handle it...
	*/

	struct l_dirent {
	l_ulong d_ino;
	l_off_t d_off;
	l_ushort d_reclen;
	char d_name[LINUX_NAME_MAX + 1];
	};

	struct l_dirent64 {
	uint64_t d_ino;
	int64_t d_off;
	l_ushort d_reclen;
	u_char d_type;
	char d_name[LINUX_NAME_MAX + 1];
	};

	/*
	* Linux uses the last byte in the dirent buffer to store d_type,
	* at least glibc-2.7 requires it. That is why l_dirent is padded with 2 bytes.
	*/
	#define LINUX_RECLEN(namlen) \
	roundup((offsetof(struct l_dirent, d_name) + (namlen) + 2), \
	sizeof(l_ulong))

	#define LINUX_RECLEN64(namlen) \
	roundup((offsetof(struct l_dirent64, d_name) + (namlen) + 1), \
	sizeof(uint64_t))

	#define LINUX_MAXRECLEN max(LINUX_RECLEN(LINUX_NAME_MAX), \
	LINUX_RECLEN64(LINUX_NAME_MAX))
	#define LINUX_DIRBLKSIZ 512

	static int
	getdents_common(struct thread td, struct linux_getdents64_args args,
	int is64bit)
	{
	struct dirent *bdp;
	struct vnode *vp;
	caddr_t inp, buf; /* BSD-format */
	int len, reclen; /* BSD-format */
	caddr_t outp; /* Linux-format */
	int resid, linuxreclen=0; /* Linux-format */
	caddr_t lbuf; /* Linux-format */
	struct file *fp;
	struct uio auio;
	struct iovec aiov;
	off_t off;
	struct l_dirent *linux_dirent;
	struct l_dirent64 *linux_dirent64;
	int buflen, error, eofflag, nbytes, justone;
	u_long cookies = NULL, cookiep;
	int ncookies, vfslocked;

	nbytes = args->count;
	if (nbytes == 1) {
	/* readdir(2) case. Always struct dirent. */
	if (is64bit)
	return (EINVAL);
	nbytes = sizeof(*linux_dirent);
	justone = 1;
	} else
	justone = 0;

	if ((error = getvnode(td->td_proc->p_fd, args->fd, CAP_READ, &fp)) != 0)
	return (error);

	if ((fp->f_flag & FREAD) == 0) {
	fdrop(fp, td);
	return (EBADF);
	}

	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	if (vp->v_type != VDIR) {
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	return (EINVAL);
	}

	off = fp->f_offset;

	buflen = max(LINUX_DIRBLKSIZ, nbytes);
	buflen = min(buflen, MAXBSIZE);
	buf = malloc(buflen, M_TEMP, M_WAITOK);
	lbuf = malloc(LINUX_MAXRECLEN, M_TEMP, M_WAITOK \| M_ZERO);
	vn_lock(vp, LK_SHARED \| LK_RETRY);

	aiov.iov_base = buf;
	aiov.iov_len = buflen;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_td = td;
	auio.uio_resid = buflen;
	auio.uio_offset = off;

	if (cookies) {
	free(cookies, M_TEMP);
	cookies = NULL;
	}

	#ifdef MAC
	/*
	* Do directory search MAC check using non-cached credentials.
	*/
	if ((error = mac_vnode_check_readdir(td->td_ucred, vp)))
	goto out;
	#endif /* MAC */
	if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies,
	&cookies)))
	goto out;

	inp = buf;
	outp = (caddr_t)args->dirent;
	resid = nbytes;
	if ((len = buflen - auio.uio_resid) <= 0)
	goto eof;

	cookiep = cookies;

	if (cookies) {
	/*
	* When using cookies, the vfs has the option of reading from
	* a different offset than that supplied (UFS truncates the
	* offset to a block boundary to make sure that it never reads
	* partway through a directory entry, even if the directory
	* has been compacted).
	*/
	while (len > 0 && ncookies > 0 && *cookiep <= off) {
	bdp = (struct dirent *) inp;
	len -= bdp->d_reclen;
	inp += bdp->d_reclen;
	cookiep++;
	ncookies--;
	}
	}

	while (len > 0) {
	if (cookiep && ncookies == 0)
	break;
	bdp = (struct dirent *) inp;
	reclen = bdp->d_reclen;
	if (reclen & 3) {
	error = EFAULT;
	goto out;
	}

	if (bdp->d_fileno == 0) {
	inp += reclen;
	if (cookiep) {
	off = *cookiep++;
	ncookies--;
	} else
	off += reclen;

	len -= reclen;
	continue;
	}

	linuxreclen = (is64bit)
	? LINUX_RECLEN64(bdp->d_namlen)
	: LINUX_RECLEN(bdp->d_namlen);

	if (reclen > len \|\| resid < linuxreclen) {
	outp++;
	break;
	}

	if (justone) {
	/* readdir(2) case. */
	linux_dirent = (struct l_dirent*)lbuf;
	linux_dirent->d_ino = bdp->d_fileno;
	linux_dirent->d_off = (l_off_t)linuxreclen;
	linux_dirent->d_reclen = (l_ushort)bdp->d_namlen;
	strlcpy(linux_dirent->d_name, bdp->d_name,
	linuxreclen - offsetof(struct l_dirent, d_name));
	error = copyout(linux_dirent, outp, linuxreclen);
	}
	if (is64bit) {
	linux_dirent64 = (struct l_dirent64*)lbuf;
	linux_dirent64->d_ino = bdp->d_fileno;
	linux_dirent64->d_off = (cookiep)
	? (l_off_t)*cookiep
	: (l_off_t)(off + reclen);
	linux_dirent64->d_reclen = (l_ushort)linuxreclen;
	linux_dirent64->d_type = bdp->d_type;
	strlcpy(linux_dirent64->d_name, bdp->d_name,
	linuxreclen - offsetof(struct l_dirent64, d_name));
	error = copyout(linux_dirent64, outp, linuxreclen);
	} else if (!justone) {
	linux_dirent = (struct l_dirent*)lbuf;
	linux_dirent->d_ino = bdp->d_fileno;
	linux_dirent->d_off = (cookiep)
	? (l_off_t)*cookiep
	: (l_off_t)(off + reclen);
	linux_dirent->d_reclen = (l_ushort)linuxreclen;
	/*
	* Copy d_type to last byte of l_dirent buffer
	*/
	lbuf[linuxreclen-1] = bdp->d_type;
	strlcpy(linux_dirent->d_name, bdp->d_name,
	linuxreclen - offsetof(struct l_dirent, d_name)-1);
	error = copyout(linux_dirent, outp, linuxreclen);
	}

	if (error)
	goto out;

	inp += reclen;
	if (cookiep) {
	off = *cookiep++;
	ncookies--;
	} else
	off += reclen;

	outp += linuxreclen;
	resid -= linuxreclen;
	len -= reclen;
	if (justone)
	break;
	}

	if (outp == (caddr_t)args->dirent) {
	nbytes = resid;
	goto eof;
	}

	fp->f_offset = off;
	if (justone)
	nbytes = resid + linuxreclen;

	eof:
	td->td_retval[0] = nbytes - resid;

	out:
	if (cookies)
	free(cookies, M_TEMP);

	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	free(buf, M_TEMP);
	free(lbuf, M_TEMP);
	return (error);
	}

	int
	linux_getdents(struct thread td, struct linux_getdents_args args)
	{

	#ifdef DEBUG
	if (ldebug(getdents))
	printf(ARGS(getdents, "%d, *, %d"), args->fd, args->count);
	#endif

	return (getdents_common(td, (struct linux_getdents64_args*)args, 0));
	}

	int
	linux_getdents64(struct thread td, struct linux_getdents64_args args)
	{

	#ifdef DEBUG
	if (ldebug(getdents64))
	printf(ARGS(getdents64, "%d, *, %d"), args->fd, args->count);
	#endif

	return (getdents_common(td, args, 1));
	}

	/*
	* These exist mainly for hooks for doing /compat/linux translation.
	*/

	int
	linux_access(struct thread td, struct linux_access_args args)
	{
	char *path;
	int error;

	/* linux convention */
	if (args->flags & ~(F_OK \| X_OK \| W_OK \| R_OK))
	return (EINVAL);

	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(access))
	printf(ARGS(access, "%s, %d"), path, args->flags);
	#endif
	error = kern_access(td, path, UIO_SYSSPACE, args->flags);
	LFREEPATH(path);

	return (error);
	}

	int
	linux_faccessat(struct thread td, struct linux_faccessat_args args)
	{
	char *path;
	int error, dfd;

	/* linux convention */
	if (args->mode & ~(F_OK \| X_OK \| W_OK \| R_OK))
	return (EINVAL);

	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
	LCONVPATHEXIST_AT(td, args->filename, &path, dfd);

	#ifdef DEBUG
	if (ldebug(access))
	printf(ARGS(access, "%s, %d"), path, args->mode);
	#endif

	error = kern_accessat(td, dfd, path, UIO_SYSSPACE, 0 /* XXX */,
	args->mode);
	LFREEPATH(path);

	return (error);
	}

	int
	linux_unlink(struct thread td, struct linux_unlink_args args)
	{
	char *path;
	int error;
	struct stat st;

	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(unlink))
	printf(ARGS(unlink, "%s"), path);
	#endif

	error = kern_unlink(td, path, UIO_SYSSPACE);
	if (error == EPERM)
	/* Introduce POSIX noncompliant behaviour of Linux */
	if (kern_stat(td, path, UIO_SYSSPACE, &st) == 0)
	if (S_ISDIR(st.st_mode))
	error = EISDIR;
	LFREEPATH(path);
	return (error);
	}

	int
	linux_unlinkat(struct thread td, struct linux_unlinkat_args args)
	{
	char *path;
	int error, dfd;
	struct stat st;

	if (args->flag & ~LINUX_AT_REMOVEDIR)
	return (EINVAL);

	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
	LCONVPATHEXIST_AT(td, args->pathname, &path, dfd);

	#ifdef DEBUG
	if (ldebug(unlinkat))
	printf(ARGS(unlinkat, "%s"), path);
	#endif

	if (args->flag & LINUX_AT_REMOVEDIR)
	error = kern_rmdirat(td, dfd, path, UIO_SYSSPACE);
	else
	error = kern_unlinkat(td, dfd, path, UIO_SYSSPACE, 0);
	if (error == EPERM && !(args->flag & LINUX_AT_REMOVEDIR)) {
	/* Introduce POSIX noncompliant behaviour of Linux */
	if (kern_statat(td, AT_SYMLINK_NOFOLLOW, dfd, path,
	UIO_SYSSPACE, &st) == 0 && S_ISDIR(st.st_mode))
	error = EISDIR;
	}
	LFREEPATH(path);
	return (error);
	}
	int
	linux_chdir(struct thread td, struct linux_chdir_args args)
	{
	char *path;
	int error;

	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(chdir))
	printf(ARGS(chdir, "%s"), path);
	#endif
	error = kern_chdir(td, path, UIO_SYSSPACE);
	LFREEPATH(path);
	return (error);
	}

	int
	linux_chmod(struct thread td, struct linux_chmod_args args)
	{
	char *path;
	int error;

	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(chmod))
	printf(ARGS(chmod, "%s, %d"), path, args->mode);
	#endif
	error = kern_chmod(td, path, UIO_SYSSPACE, args->mode);
	LFREEPATH(path);
	return (error);
	}

	int
	linux_fchmodat(struct thread td, struct linux_fchmodat_args args)
	{
	char *path;
	int error, dfd;

	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
	LCONVPATHEXIST_AT(td, args->filename, &path, dfd);

	#ifdef DEBUG
	if (ldebug(fchmodat))
	printf(ARGS(fchmodat, "%s, %d"), path, args->mode);
	#endif

	error = kern_fchmodat(td, dfd, path, UIO_SYSSPACE, args->mode, 0);
	LFREEPATH(path);
	return (error);
	}

	int
	linux_mkdir(struct thread td, struct linux_mkdir_args args)
	{
	char *path;
	int error;

	LCONVPATHCREAT(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(mkdir))
	printf(ARGS(mkdir, "%s, %d"), path, args->mode);
	#endif
	error = kern_mkdir(td, path, UIO_SYSSPACE, args->mode);
	LFREEPATH(path);
	return (error);
	}

	int
	linux_mkdirat(struct thread td, struct linux_mkdirat_args args)
	{
	char *path;
	int error, dfd;

	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
	LCONVPATHCREAT_AT(td, args->pathname, &path, dfd);

	#ifdef DEBUG
	if (ldebug(mkdirat))
	printf(ARGS(mkdirat, "%s, %d"), path, args->mode);
	#endif
	error = kern_mkdirat(td, dfd, path, UIO_SYSSPACE, args->mode);
	LFREEPATH(path);
	return (error);
	}

	int
	linux_rmdir(struct thread td, struct linux_rmdir_args args)
	{
	char *path;
	int error;

	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(rmdir))
	printf(ARGS(rmdir, "%s"), path);
	#endif
	error = kern_rmdir(td, path, UIO_SYSSPACE);
	LFREEPATH(path);
	return (error);
	}

	int
	linux_rename(struct thread td, struct linux_rename_args args)
	{
	char from, to;
	int error;

	LCONVPATHEXIST(td, args->from, &from);
	/* Expand LCONVPATHCREATE so that `from' can be freed on errors */
	error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1, AT_FDCWD);
	if (to == NULL) {
	LFREEPATH(from);
	return (error);
	}

	#ifdef DEBUG
	if (ldebug(rename))
	printf(ARGS(rename, "%s, %s"), from, to);
	#endif
	error = kern_rename(td, from, to, UIO_SYSSPACE);
	LFREEPATH(from);
	LFREEPATH(to);
	return (error);
	}

	int
	linux_renameat(struct thread td, struct linux_renameat_args args)
	{
	char from, to;
	int error, olddfd, newdfd;

	olddfd = (args->olddfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->olddfd;
	newdfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd;
	LCONVPATHEXIST_AT(td, args->oldname, &from, olddfd);
	/* Expand LCONVPATHCREATE so that `from' can be freed on errors */
	error = linux_emul_convpath(td, args->newname, UIO_USERSPACE, &to, 1, newdfd);
	if (to == NULL) {
	LFREEPATH(from);
	return (error);
	}

	#ifdef DEBUG
	if (ldebug(renameat))
	printf(ARGS(renameat, "%s, %s"), from, to);
	#endif
	error = kern_renameat(td, olddfd, from, newdfd, to, UIO_SYSSPACE);
	LFREEPATH(from);
	LFREEPATH(to);
	return (error);
	}

	int
	linux_symlink(struct thread td, struct linux_symlink_args args)
	{
	char path, to;
	int error;

	LCONVPATHEXIST(td, args->path, &path);
	/* Expand LCONVPATHCREATE so that `path' can be freed on errors */
	error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1, AT_FDCWD);
	if (to == NULL) {
	LFREEPATH(path);
	return (error);
	}

	#ifdef DEBUG
	if (ldebug(symlink))
	printf(ARGS(symlink, "%s, %s"), path, to);
	#endif
	error = kern_symlink(td, path, to, UIO_SYSSPACE);
	LFREEPATH(path);
	LFREEPATH(to);
	return (error);
	}

	int
	linux_symlinkat(struct thread td, struct linux_symlinkat_args args)
	{
	char path, to;
	int error, dfd;

	dfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd;
	LCONVPATHEXIST_AT(td, args->oldname, &path, dfd);
	/* Expand LCONVPATHCREATE so that `path' can be freed on errors */
	error = linux_emul_convpath(td, args->newname, UIO_USERSPACE, &to, 1, dfd);
	if (to == NULL) {
	LFREEPATH(path);
	return (error);
	}

	#ifdef DEBUG
	if (ldebug(symlinkat))
	printf(ARGS(symlinkat, "%s, %s"), path, to);
	#endif

	error = kern_symlinkat(td, path, dfd, to, UIO_SYSSPACE);
	LFREEPATH(path);
	LFREEPATH(to);
	return (error);
	}

	int
	linux_readlink(struct thread td, struct linux_readlink_args args)
	{
	char *name;
	int error;

	LCONVPATHEXIST(td, args->name, &name);

	#ifdef DEBUG
	if (ldebug(readlink))
	printf(ARGS(readlink, "%s, %p, %d"), name, (void *)args->buf,
	args->count);
	#endif
	error = kern_readlink(td, name, UIO_SYSSPACE, args->buf, UIO_USERSPACE,
	args->count);
	LFREEPATH(name);
	return (error);
	}

	int
	linux_readlinkat(struct thread td, struct linux_readlinkat_args args)
	{
	char *name;
	int error, dfd;

	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
	LCONVPATHEXIST_AT(td, args->path, &name, dfd);

	#ifdef DEBUG
	if (ldebug(readlinkat))
	printf(ARGS(readlinkat, "%s, %p, %d"), name, (void *)args->buf,
	args->bufsiz);
	#endif

	error = kern_readlinkat(td, dfd, name, UIO_SYSSPACE, args->buf,
	UIO_USERSPACE, args->bufsiz);
	LFREEPATH(name);
	return (error);
	}

	int
	linux_truncate(struct thread td, struct linux_truncate_args args)
	{
	char *path;
	int error;

	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(truncate))
	printf(ARGS(truncate, "%s, %ld"), path, (long)args->length);
	#endif

	error = kern_truncate(td, path, UIO_SYSSPACE, args->length);
	LFREEPATH(path);
	return (error);
	}

	int
	linux_truncate64(struct thread td, struct linux_truncate64_args args)
	{
	char *path;
	int error;

	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(truncate64))
	printf(ARGS(truncate64, "%s, %jd"), path, args->length);
	#endif

	error = kern_truncate(td, path, UIO_SYSSPACE, args->length);
	LFREEPATH(path);
	return (error);
	}
	int
	linux_ftruncate(struct thread td, struct linux_ftruncate_args args)
	{
	struct ftruncate_args /* {
	int fd;
	int pad;
	off_t length;
	} */ nuap;

	nuap.fd = args->fd;
	nuap.length = args->length;
	- return (ftruncate(td, &nuap));
	+ return (sys_ftruncate(td, &nuap));
	}

	int
	linux_link(struct thread td, struct linux_link_args args)
	{
	char path, to;
	int error;

	LCONVPATHEXIST(td, args->path, &path);
	/* Expand LCONVPATHCREATE so that `path' can be freed on errors */
	error = linux_emul_convpath(td, args->to, UIO_USERSPACE, &to, 1, AT_FDCWD);
	if (to == NULL) {
	LFREEPATH(path);
	return (error);
	}

	#ifdef DEBUG
	if (ldebug(link))
	printf(ARGS(link, "%s, %s"), path, to);
	#endif
	error = kern_link(td, path, to, UIO_SYSSPACE);
	LFREEPATH(path);
	LFREEPATH(to);
	return (error);
	}

	int
	linux_linkat(struct thread td, struct linux_linkat_args args)
	{
	char path, to;
	int error, olddfd, newdfd;

	/*
	* They really introduced flags argument which is forbidden to
	* use.
	*/
	if (args->flags != 0)
	return (EINVAL);

	olddfd = (args->olddfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->olddfd;
	newdfd = (args->newdfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->newdfd;
	LCONVPATHEXIST_AT(td, args->oldname, &path, olddfd);
	/* Expand LCONVPATHCREATE so that `path' can be freed on errors */
	error = linux_emul_convpath(td, args->newname, UIO_USERSPACE, &to, 1, newdfd);
	if (to == NULL) {
	LFREEPATH(path);
	return (error);
	}

	#ifdef DEBUG
	if (ldebug(linkat))
	printf(ARGS(linkat, "%i, %s, %i, %s, %i"), args->olddfd, path,
	args->newdfd, to, args->flags);
	#endif

	error = kern_linkat(td, olddfd, newdfd, path, to, UIO_SYSSPACE, FOLLOW);
	LFREEPATH(path);
	LFREEPATH(to);
	return (error);
	}

	int
	linux_fdatasync(td, uap)
	struct thread *td;
	struct linux_fdatasync_args *uap;
	{
	struct fsync_args bsd;

	bsd.fd = uap->fd;
	- return fsync(td, &bsd);
	+ return sys_fsync(td, &bsd);
	}

	int
	linux_pread(td, uap)
	struct thread *td;
	struct linux_pread_args *uap;
	{
	struct pread_args bsd;
	struct vnode *vp;
	int error;

	bsd.fd = uap->fd;
	bsd.buf = uap->buf;
	bsd.nbyte = uap->nbyte;
	bsd.offset = uap->offset;

	- error = pread(td, &bsd);
	+ error = sys_pread(td, &bsd);

	if (error == 0) {
	/* This seems to violate POSIX but linux does it */
	if ((error = fgetvp(td, uap->fd, CAP_READ, &vp)) != 0)
	return (error);
	if (vp->v_type == VDIR) {
	vrele(vp);
	return (EISDIR);
	}
	vrele(vp);
	}

	return (error);
	}

	int
	linux_pwrite(td, uap)
	struct thread *td;
	struct linux_pwrite_args *uap;
	{
	struct pwrite_args bsd;

	bsd.fd = uap->fd;
	bsd.buf = uap->buf;
	bsd.nbyte = uap->nbyte;
	bsd.offset = uap->offset;
	- return pwrite(td, &bsd);
	+ return sys_pwrite(td, &bsd);
	}

	int
	linux_mount(struct thread td, struct linux_mount_args args)
	{
	struct ufs_args ufs;
	char fstypename[MFSNAMELEN];
	char mntonname[MNAMELEN], mntfromname[MNAMELEN];
	int error;
	int fsflags;
	void *fsdata;

	error = copyinstr(args->filesystemtype, fstypename, MFSNAMELEN - 1,
	NULL);
	if (error)
	return (error);
	error = copyinstr(args->specialfile, mntfromname, MNAMELEN - 1, NULL);
	if (error)
	return (error);
	error = copyinstr(args->dir, mntonname, MNAMELEN - 1, NULL);
	if (error)
	return (error);

	#ifdef DEBUG
	if (ldebug(mount))
	printf(ARGS(mount, "%s, %s, %s"),
	fstypename, mntfromname, mntonname);
	#endif

	if (strcmp(fstypename, "ext2") == 0) {
	strcpy(fstypename, "ext2fs");
	fsdata = &ufs;
	ufs.fspec = mntfromname;
	#define DEFAULT_ROOTID -2
	ufs.export.ex_root = DEFAULT_ROOTID;
	ufs.export.ex_flags =
	args->rwflag & LINUX_MS_RDONLY ? MNT_EXRDONLY : 0;
	} else if (strcmp(fstypename, "proc") == 0) {
	strcpy(fstypename, "linprocfs");
	fsdata = NULL;
	} else if (strcmp(fstypename, "vfat") == 0) {
	strcpy(fstypename, "msdosfs");
	fsdata = NULL;
	} else {
	return (ENODEV);
	}

	fsflags = 0;

	if ((args->rwflag & 0xffff0000) == 0xc0ed0000) {
	/*
	* Linux SYNC flag is not included; the closest equivalent
	* FreeBSD has is !ASYNC, which is our default.
	*/
	if (args->rwflag & LINUX_MS_RDONLY)
	fsflags \|= MNT_RDONLY;
	if (args->rwflag & LINUX_MS_NOSUID)
	fsflags \|= MNT_NOSUID;
	if (args->rwflag & LINUX_MS_NOEXEC)
	fsflags \|= MNT_NOEXEC;
	if (args->rwflag & LINUX_MS_REMOUNT)
	fsflags \|= MNT_UPDATE;
	}

	if (strcmp(fstypename, "linprocfs") == 0) {
	error = kernel_vmount(fsflags,
	"fstype", fstypename,
	"fspath", mntonname,
	NULL);
	} else if (strcmp(fstypename, "msdosfs") == 0) {
	error = kernel_vmount(fsflags,
	"fstype", fstypename,
	"fspath", mntonname,
	"from", mntfromname,
	NULL);
	} else
	error = EOPNOTSUPP;
	return (error);
	}

	int
	linux_oldumount(struct thread td, struct linux_oldumount_args args)
	{
	struct linux_umount_args args2;

	args2.path = args->path;
	args2.flags = 0;
	return (linux_umount(td, &args2));
	}

	int
	linux_umount(struct thread td, struct linux_umount_args args)
	{
	struct unmount_args bsd;

	bsd.path = args->path;
	bsd.flags = args->flags; /* XXX correct? */
	- return (unmount(td, &bsd));
	+ return (sys_unmount(td, &bsd));
	}

	/*
	* fcntl family of syscalls
	*/

	struct l_flock {
	l_short l_type;
	l_short l_whence;
	l_off_t l_start;
	l_off_t l_len;
	l_pid_t l_pid;
	}
	#if defined(__amd64__) && defined(COMPAT_LINUX32)
	__packed
	#endif
	;

	static void
	linux_to_bsd_flock(struct l_flock linux_flock, struct flock bsd_flock)
	{
	switch (linux_flock->l_type) {
	case LINUX_F_RDLCK:
	bsd_flock->l_type = F_RDLCK;
	break;
	case LINUX_F_WRLCK:
	bsd_flock->l_type = F_WRLCK;
	break;
	case LINUX_F_UNLCK:
	bsd_flock->l_type = F_UNLCK;
	break;
	default:
	bsd_flock->l_type = -1;
	break;
	}
	bsd_flock->l_whence = linux_flock->l_whence;
	bsd_flock->l_start = (off_t)linux_flock->l_start;
	bsd_flock->l_len = (off_t)linux_flock->l_len;
	bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
	bsd_flock->l_sysid = 0;
	}

	static void
	bsd_to_linux_flock(struct flock bsd_flock, struct l_flock linux_flock)
	{
	switch (bsd_flock->l_type) {
	case F_RDLCK:
	linux_flock->l_type = LINUX_F_RDLCK;
	break;
	case F_WRLCK:
	linux_flock->l_type = LINUX_F_WRLCK;
	break;
	case F_UNLCK:
	linux_flock->l_type = LINUX_F_UNLCK;
	break;
	}
	linux_flock->l_whence = bsd_flock->l_whence;
	linux_flock->l_start = (l_off_t)bsd_flock->l_start;
	linux_flock->l_len = (l_off_t)bsd_flock->l_len;
	linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid;
	}

	#if defined(__i386__) \|\| (defined(__amd64__) && defined(COMPAT_LINUX32))
	struct l_flock64 {
	l_short l_type;
	l_short l_whence;
	l_loff_t l_start;
	l_loff_t l_len;
	l_pid_t l_pid;
	}
	#if defined(__amd64__) && defined(COMPAT_LINUX32)
	__packed
	#endif
	;

	static void
	linux_to_bsd_flock64(struct l_flock64 linux_flock, struct flock bsd_flock)
	{
	switch (linux_flock->l_type) {
	case LINUX_F_RDLCK:
	bsd_flock->l_type = F_RDLCK;
	break;
	case LINUX_F_WRLCK:
	bsd_flock->l_type = F_WRLCK;
	break;
	case LINUX_F_UNLCK:
	bsd_flock->l_type = F_UNLCK;
	break;
	default:
	bsd_flock->l_type = -1;
	break;
	}
	bsd_flock->l_whence = linux_flock->l_whence;
	bsd_flock->l_start = (off_t)linux_flock->l_start;
	bsd_flock->l_len = (off_t)linux_flock->l_len;
	bsd_flock->l_pid = (pid_t)linux_flock->l_pid;
	bsd_flock->l_sysid = 0;
	}

	static void
	bsd_to_linux_flock64(struct flock bsd_flock, struct l_flock64 linux_flock)
	{
	switch (bsd_flock->l_type) {
	case F_RDLCK:
	linux_flock->l_type = LINUX_F_RDLCK;
	break;
	case F_WRLCK:
	linux_flock->l_type = LINUX_F_WRLCK;
	break;
	case F_UNLCK:
	linux_flock->l_type = LINUX_F_UNLCK;
	break;
	}
	linux_flock->l_whence = bsd_flock->l_whence;
	linux_flock->l_start = (l_loff_t)bsd_flock->l_start;
	linux_flock->l_len = (l_loff_t)bsd_flock->l_len;
	linux_flock->l_pid = (l_pid_t)bsd_flock->l_pid;
	}
	#endif /* __i386__ \|\| (__amd64__ && COMPAT_LINUX32) */

	static int
	fcntl_common(struct thread td, struct linux_fcntl64_args args)
	{
	struct l_flock linux_flock;
	struct flock bsd_flock;
	struct file *fp;
	long arg;
	int error, result;

	switch (args->cmd) {
	case LINUX_F_DUPFD:
	return (kern_fcntl(td, args->fd, F_DUPFD, args->arg));

	case LINUX_F_GETFD:
	return (kern_fcntl(td, args->fd, F_GETFD, 0));

	case LINUX_F_SETFD:
	return (kern_fcntl(td, args->fd, F_SETFD, args->arg));

	case LINUX_F_GETFL:
	error = kern_fcntl(td, args->fd, F_GETFL, 0);
	result = td->td_retval[0];
	td->td_retval[0] = 0;
	if (result & O_RDONLY)
	td->td_retval[0] \|= LINUX_O_RDONLY;
	if (result & O_WRONLY)
	td->td_retval[0] \|= LINUX_O_WRONLY;
	if (result & O_RDWR)
	td->td_retval[0] \|= LINUX_O_RDWR;
	if (result & O_NDELAY)
	td->td_retval[0] \|= LINUX_O_NONBLOCK;
	if (result & O_APPEND)
	td->td_retval[0] \|= LINUX_O_APPEND;
	if (result & O_FSYNC)
	td->td_retval[0] \|= LINUX_O_SYNC;
	if (result & O_ASYNC)
	td->td_retval[0] \|= LINUX_FASYNC;
	#ifdef LINUX_O_NOFOLLOW
	if (result & O_NOFOLLOW)
	td->td_retval[0] \|= LINUX_O_NOFOLLOW;
	#endif
	#ifdef LINUX_O_DIRECT
	if (result & O_DIRECT)
	td->td_retval[0] \|= LINUX_O_DIRECT;
	#endif
	return (error);

	case LINUX_F_SETFL:
	arg = 0;
	if (args->arg & LINUX_O_NDELAY)
	arg \|= O_NONBLOCK;
	if (args->arg & LINUX_O_APPEND)
	arg \|= O_APPEND;
	if (args->arg & LINUX_O_SYNC)
	arg \|= O_FSYNC;
	if (args->arg & LINUX_FASYNC)
	arg \|= O_ASYNC;
	#ifdef LINUX_O_NOFOLLOW
	if (args->arg & LINUX_O_NOFOLLOW)
	arg \|= O_NOFOLLOW;
	#endif
	#ifdef LINUX_O_DIRECT
	if (args->arg & LINUX_O_DIRECT)
	arg \|= O_DIRECT;
	#endif
	return (kern_fcntl(td, args->fd, F_SETFL, arg));

	case LINUX_F_GETLK:
	error = copyin((void *)args->arg, &linux_flock,
	sizeof(linux_flock));
	if (error)
	return (error);
	linux_to_bsd_flock(&linux_flock, &bsd_flock);
	error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock);
	if (error)
	return (error);
	bsd_to_linux_flock(&bsd_flock, &linux_flock);
	return (copyout(&linux_flock, (void *)args->arg,
	sizeof(linux_flock)));

	case LINUX_F_SETLK:
	error = copyin((void *)args->arg, &linux_flock,
	sizeof(linux_flock));
	if (error)
	return (error);
	linux_to_bsd_flock(&linux_flock, &bsd_flock);
	return (kern_fcntl(td, args->fd, F_SETLK,
	(intptr_t)&bsd_flock));

	case LINUX_F_SETLKW:
	error = copyin((void *)args->arg, &linux_flock,
	sizeof(linux_flock));
	if (error)
	return (error);
	linux_to_bsd_flock(&linux_flock, &bsd_flock);
	return (kern_fcntl(td, args->fd, F_SETLKW,
	(intptr_t)&bsd_flock));

	case LINUX_F_GETOWN:
	return (kern_fcntl(td, args->fd, F_GETOWN, 0));

	case LINUX_F_SETOWN:
	/*
	* XXX some Linux applications depend on F_SETOWN having no
	* significant effect for pipes (SIGIO is not delivered for
	* pipes under Linux-2.2.35 at least).
	*/
	error = fget(td, args->fd, CAP_FCNTL, &fp);
	if (error)
	return (error);
	if (fp->f_type == DTYPE_PIPE) {
	fdrop(fp, td);
	return (EINVAL);
	}
	fdrop(fp, td);

	return (kern_fcntl(td, args->fd, F_SETOWN, args->arg));
	}

	return (EINVAL);
	}

	int
	linux_fcntl(struct thread td, struct linux_fcntl_args args)
	{
	struct linux_fcntl64_args args64;

	#ifdef DEBUG
	if (ldebug(fcntl))
	printf(ARGS(fcntl, "%d, %08x, *"), args->fd, args->cmd);
	#endif

	args64.fd = args->fd;
	args64.cmd = args->cmd;
	args64.arg = args->arg;
	return (fcntl_common(td, &args64));
	}

	#if defined(__i386__) \|\| (defined(__amd64__) && defined(COMPAT_LINUX32))
	int
	linux_fcntl64(struct thread td, struct linux_fcntl64_args args)
	{
	struct l_flock64 linux_flock;
	struct flock bsd_flock;
	int error;

	#ifdef DEBUG
	if (ldebug(fcntl64))
	printf(ARGS(fcntl64, "%d, %08x, *"), args->fd, args->cmd);
	#endif

	switch (args->cmd) {
	case LINUX_F_GETLK64:
	error = copyin((void *)args->arg, &linux_flock,
	sizeof(linux_flock));
	if (error)
	return (error);
	linux_to_bsd_flock64(&linux_flock, &bsd_flock);
	error = kern_fcntl(td, args->fd, F_GETLK, (intptr_t)&bsd_flock);
	if (error)
	return (error);
	bsd_to_linux_flock64(&bsd_flock, &linux_flock);
	return (copyout(&linux_flock, (void *)args->arg,
	sizeof(linux_flock)));

	case LINUX_F_SETLK64:
	error = copyin((void *)args->arg, &linux_flock,
	sizeof(linux_flock));
	if (error)
	return (error);
	linux_to_bsd_flock64(&linux_flock, &bsd_flock);
	return (kern_fcntl(td, args->fd, F_SETLK,
	(intptr_t)&bsd_flock));

	case LINUX_F_SETLKW64:
	error = copyin((void *)args->arg, &linux_flock,
	sizeof(linux_flock));
	if (error)
	return (error);
	linux_to_bsd_flock64(&linux_flock, &bsd_flock);
	return (kern_fcntl(td, args->fd, F_SETLKW,
	(intptr_t)&bsd_flock));
	}

	return (fcntl_common(td, args));
	}
	#endif /* __i386__ \|\| (__amd64__ && COMPAT_LINUX32) */

	int
	linux_chown(struct thread td, struct linux_chown_args args)
	{
	char *path;
	int error;

	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(chown))
	printf(ARGS(chown, "%s, %d, %d"), path, args->uid, args->gid);
	#endif
	error = kern_chown(td, path, UIO_SYSSPACE, args->uid, args->gid);
	LFREEPATH(path);
	return (error);
	}

	int
	linux_fchownat(struct thread td, struct linux_fchownat_args args)
	{
	char *path;
	int error, dfd, follow;

	if (args->flag & ~LINUX_AT_SYMLINK_NOFOLLOW)
	return (EINVAL);

	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
	LCONVPATHEXIST_AT(td, args->filename, &path, dfd);

	#ifdef DEBUG
	if (ldebug(fchownat))
	printf(ARGS(fchownat, "%s, %d, %d"), path, args->uid, args->gid);
	#endif

	follow = (args->flag & LINUX_AT_SYMLINK_NOFOLLOW) == 0 ? 0 :
	AT_SYMLINK_NOFOLLOW;
	error = kern_fchownat(td, dfd, path, UIO_SYSSPACE, args->uid, args->gid,
	follow);
	LFREEPATH(path);
	return (error);
	}

	int
	linux_lchown(struct thread td, struct linux_lchown_args args)
	{
	char *path;
	int error;

	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(lchown))
	printf(ARGS(lchown, "%s, %d, %d"), path, args->uid, args->gid);
	#endif
	error = kern_lchown(td, path, UIO_SYSSPACE, args->uid, args->gid);
	LFREEPATH(path);
	return (error);
	}
	Index: head/sys/compat/linux/linux_ioctl.c
	===================================================================
	--- head/sys/compat/linux/linux_ioctl.c (revision 225616)
	+++ head/sys/compat/linux/linux_ioctl.c (revision 225617)
	@@ -1,3531 +1,3531 @@
	/*-
	* Copyright (c) 1994-1995 Søren Schmidt
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include "opt_compat.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/capability.h>
	#include <sys/cdio.h>
	#include <sys/dvdio.h>
	#include <sys/conf.h>
	#include <sys/disk.h>
	#include <sys/consio.h>
	#include <sys/ctype.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/filio.h>
	#include <sys/jail.h>
	#include <sys/kbio.h>
	#include <sys/kernel.h>
	#include <sys/linker_set.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/proc.h>
	#include <sys/sbuf.h>
	#include <sys/socket.h>
	#include <sys/sockio.h>
	#include <sys/soundcard.h>
	#include <sys/stdint.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/tty.h>
	#include <sys/uio.h>
	#include <sys/types.h>
	#include <sys/mman.h>
	#include <sys/resourcevar.h>

	#include <net/if.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/vnet.h>

	#include <dev/usb/usb_ioctl.h>

	#ifdef COMPAT_LINUX32
	#include <machine/../linux32/linux.h>
	#include <machine/../linux32/linux32_proto.h>
	#else
	#include <machine/../linux/linux.h>
	#include <machine/../linux/linux_proto.h>
	#endif

	#include <compat/linux/linux_ioctl.h>
	#include <compat/linux/linux_mib.h>
	#include <compat/linux/linux_socket.h>
	#include <compat/linux/linux_util.h>

	#include <compat/linux/linux_videodev.h>
	#include <compat/linux/linux_videodev_compat.h>

	#include <compat/linux/linux_videodev2.h>
	#include <compat/linux/linux_videodev2_compat.h>

	CTASSERT(LINUX_IFNAMSIZ == IFNAMSIZ);

	FEATURE(linuxulator_v4l, "V4L ioctl wrapper support in the linuxulator");
	FEATURE(linuxulator_v4l2, "V4L2 ioctl wrapper support in the linuxulator");

	static linux_ioctl_function_t linux_ioctl_cdrom;
	static linux_ioctl_function_t linux_ioctl_vfat;
	static linux_ioctl_function_t linux_ioctl_console;
	static linux_ioctl_function_t linux_ioctl_hdio;
	static linux_ioctl_function_t linux_ioctl_disk;
	static linux_ioctl_function_t linux_ioctl_socket;
	static linux_ioctl_function_t linux_ioctl_sound;
	static linux_ioctl_function_t linux_ioctl_termio;
	static linux_ioctl_function_t linux_ioctl_private;
	static linux_ioctl_function_t linux_ioctl_drm;
	static linux_ioctl_function_t linux_ioctl_sg;
	static linux_ioctl_function_t linux_ioctl_v4l;
	static linux_ioctl_function_t linux_ioctl_v4l2;
	static linux_ioctl_function_t linux_ioctl_special;
	static linux_ioctl_function_t linux_ioctl_fbsd_usb;

	static struct linux_ioctl_handler cdrom_handler =
	{ linux_ioctl_cdrom, LINUX_IOCTL_CDROM_MIN, LINUX_IOCTL_CDROM_MAX };
	static struct linux_ioctl_handler vfat_handler =
	{ linux_ioctl_vfat, LINUX_IOCTL_VFAT_MIN, LINUX_IOCTL_VFAT_MAX };
	static struct linux_ioctl_handler console_handler =
	{ linux_ioctl_console, LINUX_IOCTL_CONSOLE_MIN, LINUX_IOCTL_CONSOLE_MAX };
	static struct linux_ioctl_handler hdio_handler =
	{ linux_ioctl_hdio, LINUX_IOCTL_HDIO_MIN, LINUX_IOCTL_HDIO_MAX };
	static struct linux_ioctl_handler disk_handler =
	{ linux_ioctl_disk, LINUX_IOCTL_DISK_MIN, LINUX_IOCTL_DISK_MAX };
	static struct linux_ioctl_handler socket_handler =
	{ linux_ioctl_socket, LINUX_IOCTL_SOCKET_MIN, LINUX_IOCTL_SOCKET_MAX };
	static struct linux_ioctl_handler sound_handler =
	{ linux_ioctl_sound, LINUX_IOCTL_SOUND_MIN, LINUX_IOCTL_SOUND_MAX };
	static struct linux_ioctl_handler termio_handler =
	{ linux_ioctl_termio, LINUX_IOCTL_TERMIO_MIN, LINUX_IOCTL_TERMIO_MAX };
	static struct linux_ioctl_handler private_handler =
	{ linux_ioctl_private, LINUX_IOCTL_PRIVATE_MIN, LINUX_IOCTL_PRIVATE_MAX };
	static struct linux_ioctl_handler drm_handler =
	{ linux_ioctl_drm, LINUX_IOCTL_DRM_MIN, LINUX_IOCTL_DRM_MAX };
	static struct linux_ioctl_handler sg_handler =
	{ linux_ioctl_sg, LINUX_IOCTL_SG_MIN, LINUX_IOCTL_SG_MAX };
	static struct linux_ioctl_handler video_handler =
	{ linux_ioctl_v4l, LINUX_IOCTL_VIDEO_MIN, LINUX_IOCTL_VIDEO_MAX };
	static struct linux_ioctl_handler video2_handler =
	{ linux_ioctl_v4l2, LINUX_IOCTL_VIDEO2_MIN, LINUX_IOCTL_VIDEO2_MAX };
	static struct linux_ioctl_handler fbsd_usb =
	{ linux_ioctl_fbsd_usb, FBSD_LUSB_MIN, FBSD_LUSB_MAX };

	DATA_SET(linux_ioctl_handler_set, cdrom_handler);
	DATA_SET(linux_ioctl_handler_set, vfat_handler);
	DATA_SET(linux_ioctl_handler_set, console_handler);
	DATA_SET(linux_ioctl_handler_set, hdio_handler);
	DATA_SET(linux_ioctl_handler_set, disk_handler);
	DATA_SET(linux_ioctl_handler_set, socket_handler);
	DATA_SET(linux_ioctl_handler_set, sound_handler);
	DATA_SET(linux_ioctl_handler_set, termio_handler);
	DATA_SET(linux_ioctl_handler_set, private_handler);
	DATA_SET(linux_ioctl_handler_set, drm_handler);
	DATA_SET(linux_ioctl_handler_set, sg_handler);
	DATA_SET(linux_ioctl_handler_set, video_handler);
	DATA_SET(linux_ioctl_handler_set, video2_handler);
	DATA_SET(linux_ioctl_handler_set, fbsd_usb);

	struct handler_element
	{
	TAILQ_ENTRY(handler_element) list;
	int (func)(struct thread , struct linux_ioctl_args *);
	int low, high, span;
	};

	static TAILQ_HEAD(, handler_element) handlers =
	TAILQ_HEAD_INITIALIZER(handlers);
	static struct sx linux_ioctl_sx;
	SX_SYSINIT(linux_ioctl, &linux_ioctl_sx, "linux ioctl handlers");

	/*
	* hdio related ioctls for VMWare support
	*/

	struct linux_hd_geometry {
	u_int8_t heads;
	u_int8_t sectors;
	u_int16_t cylinders;
	u_int32_t start;
	};

	struct linux_hd_big_geometry {
	u_int8_t heads;
	u_int8_t sectors;
	u_int32_t cylinders;
	u_int32_t start;
	};

	static int
	linux_ioctl_hdio(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;
	u_int sectorsize, fwcylinders, fwheads, fwsectors;
	off_t mediasize, bytespercyl;

	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	switch (args->cmd & 0xffff) {
	case LINUX_HDIO_GET_GEO:
	case LINUX_HDIO_GET_GEO_BIG:
	error = fo_ioctl(fp, DIOCGMEDIASIZE,
	(caddr_t)&mediasize, td->td_ucred, td);
	if (!error)
	error = fo_ioctl(fp, DIOCGSECTORSIZE,
	(caddr_t)&sectorsize, td->td_ucred, td);
	if (!error)
	error = fo_ioctl(fp, DIOCGFWHEADS,
	(caddr_t)&fwheads, td->td_ucred, td);
	if (!error)
	error = fo_ioctl(fp, DIOCGFWSECTORS,
	(caddr_t)&fwsectors, td->td_ucred, td);
	/*
	* XXX: DIOCGFIRSTOFFSET is not yet implemented, so
	* so pretend that GEOM always says 0. This is NOT VALID
	* for slices or partitions, only the per-disk raw devices.
	*/

	fdrop(fp, td);
	if (error)
	return (error);
	/*
	* 1. Calculate the number of bytes in a cylinder,
	* given the firmware's notion of heads and sectors
	* per cylinder.
	* 2. Calculate the number of cylinders, given the total
	* size of the media.
	* All internal calculations should have 64-bit precision.
	*/
	bytespercyl = (off_t) sectorsize * fwheads * fwsectors;
	fwcylinders = mediasize / bytespercyl;
	#if defined(DEBUG)
	linux_msg(td, "HDIO_GET_GEO: mediasize %jd, c/h/s %d/%d/%d, "
	"bpc %jd",
	(intmax_t)mediasize, fwcylinders, fwheads, fwsectors,
	(intmax_t)bytespercyl);
	#endif
	if ((args->cmd & 0xffff) == LINUX_HDIO_GET_GEO) {
	struct linux_hd_geometry hdg;

	hdg.cylinders = fwcylinders;
	hdg.heads = fwheads;
	hdg.sectors = fwsectors;
	hdg.start = 0;
	error = copyout(&hdg, (void *)args->arg, sizeof(hdg));
	} else if ((args->cmd & 0xffff) == LINUX_HDIO_GET_GEO_BIG) {
	struct linux_hd_big_geometry hdbg;

	hdbg.cylinders = fwcylinders;
	hdbg.heads = fwheads;
	hdbg.sectors = fwsectors;
	hdbg.start = 0;
	error = copyout(&hdbg, (void *)args->arg, sizeof(hdbg));
	}
	return (error);
	break;
	default:
	/* XXX */
	linux_msg(td,
	"ioctl fd=%d, cmd=0x%x ('%c',%d) is not implemented",
	args->fd, (int)(args->cmd & 0xffff),
	(int)(args->cmd & 0xff00) >> 8,
	(int)(args->cmd & 0xff));
	break;
	}
	fdrop(fp, td);
	return (ENOIOCTL);
	}

	static int
	linux_ioctl_disk(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;
	u_int sectorsize;
	off_t mediasize;

	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	switch (args->cmd & 0xffff) {
	case LINUX_BLKGETSIZE:
	error = fo_ioctl(fp, DIOCGSECTORSIZE,
	(caddr_t)&sectorsize, td->td_ucred, td);
	if (!error)
	error = fo_ioctl(fp, DIOCGMEDIASIZE,
	(caddr_t)&mediasize, td->td_ucred, td);
	fdrop(fp, td);
	if (error)
	return (error);
	sectorsize = mediasize / sectorsize;
	/*
	* XXX: How do we know we return the right size of integer ?
	*/
	return (copyout(&sectorsize, (void *)args->arg,
	sizeof(sectorsize)));
	break;
	}
	fdrop(fp, td);
	return (ENOIOCTL);
	}

	/*
	* termio related ioctls
	*/

	struct linux_termio {
	unsigned short c_iflag;
	unsigned short c_oflag;
	unsigned short c_cflag;
	unsigned short c_lflag;
	unsigned char c_line;
	unsigned char c_cc[LINUX_NCC];
	};

	struct linux_termios {
	unsigned int c_iflag;
	unsigned int c_oflag;
	unsigned int c_cflag;
	unsigned int c_lflag;
	unsigned char c_line;
	unsigned char c_cc[LINUX_NCCS];
	};

	struct linux_winsize {
	unsigned short ws_row, ws_col;
	unsigned short ws_xpixel, ws_ypixel;
	};

	struct speedtab {
	int sp_speed; /* Speed. */
	int sp_code; /* Code. */
	};

	static struct speedtab sptab[] = {
	{ B0, LINUX_B0 }, { B50, LINUX_B50 },
	{ B75, LINUX_B75 }, { B110, LINUX_B110 },
	{ B134, LINUX_B134 }, { B150, LINUX_B150 },
	{ B200, LINUX_B200 }, { B300, LINUX_B300 },
	{ B600, LINUX_B600 }, { B1200, LINUX_B1200 },
	{ B1800, LINUX_B1800 }, { B2400, LINUX_B2400 },
	{ B4800, LINUX_B4800 }, { B9600, LINUX_B9600 },
	{ B19200, LINUX_B19200 }, { B38400, LINUX_B38400 },
	{ B57600, LINUX_B57600 }, { B115200, LINUX_B115200 },
	{-1, -1 }
	};

	struct linux_serial_struct {
	int type;
	int line;
	int port;
	int irq;
	int flags;
	int xmit_fifo_size;
	int custom_divisor;
	int baud_base;
	unsigned short close_delay;
	char reserved_char[2];
	int hub6;
	unsigned short closing_wait;
	unsigned short closing_wait2;
	int reserved[4];
	};

	static int
	linux_to_bsd_speed(int code, struct speedtab *table)
	{
	for ( ; table->sp_code != -1; table++)
	if (table->sp_code == code)
	return (table->sp_speed);
	return -1;
	}

	static int
	bsd_to_linux_speed(int speed, struct speedtab *table)
	{
	for ( ; table->sp_speed != -1; table++)
	if (table->sp_speed == speed)
	return (table->sp_code);
	return -1;
	}

	static void
	bsd_to_linux_termios(struct termios bios, struct linux_termios lios)
	{
	int i;

	#ifdef DEBUG
	if (ldebug(ioctl)) {
	printf("LINUX: BSD termios structure (input):\n");
	printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n",
	bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag,
	bios->c_ispeed, bios->c_ospeed);
	printf("c_cc ");
	for (i=0; i<NCCS; i++)
	printf("%02x ", bios->c_cc[i]);
	printf("\n");
	}
	#endif

	lios->c_iflag = 0;
	if (bios->c_iflag & IGNBRK)
	lios->c_iflag \|= LINUX_IGNBRK;
	if (bios->c_iflag & BRKINT)
	lios->c_iflag \|= LINUX_BRKINT;
	if (bios->c_iflag & IGNPAR)
	lios->c_iflag \|= LINUX_IGNPAR;
	if (bios->c_iflag & PARMRK)
	lios->c_iflag \|= LINUX_PARMRK;
	if (bios->c_iflag & INPCK)
	lios->c_iflag \|= LINUX_INPCK;
	if (bios->c_iflag & ISTRIP)
	lios->c_iflag \|= LINUX_ISTRIP;
	if (bios->c_iflag & INLCR)
	lios->c_iflag \|= LINUX_INLCR;
	if (bios->c_iflag & IGNCR)
	lios->c_iflag \|= LINUX_IGNCR;
	if (bios->c_iflag & ICRNL)
	lios->c_iflag \|= LINUX_ICRNL;
	if (bios->c_iflag & IXON)
	lios->c_iflag \|= LINUX_IXON;
	if (bios->c_iflag & IXANY)
	lios->c_iflag \|= LINUX_IXANY;
	if (bios->c_iflag & IXOFF)
	lios->c_iflag \|= LINUX_IXOFF;
	if (bios->c_iflag & IMAXBEL)
	lios->c_iflag \|= LINUX_IMAXBEL;

	lios->c_oflag = 0;
	if (bios->c_oflag & OPOST)
	lios->c_oflag \|= LINUX_OPOST;
	if (bios->c_oflag & ONLCR)
	lios->c_oflag \|= LINUX_ONLCR;
	if (bios->c_oflag & TAB3)
	lios->c_oflag \|= LINUX_XTABS;

	lios->c_cflag = bsd_to_linux_speed(bios->c_ispeed, sptab);
	lios->c_cflag \|= (bios->c_cflag & CSIZE) >> 4;
	if (bios->c_cflag & CSTOPB)
	lios->c_cflag \|= LINUX_CSTOPB;
	if (bios->c_cflag & CREAD)
	lios->c_cflag \|= LINUX_CREAD;
	if (bios->c_cflag & PARENB)
	lios->c_cflag \|= LINUX_PARENB;
	if (bios->c_cflag & PARODD)
	lios->c_cflag \|= LINUX_PARODD;
	if (bios->c_cflag & HUPCL)
	lios->c_cflag \|= LINUX_HUPCL;
	if (bios->c_cflag & CLOCAL)
	lios->c_cflag \|= LINUX_CLOCAL;
	if (bios->c_cflag & CRTSCTS)
	lios->c_cflag \|= LINUX_CRTSCTS;

	lios->c_lflag = 0;
	if (bios->c_lflag & ISIG)
	lios->c_lflag \|= LINUX_ISIG;
	if (bios->c_lflag & ICANON)
	lios->c_lflag \|= LINUX_ICANON;
	if (bios->c_lflag & ECHO)
	lios->c_lflag \|= LINUX_ECHO;
	if (bios->c_lflag & ECHOE)
	lios->c_lflag \|= LINUX_ECHOE;
	if (bios->c_lflag & ECHOK)
	lios->c_lflag \|= LINUX_ECHOK;
	if (bios->c_lflag & ECHONL)
	lios->c_lflag \|= LINUX_ECHONL;
	if (bios->c_lflag & NOFLSH)
	lios->c_lflag \|= LINUX_NOFLSH;
	if (bios->c_lflag & TOSTOP)
	lios->c_lflag \|= LINUX_TOSTOP;
	if (bios->c_lflag & ECHOCTL)
	lios->c_lflag \|= LINUX_ECHOCTL;
	if (bios->c_lflag & ECHOPRT)
	lios->c_lflag \|= LINUX_ECHOPRT;
	if (bios->c_lflag & ECHOKE)
	lios->c_lflag \|= LINUX_ECHOKE;
	if (bios->c_lflag & FLUSHO)
	lios->c_lflag \|= LINUX_FLUSHO;
	if (bios->c_lflag & PENDIN)
	lios->c_lflag \|= LINUX_PENDIN;
	if (bios->c_lflag & IEXTEN)
	lios->c_lflag \|= LINUX_IEXTEN;

	for (i=0; i<LINUX_NCCS; i++)
	lios->c_cc[i] = LINUX_POSIX_VDISABLE;
	lios->c_cc[LINUX_VINTR] = bios->c_cc[VINTR];
	lios->c_cc[LINUX_VQUIT] = bios->c_cc[VQUIT];
	lios->c_cc[LINUX_VERASE] = bios->c_cc[VERASE];
	lios->c_cc[LINUX_VKILL] = bios->c_cc[VKILL];
	lios->c_cc[LINUX_VEOF] = bios->c_cc[VEOF];
	lios->c_cc[LINUX_VEOL] = bios->c_cc[VEOL];
	lios->c_cc[LINUX_VMIN] = bios->c_cc[VMIN];
	lios->c_cc[LINUX_VTIME] = bios->c_cc[VTIME];
	lios->c_cc[LINUX_VEOL2] = bios->c_cc[VEOL2];
	lios->c_cc[LINUX_VSUSP] = bios->c_cc[VSUSP];
	lios->c_cc[LINUX_VSTART] = bios->c_cc[VSTART];
	lios->c_cc[LINUX_VSTOP] = bios->c_cc[VSTOP];
	lios->c_cc[LINUX_VREPRINT] = bios->c_cc[VREPRINT];
	lios->c_cc[LINUX_VDISCARD] = bios->c_cc[VDISCARD];
	lios->c_cc[LINUX_VWERASE] = bios->c_cc[VWERASE];
	lios->c_cc[LINUX_VLNEXT] = bios->c_cc[VLNEXT];

	for (i=0; i<LINUX_NCCS; i++) {
	if (i != LINUX_VMIN && i != LINUX_VTIME &&
	lios->c_cc[i] == _POSIX_VDISABLE)
	lios->c_cc[i] = LINUX_POSIX_VDISABLE;
	}
	lios->c_line = 0;

	#ifdef DEBUG
	if (ldebug(ioctl)) {
	printf("LINUX: LINUX termios structure (output):\n");
	printf("i=%08x o=%08x c=%08x l=%08x line=%d\n",
	lios->c_iflag, lios->c_oflag, lios->c_cflag,
	lios->c_lflag, (int)lios->c_line);
	printf("c_cc ");
	for (i=0; i<LINUX_NCCS; i++)
	printf("%02x ", lios->c_cc[i]);
	printf("\n");
	}
	#endif
	}

	static void
	linux_to_bsd_termios(struct linux_termios lios, struct termios bios)
	{
	int i;

	#ifdef DEBUG
	if (ldebug(ioctl)) {
	printf("LINUX: LINUX termios structure (input):\n");
	printf("i=%08x o=%08x c=%08x l=%08x line=%d\n",
	lios->c_iflag, lios->c_oflag, lios->c_cflag,
	lios->c_lflag, (int)lios->c_line);
	printf("c_cc ");
	for (i=0; i<LINUX_NCCS; i++)
	printf("%02x ", lios->c_cc[i]);
	printf("\n");
	}
	#endif

	bios->c_iflag = 0;
	if (lios->c_iflag & LINUX_IGNBRK)
	bios->c_iflag \|= IGNBRK;
	if (lios->c_iflag & LINUX_BRKINT)
	bios->c_iflag \|= BRKINT;
	if (lios->c_iflag & LINUX_IGNPAR)
	bios->c_iflag \|= IGNPAR;
	if (lios->c_iflag & LINUX_PARMRK)
	bios->c_iflag \|= PARMRK;
	if (lios->c_iflag & LINUX_INPCK)
	bios->c_iflag \|= INPCK;
	if (lios->c_iflag & LINUX_ISTRIP)
	bios->c_iflag \|= ISTRIP;
	if (lios->c_iflag & LINUX_INLCR)
	bios->c_iflag \|= INLCR;
	if (lios->c_iflag & LINUX_IGNCR)
	bios->c_iflag \|= IGNCR;
	if (lios->c_iflag & LINUX_ICRNL)
	bios->c_iflag \|= ICRNL;
	if (lios->c_iflag & LINUX_IXON)
	bios->c_iflag \|= IXON;
	if (lios->c_iflag & LINUX_IXANY)
	bios->c_iflag \|= IXANY;
	if (lios->c_iflag & LINUX_IXOFF)
	bios->c_iflag \|= IXOFF;
	if (lios->c_iflag & LINUX_IMAXBEL)
	bios->c_iflag \|= IMAXBEL;

	bios->c_oflag = 0;
	if (lios->c_oflag & LINUX_OPOST)
	bios->c_oflag \|= OPOST;
	if (lios->c_oflag & LINUX_ONLCR)
	bios->c_oflag \|= ONLCR;
	if (lios->c_oflag & LINUX_XTABS)
	bios->c_oflag \|= TAB3;

	bios->c_cflag = (lios->c_cflag & LINUX_CSIZE) << 4;
	if (lios->c_cflag & LINUX_CSTOPB)
	bios->c_cflag \|= CSTOPB;
	if (lios->c_cflag & LINUX_CREAD)
	bios->c_cflag \|= CREAD;
	if (lios->c_cflag & LINUX_PARENB)
	bios->c_cflag \|= PARENB;
	if (lios->c_cflag & LINUX_PARODD)
	bios->c_cflag \|= PARODD;
	if (lios->c_cflag & LINUX_HUPCL)
	bios->c_cflag \|= HUPCL;
	if (lios->c_cflag & LINUX_CLOCAL)
	bios->c_cflag \|= CLOCAL;
	if (lios->c_cflag & LINUX_CRTSCTS)
	bios->c_cflag \|= CRTSCTS;

	bios->c_lflag = 0;
	if (lios->c_lflag & LINUX_ISIG)
	bios->c_lflag \|= ISIG;
	if (lios->c_lflag & LINUX_ICANON)
	bios->c_lflag \|= ICANON;
	if (lios->c_lflag & LINUX_ECHO)
	bios->c_lflag \|= ECHO;
	if (lios->c_lflag & LINUX_ECHOE)
	bios->c_lflag \|= ECHOE;
	if (lios->c_lflag & LINUX_ECHOK)
	bios->c_lflag \|= ECHOK;
	if (lios->c_lflag & LINUX_ECHONL)
	bios->c_lflag \|= ECHONL;
	if (lios->c_lflag & LINUX_NOFLSH)
	bios->c_lflag \|= NOFLSH;
	if (lios->c_lflag & LINUX_TOSTOP)
	bios->c_lflag \|= TOSTOP;
	if (lios->c_lflag & LINUX_ECHOCTL)
	bios->c_lflag \|= ECHOCTL;
	if (lios->c_lflag & LINUX_ECHOPRT)
	bios->c_lflag \|= ECHOPRT;
	if (lios->c_lflag & LINUX_ECHOKE)
	bios->c_lflag \|= ECHOKE;
	if (lios->c_lflag & LINUX_FLUSHO)
	bios->c_lflag \|= FLUSHO;
	if (lios->c_lflag & LINUX_PENDIN)
	bios->c_lflag \|= PENDIN;
	if (lios->c_lflag & LINUX_IEXTEN)
	bios->c_lflag \|= IEXTEN;

	for (i=0; i<NCCS; i++)
	bios->c_cc[i] = _POSIX_VDISABLE;
	bios->c_cc[VINTR] = lios->c_cc[LINUX_VINTR];
	bios->c_cc[VQUIT] = lios->c_cc[LINUX_VQUIT];
	bios->c_cc[VERASE] = lios->c_cc[LINUX_VERASE];
	bios->c_cc[VKILL] = lios->c_cc[LINUX_VKILL];
	bios->c_cc[VEOF] = lios->c_cc[LINUX_VEOF];
	bios->c_cc[VEOL] = lios->c_cc[LINUX_VEOL];
	bios->c_cc[VMIN] = lios->c_cc[LINUX_VMIN];
	bios->c_cc[VTIME] = lios->c_cc[LINUX_VTIME];
	bios->c_cc[VEOL2] = lios->c_cc[LINUX_VEOL2];
	bios->c_cc[VSUSP] = lios->c_cc[LINUX_VSUSP];
	bios->c_cc[VSTART] = lios->c_cc[LINUX_VSTART];
	bios->c_cc[VSTOP] = lios->c_cc[LINUX_VSTOP];
	bios->c_cc[VREPRINT] = lios->c_cc[LINUX_VREPRINT];
	bios->c_cc[VDISCARD] = lios->c_cc[LINUX_VDISCARD];
	bios->c_cc[VWERASE] = lios->c_cc[LINUX_VWERASE];
	bios->c_cc[VLNEXT] = lios->c_cc[LINUX_VLNEXT];

	for (i=0; i<NCCS; i++) {
	if (i != VMIN && i != VTIME &&
	bios->c_cc[i] == LINUX_POSIX_VDISABLE)
	bios->c_cc[i] = _POSIX_VDISABLE;
	}

	bios->c_ispeed = bios->c_ospeed =
	linux_to_bsd_speed(lios->c_cflag & LINUX_CBAUD, sptab);

	#ifdef DEBUG
	if (ldebug(ioctl)) {
	printf("LINUX: BSD termios structure (output):\n");
	printf("i=%08x o=%08x c=%08x l=%08x ispeed=%d ospeed=%d\n",
	bios->c_iflag, bios->c_oflag, bios->c_cflag, bios->c_lflag,
	bios->c_ispeed, bios->c_ospeed);
	printf("c_cc ");
	for (i=0; i<NCCS; i++)
	printf("%02x ", bios->c_cc[i]);
	printf("\n");
	}
	#endif
	}

	static void
	bsd_to_linux_termio(struct termios bios, struct linux_termio lio)
	{
	struct linux_termios lios;

	bsd_to_linux_termios(bios, &lios);
	lio->c_iflag = lios.c_iflag;
	lio->c_oflag = lios.c_oflag;
	lio->c_cflag = lios.c_cflag;
	lio->c_lflag = lios.c_lflag;
	lio->c_line = lios.c_line;
	memcpy(lio->c_cc, lios.c_cc, LINUX_NCC);
	}

	static void
	linux_to_bsd_termio(struct linux_termio lio, struct termios bios)
	{
	struct linux_termios lios;
	int i;

	lios.c_iflag = lio->c_iflag;
	lios.c_oflag = lio->c_oflag;
	lios.c_cflag = lio->c_cflag;
	lios.c_lflag = lio->c_lflag;
	for (i=LINUX_NCC; i<LINUX_NCCS; i++)
	lios.c_cc[i] = LINUX_POSIX_VDISABLE;
	memcpy(lios.c_cc, lio->c_cc, LINUX_NCC);
	linux_to_bsd_termios(&lios, bios);
	}

	static int
	linux_ioctl_termio(struct thread td, struct linux_ioctl_args args)
	{
	struct termios bios;
	struct linux_termios lios;
	struct linux_termio lio;
	struct file *fp;
	int error;

	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);

	switch (args->cmd & 0xffff) {

	case LINUX_TCGETS:
	error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred,
	td);
	if (error)
	break;
	bsd_to_linux_termios(&bios, &lios);
	error = copyout(&lios, (void *)args->arg, sizeof(lios));
	break;

	case LINUX_TCSETS:
	error = copyin((void *)args->arg, &lios, sizeof(lios));
	if (error)
	break;
	linux_to_bsd_termios(&lios, &bios);
	error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCSETSW:
	error = copyin((void *)args->arg, &lios, sizeof(lios));
	if (error)
	break;
	linux_to_bsd_termios(&lios, &bios);
	error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCSETSF:
	error = copyin((void *)args->arg, &lios, sizeof(lios));
	if (error)
	break;
	linux_to_bsd_termios(&lios, &bios);
	error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCGETA:
	error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios, td->td_ucred,
	td);
	if (error)
	break;
	bsd_to_linux_termio(&bios, &lio);
	error = (copyout(&lio, (void *)args->arg, sizeof(lio)));
	break;

	case LINUX_TCSETA:
	error = copyin((void *)args->arg, &lio, sizeof(lio));
	if (error)
	break;
	linux_to_bsd_termio(&lio, &bios);
	error = (fo_ioctl(fp, TIOCSETA, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCSETAW:
	error = copyin((void *)args->arg, &lio, sizeof(lio));
	if (error)
	break;
	linux_to_bsd_termio(&lio, &bios);
	error = (fo_ioctl(fp, TIOCSETAW, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	case LINUX_TCSETAF:
	error = copyin((void *)args->arg, &lio, sizeof(lio));
	if (error)
	break;
	linux_to_bsd_termio(&lio, &bios);
	error = (fo_ioctl(fp, TIOCSETAF, (caddr_t)&bios, td->td_ucred,
	td));
	break;

	/* LINUX_TCSBRK */

	case LINUX_TCXONC: {
	switch (args->arg) {
	case LINUX_TCOOFF:
	args->cmd = TIOCSTOP;
	break;
	case LINUX_TCOON:
	args->cmd = TIOCSTART;
	break;
	case LINUX_TCIOFF:
	case LINUX_TCION: {
	int c;
	struct write_args wr;
	error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bios,
	td->td_ucred, td);
	if (error)
	break;
	fdrop(fp, td);
	c = (args->arg == LINUX_TCIOFF) ? VSTOP : VSTART;
	c = bios.c_cc[c];
	if (c != _POSIX_VDISABLE) {
	wr.fd = args->fd;
	wr.buf = &c;
	wr.nbyte = sizeof(c);
	- return (write(td, &wr));
	+ return (sys_write(td, &wr));
	} else
	return (0);
	}
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	args->arg = 0;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;
	}

	case LINUX_TCFLSH: {
	int val;
	switch (args->arg) {
	case LINUX_TCIFLUSH:
	val = FREAD;
	break;
	case LINUX_TCOFLUSH:
	val = FWRITE;
	break;
	case LINUX_TCIOFLUSH:
	val = FREAD \| FWRITE;
	break;
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	error = (fo_ioctl(fp,TIOCFLUSH,(caddr_t)&val,td->td_ucred,td));
	break;
	}

	case LINUX_TIOCEXCL:
	args->cmd = TIOCEXCL;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCNXCL:
	args->cmd = TIOCNXCL;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCSCTTY:
	args->cmd = TIOCSCTTY;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCGPGRP:
	args->cmd = TIOCGPGRP;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCSPGRP:
	args->cmd = TIOCSPGRP;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_TIOCOUTQ */
	/* LINUX_TIOCSTI */

	case LINUX_TIOCGWINSZ:
	args->cmd = TIOCGWINSZ;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCSWINSZ:
	args->cmd = TIOCSWINSZ;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCMGET:
	args->cmd = TIOCMGET;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCMBIS:
	args->cmd = TIOCMBIS;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCMBIC:
	args->cmd = TIOCMBIC;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCMSET:
	args->cmd = TIOCMSET;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	/* TIOCGSOFTCAR */
	/* TIOCSSOFTCAR */

	case LINUX_FIONREAD: /* LINUX_TIOCINQ */
	args->cmd = FIONREAD;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_TIOCLINUX */

	case LINUX_TIOCCONS:
	args->cmd = TIOCCONS;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCGSERIAL: {
	struct linux_serial_struct lss;
	lss.type = LINUX_PORT_16550A;
	lss.flags = 0;
	lss.close_delay = 0;
	error = copyout(&lss, (void *)args->arg, sizeof(lss));
	break;
	}

	case LINUX_TIOCSSERIAL: {
	struct linux_serial_struct lss;
	error = copyin((void *)args->arg, &lss, sizeof(lss));
	if (error)
	break;
	/* XXX - It really helps to have an implementation that
	* does nothing. NOT!
	*/
	error = 0;
	break;
	}

	case LINUX_TIOCPKT:
	args->cmd = TIOCPKT;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_FIONBIO:
	args->cmd = FIONBIO;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCNOTTY:
	args->cmd = TIOCNOTTY;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCSETD: {
	int line;
	switch (args->arg) {
	case LINUX_N_TTY:
	line = TTYDISC;
	break;
	case LINUX_N_SLIP:
	line = SLIPDISC;
	break;
	case LINUX_N_PPP:
	line = PPPDISC;
	break;
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	error = (fo_ioctl(fp, TIOCSETD, (caddr_t)&line, td->td_ucred,
	td));
	break;
	}

	case LINUX_TIOCGETD: {
	int linux_line;
	int bsd_line = TTYDISC;
	error = fo_ioctl(fp, TIOCGETD, (caddr_t)&bsd_line,
	td->td_ucred, td);
	if (error)
	return (error);
	switch (bsd_line) {
	case TTYDISC:
	linux_line = LINUX_N_TTY;
	break;
	case SLIPDISC:
	linux_line = LINUX_N_SLIP;
	break;
	case PPPDISC:
	linux_line = LINUX_N_PPP;
	break;
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	error = (copyout(&linux_line, (void *)args->arg, sizeof(int)));
	break;
	}

	/* LINUX_TCSBRKP */
	/* LINUX_TIOCTTYGSTRUCT */

	case LINUX_FIONCLEX:
	args->cmd = FIONCLEX;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_FIOCLEX:
	args->cmd = FIOCLEX;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_FIOASYNC:
	args->cmd = FIOASYNC;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_TIOCSERCONFIG */
	/* LINUX_TIOCSERGWILD */
	/* LINUX_TIOCSERSWILD */
	/* LINUX_TIOCGLCKTRMIOS */
	/* LINUX_TIOCSLCKTRMIOS */

	case LINUX_TIOCSBRK:
	args->cmd = TIOCSBRK;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_TIOCCBRK:
	args->cmd = TIOCCBRK;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;
	case LINUX_TIOCGPTN: {
	int nb;

	error = fo_ioctl(fp, TIOCGPTN, (caddr_t)&nb, td->td_ucred, td);
	if (!error)
	error = copyout(&nb, (void *)args->arg,
	sizeof(int));
	break;
	}
	case LINUX_TIOCSPTLCK:
	/* Our unlockpt() does nothing. */
	error = 0;
	break;
	default:
	error = ENOIOCTL;
	break;
	}

	fdrop(fp, td);
	return (error);
	}

	/*
	* CDROM related ioctls
	*/

	struct linux_cdrom_msf
	{
	u_char cdmsf_min0;
	u_char cdmsf_sec0;
	u_char cdmsf_frame0;
	u_char cdmsf_min1;
	u_char cdmsf_sec1;
	u_char cdmsf_frame1;
	};

	struct linux_cdrom_tochdr
	{
	u_char cdth_trk0;
	u_char cdth_trk1;
	};

	union linux_cdrom_addr
	{
	struct {
	u_char minute;
	u_char second;
	u_char frame;
	} msf;
	int lba;
	};

	struct linux_cdrom_tocentry
	{
	u_char cdte_track;
	u_char cdte_adr:4;
	u_char cdte_ctrl:4;
	u_char cdte_format;
	union linux_cdrom_addr cdte_addr;
	u_char cdte_datamode;
	};

	struct linux_cdrom_subchnl
	{
	u_char cdsc_format;
	u_char cdsc_audiostatus;
	u_char cdsc_adr:4;
	u_char cdsc_ctrl:4;
	u_char cdsc_trk;
	u_char cdsc_ind;
	union linux_cdrom_addr cdsc_absaddr;
	union linux_cdrom_addr cdsc_reladdr;
	};

	struct l_cdrom_read_audio {
	union linux_cdrom_addr addr;
	u_char addr_format;
	l_int nframes;
	u_char *buf;
	};

	struct l_dvd_layer {
	u_char book_version:4;
	u_char book_type:4;
	u_char min_rate:4;
	u_char disc_size:4;
	u_char layer_type:4;
	u_char track_path:1;
	u_char nlayers:2;
	u_char track_density:4;
	u_char linear_density:4;
	u_char bca:1;
	u_int32_t start_sector;
	u_int32_t end_sector;
	u_int32_t end_sector_l0;
	};

	struct l_dvd_physical {
	u_char type;
	u_char layer_num;
	struct l_dvd_layer layer[4];
	};

	struct l_dvd_copyright {
	u_char type;
	u_char layer_num;
	u_char cpst;
	u_char rmi;
	};

	struct l_dvd_disckey {
	u_char type;
	l_uint agid:2;
	u_char value[2048];
	};

	struct l_dvd_bca {
	u_char type;
	l_int len;
	u_char value[188];
	};

	struct l_dvd_manufact {
	u_char type;
	u_char layer_num;
	l_int len;
	u_char value[2048];
	};

	typedef union {
	u_char type;
	struct l_dvd_physical physical;
	struct l_dvd_copyright copyright;
	struct l_dvd_disckey disckey;
	struct l_dvd_bca bca;
	struct l_dvd_manufact manufact;
	} l_dvd_struct;

	typedef u_char l_dvd_key[5];
	typedef u_char l_dvd_challenge[10];

	struct l_dvd_lu_send_agid {
	u_char type;
	l_uint agid:2;
	};

	struct l_dvd_host_send_challenge {
	u_char type;
	l_uint agid:2;
	l_dvd_challenge chal;
	};

	struct l_dvd_send_key {
	u_char type;
	l_uint agid:2;
	l_dvd_key key;
	};

	struct l_dvd_lu_send_challenge {
	u_char type;
	l_uint agid:2;
	l_dvd_challenge chal;
	};

	struct l_dvd_lu_send_title_key {
	u_char type;
	l_uint agid:2;
	l_dvd_key title_key;
	l_int lba;
	l_uint cpm:1;
	l_uint cp_sec:1;
	l_uint cgms:2;
	};

	struct l_dvd_lu_send_asf {
	u_char type;
	l_uint agid:2;
	l_uint asf:1;
	};

	struct l_dvd_host_send_rpcstate {
	u_char type;
	u_char pdrc;
	};

	struct l_dvd_lu_send_rpcstate {
	u_char type:2;
	u_char vra:3;
	u_char ucca:3;
	u_char region_mask;
	u_char rpc_scheme;
	};

	typedef union {
	u_char type;
	struct l_dvd_lu_send_agid lsa;
	struct l_dvd_host_send_challenge hsc;
	struct l_dvd_send_key lsk;
	struct l_dvd_lu_send_challenge lsc;
	struct l_dvd_send_key hsk;
	struct l_dvd_lu_send_title_key lstk;
	struct l_dvd_lu_send_asf lsasf;
	struct l_dvd_host_send_rpcstate hrpcs;
	struct l_dvd_lu_send_rpcstate lrpcs;
	} l_dvd_authinfo;

	static void
	bsd_to_linux_msf_lba(u_char af, union msf_lba bp, union linux_cdrom_addr lp)
	{
	if (af == CD_LBA_FORMAT)
	lp->lba = bp->lba;
	else {
	lp->msf.minute = bp->msf.minute;
	lp->msf.second = bp->msf.second;
	lp->msf.frame = bp->msf.frame;
	}
	}

	static void
	set_linux_cdrom_addr(union linux_cdrom_addr *addr, int format, int lba)
	{
	if (format == LINUX_CDROM_MSF) {
	addr->msf.frame = lba % 75;
	lba /= 75;
	lba += 2;
	addr->msf.second = lba % 60;
	addr->msf.minute = lba / 60;
	} else
	addr->lba = lba;
	}

	static int
	linux_to_bsd_dvd_struct(l_dvd_struct lp, struct dvd_struct bp)
	{
	bp->format = lp->type;
	switch (bp->format) {
	case DVD_STRUCT_PHYSICAL:
	if (bp->layer_num >= 4)
	return (EINVAL);
	bp->layer_num = lp->physical.layer_num;
	break;
	case DVD_STRUCT_COPYRIGHT:
	bp->layer_num = lp->copyright.layer_num;
	break;
	case DVD_STRUCT_DISCKEY:
	bp->agid = lp->disckey.agid;
	break;
	case DVD_STRUCT_BCA:
	case DVD_STRUCT_MANUFACT:
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	bsd_to_linux_dvd_struct(struct dvd_struct bp, l_dvd_struct lp)
	{
	switch (bp->format) {
	case DVD_STRUCT_PHYSICAL: {
	struct dvd_layer blp = (struct dvd_layer )bp->data;
	struct l_dvd_layer *llp = &lp->physical.layer[bp->layer_num];
	memset(llp, 0, sizeof(*llp));
	llp->book_version = blp->book_version;
	llp->book_type = blp->book_type;
	llp->min_rate = blp->max_rate;
	llp->disc_size = blp->disc_size;
	llp->layer_type = blp->layer_type;
	llp->track_path = blp->track_path;
	llp->nlayers = blp->nlayers;
	llp->track_density = blp->track_density;
	llp->linear_density = blp->linear_density;
	llp->bca = blp->bca;
	llp->start_sector = blp->start_sector;
	llp->end_sector = blp->end_sector;
	llp->end_sector_l0 = blp->end_sector_l0;
	break;
	}
	case DVD_STRUCT_COPYRIGHT:
	lp->copyright.cpst = bp->cpst;
	lp->copyright.rmi = bp->rmi;
	break;
	case DVD_STRUCT_DISCKEY:
	memcpy(lp->disckey.value, bp->data, sizeof(lp->disckey.value));
	break;
	case DVD_STRUCT_BCA:
	lp->bca.len = bp->length;
	memcpy(lp->bca.value, bp->data, sizeof(lp->bca.value));
	break;
	case DVD_STRUCT_MANUFACT:
	lp->manufact.len = bp->length;
	memcpy(lp->manufact.value, bp->data,
	sizeof(lp->manufact.value));
	/* lp->manufact.layer_num is unused in linux (redhat 7.0) */
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	linux_to_bsd_dvd_authinfo(l_dvd_authinfo lp, int bcode,
	struct dvd_authinfo *bp)
	{
	switch (lp->type) {
	case LINUX_DVD_LU_SEND_AGID:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_AGID;
	bp->agid = lp->lsa.agid;
	break;
	case LINUX_DVD_HOST_SEND_CHALLENGE:
	*bcode = DVDIOCSENDKEY;
	bp->format = DVD_SEND_CHALLENGE;
	bp->agid = lp->hsc.agid;
	memcpy(bp->keychal, lp->hsc.chal, 10);
	break;
	case LINUX_DVD_LU_SEND_KEY1:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_KEY1;
	bp->agid = lp->lsk.agid;
	break;
	case LINUX_DVD_LU_SEND_CHALLENGE:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_CHALLENGE;
	bp->agid = lp->lsc.agid;
	break;
	case LINUX_DVD_HOST_SEND_KEY2:
	*bcode = DVDIOCSENDKEY;
	bp->format = DVD_SEND_KEY2;
	bp->agid = lp->hsk.agid;
	memcpy(bp->keychal, lp->hsk.key, 5);
	break;
	case LINUX_DVD_LU_SEND_TITLE_KEY:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_TITLE_KEY;
	bp->agid = lp->lstk.agid;
	bp->lba = lp->lstk.lba;
	break;
	case LINUX_DVD_LU_SEND_ASF:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_ASF;
	bp->agid = lp->lsasf.agid;
	break;
	case LINUX_DVD_INVALIDATE_AGID:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_INVALIDATE_AGID;
	bp->agid = lp->lsa.agid;
	break;
	case LINUX_DVD_LU_SEND_RPC_STATE:
	*bcode = DVDIOCREPORTKEY;
	bp->format = DVD_REPORT_RPC;
	break;
	case LINUX_DVD_HOST_SEND_RPC_STATE:
	*bcode = DVDIOCSENDKEY;
	bp->format = DVD_SEND_RPC;
	bp->region = lp->hrpcs.pdrc;
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	bsd_to_linux_dvd_authinfo(struct dvd_authinfo bp, l_dvd_authinfo lp)
	{
	switch (lp->type) {
	case LINUX_DVD_LU_SEND_AGID:
	lp->lsa.agid = bp->agid;
	break;
	case LINUX_DVD_HOST_SEND_CHALLENGE:
	lp->type = LINUX_DVD_LU_SEND_KEY1;
	break;
	case LINUX_DVD_LU_SEND_KEY1:
	memcpy(lp->lsk.key, bp->keychal, sizeof(lp->lsk.key));
	break;
	case LINUX_DVD_LU_SEND_CHALLENGE:
	memcpy(lp->lsc.chal, bp->keychal, sizeof(lp->lsc.chal));
	break;
	case LINUX_DVD_HOST_SEND_KEY2:
	lp->type = LINUX_DVD_AUTH_ESTABLISHED;
	break;
	case LINUX_DVD_LU_SEND_TITLE_KEY:
	memcpy(lp->lstk.title_key, bp->keychal,
	sizeof(lp->lstk.title_key));
	lp->lstk.cpm = bp->cpm;
	lp->lstk.cp_sec = bp->cp_sec;
	lp->lstk.cgms = bp->cgms;
	break;
	case LINUX_DVD_LU_SEND_ASF:
	lp->lsasf.asf = bp->asf;
	break;
	case LINUX_DVD_INVALIDATE_AGID:
	break;
	case LINUX_DVD_LU_SEND_RPC_STATE:
	lp->lrpcs.type = bp->reg_type;
	lp->lrpcs.vra = bp->vend_rsts;
	lp->lrpcs.ucca = bp->user_rsts;
	lp->lrpcs.region_mask = bp->region;
	lp->lrpcs.rpc_scheme = bp->rpc_scheme;
	break;
	case LINUX_DVD_HOST_SEND_RPC_STATE:
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	linux_ioctl_cdrom(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;

	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	switch (args->cmd & 0xffff) {

	case LINUX_CDROMPAUSE:
	args->cmd = CDIOCPAUSE;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMRESUME:
	args->cmd = CDIOCRESUME;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMPLAYMSF:
	args->cmd = CDIOCPLAYMSF;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMPLAYTRKIND:
	args->cmd = CDIOCPLAYTRACKS;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMREADTOCHDR: {
	struct ioc_toc_header th;
	struct linux_cdrom_tochdr lth;
	error = fo_ioctl(fp, CDIOREADTOCHEADER, (caddr_t)&th,
	td->td_ucred, td);
	if (!error) {
	lth.cdth_trk0 = th.starting_track;
	lth.cdth_trk1 = th.ending_track;
	copyout(&lth, (void *)args->arg, sizeof(lth));
	}
	break;
	}

	case LINUX_CDROMREADTOCENTRY: {
	struct linux_cdrom_tocentry lte;
	struct ioc_read_toc_single_entry irtse;

	error = copyin((void *)args->arg, &lte, sizeof(lte));
	if (error)
	break;
	irtse.address_format = lte.cdte_format;
	irtse.track = lte.cdte_track;
	error = fo_ioctl(fp, CDIOREADTOCENTRY, (caddr_t)&irtse,
	td->td_ucred, td);
	if (!error) {
	lte.cdte_ctrl = irtse.entry.control;
	lte.cdte_adr = irtse.entry.addr_type;
	bsd_to_linux_msf_lba(irtse.address_format,
	&irtse.entry.addr, &lte.cdte_addr);
	error = copyout(&lte, (void *)args->arg, sizeof(lte));
	}
	break;
	}

	case LINUX_CDROMSTOP:
	args->cmd = CDIOCSTOP;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMSTART:
	args->cmd = CDIOCSTART;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_CDROMEJECT:
	args->cmd = CDIOCEJECT;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_CDROMVOLCTRL */

	case LINUX_CDROMSUBCHNL: {
	struct linux_cdrom_subchnl sc;
	struct ioc_read_subchannel bsdsc;
	struct cd_sub_channel_info bsdinfo;

	bsdsc.address_format = CD_LBA_FORMAT;
	bsdsc.data_format = CD_CURRENT_POSITION;
	bsdsc.track = 0;
	bsdsc.data_len = sizeof(bsdinfo);
	bsdsc.data = &bsdinfo;
	error = fo_ioctl(fp, CDIOCREADSUBCHANNEL_SYSSPACE,
	(caddr_t)&bsdsc, td->td_ucred, td);
	if (error)
	break;
	error = copyin((void *)args->arg, &sc, sizeof(sc));
	if (error)
	break;
	sc.cdsc_audiostatus = bsdinfo.header.audio_status;
	sc.cdsc_adr = bsdinfo.what.position.addr_type;
	sc.cdsc_ctrl = bsdinfo.what.position.control;
	sc.cdsc_trk = bsdinfo.what.position.track_number;
	sc.cdsc_ind = bsdinfo.what.position.index_number;
	set_linux_cdrom_addr(&sc.cdsc_absaddr, sc.cdsc_format,
	bsdinfo.what.position.absaddr.lba);
	set_linux_cdrom_addr(&sc.cdsc_reladdr, sc.cdsc_format,
	bsdinfo.what.position.reladdr.lba);
	error = copyout(&sc, (void *)args->arg, sizeof(sc));
	break;
	}

	/* LINUX_CDROMREADMODE2 */
	/* LINUX_CDROMREADMODE1 */
	/* LINUX_CDROMREADAUDIO */
	/* LINUX_CDROMEJECT_SW */
	/* LINUX_CDROMMULTISESSION */
	/* LINUX_CDROM_GET_UPC */

	case LINUX_CDROMRESET:
	args->cmd = CDIOCRESET;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	/* LINUX_CDROMVOLREAD */
	/* LINUX_CDROMREADRAW */
	/* LINUX_CDROMREADCOOKED */
	/* LINUX_CDROMSEEK */
	/* LINUX_CDROMPLAYBLK */
	/* LINUX_CDROMREADALL */
	/* LINUX_CDROMCLOSETRAY */
	/* LINUX_CDROMLOADFROMSLOT */
	/* LINUX_CDROMGETSPINDOWN */
	/* LINUX_CDROMSETSPINDOWN */
	/* LINUX_CDROM_SET_OPTIONS */
	/* LINUX_CDROM_CLEAR_OPTIONS */
	/* LINUX_CDROM_SELECT_SPEED */
	/* LINUX_CDROM_SELECT_DISC */
	/* LINUX_CDROM_MEDIA_CHANGED */
	/* LINUX_CDROM_DRIVE_STATUS */
	/* LINUX_CDROM_DISC_STATUS */
	/* LINUX_CDROM_CHANGER_NSLOTS */
	/* LINUX_CDROM_LOCKDOOR */
	/* LINUX_CDROM_DEBUG */
	/* LINUX_CDROM_GET_CAPABILITY */
	/* LINUX_CDROMAUDIOBUFSIZ */

	case LINUX_DVD_READ_STRUCT: {
	l_dvd_struct *lds;
	struct dvd_struct *bds;

	lds = malloc(sizeof(*lds), M_LINUX, M_WAITOK);
	bds = malloc(sizeof(*bds), M_LINUX, M_WAITOK);
	error = copyin((void )args->arg, lds, sizeof(lds));
	if (error)
	goto out;
	error = linux_to_bsd_dvd_struct(lds, bds);
	if (error)
	goto out;
	error = fo_ioctl(fp, DVDIOCREADSTRUCTURE, (caddr_t)bds,
	td->td_ucred, td);
	if (error)
	goto out;
	error = bsd_to_linux_dvd_struct(bds, lds);
	if (error)
	goto out;
	error = copyout(lds, (void )args->arg, sizeof(lds));
	out:
	free(bds, M_LINUX);
	free(lds, M_LINUX);
	break;
	}

	/* LINUX_DVD_WRITE_STRUCT */

	case LINUX_DVD_AUTH: {
	l_dvd_authinfo lda;
	struct dvd_authinfo bda;
	int bcode;

	error = copyin((void *)args->arg, &lda, sizeof(lda));
	if (error)
	break;
	error = linux_to_bsd_dvd_authinfo(&lda, &bcode, &bda);
	if (error)
	break;
	error = fo_ioctl(fp, bcode, (caddr_t)&bda, td->td_ucred,
	td);
	if (error) {
	if (lda.type == LINUX_DVD_HOST_SEND_KEY2) {
	lda.type = LINUX_DVD_AUTH_FAILURE;
	copyout(&lda, (void *)args->arg, sizeof(lda));
	}
	break;
	}
	error = bsd_to_linux_dvd_authinfo(&bda, &lda);
	if (error)
	break;
	error = copyout(&lda, (void *)args->arg, sizeof(lda));
	break;
	}

	case LINUX_SCSI_GET_BUS_NUMBER:
	case LINUX_SCSI_GET_IDLUN:
	error = linux_ioctl_sg(td, args);
	break;

	/* LINUX_CDROM_SEND_PACKET */
	/* LINUX_CDROM_NEXT_WRITABLE */
	/* LINUX_CDROM_LAST_WRITTEN */

	default:
	error = ENOIOCTL;
	break;
	}

	fdrop(fp, td);
	return (error);
	}

	static int
	linux_ioctl_vfat(struct thread td, struct linux_ioctl_args args)
	{

	return (ENOTTY);
	}

	/*
	* Sound related ioctls
	*/

	struct linux_mixer_info {
	char id[16];
	char name[32];
	int modify_counter;
	int fillers[10];
	};

	struct linux_old_mixer_info {
	char id[16];
	char name[32];
	};

	static u_int32_t dirbits[4] = { IOC_VOID, IOC_IN, IOC_OUT, IOC_INOUT };

	#define SETDIR(c) (((c) & ~IOC_DIRMASK) \| dirbits[args->cmd >> 30])

	static int
	linux_ioctl_sound(struct thread td, struct linux_ioctl_args args)
	{

	switch (args->cmd & 0xffff) {

	case LINUX_SOUND_MIXER_WRITE_VOLUME:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_VOLUME);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_BASS:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_BASS);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_TREBLE:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_TREBLE);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_SYNTH:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_SYNTH);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_PCM:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_PCM);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_SPEAKER:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_SPEAKER);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_LINE:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_MIC:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_MIC);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_CD:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_CD);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_IMIX:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_IMIX);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_ALTPCM:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_ALTPCM);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_RECLEV:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_RECLEV);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_IGAIN:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_IGAIN);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_OGAIN:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_OGAIN);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_LINE1:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE1);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_LINE2:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE2);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_LINE3:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_LINE3);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_INFO: {
	/* Key on encoded length */
	switch ((args->cmd >> 16) & 0x1fff) {
	case 0x005c: { /* SOUND_MIXER_INFO */
	struct linux_mixer_info info;
	bzero(&info, sizeof(info));
	strncpy(info.id, "OSS", sizeof(info.id) - 1);
	strncpy(info.name, "FreeBSD OSS Mixer", sizeof(info.name) - 1);
	copyout(&info, (void *)args->arg, sizeof(info));
	return (0);
	}
	case 0x0030: { /* SOUND_OLD_MIXER_INFO */
	struct linux_old_mixer_info info;
	bzero(&info, sizeof(info));
	strncpy(info.id, "OSS", sizeof(info.id) - 1);
	strncpy(info.name, "FreeBSD OSS Mixer", sizeof(info.name) - 1);
	copyout(&info, (void *)args->arg, sizeof(info));
	return (0);
	}
	default:
	return (ENOIOCTL);
	}
	break;
	}

	case LINUX_OSS_GETVERSION: {
	int version = linux_get_oss_version(td);
	return (copyout(&version, (void *)args->arg, sizeof(int)));
	}

	case LINUX_SOUND_MIXER_READ_STEREODEVS:
	args->cmd = SOUND_MIXER_READ_STEREODEVS;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_READ_CAPS:
	args->cmd = SOUND_MIXER_READ_CAPS;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_READ_RECMASK:
	args->cmd = SOUND_MIXER_READ_RECMASK;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_READ_DEVMASK:
	args->cmd = SOUND_MIXER_READ_DEVMASK;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_MIXER_WRITE_RECSRC:
	args->cmd = SETDIR(SOUND_MIXER_WRITE_RECSRC);
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_RESET:
	args->cmd = SNDCTL_DSP_RESET;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SYNC:
	args->cmd = SNDCTL_DSP_SYNC;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SPEED:
	args->cmd = SNDCTL_DSP_SPEED;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_STEREO:
	args->cmd = SNDCTL_DSP_STEREO;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETBLKSIZE: /* LINUX_SNDCTL_DSP_SETBLKSIZE */
	args->cmd = SNDCTL_DSP_GETBLKSIZE;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SETFMT:
	args->cmd = SNDCTL_DSP_SETFMT;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_PCM_WRITE_CHANNELS:
	args->cmd = SOUND_PCM_WRITE_CHANNELS;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SOUND_PCM_WRITE_FILTER:
	args->cmd = SOUND_PCM_WRITE_FILTER;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_POST:
	args->cmd = SNDCTL_DSP_POST;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SUBDIVIDE:
	args->cmd = SNDCTL_DSP_SUBDIVIDE;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SETFRAGMENT:
	args->cmd = SNDCTL_DSP_SETFRAGMENT;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETFMTS:
	args->cmd = SNDCTL_DSP_GETFMTS;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETOSPACE:
	args->cmd = SNDCTL_DSP_GETOSPACE;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETISPACE:
	args->cmd = SNDCTL_DSP_GETISPACE;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_NONBLOCK:
	args->cmd = SNDCTL_DSP_NONBLOCK;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETCAPS:
	args->cmd = SNDCTL_DSP_GETCAPS;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SETTRIGGER: /* LINUX_SNDCTL_GETTRIGGER */
	args->cmd = SNDCTL_DSP_SETTRIGGER;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETIPTR:
	args->cmd = SNDCTL_DSP_GETIPTR;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETOPTR:
	args->cmd = SNDCTL_DSP_GETOPTR;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_SETDUPLEX:
	args->cmd = SNDCTL_DSP_SETDUPLEX;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_DSP_GETODELAY:
	args->cmd = SNDCTL_DSP_GETODELAY;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_RESET:
	args->cmd = SNDCTL_SEQ_RESET;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_SYNC:
	args->cmd = SNDCTL_SEQ_SYNC;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SYNTH_INFO:
	args->cmd = SNDCTL_SYNTH_INFO;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_CTRLRATE:
	args->cmd = SNDCTL_SEQ_CTRLRATE;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_GETOUTCOUNT:
	args->cmd = SNDCTL_SEQ_GETOUTCOUNT;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_GETINCOUNT:
	args->cmd = SNDCTL_SEQ_GETINCOUNT;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_PERCMODE:
	args->cmd = SNDCTL_SEQ_PERCMODE;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_FM_LOAD_INSTR:
	args->cmd = SNDCTL_FM_LOAD_INSTR;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_TESTMIDI:
	args->cmd = SNDCTL_SEQ_TESTMIDI;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_RESETSAMPLES:
	args->cmd = SNDCTL_SEQ_RESETSAMPLES;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_NRSYNTHS:
	args->cmd = SNDCTL_SEQ_NRSYNTHS;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_NRMIDIS:
	args->cmd = SNDCTL_SEQ_NRMIDIS;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_MIDI_INFO:
	args->cmd = SNDCTL_MIDI_INFO;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SEQ_TRESHOLD:
	args->cmd = SNDCTL_SEQ_TRESHOLD;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	case LINUX_SNDCTL_SYNTH_MEMAVL:
	args->cmd = SNDCTL_SYNTH_MEMAVL;
	- return (ioctl(td, (struct ioctl_args *)args));
	+ return (sys_ioctl(td, (struct ioctl_args *)args));

	}

	return (ENOIOCTL);
	}

	/*
	* Console related ioctls
	*/

	#define ISSIGVALID(sig) ((sig) > 0 && (sig) < NSIG)

	static int
	linux_ioctl_console(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;

	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	switch (args->cmd & 0xffff) {

	case LINUX_KIOCSOUND:
	args->cmd = KIOCSOUND;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDMKTONE:
	args->cmd = KDMKTONE;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDGETLED:
	args->cmd = KDGETLED;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDSETLED:
	args->cmd = KDSETLED;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDSETMODE:
	args->cmd = KDSETMODE;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDGETMODE:
	args->cmd = KDGETMODE;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDGKBMODE:
	args->cmd = KDGKBMODE;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_KDSKBMODE: {
	int kbdmode;
	switch (args->arg) {
	case LINUX_KBD_RAW:
	kbdmode = K_RAW;
	break;
	case LINUX_KBD_XLATE:
	kbdmode = K_XLATE;
	break;
	case LINUX_KBD_MEDIUMRAW:
	kbdmode = K_RAW;
	break;
	default:
	fdrop(fp, td);
	return (EINVAL);
	}
	error = (fo_ioctl(fp, KDSKBMODE, (caddr_t)&kbdmode,
	td->td_ucred, td));
	break;
	}

	case LINUX_VT_OPENQRY:
	args->cmd = VT_OPENQRY;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_GETMODE:
	args->cmd = VT_GETMODE;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_SETMODE: {
	struct vt_mode mode;
	if ((error = copyin((void *)args->arg, &mode, sizeof(mode))))
	break;
	if (!ISSIGVALID(mode.frsig) && ISSIGVALID(mode.acqsig))
	mode.frsig = mode.acqsig;
	if ((error = copyout(&mode, (void *)args->arg, sizeof(mode))))
	break;
	args->cmd = VT_SETMODE;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;
	}

	case LINUX_VT_GETSTATE:
	args->cmd = VT_GETACTIVE;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_RELDISP:
	args->cmd = VT_RELDISP;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_ACTIVATE:
	args->cmd = VT_ACTIVATE;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	case LINUX_VT_WAITACTIVE:
	args->cmd = VT_WAITACTIVE;
	- error = (ioctl(td, (struct ioctl_args *)args));
	+ error = (sys_ioctl(td, (struct ioctl_args *)args));
	break;

	default:
	error = ENOIOCTL;
	break;
	}

	fdrop(fp, td);
	return (error);
	}

	/*
	* Criteria for interface name translation
	*/
	#define IFP_IS_ETH(ifp) (ifp->if_type == IFT_ETHER)

	/*
	* Interface function used by linprocfs (at the time of writing). It's not
	* used by the Linuxulator itself.
	*/
	int
	linux_ifname(struct ifnet ifp, char buffer, size_t buflen)
	{
	struct ifnet *ifscan;
	int ethno;

	IFNET_RLOCK_ASSERT();

	/* Short-circuit non ethernet interfaces */
	if (!IFP_IS_ETH(ifp))
	return (strlcpy(buffer, ifp->if_xname, buflen));

	/* Determine the (relative) unit number for ethernet interfaces */
	ethno = 0;
	TAILQ_FOREACH(ifscan, &V_ifnet, if_link) {
	if (ifscan == ifp)
	return (snprintf(buffer, buflen, "eth%d", ethno));
	if (IFP_IS_ETH(ifscan))
	ethno++;
	}

	return (0);
	}

	/*
	* Translate a Linux interface name to a FreeBSD interface name,
	* and return the associated ifnet structure
	* bsdname and lxname need to be least IFNAMSIZ bytes long, but
	* can point to the same buffer.
	*/

	static struct ifnet *
	ifname_linux_to_bsd(struct thread td, const char lxname, char *bsdname)
	{
	struct ifnet *ifp;
	int len, unit;
	char *ep;
	int is_eth, index;

	for (len = 0; len < LINUX_IFNAMSIZ; ++len)
	if (!isalpha(lxname[len]))
	break;
	if (len == 0 \|\| len == LINUX_IFNAMSIZ)
	return (NULL);
	unit = (int)strtoul(lxname + len, &ep, 10);
	if (ep == NULL \|\| ep == lxname + len \|\| ep >= lxname + LINUX_IFNAMSIZ)
	return (NULL);
	index = 0;
	is_eth = (len == 3 && !strncmp(lxname, "eth", len)) ? 1 : 0;
	CURVNET_SET(TD_TO_VNET(td));
	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	/*
	* Allow Linux programs to use FreeBSD names. Don't presume
	* we never have an interface named "eth", so don't make
	* the test optional based on is_eth.
	*/
	if (strncmp(ifp->if_xname, lxname, LINUX_IFNAMSIZ) == 0)
	break;
	if (is_eth && IFP_IS_ETH(ifp) && unit == index++)
	break;
	}
	IFNET_RUNLOCK();
	CURVNET_RESTORE();
	if (ifp != NULL)
	strlcpy(bsdname, ifp->if_xname, IFNAMSIZ);
	return (ifp);
	}

	/*
	* Implement the SIOCGIFCONF ioctl
	*/

	static int
	linux_ifconf(struct thread td, struct ifconf uifc)
	{
	#ifdef COMPAT_LINUX32
	struct l_ifconf ifc;
	#else
	struct ifconf ifc;
	#endif
	struct l_ifreq ifr;
	struct ifnet *ifp;
	struct ifaddr *ifa;
	struct sbuf *sb;
	int error, ethno, full = 0, valid_len, max_len;

	error = copyin(uifc, &ifc, sizeof(ifc));
	if (error != 0)
	return (error);

	max_len = MAXPHYS - 1;

	CURVNET_SET(TD_TO_VNET(td));
	/* handle the 'request buffer size' case */
	if (ifc.ifc_buf == PTROUT(NULL)) {
	ifc.ifc_len = 0;
	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	struct sockaddr *sa = ifa->ifa_addr;
	if (sa->sa_family == AF_INET)
	ifc.ifc_len += sizeof(ifr);
	}
	}
	IFNET_RUNLOCK();
	error = copyout(&ifc, uifc, sizeof(ifc));
	CURVNET_RESTORE();
	return (error);
	}

	if (ifc.ifc_len <= 0) {
	CURVNET_RESTORE();
	return (EINVAL);
	}

	again:
	/* Keep track of eth interfaces */
	ethno = 0;
	if (ifc.ifc_len <= max_len) {
	max_len = ifc.ifc_len;
	full = 1;
	}
	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
	max_len = 0;
	valid_len = 0;

	/* Return all AF_INET addresses of all interfaces */
	IFNET_RLOCK();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	int addrs = 0;

	bzero(&ifr, sizeof(ifr));
	if (IFP_IS_ETH(ifp))
	snprintf(ifr.ifr_name, LINUX_IFNAMSIZ, "eth%d",
	ethno++);
	else
	strlcpy(ifr.ifr_name, ifp->if_xname, LINUX_IFNAMSIZ);

	/* Walk the address list */
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	struct sockaddr *sa = ifa->ifa_addr;

	if (sa->sa_family == AF_INET) {
	ifr.ifr_addr.sa_family = LINUX_AF_INET;
	memcpy(ifr.ifr_addr.sa_data, sa->sa_data,
	sizeof(ifr.ifr_addr.sa_data));
	sbuf_bcat(sb, &ifr, sizeof(ifr));
	max_len += sizeof(ifr);
	addrs++;
	}

	if (sbuf_error(sb) == 0)
	valid_len = sbuf_len(sb);
	}
	if (addrs == 0) {
	bzero((caddr_t)&ifr.ifr_addr, sizeof(ifr.ifr_addr));
	sbuf_bcat(sb, &ifr, sizeof(ifr));
	max_len += sizeof(ifr);

	if (sbuf_error(sb) == 0)
	valid_len = sbuf_len(sb);
	}
	}
	IFNET_RUNLOCK();

	if (valid_len != max_len && !full) {
	sbuf_delete(sb);
	goto again;
	}

	ifc.ifc_len = valid_len;
	sbuf_finish(sb);
	memcpy(PTRIN(ifc.ifc_buf), sbuf_data(sb), ifc.ifc_len);
	error = copyout(&ifc, uifc, sizeof(ifc));
	sbuf_delete(sb);
	CURVNET_RESTORE();

	return (error);
	}

	static int
	linux_gifflags(struct thread td, struct ifnet ifp, struct l_ifreq *ifr)
	{
	l_short flags;

	flags = (ifp->if_flags \| ifp->if_drv_flags) & 0xffff;
	/* these flags have no Linux equivalent */
	flags &= ~(IFF_SMART\|IFF_DRV_OACTIVE\|IFF_SIMPLEX\|
	IFF_LINK0\|IFF_LINK1\|IFF_LINK2);
	/* Linux' multicast flag is in a different bit */
	if (flags & IFF_MULTICAST) {
	flags &= ~IFF_MULTICAST;
	flags \|= 0x1000;
	}

	return (copyout(&flags, &ifr->ifr_flags, sizeof(flags)));
	}

	#define ARPHRD_ETHER 1
	#define ARPHRD_LOOPBACK 772

	static int
	linux_gifhwaddr(struct ifnet ifp, struct l_ifreq ifr)
	{
	struct ifaddr *ifa;
	struct sockaddr_dl *sdl;
	struct l_sockaddr lsa;

	if (ifp->if_type == IFT_LOOP) {
	bzero(&lsa, sizeof(lsa));
	lsa.sa_family = ARPHRD_LOOPBACK;
	return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof(lsa)));
	}

	if (ifp->if_type != IFT_ETHER)
	return (ENOENT);

	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	sdl = (struct sockaddr_dl*)ifa->ifa_addr;
	if (sdl != NULL && (sdl->sdl_family == AF_LINK) &&
	(sdl->sdl_type == IFT_ETHER)) {
	bzero(&lsa, sizeof(lsa));
	lsa.sa_family = ARPHRD_ETHER;
	bcopy(LLADDR(sdl), lsa.sa_data, LINUX_IFHWADDRLEN);
	return (copyout(&lsa, &ifr->ifr_hwaddr, sizeof(lsa)));
	}
	}

	return (ENOENT);
	}


	/*
	* If we fault in bsd_to_linux_ifreq() then we will fault when we call
	* the native ioctl(). Thus, we don't really need to check the return
	* value of this function.
	*/
	static int
	bsd_to_linux_ifreq(struct ifreq *arg)
	{
	struct ifreq ifr;
	size_t ifr_len = sizeof(struct ifreq);
	int error;

	if ((error = copyin(arg, &ifr, ifr_len)))
	return (error);

	(u_short )&ifr.ifr_addr = ifr.ifr_addr.sa_family;

	error = copyout(&ifr, arg, ifr_len);

	return (error);
	}

	/*
	* Socket related ioctls
	*/

	static int
	linux_ioctl_socket(struct thread td, struct linux_ioctl_args args)
	{
	char lifname[LINUX_IFNAMSIZ], ifname[IFNAMSIZ];
	struct ifnet *ifp;
	struct file *fp;
	int error, type;

	ifp = NULL;
	error = 0;

	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	type = fp->f_type;
	fdrop(fp, td);
	if (type != DTYPE_SOCKET) {
	/* not a socket - probably a tap / vmnet device */
	switch (args->cmd) {
	case LINUX_SIOCGIFADDR:
	case LINUX_SIOCSIFADDR:
	case LINUX_SIOCGIFFLAGS:
	return (linux_ioctl_special(td, args));
	default:
	return (ENOIOCTL);
	}
	}

	switch (args->cmd & 0xffff) {

	case LINUX_FIOGETOWN:
	case LINUX_FIOSETOWN:
	case LINUX_SIOCADDMULTI:
	case LINUX_SIOCATMARK:
	case LINUX_SIOCDELMULTI:
	case LINUX_SIOCGIFCONF:
	case LINUX_SIOCGPGRP:
	case LINUX_SIOCSPGRP:
	case LINUX_SIOCGIFCOUNT:
	/* these ioctls don't take an interface name */
	#ifdef DEBUG
	printf("%s(): ioctl %d\n", __func__,
	args->cmd & 0xffff);
	#endif
	break;

	case LINUX_SIOCGIFFLAGS:
	case LINUX_SIOCGIFADDR:
	case LINUX_SIOCSIFADDR:
	case LINUX_SIOCGIFDSTADDR:
	case LINUX_SIOCGIFBRDADDR:
	case LINUX_SIOCGIFNETMASK:
	case LINUX_SIOCSIFNETMASK:
	case LINUX_SIOCGIFMTU:
	case LINUX_SIOCSIFMTU:
	case LINUX_SIOCSIFNAME:
	case LINUX_SIOCGIFHWADDR:
	case LINUX_SIOCSIFHWADDR:
	case LINUX_SIOCDEVPRIVATE:
	case LINUX_SIOCDEVPRIVATE+1:
	case LINUX_SIOCGIFINDEX:
	/* copy in the interface name and translate it. */
	error = copyin((void *)args->arg, lifname, LINUX_IFNAMSIZ);
	if (error != 0)
	return (error);
	#ifdef DEBUG
	printf("%s(): ioctl %d on %.*s\n", __func__,
	args->cmd & 0xffff, LINUX_IFNAMSIZ, lifname);
	#endif
	ifp = ifname_linux_to_bsd(td, lifname, ifname);
	if (ifp == NULL)
	return (EINVAL);
	/*
	* We need to copy it back out in case we pass the
	* request on to our native ioctl(), which will expect
	* the ifreq to be in user space and have the correct
	* interface name.
	*/
	error = copyout(ifname, (void *)args->arg, IFNAMSIZ);
	if (error != 0)
	return (error);
	#ifdef DEBUG
	printf("%s(): %s translated to %s\n", __func__,
	lifname, ifname);
	#endif
	break;

	default:
	return (ENOIOCTL);
	}

	switch (args->cmd & 0xffff) {

	case LINUX_FIOSETOWN:
	args->cmd = FIOSETOWN;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCSPGRP:
	args->cmd = SIOCSPGRP;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_FIOGETOWN:
	args->cmd = FIOGETOWN;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCGPGRP:
	args->cmd = SIOCGPGRP;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCATMARK:
	args->cmd = SIOCATMARK;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	/* LINUX_SIOCGSTAMP */

	case LINUX_SIOCGIFCONF:
	error = linux_ifconf(td, (struct ifconf *)args->arg);
	break;

	case LINUX_SIOCGIFFLAGS:
	args->cmd = SIOCGIFFLAGS;
	error = linux_gifflags(td, ifp, (struct l_ifreq *)args->arg);
	break;

	case LINUX_SIOCGIFADDR:
	args->cmd = SIOCGIFADDR;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	bsd_to_linux_ifreq((struct ifreq *)args->arg);
	break;

	case LINUX_SIOCSIFADDR:
	/* XXX probably doesn't work, included for completeness */
	args->cmd = SIOCSIFADDR;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCGIFDSTADDR:
	args->cmd = SIOCGIFDSTADDR;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	bsd_to_linux_ifreq((struct ifreq *)args->arg);
	break;

	case LINUX_SIOCGIFBRDADDR:
	args->cmd = SIOCGIFBRDADDR;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	bsd_to_linux_ifreq((struct ifreq *)args->arg);
	break;

	case LINUX_SIOCGIFNETMASK:
	args->cmd = SIOCGIFNETMASK;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	bsd_to_linux_ifreq((struct ifreq *)args->arg);
	break;

	case LINUX_SIOCSIFNETMASK:
	error = ENOIOCTL;
	break;

	case LINUX_SIOCGIFMTU:
	args->cmd = SIOCGIFMTU;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCSIFMTU:
	args->cmd = SIOCSIFMTU;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCSIFNAME:
	error = ENOIOCTL;
	break;

	case LINUX_SIOCGIFHWADDR:
	error = linux_gifhwaddr(ifp, (struct l_ifreq *)args->arg);
	break;

	case LINUX_SIOCSIFHWADDR:
	error = ENOIOCTL;
	break;

	case LINUX_SIOCADDMULTI:
	args->cmd = SIOCADDMULTI;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCDELMULTI:
	args->cmd = SIOCDELMULTI;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCGIFINDEX:
	args->cmd = SIOCGIFINDEX;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCGIFCOUNT:
	error = 0;
	break;

	/*
	* XXX This is slightly bogus, but these ioctls are currently
	* XXX only used by the aironet (if_an) network driver.
	*/
	case LINUX_SIOCDEVPRIVATE:
	args->cmd = SIOCGPRIVATE_0;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;

	case LINUX_SIOCDEVPRIVATE+1:
	args->cmd = SIOCGPRIVATE_1;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;
	}

	if (ifp != NULL)
	/* restore the original interface name */
	copyout(lifname, (void *)args->arg, LINUX_IFNAMSIZ);

	#ifdef DEBUG
	printf("%s(): returning %d\n", __func__, error);
	#endif
	return (error);
	}

	/*
	* Device private ioctl handler
	*/
	static int
	linux_ioctl_private(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error, type;

	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	type = fp->f_type;
	fdrop(fp, td);
	if (type == DTYPE_SOCKET)
	return (linux_ioctl_socket(td, args));
	return (ENOIOCTL);
	}

	/*
	* DRM ioctl handler (sys/dev/drm)
	*/
	static int
	linux_ioctl_drm(struct thread td, struct linux_ioctl_args args)
	{
	args->cmd = SETDIR(args->cmd);
	- return ioctl(td, (struct ioctl_args *)args);
	+ return sys_ioctl(td, (struct ioctl_args *)args);
	}

	static int
	linux_ioctl_sg(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	u_long cmd;
	int error;

	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0) {
	printf("sg_linux_ioctl: fget returned %d\n", error);
	return (error);
	}
	cmd = args->cmd;

	error = (fo_ioctl(fp, cmd, (caddr_t)args->arg, td->td_ucred, td));
	fdrop(fp, td);
	return (error);
	}

	/*
	* Video4Linux (V4L) ioctl handler
	*/
	static int
	linux_to_bsd_v4l_tuner(struct l_video_tuner lvt, struct video_tuner vt)
	{
	vt->tuner = lvt->tuner;
	strlcpy(vt->name, lvt->name, LINUX_VIDEO_TUNER_NAME_SIZE);
	vt->rangelow = lvt->rangelow; /* possible long size conversion */
	vt->rangehigh = lvt->rangehigh; /* possible long size conversion */
	vt->flags = lvt->flags;
	vt->mode = lvt->mode;
	vt->signal = lvt->signal;
	return (0);
	}

	static int
	bsd_to_linux_v4l_tuner(struct video_tuner vt, struct l_video_tuner lvt)
	{
	lvt->tuner = vt->tuner;
	strlcpy(lvt->name, vt->name, LINUX_VIDEO_TUNER_NAME_SIZE);
	lvt->rangelow = vt->rangelow; /* possible long size conversion */
	lvt->rangehigh = vt->rangehigh; /* possible long size conversion */
	lvt->flags = vt->flags;
	lvt->mode = vt->mode;
	lvt->signal = vt->signal;
	return (0);
	}

	#ifdef COMPAT_LINUX_V4L_CLIPLIST
	static int
	linux_to_bsd_v4l_clip(struct l_video_clip lvc, struct video_clip vc)
	{
	vc->x = lvc->x;
	vc->y = lvc->y;
	vc->width = lvc->width;
	vc->height = lvc->height;
	vc->next = PTRIN(lvc->next); /* possible pointer size conversion */
	return (0);
	}
	#endif

	static int
	linux_to_bsd_v4l_window(struct l_video_window lvw, struct video_window vw)
	{
	vw->x = lvw->x;
	vw->y = lvw->y;
	vw->width = lvw->width;
	vw->height = lvw->height;
	vw->chromakey = lvw->chromakey;
	vw->flags = lvw->flags;
	vw->clips = PTRIN(lvw->clips); /* possible pointer size conversion */
	vw->clipcount = lvw->clipcount;
	return (0);
	}

	static int
	bsd_to_linux_v4l_window(struct video_window vw, struct l_video_window lvw)
	{
	lvw->x = vw->x;
	lvw->y = vw->y;
	lvw->width = vw->width;
	lvw->height = vw->height;
	lvw->chromakey = vw->chromakey;
	lvw->flags = vw->flags;
	lvw->clips = PTROUT(vw->clips); /* possible pointer size conversion */
	lvw->clipcount = vw->clipcount;
	return (0);
	}

	static int
	linux_to_bsd_v4l_buffer(struct l_video_buffer lvb, struct video_buffer vb)
	{
	vb->base = PTRIN(lvb->base); /* possible pointer size conversion */
	vb->height = lvb->height;
	vb->width = lvb->width;
	vb->depth = lvb->depth;
	vb->bytesperline = lvb->bytesperline;
	return (0);
	}

	static int
	bsd_to_linux_v4l_buffer(struct video_buffer vb, struct l_video_buffer lvb)
	{
	lvb->base = PTROUT(vb->base); /* possible pointer size conversion */
	lvb->height = vb->height;
	lvb->width = vb->width;
	lvb->depth = vb->depth;
	lvb->bytesperline = vb->bytesperline;
	return (0);
	}

	static int
	linux_to_bsd_v4l_code(struct l_video_code lvc, struct video_code vc)
	{
	strlcpy(vc->loadwhat, lvc->loadwhat, LINUX_VIDEO_CODE_LOADWHAT_SIZE);
	vc->datasize = lvc->datasize;
	vc->data = PTRIN(lvc->data); /* possible pointer size conversion */
	return (0);
	}

	#ifdef COMPAT_LINUX_V4L_CLIPLIST
	static int
	linux_v4l_clip_copy(void lvc, struct video_clip *ppvc)
	{
	int error;
	struct video_clip vclip;
	struct l_video_clip l_vclip;

	error = copyin(lvc, &l_vclip, sizeof(l_vclip));
	if (error) return (error);
	linux_to_bsd_v4l_clip(&l_vclip, &vclip);
	/* XXX: If there can be no concurrency: s/M_NOWAIT/M_WAITOK/ */
	if ((ppvc = malloc(sizeof(*ppvc), M_LINUX, M_NOWAIT)) == NULL)
	return (ENOMEM); /* XXX: linux has no ENOMEM here */
	memcpy(*ppvc, &vclip, sizeof(vclip));
	(*ppvc)->next = NULL;
	return (0);
	}

	static int
	linux_v4l_cliplist_free(struct video_window *vw)
	{
	struct video_clip **ppvc;
	struct video_clip **ppvc_next;

	for (ppvc = &(vw->clips); *ppvc != NULL; ppvc = ppvc_next) {
	ppvc_next = &((*ppvc)->next);
	free(*ppvc, M_LINUX);
	}
	vw->clips = NULL;

	return (0);
	}

	static int
	linux_v4l_cliplist_copy(struct l_video_window lvw, struct video_window vw)
	{
	int error;
	int clipcount;
	void *plvc;
	struct video_clip **ppvc;

	/*
	* XXX: The cliplist is used to pass in a list of clipping
	* rectangles or, if clipcount == VIDEO_CLIP_BITMAP, a
	* clipping bitmap. Some Linux apps, however, appear to
	* leave cliplist and clips uninitialized. In any case,
	* the cliplist is not used by pwc(4), at the time of
	* writing, FreeBSD's only V4L driver. When a driver
	* that uses the cliplist is developed, this code may
	* need re-examiniation.
	*/
	error = 0;
	clipcount = vw->clipcount;
	if (clipcount == VIDEO_CLIP_BITMAP) {
	/*
	* In this case, the pointer (clips) is overloaded
	* to be a "void *" to a bitmap, therefore there
	* is no struct video_clip to copy now.
	*/
	} else if (clipcount > 0 && clipcount <= 16384) {
	/*
	* Clips points to list of clip rectangles, so
	* copy the list.
	*
	* XXX: Upper limit of 16384 was used here to try to
	* avoid cases when clipcount and clips pointer
	* are uninitialized and therefore have high random
	* values, as is the case in the Linux Skype
	* application. The value 16384 was chosen as that
	* is what is used in the Linux stradis(4) MPEG
	* decoder driver, the only place we found an
	* example of cliplist use.
	*/
	plvc = PTRIN(lvw->clips);
	vw->clips = NULL;
	ppvc = &(vw->clips);
	while (clipcount-- > 0) {
	if (plvc == 0) {
	error = EFAULT;
	break;
	} else {
	error = linux_v4l_clip_copy(plvc, ppvc);
	if (error) {
	linux_v4l_cliplist_free(vw);
	break;
	}
	}
	ppvc = &((*ppvc)->next);
	plvc = PTRIN(((struct l_video_clip *) plvc)->next);
	}
	} else {
	/*
	* clipcount == 0 or negative (but not VIDEO_CLIP_BITMAP)
	* Force cliplist to null.
	*/
	vw->clipcount = 0;
	vw->clips = NULL;
	}
	return (error);
	}
	#endif

	static int
	linux_ioctl_v4l(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;
	struct video_tuner vtun;
	struct video_window vwin;
	struct video_buffer vbuf;
	struct video_code vcode;
	struct l_video_tuner l_vtun;
	struct l_video_window l_vwin;
	struct l_video_buffer l_vbuf;
	struct l_video_code l_vcode;

	switch (args->cmd & 0xffff) {
	case LINUX_VIDIOCGCAP: args->cmd = VIDIOCGCAP; break;
	case LINUX_VIDIOCGCHAN: args->cmd = VIDIOCGCHAN; break;
	case LINUX_VIDIOCSCHAN: args->cmd = VIDIOCSCHAN; break;

	case LINUX_VIDIOCGTUNER:
	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	error = copyin((void *) args->arg, &l_vtun, sizeof(l_vtun));
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	linux_to_bsd_v4l_tuner(&l_vtun, &vtun);
	error = fo_ioctl(fp, VIDIOCGTUNER, &vtun, td->td_ucred, td);
	if (!error) {
	bsd_to_linux_v4l_tuner(&vtun, &l_vtun);
	error = copyout(&l_vtun, (void *) args->arg,
	sizeof(l_vtun));
	}
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOCSTUNER:
	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	error = copyin((void *) args->arg, &l_vtun, sizeof(l_vtun));
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	linux_to_bsd_v4l_tuner(&l_vtun, &vtun);
	error = fo_ioctl(fp, VIDIOCSTUNER, &vtun, td->td_ucred, td);
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOCGPICT: args->cmd = VIDIOCGPICT; break;
	case LINUX_VIDIOCSPICT: args->cmd = VIDIOCSPICT; break;
	case LINUX_VIDIOCCAPTURE: args->cmd = VIDIOCCAPTURE; break;

	case LINUX_VIDIOCGWIN:
	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	error = fo_ioctl(fp, VIDIOCGWIN, &vwin, td->td_ucred, td);
	if (!error) {
	bsd_to_linux_v4l_window(&vwin, &l_vwin);
	error = copyout(&l_vwin, (void *) args->arg,
	sizeof(l_vwin));
	}
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOCSWIN:
	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	error = copyin((void *) args->arg, &l_vwin, sizeof(l_vwin));
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	linux_to_bsd_v4l_window(&l_vwin, &vwin);
	#ifdef COMPAT_LINUX_V4L_CLIPLIST
	error = linux_v4l_cliplist_copy(&l_vwin, &vwin);
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	#endif
	error = fo_ioctl(fp, VIDIOCSWIN, &vwin, td->td_ucred, td);
	fdrop(fp, td);
	#ifdef COMPAT_LINUX_V4L_CLIPLIST
	linux_v4l_cliplist_free(&vwin);
	#endif
	return (error);

	case LINUX_VIDIOCGFBUF:
	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	error = fo_ioctl(fp, VIDIOCGFBUF, &vbuf, td->td_ucred, td);
	if (!error) {
	bsd_to_linux_v4l_buffer(&vbuf, &l_vbuf);
	error = copyout(&l_vbuf, (void *) args->arg,
	sizeof(l_vbuf));
	}
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOCSFBUF:
	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	error = copyin((void *) args->arg, &l_vbuf, sizeof(l_vbuf));
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	linux_to_bsd_v4l_buffer(&l_vbuf, &vbuf);
	error = fo_ioctl(fp, VIDIOCSFBUF, &vbuf, td->td_ucred, td);
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOCKEY: args->cmd = VIDIOCKEY; break;
	case LINUX_VIDIOCGFREQ: args->cmd = VIDIOCGFREQ; break;
	case LINUX_VIDIOCSFREQ: args->cmd = VIDIOCSFREQ; break;
	case LINUX_VIDIOCGAUDIO: args->cmd = VIDIOCGAUDIO; break;
	case LINUX_VIDIOCSAUDIO: args->cmd = VIDIOCSAUDIO; break;
	case LINUX_VIDIOCSYNC: args->cmd = VIDIOCSYNC; break;
	case LINUX_VIDIOCMCAPTURE: args->cmd = VIDIOCMCAPTURE; break;
	case LINUX_VIDIOCGMBUF: args->cmd = VIDIOCGMBUF; break;
	case LINUX_VIDIOCGUNIT: args->cmd = VIDIOCGUNIT; break;
	case LINUX_VIDIOCGCAPTURE: args->cmd = VIDIOCGCAPTURE; break;
	case LINUX_VIDIOCSCAPTURE: args->cmd = VIDIOCSCAPTURE; break;
	case LINUX_VIDIOCSPLAYMODE: args->cmd = VIDIOCSPLAYMODE; break;
	case LINUX_VIDIOCSWRITEMODE: args->cmd = VIDIOCSWRITEMODE; break;
	case LINUX_VIDIOCGPLAYINFO: args->cmd = VIDIOCGPLAYINFO; break;

	case LINUX_VIDIOCSMICROCODE:
	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	error = copyin((void *) args->arg, &l_vcode, sizeof(l_vcode));
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	linux_to_bsd_v4l_code(&l_vcode, &vcode);
	error = fo_ioctl(fp, VIDIOCSMICROCODE, &vcode, td->td_ucred, td);
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOCGVBIFMT: args->cmd = VIDIOCGVBIFMT; break;
	case LINUX_VIDIOCSVBIFMT: args->cmd = VIDIOCSVBIFMT; break;
	default: return (ENOIOCTL);
	}

	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	return (error);
	}

	/*
	* Special ioctl handler
	*/
	static int
	linux_ioctl_special(struct thread td, struct linux_ioctl_args args)
	{
	int error;

	switch (args->cmd) {
	case LINUX_SIOCGIFADDR:
	args->cmd = SIOCGIFADDR;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;
	case LINUX_SIOCSIFADDR:
	args->cmd = SIOCSIFADDR;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;
	case LINUX_SIOCGIFFLAGS:
	args->cmd = SIOCGIFFLAGS;
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	break;
	default:
	error = ENOIOCTL;
	}

	return (error);
	}

	static int
	linux_to_bsd_v4l2_standard(struct l_v4l2_standard lvstd, struct v4l2_standard vstd)
	{
	vstd->index = lvstd->index;
	vstd->id = lvstd->id;
	memcpy(&vstd->name, &lvstd->name, sizeof(*lvstd) - offsetof(struct l_v4l2_standard, name));
	return (0);
	}

	static int
	bsd_to_linux_v4l2_standard(struct v4l2_standard vstd, struct l_v4l2_standard lvstd)
	{
	lvstd->index = vstd->index;
	lvstd->id = vstd->id;
	memcpy(&lvstd->name, &vstd->name, sizeof(*lvstd) - offsetof(struct l_v4l2_standard, name));
	return (0);
	}

	static int
	linux_to_bsd_v4l2_buffer(struct l_v4l2_buffer lvb, struct v4l2_buffer vb)
	{
	vb->index = lvb->index;
	vb->type = lvb->type;
	vb->bytesused = lvb->bytesused;
	vb->flags = lvb->flags;
	vb->field = lvb->field;
	vb->timestamp.tv_sec = lvb->timestamp.tv_sec;
	vb->timestamp.tv_usec = lvb->timestamp.tv_usec;
	memcpy(&vb->timecode, &lvb->timecode, sizeof (lvb->timecode));
	vb->sequence = lvb->sequence;
	vb->memory = lvb->memory;
	if (lvb->memory == V4L2_MEMORY_USERPTR)
	/* possible pointer size conversion */
	vb->m.userptr = (unsigned long)PTRIN(lvb->m.userptr);
	else
	vb->m.offset = lvb->m.offset;
	vb->length = lvb->length;
	vb->input = lvb->input;
	vb->reserved = lvb->reserved;
	return (0);
	}

	static int
	bsd_to_linux_v4l2_buffer(struct v4l2_buffer vb, struct l_v4l2_buffer lvb)
	{
	lvb->index = vb->index;
	lvb->type = vb->type;
	lvb->bytesused = vb->bytesused;
	lvb->flags = vb->flags;
	lvb->field = vb->field;
	lvb->timestamp.tv_sec = vb->timestamp.tv_sec;
	lvb->timestamp.tv_usec = vb->timestamp.tv_usec;
	memcpy(&lvb->timecode, &vb->timecode, sizeof (vb->timecode));
	lvb->sequence = vb->sequence;
	lvb->memory = vb->memory;
	if (vb->memory == V4L2_MEMORY_USERPTR)
	/* possible pointer size conversion */
	lvb->m.userptr = PTROUT(vb->m.userptr);
	else
	lvb->m.offset = vb->m.offset;
	lvb->length = vb->length;
	lvb->input = vb->input;
	lvb->reserved = vb->reserved;
	return (0);
	}

	static int
	linux_to_bsd_v4l2_format(struct l_v4l2_format lvf, struct v4l2_format vf)
	{
	vf->type = lvf->type;
	if (lvf->type == V4L2_BUF_TYPE_VIDEO_OVERLAY
	#ifdef V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY
	\|\| lvf->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY
	#endif
	)
	/*
	* XXX TODO - needs 32 -> 64 bit conversion:
	* (unused by webcams?)
	*/
	return EINVAL;
	memcpy(&vf->fmt, &lvf->fmt, sizeof(vf->fmt));
	return 0;
	}

	static int
	bsd_to_linux_v4l2_format(struct v4l2_format vf, struct l_v4l2_format lvf)
	{
	lvf->type = vf->type;
	if (vf->type == V4L2_BUF_TYPE_VIDEO_OVERLAY
	#ifdef V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY
	\|\| vf->type == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY
	#endif
	)
	/*
	* XXX TODO - needs 32 -> 64 bit conversion:
	* (unused by webcams?)
	*/
	return EINVAL;
	memcpy(&lvf->fmt, &vf->fmt, sizeof(vf->fmt));
	return 0;
	}
	static int
	linux_ioctl_v4l2(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	int error;
	struct v4l2_format vformat;
	struct l_v4l2_format l_vformat;
	struct v4l2_standard vstd;
	struct l_v4l2_standard l_vstd;
	struct l_v4l2_buffer l_vbuf;
	struct v4l2_buffer vbuf;
	struct v4l2_input vinp;

	switch (args->cmd & 0xffff) {
	case LINUX_VIDIOC_RESERVED:
	case LINUX_VIDIOC_LOG_STATUS:
	if ((args->cmd & IOC_DIRMASK) != LINUX_IOC_VOID)
	return ENOIOCTL;
	args->cmd = (args->cmd & 0xffff) \| IOC_VOID;
	break;

	case LINUX_VIDIOC_OVERLAY:
	case LINUX_VIDIOC_STREAMON:
	case LINUX_VIDIOC_STREAMOFF:
	case LINUX_VIDIOC_S_STD:
	case LINUX_VIDIOC_S_TUNER:
	case LINUX_VIDIOC_S_AUDIO:
	case LINUX_VIDIOC_S_AUDOUT:
	case LINUX_VIDIOC_S_MODULATOR:
	case LINUX_VIDIOC_S_FREQUENCY:
	case LINUX_VIDIOC_S_CROP:
	case LINUX_VIDIOC_S_JPEGCOMP:
	case LINUX_VIDIOC_S_PRIORITY:
	case LINUX_VIDIOC_DBG_S_REGISTER:
	case LINUX_VIDIOC_S_HW_FREQ_SEEK:
	case LINUX_VIDIOC_SUBSCRIBE_EVENT:
	case LINUX_VIDIOC_UNSUBSCRIBE_EVENT:
	args->cmd = (args->cmd & ~IOC_DIRMASK) \| IOC_IN;
	break;

	case LINUX_VIDIOC_QUERYCAP:
	case LINUX_VIDIOC_G_STD:
	case LINUX_VIDIOC_G_AUDIO:
	case LINUX_VIDIOC_G_INPUT:
	case LINUX_VIDIOC_G_OUTPUT:
	case LINUX_VIDIOC_G_AUDOUT:
	case LINUX_VIDIOC_G_JPEGCOMP:
	case LINUX_VIDIOC_QUERYSTD:
	case LINUX_VIDIOC_G_PRIORITY:
	case LINUX_VIDIOC_QUERY_DV_PRESET:
	args->cmd = (args->cmd & ~IOC_DIRMASK) \| IOC_OUT;
	break;

	case LINUX_VIDIOC_ENUM_FMT:
	case LINUX_VIDIOC_REQBUFS:
	case LINUX_VIDIOC_G_PARM:
	case LINUX_VIDIOC_S_PARM:
	case LINUX_VIDIOC_G_CTRL:
	case LINUX_VIDIOC_S_CTRL:
	case LINUX_VIDIOC_G_TUNER:
	case LINUX_VIDIOC_QUERYCTRL:
	case LINUX_VIDIOC_QUERYMENU:
	case LINUX_VIDIOC_S_INPUT:
	case LINUX_VIDIOC_S_OUTPUT:
	case LINUX_VIDIOC_ENUMOUTPUT:
	case LINUX_VIDIOC_G_MODULATOR:
	case LINUX_VIDIOC_G_FREQUENCY:
	case LINUX_VIDIOC_CROPCAP:
	case LINUX_VIDIOC_G_CROP:
	case LINUX_VIDIOC_ENUMAUDIO:
	case LINUX_VIDIOC_ENUMAUDOUT:
	case LINUX_VIDIOC_G_SLICED_VBI_CAP:
	#ifdef VIDIOC_ENUM_FRAMESIZES
	case LINUX_VIDIOC_ENUM_FRAMESIZES:
	case LINUX_VIDIOC_ENUM_FRAMEINTERVALS:
	case LINUX_VIDIOC_ENCODER_CMD:
	case LINUX_VIDIOC_TRY_ENCODER_CMD:
	#endif
	case LINUX_VIDIOC_DBG_G_REGISTER:
	case LINUX_VIDIOC_DBG_G_CHIP_IDENT:
	case LINUX_VIDIOC_ENUM_DV_PRESETS:
	case LINUX_VIDIOC_S_DV_PRESET:
	case LINUX_VIDIOC_G_DV_PRESET:
	case LINUX_VIDIOC_S_DV_TIMINGS:
	case LINUX_VIDIOC_G_DV_TIMINGS:
	args->cmd = (args->cmd & ~IOC_DIRMASK) \| IOC_INOUT;
	break;

	case LINUX_VIDIOC_G_FMT:
	case LINUX_VIDIOC_S_FMT:
	case LINUX_VIDIOC_TRY_FMT:
	error = copyin((void *)args->arg, &l_vformat, sizeof(l_vformat));
	if (error)
	return (error);
	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	if (linux_to_bsd_v4l2_format(&l_vformat, &vformat) != 0)
	error = EINVAL;
	else if ((args->cmd & 0xffff) == LINUX_VIDIOC_G_FMT)
	error = fo_ioctl(fp, VIDIOC_G_FMT, &vformat,
	td->td_ucred, td);
	else if ((args->cmd & 0xffff) == LINUX_VIDIOC_S_FMT)
	error = fo_ioctl(fp, VIDIOC_S_FMT, &vformat,
	td->td_ucred, td);
	else
	error = fo_ioctl(fp, VIDIOC_TRY_FMT, &vformat,
	td->td_ucred, td);
	bsd_to_linux_v4l2_format(&vformat, &l_vformat);
	copyout(&l_vformat, (void *)args->arg, sizeof(l_vformat));
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOC_ENUMSTD:
	error = copyin((void *)args->arg, &l_vstd, sizeof(l_vstd));
	if (error)
	return (error);
	linux_to_bsd_v4l2_standard(&l_vstd, &vstd);
	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	error = fo_ioctl(fp, VIDIOC_ENUMSTD, (caddr_t)&vstd,
	td->td_ucred, td);
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	bsd_to_linux_v4l2_standard(&vstd, &l_vstd);
	error = copyout(&l_vstd, (void *)args->arg, sizeof(l_vstd));
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOC_ENUMINPUT:
	/*
	* The Linux struct l_v4l2_input differs only in size,
	* it has no padding at the end.
	*/
	error = copyin((void *)args->arg, &vinp,
	sizeof(struct l_v4l2_input));
	if (error != 0)
	return (error);
	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	error = fo_ioctl(fp, VIDIOC_ENUMINPUT, (caddr_t)&vinp,
	td->td_ucred, td);
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	error = copyout(&vinp, (void *)args->arg,
	sizeof(struct l_v4l2_input));
	fdrop(fp, td);
	return (error);

	case LINUX_VIDIOC_QUERYBUF:
	case LINUX_VIDIOC_QBUF:
	case LINUX_VIDIOC_DQBUF:
	error = copyin((void *)args->arg, &l_vbuf, sizeof(l_vbuf));
	if (error)
	return (error);
	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	linux_to_bsd_v4l2_buffer(&l_vbuf, &vbuf);
	if ((args->cmd & 0xffff) == LINUX_VIDIOC_QUERYBUF)
	error = fo_ioctl(fp, VIDIOC_QUERYBUF, &vbuf,
	td->td_ucred, td);
	else if ((args->cmd & 0xffff) == LINUX_VIDIOC_QBUF)
	error = fo_ioctl(fp, VIDIOC_QBUF, &vbuf,
	td->td_ucred, td);
	else
	error = fo_ioctl(fp, VIDIOC_DQBUF, &vbuf,
	td->td_ucred, td);
	bsd_to_linux_v4l2_buffer(&vbuf, &l_vbuf);
	copyout(&l_vbuf, (void *)args->arg, sizeof(l_vbuf));
	fdrop(fp, td);
	return (error);

	/*
	* XXX TODO - these need 32 -> 64 bit conversion:
	* (are any of them needed for webcams?)
	*/
	case LINUX_VIDIOC_G_FBUF:
	case LINUX_VIDIOC_S_FBUF:

	case LINUX_VIDIOC_G_EXT_CTRLS:
	case LINUX_VIDIOC_S_EXT_CTRLS:
	case LINUX_VIDIOC_TRY_EXT_CTRLS:

	case LINUX_VIDIOC_DQEVENT:

	default: return (ENOIOCTL);
	}

	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	return (error);
	}

	/*
	* Support for emulators/linux-libusb. This port uses FBSD_LUSB* macros
	* instead of USB* ones. This lets us to provide correct values for cmd.
	* 0xffffffe0 -- 0xffffffff range seemed to be the least collision-prone.
	*/
	static int
	linux_ioctl_fbsd_usb(struct thread td, struct linux_ioctl_args args)
	{
	int error;

	error = 0;
	switch (args->cmd) {
	case FBSD_LUSB_DEVICEENUMERATE:
	args->cmd = USB_DEVICEENUMERATE;
	break;
	case FBSD_LUSB_DEV_QUIRK_ADD:
	args->cmd = USB_DEV_QUIRK_ADD;
	break;
	case FBSD_LUSB_DEV_QUIRK_GET:
	args->cmd = USB_DEV_QUIRK_GET;
	break;
	case FBSD_LUSB_DEV_QUIRK_REMOVE:
	args->cmd = USB_DEV_QUIRK_REMOVE;
	break;
	case FBSD_LUSB_DO_REQUEST:
	args->cmd = USB_DO_REQUEST;
	break;
	case FBSD_LUSB_FS_CLEAR_STALL_SYNC:
	args->cmd = USB_FS_CLEAR_STALL_SYNC;
	break;
	case FBSD_LUSB_FS_CLOSE:
	args->cmd = USB_FS_CLOSE;
	break;
	case FBSD_LUSB_FS_COMPLETE:
	args->cmd = USB_FS_COMPLETE;
	break;
	case FBSD_LUSB_FS_INIT:
	args->cmd = USB_FS_INIT;
	break;
	case FBSD_LUSB_FS_OPEN:
	args->cmd = USB_FS_OPEN;
	break;
	case FBSD_LUSB_FS_START:
	args->cmd = USB_FS_START;
	break;
	case FBSD_LUSB_FS_STOP:
	args->cmd = USB_FS_STOP;
	break;
	case FBSD_LUSB_FS_UNINIT:
	args->cmd = USB_FS_UNINIT;
	break;
	case FBSD_LUSB_GET_CONFIG:
	args->cmd = USB_GET_CONFIG;
	break;
	case FBSD_LUSB_GET_DEVICEINFO:
	args->cmd = USB_GET_DEVICEINFO;
	break;
	case FBSD_LUSB_GET_DEVICE_DESC:
	args->cmd = USB_GET_DEVICE_DESC;
	break;
	case FBSD_LUSB_GET_FULL_DESC:
	args->cmd = USB_GET_FULL_DESC;
	break;
	case FBSD_LUSB_GET_IFACE_DRIVER:
	args->cmd = USB_GET_IFACE_DRIVER;
	break;
	case FBSD_LUSB_GET_PLUGTIME:
	args->cmd = USB_GET_PLUGTIME;
	break;
	case FBSD_LUSB_GET_POWER_MODE:
	args->cmd = USB_GET_POWER_MODE;
	break;
	case FBSD_LUSB_GET_REPORT_DESC:
	args->cmd = USB_GET_REPORT_DESC;
	break;
	case FBSD_LUSB_GET_REPORT_ID:
	args->cmd = USB_GET_REPORT_ID;
	break;
	case FBSD_LUSB_GET_TEMPLATE:
	args->cmd = USB_GET_TEMPLATE;
	break;
	case FBSD_LUSB_IFACE_DRIVER_ACTIVE:
	args->cmd = USB_IFACE_DRIVER_ACTIVE;
	break;
	case FBSD_LUSB_IFACE_DRIVER_DETACH:
	args->cmd = USB_IFACE_DRIVER_DETACH;
	break;
	case FBSD_LUSB_QUIRK_NAME_GET:
	args->cmd = USB_QUIRK_NAME_GET;
	break;
	case FBSD_LUSB_READ_DIR:
	args->cmd = USB_READ_DIR;
	break;
	case FBSD_LUSB_SET_ALTINTERFACE:
	args->cmd = USB_SET_ALTINTERFACE;
	break;
	case FBSD_LUSB_SET_CONFIG:
	args->cmd = USB_SET_CONFIG;
	break;
	case FBSD_LUSB_SET_IMMED:
	args->cmd = USB_SET_IMMED;
	break;
	case FBSD_LUSB_SET_POWER_MODE:
	args->cmd = USB_SET_POWER_MODE;
	break;
	case FBSD_LUSB_SET_TEMPLATE:
	args->cmd = USB_SET_TEMPLATE;
	break;
	default:
	error = ENOIOCTL;
	}
	if (error != ENOIOCTL)
	- error = ioctl(td, (struct ioctl_args *)args);
	+ error = sys_ioctl(td, (struct ioctl_args *)args);
	return (error);
	}

	/*
	* main ioctl syscall function
	*/

	int
	linux_ioctl(struct thread td, struct linux_ioctl_args args)
	{
	struct file *fp;
	struct handler_element *he;
	int error, cmd;

	#ifdef DEBUG
	if (ldebug(ioctl))
	printf(ARGS(ioctl, "%d, %04lx, *"), args->fd,
	(unsigned long)args->cmd);
	#endif

	if ((error = fget(td, args->fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	if ((fp->f_flag & (FREAD\|FWRITE)) == 0) {
	fdrop(fp, td);
	return (EBADF);
	}

	/* Iterate over the ioctl handlers */
	cmd = args->cmd & 0xffff;
	sx_slock(&linux_ioctl_sx);
	mtx_lock(&Giant);
	TAILQ_FOREACH(he, &handlers, list) {
	if (cmd >= he->low && cmd <= he->high) {
	error = (*he->func)(td, args);
	if (error != ENOIOCTL) {
	mtx_unlock(&Giant);
	sx_sunlock(&linux_ioctl_sx);
	fdrop(fp, td);
	return (error);
	}
	}
	}
	mtx_unlock(&Giant);
	sx_sunlock(&linux_ioctl_sx);
	fdrop(fp, td);

	linux_msg(td, "ioctl fd=%d, cmd=0x%x ('%c',%d) is not implemented",
	args->fd, (int)(args->cmd & 0xffff),
	(int)(args->cmd & 0xff00) >> 8, (int)(args->cmd & 0xff));

	return (EINVAL);
	}

	int
	linux_ioctl_register_handler(struct linux_ioctl_handler *h)
	{
	struct handler_element he, cur;

	if (h == NULL \|\| h->func == NULL)
	return (EINVAL);

	/*
	* Reuse the element if the handler is already on the list, otherwise
	* create a new element.
	*/
	sx_xlock(&linux_ioctl_sx);
	TAILQ_FOREACH(he, &handlers, list) {
	if (he->func == h->func)
	break;
	}
	if (he == NULL) {
	he = malloc(sizeof(*he),
	M_LINUX, M_WAITOK);
	he->func = h->func;
	} else
	TAILQ_REMOVE(&handlers, he, list);

	/* Initialize range information. */
	he->low = h->low;
	he->high = h->high;
	he->span = h->high - h->low + 1;

	/* Add the element to the list, sorted on span. */
	TAILQ_FOREACH(cur, &handlers, list) {
	if (cur->span > he->span) {
	TAILQ_INSERT_BEFORE(cur, he, list);
	sx_xunlock(&linux_ioctl_sx);
	return (0);
	}
	}
	TAILQ_INSERT_TAIL(&handlers, he, list);
	sx_xunlock(&linux_ioctl_sx);

	return (0);
	}

	int
	linux_ioctl_unregister_handler(struct linux_ioctl_handler *h)
	{
	struct handler_element *he;

	if (h == NULL \|\| h->func == NULL)
	return (EINVAL);

	sx_xlock(&linux_ioctl_sx);
	TAILQ_FOREACH(he, &handlers, list) {
	if (he->func == h->func) {
	TAILQ_REMOVE(&handlers, he, list);
	sx_xunlock(&linux_ioctl_sx);
	free(he, M_LINUX);
	return (0);
	}
	}
	sx_xunlock(&linux_ioctl_sx);

	return (EINVAL);
	}
	Index: head/sys/compat/linux/linux_ipc.c
	===================================================================
	--- head/sys/compat/linux/linux_ipc.c (revision 225616)
	+++ head/sys/compat/linux/linux_ipc.c (revision 225617)
	@@ -1,899 +1,899 @@
	/*-
	* Copyright (c) 1994-1995 Søren Schmidt
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>
	#include <sys/proc.h>
	#include <sys/limits.h>
	#include <sys/msg.h>
	#include <sys/sem.h>
	#include <sys/shm.h>

	#include "opt_compat.h"

	#ifdef COMPAT_LINUX32
	#include <machine/../linux32/linux.h>
	#include <machine/../linux32/linux32_proto.h>
	#include <machine/../linux32/linux32_ipc64.h>
	#else
	#include <machine/../linux/linux.h>
	#include <machine/../linux/linux_proto.h>
	#include <machine/../linux/linux_ipc64.h>
	#endif
	#include <compat/linux/linux_ipc.h>
	#include <compat/linux/linux_util.h>

	struct l_seminfo {
	l_int semmap;
	l_int semmni;
	l_int semmns;
	l_int semmnu;
	l_int semmsl;
	l_int semopm;
	l_int semume;
	l_int semusz;
	l_int semvmx;
	l_int semaem;
	};

	struct l_shminfo {
	l_int shmmax;
	l_int shmmin;
	l_int shmmni;
	l_int shmseg;
	l_int shmall;
	};

	struct l_shm_info {
	l_int used_ids;
	l_ulong shm_tot; /* total allocated shm */
	l_ulong shm_rss; /* total resident shm */
	l_ulong shm_swp; /* total swapped shm */
	l_ulong swap_attempts;
	l_ulong swap_successes;
	};

	struct l_msginfo {
	l_int msgpool;
	l_int msgmap;
	l_int msgmax;
	l_int msgmnb;
	l_int msgmni;
	l_int msgssz;
	l_int msgtql;
	l_ushort msgseg;
	};

	static void
	bsd_to_linux_shminfo( struct shminfo bpp, struct l_shminfo lpp)
	{

	lpp->shmmax = bpp->shmmax;
	lpp->shmmin = bpp->shmmin;
	lpp->shmmni = bpp->shmmni;
	lpp->shmseg = bpp->shmseg;
	lpp->shmall = bpp->shmall;
	}

	static void
	bsd_to_linux_shm_info( struct shm_info bpp, struct l_shm_info lpp)
	{

	lpp->used_ids = bpp->used_ids ;
	lpp->shm_tot = bpp->shm_tot ;
	lpp->shm_rss = bpp->shm_rss ;
	lpp->shm_swp = bpp->shm_swp ;
	lpp->swap_attempts = bpp->swap_attempts ;
	lpp->swap_successes = bpp->swap_successes ;
	}

	struct l_ipc_perm {
	l_key_t key;
	l_uid16_t uid;
	l_gid16_t gid;
	l_uid16_t cuid;
	l_gid16_t cgid;
	l_ushort mode;
	l_ushort seq;
	};

	static void
	linux_to_bsd_ipc_perm(struct l_ipc_perm lpp, struct ipc_perm bpp)
	{

	bpp->key = lpp->key;
	bpp->uid = lpp->uid;
	bpp->gid = lpp->gid;
	bpp->cuid = lpp->cuid;
	bpp->cgid = lpp->cgid;
	bpp->mode = lpp->mode;
	bpp->seq = lpp->seq;
	}


	static void
	bsd_to_linux_ipc_perm(struct ipc_perm bpp, struct l_ipc_perm lpp)
	{

	lpp->key = bpp->key;
	lpp->uid = bpp->uid;
	lpp->gid = bpp->gid;
	lpp->cuid = bpp->cuid;
	lpp->cgid = bpp->cgid;
	lpp->mode = bpp->mode;
	lpp->seq = bpp->seq;
	}

	struct l_msqid_ds {
	struct l_ipc_perm msg_perm;
	l_uintptr_t msg_first; /* first message on queue,unused */
	l_uintptr_t msg_last; /* last message in queue,unused */
	l_time_t msg_stime; /* last msgsnd time */
	l_time_t msg_rtime; /* last msgrcv time */
	l_time_t msg_ctime; /* last change time */
	l_ulong msg_lcbytes; /* Reuse junk fields for 32 bit */
	l_ulong msg_lqbytes; /* ditto */
	l_ushort msg_cbytes; /* current number of bytes on queue */
	l_ushort msg_qnum; /* number of messages in queue */
	l_ushort msg_qbytes; /* max number of bytes on queue */
	l_pid_t msg_lspid; /* pid of last msgsnd */
	l_pid_t msg_lrpid; /* last receive pid */
	}
	#if defined(__amd64__) && defined(COMPAT_LINUX32)
	__packed
	#endif
	;

	struct l_semid_ds {
	struct l_ipc_perm sem_perm;
	l_time_t sem_otime;
	l_time_t sem_ctime;
	l_uintptr_t sem_base;
	l_uintptr_t sem_pending;
	l_uintptr_t sem_pending_last;
	l_uintptr_t undo;
	l_ushort sem_nsems;
	}
	#if defined(__amd64__) && defined(COMPAT_LINUX32)
	__packed
	#endif
	;

	struct l_shmid_ds {
	struct l_ipc_perm shm_perm;
	l_int shm_segsz;
	l_time_t shm_atime;
	l_time_t shm_dtime;
	l_time_t shm_ctime;
	l_ushort shm_cpid;
	l_ushort shm_lpid;
	l_short shm_nattch;
	l_ushort private1;
	l_uintptr_t private2;
	l_uintptr_t private3;
	};

	static void
	linux_to_bsd_semid_ds(struct l_semid_ds lsp, struct semid_ds bsp)
	{

	linux_to_bsd_ipc_perm(&lsp->sem_perm, &bsp->sem_perm);
	bsp->sem_otime = lsp->sem_otime;
	bsp->sem_ctime = lsp->sem_ctime;
	bsp->sem_nsems = lsp->sem_nsems;
	bsp->sem_base = PTRIN(lsp->sem_base);
	}

	static void
	bsd_to_linux_semid_ds(struct semid_ds bsp, struct l_semid_ds lsp)
	{

	bsd_to_linux_ipc_perm(&bsp->sem_perm, &lsp->sem_perm);
	lsp->sem_otime = bsp->sem_otime;
	lsp->sem_ctime = bsp->sem_ctime;
	lsp->sem_nsems = bsp->sem_nsems;
	lsp->sem_base = PTROUT(bsp->sem_base);
	}

	static void
	linux_to_bsd_shmid_ds(struct l_shmid_ds lsp, struct shmid_ds bsp)
	{

	linux_to_bsd_ipc_perm(&lsp->shm_perm, &bsp->shm_perm);
	bsp->shm_segsz = lsp->shm_segsz;
	bsp->shm_lpid = lsp->shm_lpid;
	bsp->shm_cpid = lsp->shm_cpid;
	bsp->shm_nattch = lsp->shm_nattch;
	bsp->shm_atime = lsp->shm_atime;
	bsp->shm_dtime = lsp->shm_dtime;
	bsp->shm_ctime = lsp->shm_ctime;
	}

	static void
	bsd_to_linux_shmid_ds(struct shmid_ds bsp, struct l_shmid_ds lsp)
	{

	bsd_to_linux_ipc_perm(&bsp->shm_perm, &lsp->shm_perm);
	if (bsp->shm_segsz > INT_MAX)
	lsp->shm_segsz = INT_MAX;
	else
	lsp->shm_segsz = bsp->shm_segsz;
	lsp->shm_lpid = bsp->shm_lpid;
	lsp->shm_cpid = bsp->shm_cpid;
	if (bsp->shm_nattch > SHRT_MAX)
	lsp->shm_nattch = SHRT_MAX;
	else
	lsp->shm_nattch = bsp->shm_nattch;
	lsp->shm_atime = bsp->shm_atime;
	lsp->shm_dtime = bsp->shm_dtime;
	lsp->shm_ctime = bsp->shm_ctime;
	lsp->private3 = 0;
	}

	static void
	linux_to_bsd_msqid_ds(struct l_msqid_ds lsp, struct msqid_ds bsp)
	{

	linux_to_bsd_ipc_perm(&lsp->msg_perm, &bsp->msg_perm);
	bsp->msg_cbytes = lsp->msg_cbytes;
	bsp->msg_qnum = lsp->msg_qnum;
	bsp->msg_qbytes = lsp->msg_qbytes;
	bsp->msg_lspid = lsp->msg_lspid;
	bsp->msg_lrpid = lsp->msg_lrpid;
	bsp->msg_stime = lsp->msg_stime;
	bsp->msg_rtime = lsp->msg_rtime;
	bsp->msg_ctime = lsp->msg_ctime;
	}

	static void
	bsd_to_linux_msqid_ds(struct msqid_ds bsp, struct l_msqid_ds lsp)
	{

	bsd_to_linux_ipc_perm(&bsp->msg_perm, &lsp->msg_perm);
	lsp->msg_cbytes = bsp->msg_cbytes;
	lsp->msg_qnum = bsp->msg_qnum;
	lsp->msg_qbytes = bsp->msg_qbytes;
	lsp->msg_lspid = bsp->msg_lspid;
	lsp->msg_lrpid = bsp->msg_lrpid;
	lsp->msg_stime = bsp->msg_stime;
	lsp->msg_rtime = bsp->msg_rtime;
	lsp->msg_ctime = bsp->msg_ctime;
	}

	static void
	linux_ipc_perm_to_ipc64_perm(struct l_ipc_perm in, struct l_ipc64_perm out)
	{

	/* XXX: do we really need to do something here? */
	out->key = in->key;
	out->uid = in->uid;
	out->gid = in->gid;
	out->cuid = in->cuid;
	out->cgid = in->cgid;
	out->mode = in->mode;
	out->seq = in->seq;
	}

	static int
	linux_msqid_pullup(l_int ver, struct l_msqid_ds *linux_msqid, caddr_t uaddr)
	{
	struct l_msqid64_ds linux_msqid64;
	int error;

	if (ver == LINUX_IPC_64) {
	error = copyin(uaddr, &linux_msqid64, sizeof(linux_msqid64));
	if (error != 0)
	return (error);

	bzero(linux_msqid, sizeof(*linux_msqid));

	linux_msqid->msg_perm.uid = linux_msqid64.msg_perm.uid;
	linux_msqid->msg_perm.gid = linux_msqid64.msg_perm.gid;
	linux_msqid->msg_perm.mode = linux_msqid64.msg_perm.mode;

	if (linux_msqid64.msg_qbytes > USHRT_MAX)
	linux_msqid->msg_lqbytes = linux_msqid64.msg_qbytes;
	else
	linux_msqid->msg_qbytes = linux_msqid64.msg_qbytes;
	} else
	error = copyin(uaddr, linux_msqid, sizeof(*linux_msqid));

	return (error);
	}

	static int
	linux_msqid_pushdown(l_int ver, struct l_msqid_ds *linux_msqid, caddr_t uaddr)
	{
	struct l_msqid64_ds linux_msqid64;

	if (ver == LINUX_IPC_64) {
	bzero(&linux_msqid64, sizeof(linux_msqid64));

	linux_ipc_perm_to_ipc64_perm(&linux_msqid->msg_perm,
	&linux_msqid64.msg_perm);

	linux_msqid64.msg_stime = linux_msqid->msg_stime;
	linux_msqid64.msg_rtime = linux_msqid->msg_rtime;
	linux_msqid64.msg_ctime = linux_msqid->msg_ctime;

	if (linux_msqid->msg_cbytes == 0)
	linux_msqid64.msg_cbytes = linux_msqid->msg_lcbytes;
	else
	linux_msqid64.msg_cbytes = linux_msqid->msg_cbytes;

	linux_msqid64.msg_qnum = linux_msqid->msg_qnum;

	if (linux_msqid->msg_qbytes == 0)
	linux_msqid64.msg_qbytes = linux_msqid->msg_lqbytes;
	else
	linux_msqid64.msg_qbytes = linux_msqid->msg_qbytes;

	linux_msqid64.msg_lspid = linux_msqid->msg_lspid;
	linux_msqid64.msg_lrpid = linux_msqid->msg_lrpid;

	return (copyout(&linux_msqid64, uaddr, sizeof(linux_msqid64)));
	} else
	return (copyout(linux_msqid, uaddr, sizeof(*linux_msqid)));
	}

	static int
	linux_semid_pullup(l_int ver, struct l_semid_ds *linux_semid, caddr_t uaddr)
	{
	struct l_semid64_ds linux_semid64;
	int error;

	if (ver == LINUX_IPC_64) {
	error = copyin(uaddr, &linux_semid64, sizeof(linux_semid64));
	if (error != 0)
	return (error);

	bzero(linux_semid, sizeof(*linux_semid));

	linux_semid->sem_perm.uid = linux_semid64.sem_perm.uid;
	linux_semid->sem_perm.gid = linux_semid64.sem_perm.gid;
	linux_semid->sem_perm.mode = linux_semid64.sem_perm.mode;
	} else
	error = copyin(uaddr, linux_semid, sizeof(*linux_semid));

	return (error);
	}

	static int
	linux_semid_pushdown(l_int ver, struct l_semid_ds *linux_semid, caddr_t uaddr)
	{
	struct l_semid64_ds linux_semid64;

	if (ver == LINUX_IPC_64) {
	bzero(&linux_semid64, sizeof(linux_semid64));

	linux_ipc_perm_to_ipc64_perm(&linux_semid->sem_perm,
	&linux_semid64.sem_perm);

	linux_semid64.sem_otime = linux_semid->sem_otime;
	linux_semid64.sem_ctime = linux_semid->sem_ctime;
	linux_semid64.sem_nsems = linux_semid->sem_nsems;

	return (copyout(&linux_semid64, uaddr, sizeof(linux_semid64)));
	} else
	return (copyout(linux_semid, uaddr, sizeof(*linux_semid)));
	}

	static int
	linux_shmid_pullup(l_int ver, struct l_shmid_ds *linux_shmid, caddr_t uaddr)
	{
	struct l_shmid64_ds linux_shmid64;
	int error;

	if (ver == LINUX_IPC_64) {
	error = copyin(uaddr, &linux_shmid64, sizeof(linux_shmid64));
	if (error != 0)
	return (error);

	bzero(linux_shmid, sizeof(*linux_shmid));

	linux_shmid->shm_perm.uid = linux_shmid64.shm_perm.uid;
	linux_shmid->shm_perm.gid = linux_shmid64.shm_perm.gid;
	linux_shmid->shm_perm.mode = linux_shmid64.shm_perm.mode;
	} else
	error = copyin(uaddr, linux_shmid, sizeof(*linux_shmid));

	return (error);
	}

	static int
	linux_shmid_pushdown(l_int ver, struct l_shmid_ds *linux_shmid, caddr_t uaddr)
	{
	struct l_shmid64_ds linux_shmid64;

	/*
	* XXX: This is backwards and loses information in shm_nattch
	* and shm_segsz. We should probably either expose the BSD
	* shmid structure directly and convert it to either the
	* non-64 or 64 variant directly or the code should always
	* convert to the 64 variant and then truncate values into the
	* non-64 variant if needed since the 64 variant has more
	* precision.
	*/
	if (ver == LINUX_IPC_64) {
	bzero(&linux_shmid64, sizeof(linux_shmid64));

	linux_ipc_perm_to_ipc64_perm(&linux_shmid->shm_perm,
	&linux_shmid64.shm_perm);

	linux_shmid64.shm_segsz = linux_shmid->shm_segsz;
	linux_shmid64.shm_atime = linux_shmid->shm_atime;
	linux_shmid64.shm_dtime = linux_shmid->shm_dtime;
	linux_shmid64.shm_ctime = linux_shmid->shm_ctime;
	linux_shmid64.shm_cpid = linux_shmid->shm_cpid;
	linux_shmid64.shm_lpid = linux_shmid->shm_lpid;
	linux_shmid64.shm_nattch = linux_shmid->shm_nattch;

	return (copyout(&linux_shmid64, uaddr, sizeof(linux_shmid64)));
	} else
	return (copyout(linux_shmid, uaddr, sizeof(*linux_shmid)));
	}

	static int
	linux_shminfo_pushdown(l_int ver, struct l_shminfo *linux_shminfo,
	caddr_t uaddr)
	{
	struct l_shminfo64 linux_shminfo64;

	if (ver == LINUX_IPC_64) {
	bzero(&linux_shminfo64, sizeof(linux_shminfo64));

	linux_shminfo64.shmmax = linux_shminfo->shmmax;
	linux_shminfo64.shmmin = linux_shminfo->shmmin;
	linux_shminfo64.shmmni = linux_shminfo->shmmni;
	linux_shminfo64.shmseg = linux_shminfo->shmseg;
	linux_shminfo64.shmall = linux_shminfo->shmall;

	return (copyout(&linux_shminfo64, uaddr,
	sizeof(linux_shminfo64)));
	} else
	return (copyout(linux_shminfo, uaddr, sizeof(*linux_shminfo)));
	}

	int
	linux_semop(struct thread td, struct linux_semop_args args)
	{
	struct semop_args /* {
	int semid;
	struct sembuf *sops;
	int nsops;
	} */ bsd_args;

	bsd_args.semid = args->semid;
	bsd_args.sops = PTRIN(args->tsops);
	bsd_args.nsops = args->nsops;
	- return (semop(td, &bsd_args));
	+ return (sys_semop(td, &bsd_args));
	}

	int
	linux_semget(struct thread td, struct linux_semget_args args)
	{
	struct semget_args /* {
	key_t key;
	int nsems;
	int semflg;
	} */ bsd_args;

	if (args->nsems < 0)
	return (EINVAL);
	bsd_args.key = args->key;
	bsd_args.nsems = args->nsems;
	bsd_args.semflg = args->semflg;
	- return (semget(td, &bsd_args));
	+ return (sys_semget(td, &bsd_args));
	}

	int
	linux_semctl(struct thread td, struct linux_semctl_args args)
	{
	struct l_semid_ds linux_semid;
	struct l_seminfo linux_seminfo;
	struct semid_ds semid;
	union semun semun;
	register_t rval;
	int cmd, error;

	switch (args->cmd & ~LINUX_IPC_64) {
	case LINUX_IPC_RMID:
	cmd = IPC_RMID;
	break;
	case LINUX_GETNCNT:
	cmd = GETNCNT;
	break;
	case LINUX_GETPID:
	cmd = GETPID;
	break;
	case LINUX_GETVAL:
	cmd = GETVAL;
	break;
	case LINUX_GETZCNT:
	cmd = GETZCNT;
	break;
	case LINUX_SETVAL:
	cmd = SETVAL;
	semun.val = args->arg.val;
	break;
	case LINUX_IPC_SET:
	cmd = IPC_SET;
	error = linux_semid_pullup(args->cmd & LINUX_IPC_64,
	&linux_semid, PTRIN(args->arg.buf));
	if (error)
	return (error);
	linux_to_bsd_semid_ds(&linux_semid, &semid);
	semun.buf = &semid;
	return (kern_semctl(td, args->semid, args->semnum, cmd, &semun,
	td->td_retval));
	case LINUX_IPC_STAT:
	case LINUX_SEM_STAT:
	if ((args->cmd & ~LINUX_IPC_64) == LINUX_IPC_STAT)
	cmd = IPC_STAT;
	else
	cmd = SEM_STAT;
	semun.buf = &semid;
	error = kern_semctl(td, args->semid, args->semnum, cmd, &semun,
	&rval);
	if (error)
	return (error);
	bsd_to_linux_semid_ds(&semid, &linux_semid);
	error = linux_semid_pushdown(args->cmd & LINUX_IPC_64,
	&linux_semid, PTRIN(args->arg.buf));
	if (error == 0)
	td->td_retval[0] = (cmd == SEM_STAT) ? rval : 0;
	return (error);
	case LINUX_IPC_INFO:
	case LINUX_SEM_INFO:
	bcopy(&seminfo, &linux_seminfo.semmni, sizeof(linux_seminfo) -
	sizeof(linux_seminfo.semmap) );
	/*
	* Linux does not use the semmap field but populates it with
	* the defined value from SEMMAP, which really is redefined to
	* SEMMNS, which they define as SEMMNI * SEMMSL. Try to
	* simulate this returning our dynamic semmns value.
	*/
	linux_seminfo.semmap = linux_seminfo.semmns;
	/* XXX BSD equivalent?
	#define used_semids 10
	#define used_sems 10
	linux_seminfo.semusz = used_semids;
	linux_seminfo.semaem = used_sems;
	*/
	error = copyout(&linux_seminfo,
	PTRIN(args->arg.buf), sizeof(linux_seminfo));
	if (error)
	return (error);
	td->td_retval[0] = seminfo.semmni;
	return (0); /* No need for __semctl call */
	case LINUX_GETALL:
	cmd = GETALL;
	semun.val = args->arg.val;
	break;
	case LINUX_SETALL:
	cmd = SETALL;
	semun.val = args->arg.val;
	break;
	default:
	linux_msg(td, "ipc type %d is not implemented",
	args->cmd & ~LINUX_IPC_64);
	return (EINVAL);
	}
	return (kern_semctl(td, args->semid, args->semnum, cmd, &semun,
	td->td_retval));
	}

	int
	linux_msgsnd(struct thread td, struct linux_msgsnd_args args)
	{
	const void *msgp;
	long mtype;
	l_long lmtype;
	int error;

	if ((l_long)args->msgsz < 0 \|\| args->msgsz > (l_long)msginfo.msgmax)
	return (EINVAL);
	msgp = PTRIN(args->msgp);
	if ((error = copyin(msgp, &lmtype, sizeof(lmtype))) != 0)
	return (error);
	mtype = (long)lmtype;
	return (kern_msgsnd(td, args->msqid,
	(const char *)msgp + sizeof(lmtype),
	args->msgsz, args->msgflg, mtype));
	}

	int
	linux_msgrcv(struct thread td, struct linux_msgrcv_args args)
	{
	void *msgp;
	long mtype;
	l_long lmtype;
	int error;

	if ((l_long)args->msgsz < 0 \|\| args->msgsz > (l_long)msginfo.msgmax)
	return (EINVAL);
	msgp = PTRIN(args->msgp);
	if ((error = kern_msgrcv(td, args->msqid,
	(char *)msgp + sizeof(lmtype), args->msgsz,
	args->msgtyp, args->msgflg, &mtype)) != 0)
	return (error);
	lmtype = (l_long)mtype;
	return (copyout(&lmtype, msgp, sizeof(lmtype)));
	}

	int
	linux_msgget(struct thread td, struct linux_msgget_args args)
	{
	struct msgget_args /* {
	key_t key;
	int msgflg;
	} */ bsd_args;

	bsd_args.key = args->key;
	bsd_args.msgflg = args->msgflg;
	- return (msgget(td, &bsd_args));
	+ return (sys_msgget(td, &bsd_args));
	}

	int
	linux_msgctl(struct thread td, struct linux_msgctl_args args)
	{
	int error, bsd_cmd;
	struct l_msqid_ds linux_msqid;
	struct msqid_ds bsd_msqid;

	bsd_cmd = args->cmd & ~LINUX_IPC_64;
	switch (bsd_cmd) {
	case LINUX_IPC_INFO:
	case LINUX_MSG_INFO: {
	struct l_msginfo linux_msginfo;

	/*
	* XXX MSG_INFO uses the same data structure but returns different
	* dynamic counters in msgpool, msgmap, and msgtql fields.
	*/
	linux_msginfo.msgpool = (long)msginfo.msgmni *
	(long)msginfo.msgmnb / 1024L; /* XXX MSG_INFO. */
	linux_msginfo.msgmap = msginfo.msgmnb; /* XXX MSG_INFO. */
	linux_msginfo.msgmax = msginfo.msgmax;
	linux_msginfo.msgmnb = msginfo.msgmnb;
	linux_msginfo.msgmni = msginfo.msgmni;
	linux_msginfo.msgssz = msginfo.msgssz;
	linux_msginfo.msgtql = msginfo.msgtql; /* XXX MSG_INFO. */
	linux_msginfo.msgseg = msginfo.msgseg;
	error = copyout(&linux_msginfo, PTRIN(args->buf),
	sizeof(linux_msginfo));
	if (error == 0)
	td->td_retval[0] = msginfo.msgmni; /* XXX */

	return (error);
	}

	/*
	* TODO: implement this
	* case LINUX_MSG_STAT:
	*/
	case LINUX_IPC_STAT:
	/* NOTHING */
	break;

	case LINUX_IPC_SET:
	error = linux_msqid_pullup(args->cmd & LINUX_IPC_64,
	&linux_msqid, PTRIN(args->buf));
	if (error)
	return (error);
	linux_to_bsd_msqid_ds(&linux_msqid, &bsd_msqid);
	break;

	case LINUX_IPC_RMID:
	/* NOTHING */
	break;

	default:
	return (EINVAL);
	break;
	}

	error = kern_msgctl(td, args->msqid, bsd_cmd, &bsd_msqid);
	if (error != 0)
	if (bsd_cmd != LINUX_IPC_RMID \|\| error != EINVAL)
	return (error);

	if (bsd_cmd == LINUX_IPC_STAT) {
	bsd_to_linux_msqid_ds(&bsd_msqid, &linux_msqid);
	return (linux_msqid_pushdown(args->cmd & LINUX_IPC_64,
	&linux_msqid, PTRIN(args->buf)));
	}

	return (0);
	}

	int
	linux_shmat(struct thread td, struct linux_shmat_args args)
	{
	struct shmat_args /* {
	int shmid;
	void *shmaddr;
	int shmflg;
	} */ bsd_args;
	int error;
	#if defined(__i386__) \|\| (defined(__amd64__) && defined(COMPAT_LINUX32))
	l_uintptr_t addr;
	#endif

	bsd_args.shmid = args->shmid;
	bsd_args.shmaddr = PTRIN(args->shmaddr);
	bsd_args.shmflg = args->shmflg;
	- if ((error = shmat(td, &bsd_args)))
	+ if ((error = sys_shmat(td, &bsd_args)))
	return (error);
	#if defined(__i386__) \|\| (defined(__amd64__) && defined(COMPAT_LINUX32))
	addr = td->td_retval[0];
	if ((error = copyout(&addr, PTRIN(args->raddr), sizeof(addr))))
	return (error);
	td->td_retval[0] = 0;
	#endif
	return (0);
	}

	int
	linux_shmdt(struct thread td, struct linux_shmdt_args args)
	{
	struct shmdt_args /* {
	void *shmaddr;
	} */ bsd_args;

	bsd_args.shmaddr = PTRIN(args->shmaddr);
	- return (shmdt(td, &bsd_args));
	+ return (sys_shmdt(td, &bsd_args));
	}

	int
	linux_shmget(struct thread td, struct linux_shmget_args args)
	{
	struct shmget_args /* {
	key_t key;
	int size;
	int shmflg;
	} */ bsd_args;

	bsd_args.key = args->key;
	bsd_args.size = args->size;
	bsd_args.shmflg = args->shmflg;
	- return (shmget(td, &bsd_args));
	+ return (sys_shmget(td, &bsd_args));
	}

	int
	linux_shmctl(struct thread td, struct linux_shmctl_args args)
	{
	struct l_shmid_ds linux_shmid;
	struct l_shminfo linux_shminfo;
	struct l_shm_info linux_shm_info;
	struct shmid_ds bsd_shmid;
	int error;

	switch (args->cmd & ~LINUX_IPC_64) {

	case LINUX_IPC_INFO: {
	struct shminfo bsd_shminfo;

	/* Perform shmctl wanting removed segments lookup */
	error = kern_shmctl(td, args->shmid, IPC_INFO,
	(void *)&bsd_shminfo, NULL);
	if (error)
	return (error);

	bsd_to_linux_shminfo(&bsd_shminfo, &linux_shminfo);

	return (linux_shminfo_pushdown(args->cmd & LINUX_IPC_64,
	&linux_shminfo, PTRIN(args->buf)));
	}

	case LINUX_SHM_INFO: {
	struct shm_info bsd_shm_info;

	/* Perform shmctl wanting removed segments lookup */
	error = kern_shmctl(td, args->shmid, SHM_INFO,
	(void *)&bsd_shm_info, NULL);
	if (error)
	return (error);

	bsd_to_linux_shm_info(&bsd_shm_info, &linux_shm_info);

	return (copyout(&linux_shm_info, PTRIN(args->buf),
	sizeof(struct l_shm_info)));
	}

	case LINUX_IPC_STAT:
	/* Perform shmctl wanting removed segments lookup */
	error = kern_shmctl(td, args->shmid, IPC_STAT,
	(void *)&bsd_shmid, NULL);
	if (error)
	return (error);

	bsd_to_linux_shmid_ds(&bsd_shmid, &linux_shmid);

	return (linux_shmid_pushdown(args->cmd & LINUX_IPC_64,
	&linux_shmid, PTRIN(args->buf)));

	case LINUX_SHM_STAT:
	/* Perform shmctl wanting removed segments lookup */
	error = kern_shmctl(td, args->shmid, IPC_STAT,
	(void *)&bsd_shmid, NULL);
	if (error)
	return (error);

	bsd_to_linux_shmid_ds(&bsd_shmid, &linux_shmid);

	return (linux_shmid_pushdown(args->cmd & LINUX_IPC_64,
	&linux_shmid, PTRIN(args->buf)));

	case LINUX_IPC_SET:
	error = linux_shmid_pullup(args->cmd & LINUX_IPC_64,
	&linux_shmid, PTRIN(args->buf));
	if (error)
	return (error);

	linux_to_bsd_shmid_ds(&linux_shmid, &bsd_shmid);

	/* Perform shmctl wanting removed segments lookup */
	return (kern_shmctl(td, args->shmid, IPC_SET,
	(void *)&bsd_shmid, NULL));

	case LINUX_IPC_RMID: {
	void *buf;

	if (args->buf == 0)
	buf = NULL;
	else {
	error = linux_shmid_pullup(args->cmd & LINUX_IPC_64,
	&linux_shmid, PTRIN(args->buf));
	if (error)
	return (error);
	linux_to_bsd_shmid_ds(&linux_shmid, &bsd_shmid);
	buf = (void *)&bsd_shmid;
	}
	return (kern_shmctl(td, args->shmid, IPC_RMID, buf, NULL));
	}

	case LINUX_SHM_LOCK:
	/* FALLTHROUGH */
	case LINUX_SHM_UNLOCK:
	/* FALLTHROUGH */
	default:
	linux_msg(td, "ipc type %d not implemented",
	args->cmd & ~LINUX_IPC_64);
	return (EINVAL);
	}
	}

	MODULE_DEPEND(linux, sysvmsg, 1, 1, 1);
	MODULE_DEPEND(linux, sysvsem, 1, 1, 1);
	MODULE_DEPEND(linux, sysvshm, 1, 1, 1);
	Index: head/sys/compat/linux/linux_misc.c
	===================================================================
	--- head/sys/compat/linux/linux_misc.c (revision 225616)
	+++ head/sys/compat/linux/linux_misc.c (revision 225617)
	@@ -1,1926 +1,1926 @@
	/*-
	* Copyright (c) 2002 Doug Rabson
	* Copyright (c) 1994-1995 Søren Schmidt
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/blist.h>
	#include <sys/fcntl.h>
	#if defined(__i386__)
	#include <sys/imgact_aout.h>
	#endif
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mman.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/reboot.h>
	#include <sys/racct.h>
	#include <sys/resourcevar.h>
	#include <sys/sched.h>
	#include <sys/signalvar.h>
	#include <sys/stat.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/time.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>
	#include <sys/wait.h>
	#include <sys/cpuset.h>

	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_map.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_object.h>
	#include <vm/swap_pager.h>

	#ifdef COMPAT_LINUX32
	#include <machine/../linux32/linux.h>
	#include <machine/../linux32/linux32_proto.h>
	#else
	#include <machine/../linux/linux.h>
	#include <machine/../linux/linux_proto.h>
	#endif

	#include <compat/linux/linux_file.h>
	#include <compat/linux/linux_mib.h>
	#include <compat/linux/linux_signal.h>
	#include <compat/linux/linux_util.h>
	#include <compat/linux/linux_sysproto.h>
	#include <compat/linux/linux_emul.h>
	#include <compat/linux/linux_misc.h>

	int stclohz; /* Statistics clock frequency */

	static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
	RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
	RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
	RLIMIT_MEMLOCK, RLIMIT_AS
	};

	struct l_sysinfo {
	l_long uptime; /* Seconds since boot */
	l_ulong loads[3]; /* 1, 5, and 15 minute load averages */
	#define LINUX_SYSINFO_LOADS_SCALE 65536
	l_ulong totalram; /* Total usable main memory size */
	l_ulong freeram; /* Available memory size */
	l_ulong sharedram; /* Amount of shared memory */
	l_ulong bufferram; /* Memory used by buffers */
	l_ulong totalswap; /* Total swap space size */
	l_ulong freeswap; /* swap space still available */
	l_ushort procs; /* Number of current processes */
	l_ushort pads;
	l_ulong totalbig;
	l_ulong freebig;
	l_uint mem_unit;
	char _f[20-2sizeof(l_long)-sizeof(l_int)]; / padding */
	};
	int
	linux_sysinfo(struct thread td, struct linux_sysinfo_args args)
	{
	struct l_sysinfo sysinfo;
	vm_object_t object;
	int i, j;
	struct timespec ts;

	getnanouptime(&ts);
	if (ts.tv_nsec != 0)
	ts.tv_sec++;
	sysinfo.uptime = ts.tv_sec;

	/* Use the information from the mib to get our load averages */
	for (i = 0; i < 3; i++)
	sysinfo.loads[i] = averunnable.ldavg[i] *
	LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;

	sysinfo.totalram = physmem * PAGE_SIZE;
	sysinfo.freeram = sysinfo.totalram - cnt.v_wire_count * PAGE_SIZE;

	sysinfo.sharedram = 0;
	mtx_lock(&vm_object_list_mtx);
	TAILQ_FOREACH(object, &vm_object_list, object_list)
	if (object->shadow_count > 1)
	sysinfo.sharedram += object->resident_page_count;
	mtx_unlock(&vm_object_list_mtx);

	sysinfo.sharedram *= PAGE_SIZE;
	sysinfo.bufferram = 0;

	swap_pager_status(&i, &j);
	sysinfo.totalswap = i * PAGE_SIZE;
	sysinfo.freeswap = (i - j) * PAGE_SIZE;

	sysinfo.procs = nprocs;

	/* The following are only present in newer Linux kernels. */
	sysinfo.totalbig = 0;
	sysinfo.freebig = 0;
	sysinfo.mem_unit = 1;

	return (copyout(&sysinfo, args->info, sizeof(sysinfo)));
	}

	int
	linux_alarm(struct thread td, struct linux_alarm_args args)
	{
	struct itimerval it, old_it;
	u_int secs;
	int error;

	#ifdef DEBUG
	if (ldebug(alarm))
	printf(ARGS(alarm, "%u"), args->secs);
	#endif

	secs = args->secs;

	if (secs > INT_MAX)
	secs = INT_MAX;

	it.it_value.tv_sec = (long) secs;
	it.it_value.tv_usec = 0;
	it.it_interval.tv_sec = 0;
	it.it_interval.tv_usec = 0;
	error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
	if (error)
	return (error);
	if (timevalisset(&old_it.it_value)) {
	if (old_it.it_value.tv_usec != 0)
	old_it.it_value.tv_sec++;
	td->td_retval[0] = old_it.it_value.tv_sec;
	}
	return (0);
	}

	int
	linux_brk(struct thread td, struct linux_brk_args args)
	{
	struct vmspace *vm = td->td_proc->p_vmspace;
	vm_offset_t new, old;
	struct obreak_args /* {
	char * nsize;
	} */ tmp;

	#ifdef DEBUG
	if (ldebug(brk))
	printf(ARGS(brk, "%p"), (void *)(uintptr_t)args->dsend);
	#endif
	old = (vm_offset_t)vm->vm_daddr + ctob(vm->vm_dsize);
	new = (vm_offset_t)args->dsend;
	tmp.nsize = (char *)new;
	- if (((caddr_t)new > vm->vm_daddr) && !obreak(td, &tmp))
	+ if (((caddr_t)new > vm->vm_daddr) && !sys_obreak(td, &tmp))
	td->td_retval[0] = (long)new;
	else
	td->td_retval[0] = (long)old;

	return (0);
	}

	#if defined(__i386__)
	/* XXX: what about amd64/linux32? */

	int
	linux_uselib(struct thread td, struct linux_uselib_args args)
	{
	struct nameidata ni;
	struct vnode *vp;
	struct exec *a_out;
	struct vattr attr;
	vm_offset_t vmaddr;
	unsigned long file_offset;
	vm_offset_t buffer;
	unsigned long bss_size;
	char *library;
	int error;
	int locked, vfslocked;

	LCONVPATHEXIST(td, args->library, &library);

	#ifdef DEBUG
	if (ldebug(uselib))
	printf(ARGS(uselib, "%s"), library);
	#endif

	a_out = NULL;
	vfslocked = 0;
	locked = 0;
	vp = NULL;

	NDINIT(&ni, LOOKUP, ISOPEN \| FOLLOW \| LOCKLEAF \| MPSAFE \| AUDITVNODE1,
	UIO_SYSSPACE, library, td);
	error = namei(&ni);
	LFREEPATH(library);
	if (error)
	goto cleanup;

	vp = ni.ni_vp;
	vfslocked = NDHASGIANT(&ni);
	NDFREE(&ni, NDF_ONLY_PNBUF);

	/*
	* From here on down, we have a locked vnode that must be unlocked.
	* XXX: The code below largely duplicates exec_check_permissions().
	*/
	locked = 1;

	/* Writable? */
	if (vp->v_writecount) {
	error = ETXTBSY;
	goto cleanup;
	}

	/* Executable? */
	error = VOP_GETATTR(vp, &attr, td->td_ucred);
	if (error)
	goto cleanup;

	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) \|\|
	((attr.va_mode & 0111) == 0) \|\| (attr.va_type != VREG)) {
	/* EACCESS is what exec(2) returns. */
	error = ENOEXEC;
	goto cleanup;
	}

	/* Sensible size? */
	if (attr.va_size == 0) {
	error = ENOEXEC;
	goto cleanup;
	}

	/* Can we access it? */
	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
	if (error)
	goto cleanup;

	/*
	* XXX: This should use vn_open() so that it is properly authorized,
	* and to reduce code redundancy all over the place here.
	* XXX: Not really, it duplicates far more of exec_check_permissions()
	* than vn_open().
	*/
	#ifdef MAC
	error = mac_vnode_check_open(td->td_ucred, vp, VREAD);
	if (error)
	goto cleanup;
	#endif
	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
	if (error)
	goto cleanup;

	/* Pull in executable header into kernel_map */
	error = vm_mmap(kernel_map, (vm_offset_t *)&a_out, PAGE_SIZE,
	VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp, 0);
	if (error)
	goto cleanup;

	/* Is it a Linux binary ? */
	if (((a_out->a_magic >> 16) & 0xff) != 0x64) {
	error = ENOEXEC;
	goto cleanup;
	}

	/*
	* While we are here, we should REALLY do some more checks
	*/

	/* Set file/virtual offset based on a.out variant. */
	switch ((int)(a_out->a_magic & 0xffff)) {
	case 0413: /* ZMAGIC */
	file_offset = 1024;
	break;
	case 0314: /* QMAGIC */
	file_offset = 0;
	break;
	default:
	error = ENOEXEC;
	goto cleanup;
	}

	bss_size = round_page(a_out->a_bss);

	/* Check various fields in header for validity/bounds. */
	if (a_out->a_text & PAGE_MASK \|\| a_out->a_data & PAGE_MASK) {
	error = ENOEXEC;
	goto cleanup;
	}

	/* text + data can't exceed file size */
	if (a_out->a_data + a_out->a_text > attr.va_size) {
	error = EFAULT;
	goto cleanup;
	}

	/*
	* text/data/bss must not exceed limits
	* XXX - this is not complete. it should check current usage PLUS
	* the resources needed by this library.
	*/
	PROC_LOCK(td->td_proc);
	if (a_out->a_text > maxtsiz \|\|
	a_out->a_data + bss_size > lim_cur(td->td_proc, RLIMIT_DATA) \|\|
	racct_set(td->td_proc, RACCT_DATA, a_out->a_data +
	bss_size) != 0) {
	PROC_UNLOCK(td->td_proc);
	error = ENOMEM;
	goto cleanup;
	}
	PROC_UNLOCK(td->td_proc);

	/*
	* Prevent more writers.
	* XXX: Note that if any of the VM operations fail below we don't
	* clear this flag.
	*/
	vp->v_vflag \|= VV_TEXT;

	/*
	* Lock no longer needed
	*/
	locked = 0;
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);

	/*
	* Check if file_offset page aligned. Currently we cannot handle
	* misalinged file offsets, and so we read in the entire image
	* (what a waste).
	*/
	if (file_offset & PAGE_MASK) {
	#ifdef DEBUG
	printf("uselib: Non page aligned binary %lu\n", file_offset);
	#endif
	/* Map text+data read/write/execute */

	/* a_entry is the load address and is page aligned */
	vmaddr = trunc_page(a_out->a_entry);

	/* get anon user mapping, read+write+execute */
	error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
	&vmaddr, a_out->a_text + a_out->a_data, FALSE, VM_PROT_ALL,
	VM_PROT_ALL, 0);
	if (error)
	goto cleanup;

	/* map file into kernel_map */
	error = vm_mmap(kernel_map, &buffer,
	round_page(a_out->a_text + a_out->a_data + file_offset),
	VM_PROT_READ, VM_PROT_READ, 0, OBJT_VNODE, vp,
	trunc_page(file_offset));
	if (error)
	goto cleanup;

	/* copy from kernel VM space to user space */
	error = copyout(PTRIN(buffer + file_offset),
	(void *)vmaddr, a_out->a_text + a_out->a_data);

	/* release temporary kernel space */
	vm_map_remove(kernel_map, buffer, buffer +
	round_page(a_out->a_text + a_out->a_data + file_offset));

	if (error)
	goto cleanup;
	} else {
	#ifdef DEBUG
	printf("uselib: Page aligned binary %lu\n", file_offset);
	#endif
	/*
	* for QMAGIC, a_entry is 20 bytes beyond the load address
	* to skip the executable header
	*/
	vmaddr = trunc_page(a_out->a_entry);

	/*
	* Map it all into the process's space as a single
	* copy-on-write "data" segment.
	*/
	error = vm_mmap(&td->td_proc->p_vmspace->vm_map, &vmaddr,
	a_out->a_text + a_out->a_data, VM_PROT_ALL, VM_PROT_ALL,
	MAP_PRIVATE \| MAP_FIXED, OBJT_VNODE, vp, file_offset);
	if (error)
	goto cleanup;
	}
	#ifdef DEBUG
	printf("mem=%08lx = %08lx %08lx\n", (long)vmaddr, ((long *)vmaddr)[0],
	((long *)vmaddr)[1]);
	#endif
	if (bss_size != 0) {
	/* Calculate BSS start address */
	vmaddr = trunc_page(a_out->a_entry) + a_out->a_text +
	a_out->a_data;

	/* allocate some 'anon' space */
	error = vm_map_find(&td->td_proc->p_vmspace->vm_map, NULL, 0,
	&vmaddr, bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
	if (error)
	goto cleanup;
	}

	cleanup:
	/* Unlock vnode if needed */
	if (locked) {
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	}

	/* Release the kernel mapping. */
	if (a_out)
	vm_map_remove(kernel_map, (vm_offset_t)a_out,
	(vm_offset_t)a_out + PAGE_SIZE);

	return (error);
	}

	#endif /* __i386__ */

	int
	linux_select(struct thread td, struct linux_select_args args)
	{
	l_timeval ltv;
	struct timeval tv0, tv1, utv, *tvp;
	int error;

	#ifdef DEBUG
	if (ldebug(select))
	printf(ARGS(select, "%d, %p, %p, %p, %p"), args->nfds,
	(void )args->readfds, (void )args->writefds,
	(void )args->exceptfds, (void )args->timeout);
	#endif

	/*
	* Store current time for computation of the amount of
	* time left.
	*/
	if (args->timeout) {
	if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
	goto select_out;
	utv.tv_sec = ltv.tv_sec;
	utv.tv_usec = ltv.tv_usec;
	#ifdef DEBUG
	if (ldebug(select))
	printf(LMSG("incoming timeout (%jd/%ld)"),
	(intmax_t)utv.tv_sec, utv.tv_usec);
	#endif

	if (itimerfix(&utv)) {
	/*
	* The timeval was invalid. Convert it to something
	* valid that will act as it does under Linux.
	*/
	utv.tv_sec += utv.tv_usec / 1000000;
	utv.tv_usec %= 1000000;
	if (utv.tv_usec < 0) {
	utv.tv_sec -= 1;
	utv.tv_usec += 1000000;
	}
	if (utv.tv_sec < 0)
	timevalclear(&utv);
	}
	microtime(&tv0);
	tvp = &utv;
	} else
	tvp = NULL;

	error = kern_select(td, args->nfds, args->readfds, args->writefds,
	args->exceptfds, tvp, sizeof(l_int) * 8);

	#ifdef DEBUG
	if (ldebug(select))
	printf(LMSG("real select returns %d"), error);
	#endif
	if (error)
	goto select_out;

	if (args->timeout) {
	if (td->td_retval[0]) {
	/*
	* Compute how much time was left of the timeout,
	* by subtracting the current time and the time
	* before we started the call, and subtracting
	* that result from the user-supplied value.
	*/
	microtime(&tv1);
	timevalsub(&tv1, &tv0);
	timevalsub(&utv, &tv1);
	if (utv.tv_sec < 0)
	timevalclear(&utv);
	} else
	timevalclear(&utv);
	#ifdef DEBUG
	if (ldebug(select))
	printf(LMSG("outgoing timeout (%jd/%ld)"),
	(intmax_t)utv.tv_sec, utv.tv_usec);
	#endif
	ltv.tv_sec = utv.tv_sec;
	ltv.tv_usec = utv.tv_usec;
	if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
	goto select_out;
	}

	select_out:
	#ifdef DEBUG
	if (ldebug(select))
	printf(LMSG("select_out -> %d"), error);
	#endif
	return (error);
	}

	int
	linux_mremap(struct thread td, struct linux_mremap_args args)
	{
	struct munmap_args /* {
	void *addr;
	size_t len;
	} */ bsd_args;
	int error = 0;

	#ifdef DEBUG
	if (ldebug(mremap))
	printf(ARGS(mremap, "%p, %08lx, %08lx, %08lx"),
	(void *)(uintptr_t)args->addr,
	(unsigned long)args->old_len,
	(unsigned long)args->new_len,
	(unsigned long)args->flags);
	#endif

	if (args->flags & ~(LINUX_MREMAP_FIXED \| LINUX_MREMAP_MAYMOVE)) {
	td->td_retval[0] = 0;
	return (EINVAL);
	}

	/*
	* Check for the page alignment.
	* Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
	*/
	if (args->addr & PAGE_MASK) {
	td->td_retval[0] = 0;
	return (EINVAL);
	}

	args->new_len = round_page(args->new_len);
	args->old_len = round_page(args->old_len);

	if (args->new_len > args->old_len) {
	td->td_retval[0] = 0;
	return (ENOMEM);
	}

	if (args->new_len < args->old_len) {
	bsd_args.addr =
	(caddr_t)((uintptr_t)args->addr + args->new_len);
	bsd_args.len = args->old_len - args->new_len;
	- error = munmap(td, &bsd_args);
	+ error = sys_munmap(td, &bsd_args);
	}

	td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
	return (error);
	}

	#define LINUX_MS_ASYNC 0x0001
	#define LINUX_MS_INVALIDATE 0x0002
	#define LINUX_MS_SYNC 0x0004

	int
	linux_msync(struct thread td, struct linux_msync_args args)
	{
	struct msync_args bsd_args;

	bsd_args.addr = (caddr_t)(uintptr_t)args->addr;
	bsd_args.len = (uintptr_t)args->len;
	bsd_args.flags = args->fl & ~LINUX_MS_SYNC;

	- return (msync(td, &bsd_args));
	+ return (sys_msync(td, &bsd_args));
	}

	int
	linux_time(struct thread td, struct linux_time_args args)
	{
	struct timeval tv;
	l_time_t tm;
	int error;

	#ifdef DEBUG
	if (ldebug(time))
	printf(ARGS(time, "*"));
	#endif

	microtime(&tv);
	tm = tv.tv_sec;
	if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
	return (error);
	td->td_retval[0] = tm;
	return (0);
	}

	struct l_times_argv {
	l_clock_t tms_utime;
	l_clock_t tms_stime;
	l_clock_t tms_cutime;
	l_clock_t tms_cstime;
	};


	/*
	* Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value.
	* Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK
	* auxiliary vector entry.
	*/
	#define CLK_TCK 100

	#define CONVOTCK(r) (r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
	#define CONVNTCK(r) (r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz))

	#define CONVTCK(r) (linux_kernver(td) >= LINUX_KERNVER_2004000 ? \
	CONVNTCK(r) : CONVOTCK(r))

	int
	linux_times(struct thread td, struct linux_times_args args)
	{
	struct timeval tv, utime, stime, cutime, cstime;
	struct l_times_argv tms;
	struct proc *p;
	int error;

	#ifdef DEBUG
	if (ldebug(times))
	printf(ARGS(times, "*"));
	#endif

	if (args->buf != NULL) {
	p = td->td_proc;
	PROC_LOCK(p);
	PROC_SLOCK(p);
	calcru(p, &utime, &stime);
	PROC_SUNLOCK(p);
	calccru(p, &cutime, &cstime);
	PROC_UNLOCK(p);

	tms.tms_utime = CONVTCK(utime);
	tms.tms_stime = CONVTCK(stime);

	tms.tms_cutime = CONVTCK(cutime);
	tms.tms_cstime = CONVTCK(cstime);

	if ((error = copyout(&tms, args->buf, sizeof(tms))))
	return (error);
	}

	microuptime(&tv);
	td->td_retval[0] = (int)CONVTCK(tv);
	return (0);
	}

	int
	linux_newuname(struct thread td, struct linux_newuname_args args)
	{
	struct l_new_utsname utsname;
	char osname[LINUX_MAX_UTSNAME];
	char osrelease[LINUX_MAX_UTSNAME];
	char *p;

	#ifdef DEBUG
	if (ldebug(newuname))
	printf(ARGS(newuname, "*"));
	#endif

	linux_get_osname(td, osname);
	linux_get_osrelease(td, osrelease);

	bzero(&utsname, sizeof(utsname));
	strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
	getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
	getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME);
	strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
	strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
	for (p = utsname.version; *p != '\0'; ++p)
	if (*p == '\n') {
	*p = '\0';
	break;
	}
	strlcpy(utsname.machine, linux_platform, LINUX_MAX_UTSNAME);

	return (copyout(&utsname, args->buf, sizeof(utsname)));
	}

	#if defined(__i386__) \|\| (defined(__amd64__) && defined(COMPAT_LINUX32))
	struct l_utimbuf {
	l_time_t l_actime;
	l_time_t l_modtime;
	};

	int
	linux_utime(struct thread td, struct linux_utime_args args)
	{
	struct timeval tv[2], *tvp;
	struct l_utimbuf lut;
	char *fname;
	int error;

	LCONVPATHEXIST(td, args->fname, &fname);

	#ifdef DEBUG
	if (ldebug(utime))
	printf(ARGS(utime, "%s, *"), fname);
	#endif

	if (args->times) {
	if ((error = copyin(args->times, &lut, sizeof lut))) {
	LFREEPATH(fname);
	return (error);
	}
	tv[0].tv_sec = lut.l_actime;
	tv[0].tv_usec = 0;
	tv[1].tv_sec = lut.l_modtime;
	tv[1].tv_usec = 0;
	tvp = tv;
	} else
	tvp = NULL;

	error = kern_utimes(td, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
	LFREEPATH(fname);
	return (error);
	}

	int
	linux_utimes(struct thread td, struct linux_utimes_args args)
	{
	l_timeval ltv[2];
	struct timeval tv[2], *tvp = NULL;
	char *fname;
	int error;

	LCONVPATHEXIST(td, args->fname, &fname);

	#ifdef DEBUG
	if (ldebug(utimes))
	printf(ARGS(utimes, "%s, *"), fname);
	#endif

	if (args->tptr != NULL) {
	if ((error = copyin(args->tptr, ltv, sizeof ltv))) {
	LFREEPATH(fname);
	return (error);
	}
	tv[0].tv_sec = ltv[0].tv_sec;
	tv[0].tv_usec = ltv[0].tv_usec;
	tv[1].tv_sec = ltv[1].tv_sec;
	tv[1].tv_usec = ltv[1].tv_usec;
	tvp = tv;
	}

	error = kern_utimes(td, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
	LFREEPATH(fname);
	return (error);
	}

	int
	linux_futimesat(struct thread td, struct linux_futimesat_args args)
	{
	l_timeval ltv[2];
	struct timeval tv[2], *tvp = NULL;
	char *fname;
	int error, dfd;

	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
	LCONVPATHEXIST_AT(td, args->filename, &fname, dfd);

	#ifdef DEBUG
	if (ldebug(futimesat))
	printf(ARGS(futimesat, "%s, *"), fname);
	#endif

	if (args->utimes != NULL) {
	if ((error = copyin(args->utimes, ltv, sizeof ltv))) {
	LFREEPATH(fname);
	return (error);
	}
	tv[0].tv_sec = ltv[0].tv_sec;
	tv[0].tv_usec = ltv[0].tv_usec;
	tv[1].tv_sec = ltv[1].tv_sec;
	tv[1].tv_usec = ltv[1].tv_usec;
	tvp = tv;
	}

	error = kern_utimesat(td, dfd, fname, UIO_SYSSPACE, tvp, UIO_SYSSPACE);
	LFREEPATH(fname);
	return (error);
	}
	#endif /* __i386__ \|\| (__amd64__ && COMPAT_LINUX32) */

	int
	linux_common_wait(struct thread td, int pid, int status,
	int options, struct rusage *ru)
	{
	int error, tmpstat;

	error = kern_wait(td, pid, &tmpstat, options, ru);
	if (error)
	return (error);

	if (status) {
	tmpstat &= 0xffff;
	if (WIFSIGNALED(tmpstat))
	tmpstat = (tmpstat & 0xffffff80) \|
	BSD_TO_LINUX_SIGNAL(WTERMSIG(tmpstat));
	else if (WIFSTOPPED(tmpstat))
	tmpstat = (tmpstat & 0xffff00ff) \|
	(BSD_TO_LINUX_SIGNAL(WSTOPSIG(tmpstat)) << 8);
	error = copyout(&tmpstat, status, sizeof(int));
	}

	return (error);
	}

	int
	linux_waitpid(struct thread td, struct linux_waitpid_args args)
	{
	int options;

	#ifdef DEBUG
	if (ldebug(waitpid))
	printf(ARGS(waitpid, "%d, %p, %d"),
	args->pid, (void *)args->status, args->options);
	#endif
	/*
	* this is necessary because the test in kern_wait doesn't work
	* because we mess with the options here
	*/
	if (args->options & ~(WUNTRACED \| WNOHANG \| WCONTINUED \| __WCLONE))
	return (EINVAL);

	options = (args->options & (WNOHANG \| WUNTRACED));
	/* WLINUXCLONE should be equal to __WCLONE, but we make sure */
	if (args->options & __WCLONE)
	options \|= WLINUXCLONE;

	return (linux_common_wait(td, args->pid, args->status, options, NULL));
	}


	int
	linux_mknod(struct thread td, struct linux_mknod_args args)
	{
	char *path;
	int error;

	LCONVPATHCREAT(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(mknod))
	printf(ARGS(mknod, "%s, %d, %d"), path, args->mode, args->dev);
	#endif

	switch (args->mode & S_IFMT) {
	case S_IFIFO:
	case S_IFSOCK:
	error = kern_mkfifo(td, path, UIO_SYSSPACE, args->mode);
	break;

	case S_IFCHR:
	case S_IFBLK:
	error = kern_mknod(td, path, UIO_SYSSPACE, args->mode,
	args->dev);
	break;

	case S_IFDIR:
	error = EPERM;
	break;

	case 0:
	args->mode \|= S_IFREG;
	/* FALLTHROUGH */
	case S_IFREG:
	error = kern_open(td, path, UIO_SYSSPACE,
	O_WRONLY \| O_CREAT \| O_TRUNC, args->mode);
	if (error == 0)
	kern_close(td, td->td_retval[0]);
	break;

	default:
	error = EINVAL;
	break;
	}
	LFREEPATH(path);
	return (error);
	}

	int
	linux_mknodat(struct thread td, struct linux_mknodat_args args)
	{
	char *path;
	int error, dfd;

	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
	LCONVPATHCREAT_AT(td, args->filename, &path, dfd);

	#ifdef DEBUG
	if (ldebug(mknodat))
	printf(ARGS(mknodat, "%s, %d, %d"), path, args->mode, args->dev);
	#endif

	switch (args->mode & S_IFMT) {
	case S_IFIFO:
	case S_IFSOCK:
	error = kern_mkfifoat(td, dfd, path, UIO_SYSSPACE, args->mode);
	break;

	case S_IFCHR:
	case S_IFBLK:
	error = kern_mknodat(td, dfd, path, UIO_SYSSPACE, args->mode,
	args->dev);
	break;

	case S_IFDIR:
	error = EPERM;
	break;

	case 0:
	args->mode \|= S_IFREG;
	/* FALLTHROUGH */
	case S_IFREG:
	error = kern_openat(td, dfd, path, UIO_SYSSPACE,
	O_WRONLY \| O_CREAT \| O_TRUNC, args->mode);
	if (error == 0)
	kern_close(td, td->td_retval[0]);
	break;

	default:
	error = EINVAL;
	break;
	}
	LFREEPATH(path);
	return (error);
	}

	/*
	* UGH! This is just about the dumbest idea I've ever heard!!
	*/
	int
	linux_personality(struct thread td, struct linux_personality_args args)
	{
	#ifdef DEBUG
	if (ldebug(personality))
	printf(ARGS(personality, "%lu"), (unsigned long)args->per);
	#endif
	if (args->per != 0)
	return (EINVAL);

	/* Yes Jim, it's still a Linux... */
	td->td_retval[0] = 0;
	return (0);
	}

	struct l_itimerval {
	l_timeval it_interval;
	l_timeval it_value;
	};

	#define B2L_ITIMERVAL(bip, lip) \
	(bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec; \
	(bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec; \
	(bip)->it_value.tv_sec = (lip)->it_value.tv_sec; \
	(bip)->it_value.tv_usec = (lip)->it_value.tv_usec;

	int
	linux_setitimer(struct thread td, struct linux_setitimer_args uap)
	{
	int error;
	struct l_itimerval ls;
	struct itimerval aitv, oitv;

	#ifdef DEBUG
	if (ldebug(setitimer))
	printf(ARGS(setitimer, "%p, %p"),
	(void )uap->itv, (void )uap->oitv);
	#endif

	if (uap->itv == NULL) {
	uap->itv = uap->oitv;
	return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
	}

	error = copyin(uap->itv, &ls, sizeof(ls));
	if (error != 0)
	return (error);
	B2L_ITIMERVAL(&aitv, &ls);
	#ifdef DEBUG
	if (ldebug(setitimer)) {
	printf("setitimer: value: sec: %jd, usec: %ld\n",
	(intmax_t)aitv.it_value.tv_sec, aitv.it_value.tv_usec);
	printf("setitimer: interval: sec: %jd, usec: %ld\n",
	(intmax_t)aitv.it_interval.tv_sec, aitv.it_interval.tv_usec);
	}
	#endif
	error = kern_setitimer(td, uap->which, &aitv, &oitv);
	if (error != 0 \|\| uap->oitv == NULL)
	return (error);
	B2L_ITIMERVAL(&ls, &oitv);

	return (copyout(&ls, uap->oitv, sizeof(ls)));
	}

	int
	linux_getitimer(struct thread td, struct linux_getitimer_args uap)
	{
	int error;
	struct l_itimerval ls;
	struct itimerval aitv;

	#ifdef DEBUG
	if (ldebug(getitimer))
	printf(ARGS(getitimer, "%p"), (void *)uap->itv);
	#endif
	error = kern_getitimer(td, uap->which, &aitv);
	if (error != 0)
	return (error);
	B2L_ITIMERVAL(&ls, &aitv);
	return (copyout(&ls, uap->itv, sizeof(ls)));
	}

	int
	linux_nice(struct thread td, struct linux_nice_args args)
	{
	struct setpriority_args bsd_args;

	bsd_args.which = PRIO_PROCESS;
	bsd_args.who = 0; /* current process */
	bsd_args.prio = args->inc;
	- return (setpriority(td, &bsd_args));
	+ return (sys_setpriority(td, &bsd_args));
	}

	int
	linux_setgroups(struct thread td, struct linux_setgroups_args args)
	{
	struct ucred newcred, oldcred;
	l_gid_t *linux_gidset;
	gid_t *bsd_gidset;
	int ngrp, error;
	struct proc *p;

	ngrp = args->gidsetsize;
	if (ngrp < 0 \|\| ngrp >= ngroups_max + 1)
	return (EINVAL);
	linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_TEMP, M_WAITOK);
	error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
	if (error)
	goto out;
	newcred = crget();
	p = td->td_proc;
	PROC_LOCK(p);
	oldcred = crcopysafe(p, newcred);

	/*
	* cr_groups[0] holds egid. Setting the whole set from
	* the supplied set will cause egid to be changed too.
	* Keep cr_groups[0] unchanged to prevent that.
	*/

	if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) {
	PROC_UNLOCK(p);
	crfree(newcred);
	goto out;
	}

	if (ngrp > 0) {
	newcred->cr_ngroups = ngrp + 1;

	bsd_gidset = newcred->cr_groups;
	ngrp--;
	while (ngrp >= 0) {
	bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
	ngrp--;
	}
	} else
	newcred->cr_ngroups = 1;

	setsugid(p);
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	crfree(oldcred);
	error = 0;
	out:
	free(linux_gidset, M_TEMP);
	return (error);
	}

	int
	linux_getgroups(struct thread td, struct linux_getgroups_args args)
	{
	struct ucred *cred;
	l_gid_t *linux_gidset;
	gid_t *bsd_gidset;
	int bsd_gidsetsz, ngrp, error;

	cred = td->td_ucred;
	bsd_gidset = cred->cr_groups;
	bsd_gidsetsz = cred->cr_ngroups - 1;

	/*
	* cr_groups[0] holds egid. Returning the whole set
	* here will cause a duplicate. Exclude cr_groups[0]
	* to prevent that.
	*/

	if ((ngrp = args->gidsetsize) == 0) {
	td->td_retval[0] = bsd_gidsetsz;
	return (0);
	}

	if (ngrp < bsd_gidsetsz)
	return (EINVAL);

	ngrp = 0;
	linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
	M_TEMP, M_WAITOK);
	while (ngrp < bsd_gidsetsz) {
	linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
	ngrp++;
	}

	error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t));
	free(linux_gidset, M_TEMP);
	if (error)
	return (error);

	td->td_retval[0] = ngrp;
	return (0);
	}

	int
	linux_setrlimit(struct thread td, struct linux_setrlimit_args args)
	{
	struct rlimit bsd_rlim;
	struct l_rlimit rlim;
	u_int which;
	int error;

	#ifdef DEBUG
	if (ldebug(setrlimit))
	printf(ARGS(setrlimit, "%d, %p"),
	args->resource, (void *)args->rlim);
	#endif

	if (args->resource >= LINUX_RLIM_NLIMITS)
	return (EINVAL);

	which = linux_to_bsd_resource[args->resource];
	if (which == -1)
	return (EINVAL);

	error = copyin(args->rlim, &rlim, sizeof(rlim));
	if (error)
	return (error);

	bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
	bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
	return (kern_setrlimit(td, which, &bsd_rlim));
	}

	int
	linux_old_getrlimit(struct thread td, struct linux_old_getrlimit_args args)
	{
	struct l_rlimit rlim;
	struct proc *p = td->td_proc;
	struct rlimit bsd_rlim;
	u_int which;

	#ifdef DEBUG
	if (ldebug(old_getrlimit))
	printf(ARGS(old_getrlimit, "%d, %p"),
	args->resource, (void *)args->rlim);
	#endif

	if (args->resource >= LINUX_RLIM_NLIMITS)
	return (EINVAL);

	which = linux_to_bsd_resource[args->resource];
	if (which == -1)
	return (EINVAL);

	PROC_LOCK(p);
	lim_rlimit(p, which, &bsd_rlim);
	PROC_UNLOCK(p);

	#ifdef COMPAT_LINUX32
	rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
	if (rlim.rlim_cur == UINT_MAX)
	rlim.rlim_cur = INT_MAX;
	rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
	if (rlim.rlim_max == UINT_MAX)
	rlim.rlim_max = INT_MAX;
	#else
	rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
	if (rlim.rlim_cur == ULONG_MAX)
	rlim.rlim_cur = LONG_MAX;
	rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
	if (rlim.rlim_max == ULONG_MAX)
	rlim.rlim_max = LONG_MAX;
	#endif
	return (copyout(&rlim, args->rlim, sizeof(rlim)));
	}

	int
	linux_getrlimit(struct thread td, struct linux_getrlimit_args args)
	{
	struct l_rlimit rlim;
	struct proc *p = td->td_proc;
	struct rlimit bsd_rlim;
	u_int which;

	#ifdef DEBUG
	if (ldebug(getrlimit))
	printf(ARGS(getrlimit, "%d, %p"),
	args->resource, (void *)args->rlim);
	#endif

	if (args->resource >= LINUX_RLIM_NLIMITS)
	return (EINVAL);

	which = linux_to_bsd_resource[args->resource];
	if (which == -1)
	return (EINVAL);

	PROC_LOCK(p);
	lim_rlimit(p, which, &bsd_rlim);
	PROC_UNLOCK(p);

	rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
	rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
	return (copyout(&rlim, args->rlim, sizeof(rlim)));
	}

	int
	linux_sched_setscheduler(struct thread *td,
	struct linux_sched_setscheduler_args *args)
	{
	struct sched_setscheduler_args bsd;

	#ifdef DEBUG
	if (ldebug(sched_setscheduler))
	printf(ARGS(sched_setscheduler, "%d, %d, %p"),
	args->pid, args->policy, (const void *)args->param);
	#endif

	switch (args->policy) {
	case LINUX_SCHED_OTHER:
	bsd.policy = SCHED_OTHER;
	break;
	case LINUX_SCHED_FIFO:
	bsd.policy = SCHED_FIFO;
	break;
	case LINUX_SCHED_RR:
	bsd.policy = SCHED_RR;
	break;
	default:
	return (EINVAL);
	}

	bsd.pid = args->pid;
	bsd.param = (struct sched_param *)args->param;
	- return (sched_setscheduler(td, &bsd));
	+ return (sys_sched_setscheduler(td, &bsd));
	}

	int
	linux_sched_getscheduler(struct thread *td,
	struct linux_sched_getscheduler_args *args)
	{
	struct sched_getscheduler_args bsd;
	int error;

	#ifdef DEBUG
	if (ldebug(sched_getscheduler))
	printf(ARGS(sched_getscheduler, "%d"), args->pid);
	#endif

	bsd.pid = args->pid;
	- error = sched_getscheduler(td, &bsd);
	+ error = sys_sched_getscheduler(td, &bsd);

	switch (td->td_retval[0]) {
	case SCHED_OTHER:
	td->td_retval[0] = LINUX_SCHED_OTHER;
	break;
	case SCHED_FIFO:
	td->td_retval[0] = LINUX_SCHED_FIFO;
	break;
	case SCHED_RR:
	td->td_retval[0] = LINUX_SCHED_RR;
	break;
	}

	return (error);
	}

	int
	linux_sched_get_priority_max(struct thread *td,
	struct linux_sched_get_priority_max_args *args)
	{
	struct sched_get_priority_max_args bsd;

	#ifdef DEBUG
	if (ldebug(sched_get_priority_max))
	printf(ARGS(sched_get_priority_max, "%d"), args->policy);
	#endif

	switch (args->policy) {
	case LINUX_SCHED_OTHER:
	bsd.policy = SCHED_OTHER;
	break;
	case LINUX_SCHED_FIFO:
	bsd.policy = SCHED_FIFO;
	break;
	case LINUX_SCHED_RR:
	bsd.policy = SCHED_RR;
	break;
	default:
	return (EINVAL);
	}
	- return (sched_get_priority_max(td, &bsd));
	+ return (sys_sched_get_priority_max(td, &bsd));
	}

	int
	linux_sched_get_priority_min(struct thread *td,
	struct linux_sched_get_priority_min_args *args)
	{
	struct sched_get_priority_min_args bsd;

	#ifdef DEBUG
	if (ldebug(sched_get_priority_min))
	printf(ARGS(sched_get_priority_min, "%d"), args->policy);
	#endif

	switch (args->policy) {
	case LINUX_SCHED_OTHER:
	bsd.policy = SCHED_OTHER;
	break;
	case LINUX_SCHED_FIFO:
	bsd.policy = SCHED_FIFO;
	break;
	case LINUX_SCHED_RR:
	bsd.policy = SCHED_RR;
	break;
	default:
	return (EINVAL);
	}
	- return (sched_get_priority_min(td, &bsd));
	+ return (sys_sched_get_priority_min(td, &bsd));
	}

	#define REBOOT_CAD_ON 0x89abcdef
	#define REBOOT_CAD_OFF 0
	#define REBOOT_HALT 0xcdef0123
	#define REBOOT_RESTART 0x01234567
	#define REBOOT_RESTART2 0xA1B2C3D4
	#define REBOOT_POWEROFF 0x4321FEDC
	#define REBOOT_MAGIC1 0xfee1dead
	#define REBOOT_MAGIC2 0x28121969
	#define REBOOT_MAGIC2A 0x05121996
	#define REBOOT_MAGIC2B 0x16041998

	int
	linux_reboot(struct thread td, struct linux_reboot_args args)
	{
	struct reboot_args bsd_args;

	#ifdef DEBUG
	if (ldebug(reboot))
	printf(ARGS(reboot, "0x%x"), args->cmd);
	#endif

	if (args->magic1 != REBOOT_MAGIC1)
	return (EINVAL);

	switch (args->magic2) {
	case REBOOT_MAGIC2:
	case REBOOT_MAGIC2A:
	case REBOOT_MAGIC2B:
	break;
	default:
	return (EINVAL);
	}

	switch (args->cmd) {
	case REBOOT_CAD_ON:
	case REBOOT_CAD_OFF:
	return (priv_check(td, PRIV_REBOOT));
	case REBOOT_HALT:
	bsd_args.opt = RB_HALT;
	break;
	case REBOOT_RESTART:
	case REBOOT_RESTART2:
	bsd_args.opt = 0;
	break;
	case REBOOT_POWEROFF:
	bsd_args.opt = RB_POWEROFF;
	break;
	default:
	return (EINVAL);
	}
	- return (reboot(td, &bsd_args));
	+ return (sys_reboot(td, &bsd_args));
	}


	/*
	* The FreeBSD native getpid(2), getgid(2) and getuid(2) also modify
	* td->td_retval[1] when COMPAT_43 is defined. This clobbers registers that
	* are assumed to be preserved. The following lightweight syscalls fixes
	* this. See also linux_getgid16() and linux_getuid16() in linux_uid16.c
	*
	* linux_getpid() - MP SAFE
	* linux_getgid() - MP SAFE
	* linux_getuid() - MP SAFE
	*/

	int
	linux_getpid(struct thread td, struct linux_getpid_args args)
	{
	struct linux_emuldata *em;

	#ifdef DEBUG
	if (ldebug(getpid))
	printf(ARGS(getpid, ""));
	#endif

	if (linux_use26(td)) {
	em = em_find(td->td_proc, EMUL_DONTLOCK);
	KASSERT(em != NULL, ("getpid: emuldata not found.\n"));
	td->td_retval[0] = em->shared->group_pid;
	} else {
	td->td_retval[0] = td->td_proc->p_pid;
	}

	return (0);
	}

	int
	linux_gettid(struct thread td, struct linux_gettid_args args)
	{

	#ifdef DEBUG
	if (ldebug(gettid))
	printf(ARGS(gettid, ""));
	#endif

	td->td_retval[0] = td->td_proc->p_pid;
	return (0);
	}


	int
	linux_getppid(struct thread td, struct linux_getppid_args args)
	{
	struct linux_emuldata *em;
	struct proc p, pp;

	#ifdef DEBUG
	if (ldebug(getppid))
	printf(ARGS(getppid, ""));
	#endif

	if (!linux_use26(td)) {
	PROC_LOCK(td->td_proc);
	td->td_retval[0] = td->td_proc->p_pptr->p_pid;
	PROC_UNLOCK(td->td_proc);
	return (0);
	}

	em = em_find(td->td_proc, EMUL_DONTLOCK);

	KASSERT(em != NULL, ("getppid: process emuldata not found.\n"));

	/* find the group leader */
	p = pfind(em->shared->group_pid);

	if (p == NULL) {
	#ifdef DEBUG
	printf(LMSG("parent process not found.\n"));
	#endif
	return (0);
	}

	pp = p->p_pptr; /* switch to parent */
	PROC_LOCK(pp);
	PROC_UNLOCK(p);

	/* if its also linux process */
	if (pp->p_sysent == &elf_linux_sysvec) {
	em = em_find(pp, EMUL_DONTLOCK);
	KASSERT(em != NULL, ("getppid: parent emuldata not found.\n"));

	td->td_retval[0] = em->shared->group_pid;
	} else
	td->td_retval[0] = pp->p_pid;

	PROC_UNLOCK(pp);

	return (0);
	}

	int
	linux_getgid(struct thread td, struct linux_getgid_args args)
	{

	#ifdef DEBUG
	if (ldebug(getgid))
	printf(ARGS(getgid, ""));
	#endif

	td->td_retval[0] = td->td_ucred->cr_rgid;
	return (0);
	}

	int
	linux_getuid(struct thread td, struct linux_getuid_args args)
	{

	#ifdef DEBUG
	if (ldebug(getuid))
	printf(ARGS(getuid, ""));
	#endif

	td->td_retval[0] = td->td_ucred->cr_ruid;
	return (0);
	}


	int
	linux_getsid(struct thread td, struct linux_getsid_args args)
	{
	struct getsid_args bsd;

	#ifdef DEBUG
	if (ldebug(getsid))
	printf(ARGS(getsid, "%i"), args->pid);
	#endif

	bsd.pid = args->pid;
	- return (getsid(td, &bsd));
	+ return (sys_getsid(td, &bsd));
	}

	int
	linux_nosys(struct thread td, struct nosys_args ignore)
	{

	return (ENOSYS);
	}

	int
	linux_getpriority(struct thread td, struct linux_getpriority_args args)
	{
	struct getpriority_args bsd_args;
	int error;

	#ifdef DEBUG
	if (ldebug(getpriority))
	printf(ARGS(getpriority, "%i, %i"), args->which, args->who);
	#endif

	bsd_args.which = args->which;
	bsd_args.who = args->who;
	- error = getpriority(td, &bsd_args);
	+ error = sys_getpriority(td, &bsd_args);
	td->td_retval[0] = 20 - td->td_retval[0];
	return (error);
	}

	int
	linux_sethostname(struct thread td, struct linux_sethostname_args args)
	{
	int name[2];

	#ifdef DEBUG
	if (ldebug(sethostname))
	printf(ARGS(sethostname, "*, %i"), args->len);
	#endif

	name[0] = CTL_KERN;
	name[1] = KERN_HOSTNAME;
	return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
	args->len, 0, 0));
	}

	int
	linux_setdomainname(struct thread td, struct linux_setdomainname_args args)
	{
	int name[2];

	#ifdef DEBUG
	if (ldebug(setdomainname))
	printf(ARGS(setdomainname, "*, %i"), args->len);
	#endif

	name[0] = CTL_KERN;
	name[1] = KERN_NISDOMAINNAME;
	return (userland_sysctl(td, name, 2, 0, 0, 0, args->name,
	args->len, 0, 0));
	}

	int
	linux_exit_group(struct thread td, struct linux_exit_group_args args)
	{
	struct linux_emuldata *em;

	#ifdef DEBUG
	if (ldebug(exit_group))
	printf(ARGS(exit_group, "%i"), args->error_code);
	#endif

	em = em_find(td->td_proc, EMUL_DONTLOCK);
	if (em->shared->refs > 1) {
	EMUL_SHARED_WLOCK(&emul_shared_lock);
	em->shared->flags \|= EMUL_SHARED_HASXSTAT;
	em->shared->xstat = W_EXITCODE(args->error_code, 0);
	EMUL_SHARED_WUNLOCK(&emul_shared_lock);
	if (linux_use26(td))
	linux_kill_threads(td, SIGKILL);
	}

	/*
	* XXX: we should send a signal to the parent if
	* SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
	* as it doesnt occur often.
	*/
	exit1(td, W_EXITCODE(args->error_code, 0));

	return (0);
	}

	#define _LINUX_CAPABILITY_VERSION 0x19980330

	struct l_user_cap_header {
	l_int version;
	l_int pid;
	};

	struct l_user_cap_data {
	l_int effective;
	l_int permitted;
	l_int inheritable;
	};

	int
	linux_capget(struct thread td, struct linux_capget_args args)
	{
	struct l_user_cap_header luch;
	struct l_user_cap_data lucd;
	int error;

	if (args->hdrp == NULL)
	return (EFAULT);

	error = copyin(args->hdrp, &luch, sizeof(luch));
	if (error != 0)
	return (error);

	if (luch.version != _LINUX_CAPABILITY_VERSION) {
	luch.version = _LINUX_CAPABILITY_VERSION;
	error = copyout(&luch, args->hdrp, sizeof(luch));
	if (error)
	return (error);
	return (EINVAL);
	}

	if (luch.pid)
	return (EPERM);

	if (args->datap) {
	/*
	* The current implementation doesn't support setting
	* a capability (it's essentially a stub) so indicate
	* that no capabilities are currently set or available
	* to request.
	*/
	bzero (&lucd, sizeof(lucd));
	error = copyout(&lucd, args->datap, sizeof(lucd));
	}

	return (error);
	}

	int
	linux_capset(struct thread td, struct linux_capset_args args)
	{
	struct l_user_cap_header luch;
	struct l_user_cap_data lucd;
	int error;

	if (args->hdrp == NULL \|\| args->datap == NULL)
	return (EFAULT);

	error = copyin(args->hdrp, &luch, sizeof(luch));
	if (error != 0)
	return (error);

	if (luch.version != _LINUX_CAPABILITY_VERSION) {
	luch.version = _LINUX_CAPABILITY_VERSION;
	error = copyout(&luch, args->hdrp, sizeof(luch));
	if (error)
	return (error);
	return (EINVAL);
	}

	if (luch.pid)
	return (EPERM);

	error = copyin(args->datap, &lucd, sizeof(lucd));
	if (error != 0)
	return (error);

	/* We currently don't support setting any capabilities. */
	if (lucd.effective \|\| lucd.permitted \|\| lucd.inheritable) {
	linux_msg(td,
	"capset effective=0x%x, permitted=0x%x, "
	"inheritable=0x%x is not implemented",
	(int)lucd.effective, (int)lucd.permitted,
	(int)lucd.inheritable);
	return (EPERM);
	}

	return (0);
	}

	int
	linux_prctl(struct thread td, struct linux_prctl_args args)
	{
	int error = 0, max_size;
	struct proc *p = td->td_proc;
	char comm[LINUX_MAX_COMM_LEN];
	struct linux_emuldata *em;
	int pdeath_signal;

	#ifdef DEBUG
	if (ldebug(prctl))
	printf(ARGS(prctl, "%d, %d, %d, %d, %d"), args->option,
	args->arg2, args->arg3, args->arg4, args->arg5);
	#endif

	switch (args->option) {
	case LINUX_PR_SET_PDEATHSIG:
	if (!LINUX_SIG_VALID(args->arg2))
	return (EINVAL);
	em = em_find(p, EMUL_DOLOCK);
	KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
	em->pdeath_signal = args->arg2;
	EMUL_UNLOCK(&emul_lock);
	break;
	case LINUX_PR_GET_PDEATHSIG:
	em = em_find(p, EMUL_DOLOCK);
	KASSERT(em != NULL, ("prctl: emuldata not found.\n"));
	pdeath_signal = em->pdeath_signal;
	EMUL_UNLOCK(&emul_lock);
	error = copyout(&pdeath_signal,
	(void *)(register_t)args->arg2,
	sizeof(pdeath_signal));
	break;
	case LINUX_PR_GET_KEEPCAPS:
	/*
	* Indicate that we always clear the effective and
	* permitted capability sets when the user id becomes
	* non-zero (actually the capability sets are simply
	* always zero in the current implementation).
	*/
	td->td_retval[0] = 0;
	break;
	case LINUX_PR_SET_KEEPCAPS:
	/*
	* Ignore requests to keep the effective and permitted
	* capability sets when the user id becomes non-zero.
	*/
	break;
	case LINUX_PR_SET_NAME:
	/*
	* To be on the safe side we need to make sure to not
	* overflow the size a linux program expects. We already
	* do this here in the copyin, so that we don't need to
	* check on copyout.
	*/
	max_size = MIN(sizeof(comm), sizeof(p->p_comm));
	error = copyinstr((void *)(register_t)args->arg2, comm,
	max_size, NULL);

	/* Linux silently truncates the name if it is too long. */
	if (error == ENAMETOOLONG) {
	/*
	* XXX: copyinstr() isn't documented to populate the
	* array completely, so do a copyin() to be on the
	* safe side. This should be changed in case
	* copyinstr() is changed to guarantee this.
	*/
	error = copyin((void *)(register_t)args->arg2, comm,
	max_size - 1);
	comm[max_size - 1] = '\0';
	}
	if (error)
	return (error);

	PROC_LOCK(p);
	strlcpy(p->p_comm, comm, sizeof(p->p_comm));
	PROC_UNLOCK(p);
	break;
	case LINUX_PR_GET_NAME:
	PROC_LOCK(p);
	strlcpy(comm, p->p_comm, sizeof(comm));
	PROC_UNLOCK(p);
	error = copyout(comm, (void *)(register_t)args->arg2,
	strlen(comm) + 1);
	break;
	default:
	error = EINVAL;
	break;
	}

	return (error);
	}

	/*
	* Get affinity of a process.
	*/
	int
	linux_sched_getaffinity(struct thread *td,
	struct linux_sched_getaffinity_args *args)
	{
	int error;
	struct cpuset_getaffinity_args cga;

	#ifdef DEBUG
	if (ldebug(sched_getaffinity))
	printf(ARGS(sched_getaffinity, "%d, %d, *"), args->pid,
	args->len);
	#endif
	if (args->len < sizeof(cpuset_t))
	return (EINVAL);

	cga.level = CPU_LEVEL_WHICH;
	cga.which = CPU_WHICH_PID;
	cga.id = args->pid;
	cga.cpusetsize = sizeof(cpuset_t);
	cga.mask = (cpuset_t *) args->user_mask_ptr;

	- if ((error = cpuset_getaffinity(td, &cga)) == 0)
	+ if ((error = sys_cpuset_getaffinity(td, &cga)) == 0)
	td->td_retval[0] = sizeof(cpuset_t);

	return (error);
	}

	/*
	* Set affinity of a process.
	*/
	int
	linux_sched_setaffinity(struct thread *td,
	struct linux_sched_setaffinity_args *args)
	{
	struct cpuset_setaffinity_args csa;

	#ifdef DEBUG
	if (ldebug(sched_setaffinity))
	printf(ARGS(sched_setaffinity, "%d, %d, *"), args->pid,
	args->len);
	#endif
	if (args->len < sizeof(cpuset_t))
	return (EINVAL);

	csa.level = CPU_LEVEL_WHICH;
	csa.which = CPU_WHICH_PID;
	csa.id = args->pid;
	csa.cpusetsize = sizeof(cpuset_t);
	csa.mask = (cpuset_t *) args->user_mask_ptr;

	- return (cpuset_setaffinity(td, &csa));
	+ return (sys_cpuset_setaffinity(td, &csa));
	}
	Index: head/sys/compat/linux/linux_signal.c
	===================================================================
	--- head/sys/compat/linux/linux_signal.c (revision 225616)
	+++ head/sys/compat/linux/linux_signal.c (revision 225617)
	@@ -1,656 +1,656 @@
	/*-
	* Copyright (c) 1994-1995 Søren Schmidt
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sx.h>
	#include <sys/proc.h>
	#include <sys/signalvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>

	#include <security/audit/audit.h>

	#include "opt_compat.h"

	#ifdef COMPAT_LINUX32
	#include <machine/../linux32/linux.h>
	#include <machine/../linux32/linux32_proto.h>
	#else
	#include <machine/../linux/linux.h>
	#include <machine/../linux/linux_proto.h>
	#endif
	#include <compat/linux/linux_signal.h>
	#include <compat/linux/linux_util.h>
	#include <compat/linux/linux_emul.h>

	void
	linux_to_bsd_sigset(l_sigset_t lss, sigset_t bss)
	{
	int b, l;

	SIGEMPTYSET(*bss);
	bss->__bits[0] = lss->__bits[0] & ~((1U << LINUX_SIGTBLSZ) - 1);
	bss->__bits[1] = lss->__bits[1];
	for (l = 1; l <= LINUX_SIGTBLSZ; l++) {
	if (LINUX_SIGISMEMBER(*lss, l)) {
	b = linux_to_bsd_signal[_SIG_IDX(l)];
	if (b)
	SIGADDSET(*bss, b);
	}
	}
	}

	void
	bsd_to_linux_sigset(sigset_t bss, l_sigset_t lss)
	{
	int b, l;

	LINUX_SIGEMPTYSET(*lss);
	lss->__bits[0] = bss->__bits[0] & ~((1U << LINUX_SIGTBLSZ) - 1);
	lss->__bits[1] = bss->__bits[1];
	for (b = 1; b <= LINUX_SIGTBLSZ; b++) {
	if (SIGISMEMBER(*bss, b)) {
	l = bsd_to_linux_signal[_SIG_IDX(b)];
	if (l)
	LINUX_SIGADDSET(*lss, l);
	}
	}
	}

	static void
	linux_to_bsd_sigaction(l_sigaction_t lsa, struct sigaction bsa)
	{

	linux_to_bsd_sigset(&lsa->lsa_mask, &bsa->sa_mask);
	bsa->sa_handler = PTRIN(lsa->lsa_handler);
	bsa->sa_flags = 0;
	if (lsa->lsa_flags & LINUX_SA_NOCLDSTOP)
	bsa->sa_flags \|= SA_NOCLDSTOP;
	if (lsa->lsa_flags & LINUX_SA_NOCLDWAIT)
	bsa->sa_flags \|= SA_NOCLDWAIT;
	if (lsa->lsa_flags & LINUX_SA_SIGINFO)
	bsa->sa_flags \|= SA_SIGINFO;
	if (lsa->lsa_flags & LINUX_SA_ONSTACK)
	bsa->sa_flags \|= SA_ONSTACK;
	if (lsa->lsa_flags & LINUX_SA_RESTART)
	bsa->sa_flags \|= SA_RESTART;
	if (lsa->lsa_flags & LINUX_SA_ONESHOT)
	bsa->sa_flags \|= SA_RESETHAND;
	if (lsa->lsa_flags & LINUX_SA_NOMASK)
	bsa->sa_flags \|= SA_NODEFER;
	}

	static void
	bsd_to_linux_sigaction(struct sigaction bsa, l_sigaction_t lsa)
	{

	bsd_to_linux_sigset(&bsa->sa_mask, &lsa->lsa_mask);
	#ifdef COMPAT_LINUX32
	lsa->lsa_handler = (uintptr_t)bsa->sa_handler;
	#else
	lsa->lsa_handler = bsa->sa_handler;
	#endif
	lsa->lsa_restorer = 0; /* unsupported */
	lsa->lsa_flags = 0;
	if (bsa->sa_flags & SA_NOCLDSTOP)
	lsa->lsa_flags \|= LINUX_SA_NOCLDSTOP;
	if (bsa->sa_flags & SA_NOCLDWAIT)
	lsa->lsa_flags \|= LINUX_SA_NOCLDWAIT;
	if (bsa->sa_flags & SA_SIGINFO)
	lsa->lsa_flags \|= LINUX_SA_SIGINFO;
	if (bsa->sa_flags & SA_ONSTACK)
	lsa->lsa_flags \|= LINUX_SA_ONSTACK;
	if (bsa->sa_flags & SA_RESTART)
	lsa->lsa_flags \|= LINUX_SA_RESTART;
	if (bsa->sa_flags & SA_RESETHAND)
	lsa->lsa_flags \|= LINUX_SA_ONESHOT;
	if (bsa->sa_flags & SA_NODEFER)
	lsa->lsa_flags \|= LINUX_SA_NOMASK;
	}

	int
	linux_do_sigaction(struct thread td, int linux_sig, l_sigaction_t linux_nsa,
	l_sigaction_t *linux_osa)
	{
	struct sigaction act, oact, nsa, osa;
	int error, sig;

	if (!LINUX_SIG_VALID(linux_sig))
	return (EINVAL);

	osa = (linux_osa != NULL) ? &oact : NULL;
	if (linux_nsa != NULL) {
	nsa = &act;
	linux_to_bsd_sigaction(linux_nsa, nsa);
	} else
	nsa = NULL;

	if (linux_sig <= LINUX_SIGTBLSZ)
	sig = linux_to_bsd_signal[_SIG_IDX(linux_sig)];
	else
	sig = linux_sig;

	error = kern_sigaction(td, sig, nsa, osa, 0);
	if (error)
	return (error);

	if (linux_osa != NULL)
	bsd_to_linux_sigaction(osa, linux_osa);

	return (0);
	}


	int
	linux_signal(struct thread td, struct linux_signal_args args)
	{
	l_sigaction_t nsa, osa;
	int error;

	#ifdef DEBUG
	if (ldebug(signal))
	printf(ARGS(signal, "%d, %p"),
	args->sig, (void *)(uintptr_t)args->handler);
	#endif

	nsa.lsa_handler = args->handler;
	nsa.lsa_flags = LINUX_SA_ONESHOT \| LINUX_SA_NOMASK;
	LINUX_SIGEMPTYSET(nsa.lsa_mask);

	error = linux_do_sigaction(td, args->sig, &nsa, &osa);
	td->td_retval[0] = (int)(intptr_t)osa.lsa_handler;

	return (error);
	}

	int
	linux_rt_sigaction(struct thread td, struct linux_rt_sigaction_args args)
	{
	l_sigaction_t nsa, osa;
	int error;

	#ifdef DEBUG
	if (ldebug(rt_sigaction))
	printf(ARGS(rt_sigaction, "%ld, %p, %p, %ld"),
	(long)args->sig, (void *)args->act,
	(void *)args->oact, (long)args->sigsetsize);
	#endif

	if (args->sigsetsize != sizeof(l_sigset_t))
	return (EINVAL);

	if (args->act != NULL) {
	error = copyin(args->act, &nsa, sizeof(l_sigaction_t));
	if (error)
	return (error);
	}

	error = linux_do_sigaction(td, args->sig,
	args->act ? &nsa : NULL,
	args->oact ? &osa : NULL);

	if (args->oact != NULL && !error) {
	error = copyout(&osa, args->oact, sizeof(l_sigaction_t));
	}

	return (error);
	}

	static int
	linux_do_sigprocmask(struct thread td, int how, l_sigset_t new,
	l_sigset_t *old)
	{
	sigset_t omask, nmask;
	sigset_t *nmaskp;
	int error;

	td->td_retval[0] = 0;

	switch (how) {
	case LINUX_SIG_BLOCK:
	how = SIG_BLOCK;
	break;
	case LINUX_SIG_UNBLOCK:
	how = SIG_UNBLOCK;
	break;
	case LINUX_SIG_SETMASK:
	how = SIG_SETMASK;
	break;
	default:
	return (EINVAL);
	}
	if (new != NULL) {
	linux_to_bsd_sigset(new, &nmask);
	nmaskp = &nmask;
	} else
	nmaskp = NULL;
	error = kern_sigprocmask(td, how, nmaskp, &omask, 0);
	if (error == 0 && old != NULL)
	bsd_to_linux_sigset(&omask, old);

	return (error);
	}

	int
	linux_sigprocmask(struct thread td, struct linux_sigprocmask_args args)
	{
	l_osigset_t mask;
	l_sigset_t set, oset;
	int error;

	#ifdef DEBUG
	if (ldebug(sigprocmask))
	printf(ARGS(sigprocmask, "%d, , "), args->how);
	#endif

	if (args->mask != NULL) {
	error = copyin(args->mask, &mask, sizeof(l_osigset_t));
	if (error)
	return (error);
	LINUX_SIGEMPTYSET(set);
	set.__bits[0] = mask;
	}

	error = linux_do_sigprocmask(td, args->how,
	args->mask ? &set : NULL,
	args->omask ? &oset : NULL);

	if (args->omask != NULL && !error) {
	mask = oset.__bits[0];
	error = copyout(&mask, args->omask, sizeof(l_osigset_t));
	}

	return (error);
	}

	int
	linux_rt_sigprocmask(struct thread td, struct linux_rt_sigprocmask_args args)
	{
	l_sigset_t set, oset;
	int error;

	#ifdef DEBUG
	if (ldebug(rt_sigprocmask))
	printf(ARGS(rt_sigprocmask, "%d, %p, %p, %ld"),
	args->how, (void *)args->mask,
	(void *)args->omask, (long)args->sigsetsize);
	#endif

	if (args->sigsetsize != sizeof(l_sigset_t))
	return EINVAL;

	if (args->mask != NULL) {
	error = copyin(args->mask, &set, sizeof(l_sigset_t));
	if (error)
	return (error);
	}

	error = linux_do_sigprocmask(td, args->how,
	args->mask ? &set : NULL,
	args->omask ? &oset : NULL);

	if (args->omask != NULL && !error) {
	error = copyout(&oset, args->omask, sizeof(l_sigset_t));
	}

	return (error);
	}

	int
	linux_sgetmask(struct thread td, struct linux_sgetmask_args args)
	{
	struct proc *p = td->td_proc;
	l_sigset_t mask;

	#ifdef DEBUG
	if (ldebug(sgetmask))
	printf(ARGS(sgetmask, ""));
	#endif

	PROC_LOCK(p);
	bsd_to_linux_sigset(&td->td_sigmask, &mask);
	PROC_UNLOCK(p);
	td->td_retval[0] = mask.__bits[0];
	return (0);
	}

	int
	linux_ssetmask(struct thread td, struct linux_ssetmask_args args)
	{
	struct proc *p = td->td_proc;
	l_sigset_t lset;
	sigset_t bset;

	#ifdef DEBUG
	if (ldebug(ssetmask))
	printf(ARGS(ssetmask, "%08lx"), (unsigned long)args->mask);
	#endif

	PROC_LOCK(p);
	bsd_to_linux_sigset(&td->td_sigmask, &lset);
	td->td_retval[0] = lset.__bits[0];
	LINUX_SIGEMPTYSET(lset);
	lset.__bits[0] = args->mask;
	linux_to_bsd_sigset(&lset, &bset);
	td->td_sigmask = bset;
	SIG_CANTMASK(td->td_sigmask);
	signotify(td);
	PROC_UNLOCK(p);
	return (0);
	}

	/*
	* MPSAFE
	*/
	int
	linux_sigpending(struct thread td, struct linux_sigpending_args args)
	{
	struct proc *p = td->td_proc;
	sigset_t bset;
	l_sigset_t lset;
	l_osigset_t mask;

	#ifdef DEBUG
	if (ldebug(sigpending))
	printf(ARGS(sigpending, "*"));
	#endif

	PROC_LOCK(p);
	bset = p->p_siglist;
	SIGSETOR(bset, td->td_siglist);
	SIGSETAND(bset, td->td_sigmask);
	PROC_UNLOCK(p);
	bsd_to_linux_sigset(&bset, &lset);
	mask = lset.__bits[0];
	return (copyout(&mask, args->mask, sizeof(mask)));
	}

	/*
	* MPSAFE
	*/
	int
	linux_rt_sigpending(struct thread td, struct linux_rt_sigpending_args args)
	{
	struct proc *p = td->td_proc;
	sigset_t bset;
	l_sigset_t lset;

	if (args->sigsetsize > sizeof(lset))
	return EINVAL;
	/* NOT REACHED */

	#ifdef DEBUG
	if (ldebug(rt_sigpending))
	printf(ARGS(rt_sigpending, "*"));
	#endif

	PROC_LOCK(p);
	bset = p->p_siglist;
	SIGSETOR(bset, td->td_siglist);
	SIGSETAND(bset, td->td_sigmask);
	PROC_UNLOCK(p);
	bsd_to_linux_sigset(&bset, &lset);
	return (copyout(&lset, args->set, args->sigsetsize));
	}

	/*
	* MPSAFE
	*/
	int
	linux_rt_sigtimedwait(struct thread *td,
	struct linux_rt_sigtimedwait_args *args)
	{
	int error, sig;
	l_timeval ltv;
	struct timeval tv;
	struct timespec ts, *tsa;
	l_sigset_t lset;
	sigset_t bset;
	l_siginfo_t linfo;
	ksiginfo_t info;

	#ifdef DEBUG
	if (ldebug(rt_sigtimedwait))
	printf(ARGS(rt_sigtimedwait, "*"));
	#endif
	if (args->sigsetsize != sizeof(l_sigset_t))
	return (EINVAL);

	if ((error = copyin(args->mask, &lset, sizeof(lset))))
	return (error);
	linux_to_bsd_sigset(&lset, &bset);

	tsa = NULL;
	if (args->timeout) {
	if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
	return (error);
	#ifdef DEBUG
	if (ldebug(rt_sigtimedwait))
	printf(LMSG("linux_rt_sigtimedwait: "
	"incoming timeout (%d/%d)\n"),
	ltv.tv_sec, ltv.tv_usec);
	#endif
	tv.tv_sec = (long)ltv.tv_sec;
	tv.tv_usec = (suseconds_t)ltv.tv_usec;
	if (itimerfix(&tv)) {
	/*
	* The timeout was invalid. Convert it to something
	* valid that will act as it does under Linux.
	*/
	tv.tv_sec += tv.tv_usec / 1000000;
	tv.tv_usec %= 1000000;
	if (tv.tv_usec < 0) {
	tv.tv_sec -= 1;
	tv.tv_usec += 1000000;
	}
	if (tv.tv_sec < 0)
	timevalclear(&tv);
	#ifdef DEBUG
	if (ldebug(rt_sigtimedwait))
	printf(LMSG("linux_rt_sigtimedwait: "
	"converted timeout (%jd/%ld)\n"),
	(intmax_t)tv.tv_sec, tv.tv_usec);
	#endif
	}
	TIMEVAL_TO_TIMESPEC(&tv, &ts);
	tsa = &ts;
	}
	error = kern_sigtimedwait(td, bset, &info, tsa);
	#ifdef DEBUG
	if (ldebug(rt_sigtimedwait))
	printf(LMSG("linux_rt_sigtimedwait: "
	"sigtimedwait returning (%d)\n"), error);
	#endif
	if (error)
	return (error);

	sig = BSD_TO_LINUX_SIGNAL(info.ksi_signo);

	if (args->ptr) {
	memset(&linfo, 0, sizeof(linfo));
	ksiginfo_to_lsiginfo(&info, &linfo, sig);
	error = copyout(&linfo, args->ptr, sizeof(linfo));
	}
	if (error == 0)
	td->td_retval[0] = sig;

	return (error);
	}

	int
	linux_kill(struct thread td, struct linux_kill_args args)
	{
	struct kill_args /* {
	int pid;
	int signum;
	} */ tmp;

	#ifdef DEBUG
	if (ldebug(kill))
	printf(ARGS(kill, "%d, %d"), args->pid, args->signum);
	#endif

	/*
	* Allow signal 0 as a means to check for privileges
	*/
	if (!LINUX_SIG_VALID(args->signum) && args->signum != 0)
	return (EINVAL);

	if (args->signum > 0 && args->signum <= LINUX_SIGTBLSZ)
	tmp.signum = linux_to_bsd_signal[_SIG_IDX(args->signum)];
	else
	tmp.signum = args->signum;

	tmp.pid = args->pid;
	- return (kill(td, &tmp));
	+ return (sys_kill(td, &tmp));
	}

	static int
	linux_do_tkill(struct thread *td, l_int tgid, l_int pid, l_int signum)
	{
	struct proc *proc = td->td_proc;
	struct linux_emuldata *em;
	struct proc *p;
	ksiginfo_t ksi;
	int error;

	AUDIT_ARG_SIGNUM(signum);
	AUDIT_ARG_PID(pid);

	/*
	* Allow signal 0 as a means to check for privileges
	*/
	if (!LINUX_SIG_VALID(signum) && signum != 0)
	return (EINVAL);

	if (signum > 0 && signum <= LINUX_SIGTBLSZ)
	signum = linux_to_bsd_signal[_SIG_IDX(signum)];

	if ((p = pfind(pid)) == NULL) {
	if ((p = zpfind(pid)) == NULL)
	return (ESRCH);
	}

	AUDIT_ARG_PROCESS(p);
	error = p_cansignal(td, p, signum);
	if (error != 0 \|\| signum == 0)
	goto out;

	error = ESRCH;
	em = em_find(p, EMUL_DONTLOCK);

	if (em == NULL) {
	#ifdef DEBUG
	printf("emuldata not found in do_tkill.\n");
	#endif
	goto out;
	}
	if (tgid > 0 && em->shared->group_pid != tgid)
	goto out;

	ksiginfo_init(&ksi);
	ksi.ksi_signo = signum;
	ksi.ksi_code = LINUX_SI_TKILL;
	ksi.ksi_errno = 0;
	ksi.ksi_pid = proc->p_pid;
	ksi.ksi_uid = proc->p_ucred->cr_ruid;

	error = pksignal(p, ksi.ksi_signo, &ksi);

	out:
	PROC_UNLOCK(p);
	return (error);
	}

	int
	linux_tgkill(struct thread td, struct linux_tgkill_args args)
	{

	#ifdef DEBUG
	if (ldebug(tgkill))
	printf(ARGS(tgkill, "%d, %d, %d"), args->tgid, args->pid, args->sig);
	#endif
	if (args->pid <= 0 \|\| args->tgid <=0)
	return (EINVAL);

	return (linux_do_tkill(td, args->tgid, args->pid, args->sig));
	}

	int
	linux_tkill(struct thread td, struct linux_tkill_args args)
	{
	#ifdef DEBUG
	if (ldebug(tkill))
	printf(ARGS(tkill, "%i, %i"), args->tid, args->sig);
	#endif
	if (args->tid <= 0)
	return (EINVAL);

	return (linux_do_tkill(td, 0, args->tid, args->sig));
	}

	void
	ksiginfo_to_lsiginfo(ksiginfo_t ksi, l_siginfo_t lsi, l_int sig)
	{

	lsi->lsi_signo = sig;
	lsi->lsi_code = ksi->ksi_code;

	switch (sig) {
	case LINUX_SIGPOLL:
	/* XXX si_fd? */
	lsi->lsi_band = ksi->ksi_band;
	break;
	case LINUX_SIGCHLD:
	lsi->lsi_pid = ksi->ksi_pid;
	lsi->lsi_uid = ksi->ksi_uid;
	lsi->lsi_status = ksi->ksi_status;
	break;
	case LINUX_SIGBUS:
	case LINUX_SIGILL:
	case LINUX_SIGFPE:
	case LINUX_SIGSEGV:
	lsi->lsi_addr = PTROUT(ksi->ksi_addr);
	break;
	default:
	/* XXX SI_TIMER etc... */
	lsi->lsi_pid = ksi->ksi_pid;
	lsi->lsi_uid = ksi->ksi_uid;
	break;
	}
	if (sig >= LINUX_SIGRTMIN) {
	lsi->lsi_int = ksi->ksi_info.si_value.sival_int;
	lsi->lsi_ptr = PTROUT(ksi->ksi_info.si_value.sival_ptr);
	}
	}
	Index: head/sys/compat/linux/linux_socket.c
	===================================================================
	--- head/sys/compat/linux/linux_socket.c (revision 225616)
	+++ head/sys/compat/linux/linux_socket.c (revision 225617)
	@@ -1,1685 +1,1685 @@
	/*-
	* Copyright (c) 1995 Søren Schmidt
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/* XXX we use functions that might not exist. */
	#include "opt_compat.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/capability.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/uio.h>
	#include <sys/syslog.h>
	#include <sys/un.h>

	#include <net/if.h>
	#include <netinet/in.h>
	#include <netinet/in_systm.h>
	#include <netinet/ip.h>
	#ifdef INET6
	#include <netinet/ip6.h>
	#include <netinet6/ip6_var.h>
	#include <netinet6/in6_var.h>
	#endif

	#ifdef COMPAT_LINUX32
	#include <machine/../linux32/linux.h>
	#include <machine/../linux32/linux32_proto.h>
	#else
	#include <machine/../linux/linux.h>
	#include <machine/../linux/linux_proto.h>
	#endif
	#include <compat/linux/linux_socket.h>
	#include <compat/linux/linux_util.h>

	static int do_sa_get(struct sockaddr *, const struct osockaddr , int *,
	struct malloc_type *);
	static int linux_to_bsd_domain(int);

	/*
	* Reads a linux sockaddr and does any necessary translation.
	* Linux sockaddrs don't have a length field, only a family.
	*/
	static int
	linux_getsockaddr(struct sockaddr *sap, const struct osockaddr osa, int len)
	{
	int osalen = len;

	return (do_sa_get(sap, osa, &osalen, M_SONAME));
	}

	/*
	* Copy the osockaddr structure pointed to by osa to kernel, adjust
	* family and convert to sockaddr.
	*/
	static int
	do_sa_get(struct sockaddr *sap, const struct osockaddr osa, int *osalen,
	struct malloc_type *mtype)
	{
	int error=0, bdom;
	struct sockaddr *sa;
	struct osockaddr *kosa;
	int alloclen;
	#ifdef INET6
	int oldv6size;
	struct sockaddr_in6 *sin6;
	#endif

	if (osalen < 2 \|\| osalen > UCHAR_MAX \|\| !osa)
	return (EINVAL);

	alloclen = *osalen;
	#ifdef INET6
	oldv6size = 0;
	/*
	* Check for old (pre-RFC2553) sockaddr_in6. We may accept it
	* if it's a v4-mapped address, so reserve the proper space
	* for it.
	*/
	if (alloclen == sizeof (struct sockaddr_in6) - sizeof (u_int32_t)) {
	alloclen = sizeof (struct sockaddr_in6);
	oldv6size = 1;
	}
	#endif

	kosa = malloc(alloclen, mtype, M_WAITOK);

	if ((error = copyin(osa, kosa, *osalen)))
	goto out;

	bdom = linux_to_bsd_domain(kosa->sa_family);
	if (bdom == -1) {
	error = EAFNOSUPPORT;
	goto out;
	}

	#ifdef INET6
	/*
	* Older Linux IPv6 code uses obsolete RFC2133 struct sockaddr_in6,
	* which lacks the scope id compared with RFC2553 one. If we detect
	* the situation, reject the address and write a message to system log.
	*
	* Still accept addresses for which the scope id is not used.
	*/
	if (oldv6size && bdom == AF_INET6) {
	sin6 = (struct sockaddr_in6 *)kosa;
	if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) \|\|
	(!IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr) &&
	!IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr) &&
	!IN6_IS_ADDR_V4COMPAT(&sin6->sin6_addr) &&
	!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) &&
	!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))) {
	sin6->sin6_scope_id = 0;
	} else {
	log(LOG_DEBUG,
	"obsolete pre-RFC2553 sockaddr_in6 rejected\n");
	error = EINVAL;
	goto out;
	}
	} else
	#endif
	if (bdom == AF_INET) {
	alloclen = sizeof(struct sockaddr_in);
	if (*osalen < alloclen) {
	error = EINVAL;
	goto out;
	}
	}

	sa = (struct sockaddr *) kosa;
	sa->sa_family = bdom;
	sa->sa_len = alloclen;

	*sap = sa;
	*osalen = alloclen;
	return (0);

	out:
	free(kosa, mtype);
	return (error);
	}

	static int
	linux_to_bsd_domain(int domain)
	{

	switch (domain) {
	case LINUX_AF_UNSPEC:
	return (AF_UNSPEC);
	case LINUX_AF_UNIX:
	return (AF_LOCAL);
	case LINUX_AF_INET:
	return (AF_INET);
	case LINUX_AF_INET6:
	return (AF_INET6);
	case LINUX_AF_AX25:
	return (AF_CCITT);
	case LINUX_AF_IPX:
	return (AF_IPX);
	case LINUX_AF_APPLETALK:
	return (AF_APPLETALK);
	}
	return (-1);
	}

	static int
	bsd_to_linux_domain(int domain)
	{

	switch (domain) {
	case AF_UNSPEC:
	return (LINUX_AF_UNSPEC);
	case AF_LOCAL:
	return (LINUX_AF_UNIX);
	case AF_INET:
	return (LINUX_AF_INET);
	case AF_INET6:
	return (LINUX_AF_INET6);
	case AF_CCITT:
	return (LINUX_AF_AX25);
	case AF_IPX:
	return (LINUX_AF_IPX);
	case AF_APPLETALK:
	return (LINUX_AF_APPLETALK);
	}
	return (-1);
	}

	static int
	linux_to_bsd_sockopt_level(int level)
	{

	switch (level) {
	case LINUX_SOL_SOCKET:
	return (SOL_SOCKET);
	}
	return (level);
	}

	static int
	bsd_to_linux_sockopt_level(int level)
	{

	switch (level) {
	case SOL_SOCKET:
	return (LINUX_SOL_SOCKET);
	}
	return (level);
	}

	static int
	linux_to_bsd_ip_sockopt(int opt)
	{

	switch (opt) {
	case LINUX_IP_TOS:
	return (IP_TOS);
	case LINUX_IP_TTL:
	return (IP_TTL);
	case LINUX_IP_OPTIONS:
	return (IP_OPTIONS);
	case LINUX_IP_MULTICAST_IF:
	return (IP_MULTICAST_IF);
	case LINUX_IP_MULTICAST_TTL:
	return (IP_MULTICAST_TTL);
	case LINUX_IP_MULTICAST_LOOP:
	return (IP_MULTICAST_LOOP);
	case LINUX_IP_ADD_MEMBERSHIP:
	return (IP_ADD_MEMBERSHIP);
	case LINUX_IP_DROP_MEMBERSHIP:
	return (IP_DROP_MEMBERSHIP);
	case LINUX_IP_HDRINCL:
	return (IP_HDRINCL);
	}
	return (-1);
	}

	static int
	linux_to_bsd_so_sockopt(int opt)
	{

	switch (opt) {
	case LINUX_SO_DEBUG:
	return (SO_DEBUG);
	case LINUX_SO_REUSEADDR:
	return (SO_REUSEADDR);
	case LINUX_SO_TYPE:
	return (SO_TYPE);
	case LINUX_SO_ERROR:
	return (SO_ERROR);
	case LINUX_SO_DONTROUTE:
	return (SO_DONTROUTE);
	case LINUX_SO_BROADCAST:
	return (SO_BROADCAST);
	case LINUX_SO_SNDBUF:
	return (SO_SNDBUF);
	case LINUX_SO_RCVBUF:
	return (SO_RCVBUF);
	case LINUX_SO_KEEPALIVE:
	return (SO_KEEPALIVE);
	case LINUX_SO_OOBINLINE:
	return (SO_OOBINLINE);
	case LINUX_SO_LINGER:
	return (SO_LINGER);
	case LINUX_SO_PEERCRED:
	return (LOCAL_PEERCRED);
	case LINUX_SO_RCVLOWAT:
	return (SO_RCVLOWAT);
	case LINUX_SO_SNDLOWAT:
	return (SO_SNDLOWAT);
	case LINUX_SO_RCVTIMEO:
	return (SO_RCVTIMEO);
	case LINUX_SO_SNDTIMEO:
	return (SO_SNDTIMEO);
	case LINUX_SO_TIMESTAMP:
	return (SO_TIMESTAMP);
	case LINUX_SO_ACCEPTCONN:
	return (SO_ACCEPTCONN);
	}
	return (-1);
	}

	static int
	linux_to_bsd_msg_flags(int flags)
	{
	int ret_flags = 0;

	if (flags & LINUX_MSG_OOB)
	ret_flags \|= MSG_OOB;
	if (flags & LINUX_MSG_PEEK)
	ret_flags \|= MSG_PEEK;
	if (flags & LINUX_MSG_DONTROUTE)
	ret_flags \|= MSG_DONTROUTE;
	if (flags & LINUX_MSG_CTRUNC)
	ret_flags \|= MSG_CTRUNC;
	if (flags & LINUX_MSG_TRUNC)
	ret_flags \|= MSG_TRUNC;
	if (flags & LINUX_MSG_DONTWAIT)
	ret_flags \|= MSG_DONTWAIT;
	if (flags & LINUX_MSG_EOR)
	ret_flags \|= MSG_EOR;
	if (flags & LINUX_MSG_WAITALL)
	ret_flags \|= MSG_WAITALL;
	if (flags & LINUX_MSG_NOSIGNAL)
	ret_flags \|= MSG_NOSIGNAL;
	#if 0 /* not handled */
	if (flags & LINUX_MSG_PROXY)
	;
	if (flags & LINUX_MSG_FIN)
	;
	if (flags & LINUX_MSG_SYN)
	;
	if (flags & LINUX_MSG_CONFIRM)
	;
	if (flags & LINUX_MSG_RST)
	;
	if (flags & LINUX_MSG_ERRQUEUE)
	;
	#endif
	return ret_flags;
	}

	/*
	* If bsd_to_linux_sockaddr() or linux_to_bsd_sockaddr() faults, then the
	* native syscall will fault. Thus, we don't really need to check the
	* return values for these functions.
	*/

	static int
	bsd_to_linux_sockaddr(struct sockaddr *arg)
	{
	struct sockaddr sa;
	size_t sa_len = sizeof(struct sockaddr);
	int error;

	if ((error = copyin(arg, &sa, sa_len)))
	return (error);

	(u_short )&sa = sa.sa_family;

	error = copyout(&sa, arg, sa_len);

	return (error);
	}

	static int
	linux_to_bsd_sockaddr(struct sockaddr *arg, int len)
	{
	struct sockaddr sa;
	size_t sa_len = sizeof(struct sockaddr);
	int error;

	if ((error = copyin(arg, &sa, sa_len)))
	return (error);

	sa.sa_family = (sa_family_t )&sa;
	sa.sa_len = len;

	error = copyout(&sa, arg, sa_len);

	return (error);
	}


	static int
	linux_sa_put(struct osockaddr *osa)
	{
	struct osockaddr sa;
	int error, bdom;

	/*
	* Only read/write the osockaddr family part, the rest is
	* not changed.
	*/
	error = copyin(osa, &sa, sizeof(sa.sa_family));
	if (error)
	return (error);

	bdom = bsd_to_linux_domain(sa.sa_family);
	if (bdom == -1)
	return (EINVAL);

	sa.sa_family = bdom;
	error = copyout(&sa, osa, sizeof(sa.sa_family));
	if (error)
	return (error);

	return (0);
	}

	static int
	linux_to_bsd_cmsg_type(int cmsg_type)
	{

	switch (cmsg_type) {
	case LINUX_SCM_RIGHTS:
	return (SCM_RIGHTS);
	case LINUX_SCM_CREDENTIALS:
	return (SCM_CREDS);
	}
	return (-1);
	}

	static int
	bsd_to_linux_cmsg_type(int cmsg_type)
	{

	switch (cmsg_type) {
	case SCM_RIGHTS:
	return (LINUX_SCM_RIGHTS);
	case SCM_CREDS:
	return (LINUX_SCM_CREDENTIALS);
	}
	return (-1);
	}

	static int
	linux_to_bsd_msghdr(struct msghdr bhdr, const struct l_msghdr lhdr)
	{
	if (lhdr->msg_controllen > INT_MAX)
	return (ENOBUFS);

	bhdr->msg_name = PTRIN(lhdr->msg_name);
	bhdr->msg_namelen = lhdr->msg_namelen;
	bhdr->msg_iov = PTRIN(lhdr->msg_iov);
	bhdr->msg_iovlen = lhdr->msg_iovlen;
	bhdr->msg_control = PTRIN(lhdr->msg_control);

	/*
	* msg_controllen is skipped since BSD and LINUX control messages
	* are potentially different sizes (e.g. the cred structure used
	* by SCM_CREDS is different between the two operating system).
	*
	* The caller can set it (if necessary) after converting all the
	* control messages.
	*/

	bhdr->msg_flags = linux_to_bsd_msg_flags(lhdr->msg_flags);
	return (0);
	}

	static int
	bsd_to_linux_msghdr(const struct msghdr bhdr, struct l_msghdr lhdr)
	{
	lhdr->msg_name = PTROUT(bhdr->msg_name);
	lhdr->msg_namelen = bhdr->msg_namelen;
	lhdr->msg_iov = PTROUT(bhdr->msg_iov);
	lhdr->msg_iovlen = bhdr->msg_iovlen;
	lhdr->msg_control = PTROUT(bhdr->msg_control);

	/*
	* msg_controllen is skipped since BSD and LINUX control messages
	* are potentially different sizes (e.g. the cred structure used
	* by SCM_CREDS is different between the two operating system).
	*
	* The caller can set it (if necessary) after converting all the
	* control messages.
	*/

	/* msg_flags skipped */
	return (0);
	}

	static int
	linux_set_socket_flags(struct thread *td, int s, int flags)
	{
	int error;

	if (flags & LINUX_SOCK_NONBLOCK) {
	error = kern_fcntl(td, s, F_SETFL, O_NONBLOCK);
	if (error)
	return (error);
	}
	if (flags & LINUX_SOCK_CLOEXEC) {
	error = kern_fcntl(td, s, F_SETFD, FD_CLOEXEC);
	if (error)
	return (error);
	}
	return (0);
	}

	static int
	linux_sendit(struct thread td, int s, struct msghdr mp, int flags,
	struct mbuf *control, enum uio_seg segflg)
	{
	struct sockaddr *to;
	int error;

	if (mp->msg_name != NULL) {
	error = linux_getsockaddr(&to, mp->msg_name, mp->msg_namelen);
	if (error)
	return (error);
	mp->msg_name = to;
	} else
	to = NULL;

	error = kern_sendit(td, s, mp, linux_to_bsd_msg_flags(flags), control,
	segflg);

	if (to)
	free(to, M_SONAME);
	return (error);
	}

	/* Return 0 if IP_HDRINCL is set for the given socket. */
	static int
	linux_check_hdrincl(struct thread *td, int s)
	{
	int error, optval, size_val;

	size_val = sizeof(optval);
	error = kern_getsockopt(td, s, IPPROTO_IP, IP_HDRINCL,
	&optval, UIO_SYSSPACE, &size_val);
	if (error)
	return (error);

	return (optval == 0);
	}

	struct linux_sendto_args {
	int s;
	l_uintptr_t msg;
	int len;
	int flags;
	l_uintptr_t to;
	int tolen;
	};

	/*
	* Updated sendto() when IP_HDRINCL is set:
	* tweak endian-dependent fields in the IP packet.
	*/
	static int
	linux_sendto_hdrincl(struct thread td, struct linux_sendto_args linux_args)
	{
	/*
	* linux_ip_copysize defines how many bytes we should copy
	* from the beginning of the IP packet before we customize it for BSD.
	* It should include all the fields we modify (ip_len and ip_off).
	*/
	#define linux_ip_copysize 8

	struct ip *packet;
	struct msghdr msg;
	struct iovec aiov[1];
	int error;

	/* Check that the packet isn't too big or too small. */
	if (linux_args->len < linux_ip_copysize \|\|
	linux_args->len > IP_MAXPACKET)
	return (EINVAL);

	packet = (struct ip *)malloc(linux_args->len, M_TEMP, M_WAITOK);

	/* Make kernel copy of the packet to be sent */
	if ((error = copyin(PTRIN(linux_args->msg), packet,
	linux_args->len)))
	goto goout;

	/* Convert fields from Linux to BSD raw IP socket format */
	packet->ip_len = linux_args->len;
	packet->ip_off = ntohs(packet->ip_off);

	/* Prepare the msghdr and iovec structures describing the new packet */
	msg.msg_name = PTRIN(linux_args->to);
	msg.msg_namelen = linux_args->tolen;
	msg.msg_iov = aiov;
	msg.msg_iovlen = 1;
	msg.msg_control = NULL;
	msg.msg_flags = 0;
	aiov[0].iov_base = (char *)packet;
	aiov[0].iov_len = linux_args->len;
	error = linux_sendit(td, linux_args->s, &msg, linux_args->flags,
	NULL, UIO_SYSSPACE);
	goout:
	free(packet, M_TEMP);
	return (error);
	}

	struct linux_socket_args {
	int domain;
	int type;
	int protocol;
	};

	static int
	linux_socket(struct thread td, struct linux_socket_args args)
	{
	struct socket_args /* {
	int domain;
	int type;
	int protocol;
	} */ bsd_args;
	int retval_socket, socket_flags;

	bsd_args.protocol = args->protocol;
	socket_flags = args->type & ~LINUX_SOCK_TYPE_MASK;
	if (socket_flags & ~(LINUX_SOCK_CLOEXEC \| LINUX_SOCK_NONBLOCK))
	return (EINVAL);
	bsd_args.type = args->type & LINUX_SOCK_TYPE_MASK;
	if (bsd_args.type < 0 \|\| bsd_args.type > LINUX_SOCK_MAX)
	return (EINVAL);
	bsd_args.domain = linux_to_bsd_domain(args->domain);
	if (bsd_args.domain == -1)
	return (EAFNOSUPPORT);

	- retval_socket = socket(td, &bsd_args);
	+ retval_socket = sys_socket(td, &bsd_args);
	if (retval_socket)
	return (retval_socket);

	retval_socket = linux_set_socket_flags(td, td->td_retval[0],
	socket_flags);
	if (retval_socket) {
	(void)kern_close(td, td->td_retval[0]);
	goto out;
	}

	if (bsd_args.type == SOCK_RAW
	&& (bsd_args.protocol == IPPROTO_RAW \|\| bsd_args.protocol == 0)
	&& bsd_args.domain == PF_INET) {
	/* It's a raw IP socket: set the IP_HDRINCL option. */
	int hdrincl;

	hdrincl = 1;
	/* We ignore any error returned by kern_setsockopt() */
	kern_setsockopt(td, td->td_retval[0], IPPROTO_IP, IP_HDRINCL,
	&hdrincl, UIO_SYSSPACE, sizeof(hdrincl));
	}
	#ifdef INET6
	/*
	* Linux AF_INET6 socket has IPV6_V6ONLY setsockopt set to 0 by default
	* and some apps depend on this. So, set V6ONLY to 0 for Linux apps.
	* For simplicity we do this unconditionally of the net.inet6.ip6.v6only
	* sysctl value.
	*/
	if (bsd_args.domain == PF_INET6) {
	int v6only;

	v6only = 0;
	/* We ignore any error returned by setsockopt() */
	kern_setsockopt(td, td->td_retval[0], IPPROTO_IPV6, IPV6_V6ONLY,
	&v6only, UIO_SYSSPACE, sizeof(v6only));
	}
	#endif

	out:
	return (retval_socket);
	}

	struct linux_bind_args {
	int s;
	l_uintptr_t name;
	int namelen;
	};

	static int
	linux_bind(struct thread td, struct linux_bind_args args)
	{
	struct sockaddr *sa;
	int error;

	error = linux_getsockaddr(&sa, PTRIN(args->name),
	args->namelen);
	if (error)
	return (error);

	error = kern_bind(td, args->s, sa);
	free(sa, M_SONAME);
	if (error == EADDRNOTAVAIL && args->namelen != sizeof(struct sockaddr_in))
	return (EINVAL);
	return (error);
	}

	struct linux_connect_args {
	int s;
	l_uintptr_t name;
	int namelen;
	};
	int linux_connect(struct thread , struct linux_connect_args );

	int
	linux_connect(struct thread td, struct linux_connect_args args)
	{
	struct socket *so;
	struct sockaddr *sa;
	u_int fflag;
	int error;

	error = linux_getsockaddr(&sa, (struct osockaddr *)PTRIN(args->name),
	args->namelen);
	if (error)
	return (error);

	error = kern_connect(td, args->s, sa);
	free(sa, M_SONAME);
	if (error != EISCONN)
	return (error);

	/*
	* Linux doesn't return EISCONN the first time it occurs,
	* when on a non-blocking socket. Instead it returns the
	* error getsockopt(SOL_SOCKET, SO_ERROR) would return on BSD.
	*
	* XXXRW: Instead of using fgetsock(), check that it is a
	* socket and use the file descriptor reference instead of
	* creating a new one.
	*/
	error = fgetsock(td, args->s, CAP_CONNECT, &so, &fflag);
	if (error == 0) {
	error = EISCONN;
	if (fflag & FNONBLOCK) {
	SOCK_LOCK(so);
	if (so->so_emuldata == 0)
	error = so->so_error;
	so->so_emuldata = (void *)1;
	SOCK_UNLOCK(so);
	}
	fputsock(so);
	}
	return (error);
	}

	struct linux_listen_args {
	int s;
	int backlog;
	};

	static int
	linux_listen(struct thread td, struct linux_listen_args args)
	{
	struct listen_args /* {
	int s;
	int backlog;
	} */ bsd_args;

	bsd_args.s = args->s;
	bsd_args.backlog = args->backlog;
	- return (listen(td, &bsd_args));
	+ return (sys_listen(td, &bsd_args));
	}

	static int
	linux_accept_common(struct thread *td, int s, l_uintptr_t addr,
	l_uintptr_t namelen, int flags)
	{
	struct accept_args /* {
	int s;
	struct sockaddr * __restrict name;
	socklen_t * __restrict anamelen;
	} */ bsd_args;
	int error;

	if (flags & ~(LINUX_SOCK_CLOEXEC \| LINUX_SOCK_NONBLOCK))
	return (EINVAL);

	bsd_args.s = s;
	/* XXX: */
	bsd_args.name = (struct sockaddr * __restrict)PTRIN(addr);
	bsd_args.anamelen = PTRIN(namelen);/* XXX */
	- error = accept(td, &bsd_args);
	+ error = sys_accept(td, &bsd_args);
	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.name);
	if (error) {
	if (error == EFAULT && namelen != sizeof(struct sockaddr_in))
	return (EINVAL);
	return (error);
	}

	/*
	* linux appears not to copy flags from the parent socket to the
	* accepted one, so we must clear the flags in the new descriptor
	* and apply the requested flags.
	*/
	error = kern_fcntl(td, td->td_retval[0], F_SETFL, 0);
	if (error)
	goto out;
	error = linux_set_socket_flags(td, td->td_retval[0], flags);
	if (error)
	goto out;
	if (addr)
	error = linux_sa_put(PTRIN(addr));

	out:
	if (error) {
	(void)kern_close(td, td->td_retval[0]);
	td->td_retval[0] = 0;
	}
	return (error);
	}

	struct linux_accept_args {
	int s;
	l_uintptr_t addr;
	l_uintptr_t namelen;
	};

	static int
	linux_accept(struct thread td, struct linux_accept_args args)
	{

	return (linux_accept_common(td, args->s, args->addr,
	args->namelen, 0));
	}

	struct linux_accept4_args {
	int s;
	l_uintptr_t addr;
	l_uintptr_t namelen;
	int flags;
	};

	static int
	linux_accept4(struct thread td, struct linux_accept4_args args)
	{

	return (linux_accept_common(td, args->s, args->addr,
	args->namelen, args->flags));
	}

	struct linux_getsockname_args {
	int s;
	l_uintptr_t addr;
	l_uintptr_t namelen;
	};

	static int
	linux_getsockname(struct thread td, struct linux_getsockname_args args)
	{
	struct getsockname_args /* {
	int fdes;
	struct sockaddr * __restrict asa;
	socklen_t * __restrict alen;
	} */ bsd_args;
	int error;

	bsd_args.fdes = args->s;
	/* XXX: */
	bsd_args.asa = (struct sockaddr * __restrict)PTRIN(args->addr);
	bsd_args.alen = PTRIN(args->namelen); /* XXX */
	- error = getsockname(td, &bsd_args);
	+ error = sys_getsockname(td, &bsd_args);
	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa);
	if (error)
	return (error);
	error = linux_sa_put(PTRIN(args->addr));
	if (error)
	return (error);
	return (0);
	}

	struct linux_getpeername_args {
	int s;
	l_uintptr_t addr;
	l_uintptr_t namelen;
	};

	static int
	linux_getpeername(struct thread td, struct linux_getpeername_args args)
	{
	struct getpeername_args /* {
	int fdes;
	caddr_t asa;
	int *alen;
	} */ bsd_args;
	int error;

	bsd_args.fdes = args->s;
	bsd_args.asa = (struct sockaddr *)PTRIN(args->addr);
	bsd_args.alen = (int *)PTRIN(args->namelen);
	- error = getpeername(td, &bsd_args);
	+ error = sys_getpeername(td, &bsd_args);
	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.asa);
	if (error)
	return (error);
	error = linux_sa_put(PTRIN(args->addr));
	if (error)
	return (error);
	return (0);
	}

	struct linux_socketpair_args {
	int domain;
	int type;
	int protocol;
	l_uintptr_t rsv;
	};

	static int
	linux_socketpair(struct thread td, struct linux_socketpair_args args)
	{
	struct socketpair_args /* {
	int domain;
	int type;
	int protocol;
	int *rsv;
	} */ bsd_args;
	int error, socket_flags;
	int sv[2];

	bsd_args.domain = linux_to_bsd_domain(args->domain);
	if (bsd_args.domain != PF_LOCAL)
	return (EAFNOSUPPORT);

	socket_flags = args->type & ~LINUX_SOCK_TYPE_MASK;
	if (socket_flags & ~(LINUX_SOCK_CLOEXEC \| LINUX_SOCK_NONBLOCK))
	return (EINVAL);
	bsd_args.type = args->type & LINUX_SOCK_TYPE_MASK;
	if (bsd_args.type < 0 \|\| bsd_args.type > LINUX_SOCK_MAX)
	return (EINVAL);

	if (args->protocol != 0 && args->protocol != PF_UNIX)

	/*
	* Use of PF_UNIX as protocol argument is not right,
	* but Linux does it.
	* Do not map PF_UNIX as its Linux value is identical
	* to FreeBSD one.
	*/
	return (EPROTONOSUPPORT);
	else
	bsd_args.protocol = 0;
	bsd_args.rsv = (int *)PTRIN(args->rsv);
	error = kern_socketpair(td, bsd_args.domain, bsd_args.type,
	bsd_args.protocol, sv);
	if (error)
	return (error);
	error = linux_set_socket_flags(td, sv[0], socket_flags);
	if (error)
	goto out;
	error = linux_set_socket_flags(td, sv[1], socket_flags);
	if (error)
	goto out;

	error = copyout(sv, bsd_args.rsv, 2 * sizeof(int));

	out:
	if (error) {
	(void)kern_close(td, sv[0]);
	(void)kern_close(td, sv[1]);
	}
	return (error);
	}

	struct linux_send_args {
	int s;
	l_uintptr_t msg;
	int len;
	int flags;
	};

	static int
	linux_send(struct thread td, struct linux_send_args args)
	{
	struct sendto_args /* {
	int s;
	caddr_t buf;
	int len;
	int flags;
	caddr_t to;
	int tolen;
	} */ bsd_args;

	bsd_args.s = args->s;
	bsd_args.buf = (caddr_t)PTRIN(args->msg);
	bsd_args.len = args->len;
	bsd_args.flags = args->flags;
	bsd_args.to = NULL;
	bsd_args.tolen = 0;
	- return sendto(td, &bsd_args);
	+ return sys_sendto(td, &bsd_args);
	}

	struct linux_recv_args {
	int s;
	l_uintptr_t msg;
	int len;
	int flags;
	};

	static int
	linux_recv(struct thread td, struct linux_recv_args args)
	{
	struct recvfrom_args /* {
	int s;
	caddr_t buf;
	int len;
	int flags;
	struct sockaddr *from;
	socklen_t fromlenaddr;
	} */ bsd_args;

	bsd_args.s = args->s;
	bsd_args.buf = (caddr_t)PTRIN(args->msg);
	bsd_args.len = args->len;
	bsd_args.flags = linux_to_bsd_msg_flags(args->flags);
	bsd_args.from = NULL;
	bsd_args.fromlenaddr = 0;
	- return (recvfrom(td, &bsd_args));
	+ return (sys_recvfrom(td, &bsd_args));
	}

	static int
	linux_sendto(struct thread td, struct linux_sendto_args args)
	{
	struct msghdr msg;
	struct iovec aiov;
	int error;

	if (linux_check_hdrincl(td, args->s) == 0)
	/* IP_HDRINCL set, tweak the packet before sending */
	return (linux_sendto_hdrincl(td, args));

	msg.msg_name = PTRIN(args->to);
	msg.msg_namelen = args->tolen;
	msg.msg_iov = &aiov;
	msg.msg_iovlen = 1;
	msg.msg_control = NULL;
	msg.msg_flags = 0;
	aiov.iov_base = PTRIN(args->msg);
	aiov.iov_len = args->len;
	error = linux_sendit(td, args->s, &msg, args->flags, NULL,
	UIO_USERSPACE);
	return (error);
	}

	struct linux_recvfrom_args {
	int s;
	l_uintptr_t buf;
	int len;
	int flags;
	l_uintptr_t from;
	l_uintptr_t fromlen;
	};

	static int
	linux_recvfrom(struct thread td, struct linux_recvfrom_args args)
	{
	struct recvfrom_args /* {
	int s;
	caddr_t buf;
	size_t len;
	int flags;
	struct sockaddr * __restrict from;
	socklen_t * __restrict fromlenaddr;
	} */ bsd_args;
	size_t len;
	int error;

	if ((error = copyin(PTRIN(args->fromlen), &len, sizeof(size_t))))
	return (error);

	bsd_args.s = args->s;
	bsd_args.buf = PTRIN(args->buf);
	bsd_args.len = args->len;
	bsd_args.flags = linux_to_bsd_msg_flags(args->flags);
	/* XXX: */
	bsd_args.from = (struct sockaddr * __restrict)PTRIN(args->from);
	bsd_args.fromlenaddr = PTRIN(args->fromlen);/* XXX */

	linux_to_bsd_sockaddr((struct sockaddr *)bsd_args.from, len);
	- error = recvfrom(td, &bsd_args);
	+ error = sys_recvfrom(td, &bsd_args);
	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.from);

	if (error)
	return (error);
	if (args->from) {
	error = linux_sa_put((struct osockaddr *)
	PTRIN(args->from));
	if (error)
	return (error);
	}
	return (0);
	}

	struct linux_sendmsg_args {
	int s;
	l_uintptr_t msg;
	int flags;
	};

	static int
	linux_sendmsg(struct thread td, struct linux_sendmsg_args args)
	{
	struct cmsghdr *cmsg;
	struct cmsgcred cmcred;
	struct mbuf *control;
	struct msghdr msg;
	struct l_cmsghdr linux_cmsg;
	struct l_cmsghdr *ptr_cmsg;
	struct l_msghdr linux_msg;
	struct iovec *iov;
	socklen_t datalen;
	struct sockaddr *sa;
	sa_family_t sa_family;
	void *data;
	int error;

	error = copyin(PTRIN(args->msg), &linux_msg, sizeof(linux_msg));
	if (error)
	return (error);

	/*
	* Some Linux applications (ping) define a non-NULL control data
	* pointer, but a msg_controllen of 0, which is not allowed in the
	* FreeBSD system call interface. NULL the msg_control pointer in
	* order to handle this case. This should be checked, but allows the
	* Linux ping to work.
	*/
	if (PTRIN(linux_msg.msg_control) != NULL && linux_msg.msg_controllen == 0)
	linux_msg.msg_control = PTROUT(NULL);

	error = linux_to_bsd_msghdr(&msg, &linux_msg);
	if (error)
	return (error);

	#ifdef COMPAT_LINUX32
	error = linux32_copyiniov(PTRIN(msg.msg_iov), msg.msg_iovlen,
	&iov, EMSGSIZE);
	#else
	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
	#endif
	if (error)
	return (error);

	control = NULL;
	cmsg = NULL;

	if ((ptr_cmsg = LINUX_CMSG_FIRSTHDR(&linux_msg)) != NULL) {
	error = kern_getsockname(td, args->s, &sa, &datalen);
	if (error)
	goto bad;
	sa_family = sa->sa_family;
	free(sa, M_SONAME);

	error = ENOBUFS;
	cmsg = malloc(CMSG_HDRSZ, M_TEMP, M_WAITOK \| M_ZERO);
	control = m_get(M_WAIT, MT_CONTROL);
	if (control == NULL)
	goto bad;

	do {
	error = copyin(ptr_cmsg, &linux_cmsg,
	sizeof(struct l_cmsghdr));
	if (error)
	goto bad;

	error = EINVAL;
	if (linux_cmsg.cmsg_len < sizeof(struct l_cmsghdr))
	goto bad;

	/*
	* Now we support only SCM_RIGHTS and SCM_CRED,
	* so return EINVAL in any other cmsg_type
	*/
	cmsg->cmsg_type =
	linux_to_bsd_cmsg_type(linux_cmsg.cmsg_type);
	cmsg->cmsg_level =
	linux_to_bsd_sockopt_level(linux_cmsg.cmsg_level);
	if (cmsg->cmsg_type == -1
	\|\| cmsg->cmsg_level != SOL_SOCKET)
	goto bad;

	/*
	* Some applications (e.g. pulseaudio) attempt to
	* send ancillary data even if the underlying protocol
	* doesn't support it which is not allowed in the
	* FreeBSD system call interface.
	*/
	if (sa_family != AF_UNIX)
	continue;

	data = LINUX_CMSG_DATA(ptr_cmsg);
	datalen = linux_cmsg.cmsg_len - L_CMSG_HDRSZ;

	switch (cmsg->cmsg_type)
	{
	case SCM_RIGHTS:
	break;

	case SCM_CREDS:
	data = &cmcred;
	datalen = sizeof(cmcred);

	/*
	* The lower levels will fill in the structure
	*/
	bzero(data, datalen);
	break;
	}

	cmsg->cmsg_len = CMSG_LEN(datalen);

	error = ENOBUFS;
	if (!m_append(control, CMSG_HDRSZ, (c_caddr_t) cmsg))
	goto bad;
	if (!m_append(control, datalen, (c_caddr_t) data))
	goto bad;
	} while ((ptr_cmsg = LINUX_CMSG_NXTHDR(&linux_msg, ptr_cmsg)));

	if (m_length(control, NULL) == 0) {
	m_freem(control);
	control = NULL;
	}
	}

	msg.msg_iov = iov;
	msg.msg_flags = 0;
	error = linux_sendit(td, args->s, &msg, args->flags, control,
	UIO_USERSPACE);

	bad:
	free(iov, M_IOV);
	if (cmsg)
	free(cmsg, M_TEMP);
	return (error);
	}

	struct linux_recvmsg_args {
	int s;
	l_uintptr_t msg;
	int flags;
	};

	static int
	linux_recvmsg(struct thread td, struct linux_recvmsg_args args)
	{
	struct cmsghdr *cm;
	struct cmsgcred *cmcred;
	struct msghdr msg;
	struct l_cmsghdr *linux_cmsg = NULL;
	struct l_ucred linux_ucred;
	socklen_t datalen, outlen;
	struct l_msghdr linux_msg;
	struct iovec iov, uiov;
	struct mbuf *control = NULL;
	struct mbuf **controlp;
	caddr_t outbuf;
	void *data;
	int error, i, fd, fds, *fdp;

	error = copyin(PTRIN(args->msg), &linux_msg, sizeof(linux_msg));
	if (error)
	return (error);

	error = linux_to_bsd_msghdr(&msg, &linux_msg);
	if (error)
	return (error);

	#ifdef COMPAT_LINUX32
	error = linux32_copyiniov(PTRIN(msg.msg_iov), msg.msg_iovlen,
	&iov, EMSGSIZE);
	#else
	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
	#endif
	if (error)
	return (error);

	if (msg.msg_name) {
	error = linux_to_bsd_sockaddr((struct sockaddr *)msg.msg_name,
	msg.msg_namelen);
	if (error)
	goto bad;
	}

	uiov = msg.msg_iov;
	msg.msg_iov = iov;
	controlp = (msg.msg_control != NULL) ? &control : NULL;
	error = kern_recvit(td, args->s, &msg, UIO_USERSPACE, controlp);
	msg.msg_iov = uiov;
	if (error)
	goto bad;

	error = bsd_to_linux_msghdr(&msg, &linux_msg);
	if (error)
	goto bad;

	if (linux_msg.msg_name) {
	error = bsd_to_linux_sockaddr((struct sockaddr *)
	PTRIN(linux_msg.msg_name));
	if (error)
	goto bad;
	}
	if (linux_msg.msg_name && linux_msg.msg_namelen > 2) {
	error = linux_sa_put(PTRIN(linux_msg.msg_name));
	if (error)
	goto bad;
	}

	outbuf = PTRIN(linux_msg.msg_control);
	outlen = 0;

	if (control) {
	linux_cmsg = malloc(L_CMSG_HDRSZ, M_TEMP, M_WAITOK \| M_ZERO);

	msg.msg_control = mtod(control, struct cmsghdr *);
	msg.msg_controllen = control->m_len;

	cm = CMSG_FIRSTHDR(&msg);

	while (cm != NULL) {
	linux_cmsg->cmsg_type =
	bsd_to_linux_cmsg_type(cm->cmsg_type);
	linux_cmsg->cmsg_level =
	bsd_to_linux_sockopt_level(cm->cmsg_level);
	if (linux_cmsg->cmsg_type == -1
	\|\| cm->cmsg_level != SOL_SOCKET)
	{
	error = EINVAL;
	goto bad;
	}

	data = CMSG_DATA(cm);
	datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;

	switch (cm->cmsg_type)
	{
	case SCM_RIGHTS:
	if (args->flags & LINUX_MSG_CMSG_CLOEXEC) {
	fds = datalen / sizeof(int);
	fdp = data;
	for (i = 0; i < fds; i++) {
	fd = *fdp++;
	(void)kern_fcntl(td, fd,
	F_SETFD, FD_CLOEXEC);
	}
	}
	break;

	case SCM_CREDS:
	/*
	* Currently LOCAL_CREDS is never in
	* effect for Linux so no need to worry
	* about sockcred
	*/
	if (datalen != sizeof (*cmcred)) {
	error = EMSGSIZE;
	goto bad;
	}
	cmcred = (struct cmsgcred *)data;
	bzero(&linux_ucred, sizeof(linux_ucred));
	linux_ucred.pid = cmcred->cmcred_pid;
	linux_ucred.uid = cmcred->cmcred_uid;
	linux_ucred.gid = cmcred->cmcred_gid;
	data = &linux_ucred;
	datalen = sizeof(linux_ucred);
	break;
	}

	if (outlen + LINUX_CMSG_LEN(datalen) >
	linux_msg.msg_controllen) {
	if (outlen == 0) {
	error = EMSGSIZE;
	goto bad;
	} else {
	linux_msg.msg_flags \|=
	LINUX_MSG_CTRUNC;
	goto out;
	}
	}

	linux_cmsg->cmsg_len = LINUX_CMSG_LEN(datalen);

	error = copyout(linux_cmsg, outbuf, L_CMSG_HDRSZ);
	if (error)
	goto bad;
	outbuf += L_CMSG_HDRSZ;

	error = copyout(data, outbuf, datalen);
	if (error)
	goto bad;

	outbuf += LINUX_CMSG_ALIGN(datalen);
	outlen += LINUX_CMSG_LEN(datalen);

	cm = CMSG_NXTHDR(&msg, cm);
	}
	}

	out:
	linux_msg.msg_controllen = outlen;
	error = copyout(&linux_msg, PTRIN(args->msg), sizeof(linux_msg));

	bad:
	free(iov, M_IOV);
	if (control != NULL)
	m_freem(control);
	if (linux_cmsg != NULL)
	free(linux_cmsg, M_TEMP);

	return (error);
	}

	struct linux_shutdown_args {
	int s;
	int how;
	};

	static int
	linux_shutdown(struct thread td, struct linux_shutdown_args args)
	{
	struct shutdown_args /* {
	int s;
	int how;
	} */ bsd_args;

	bsd_args.s = args->s;
	bsd_args.how = args->how;
	- return (shutdown(td, &bsd_args));
	+ return (sys_shutdown(td, &bsd_args));
	}

	struct linux_setsockopt_args {
	int s;
	int level;
	int optname;
	l_uintptr_t optval;
	int optlen;
	};

	static int
	linux_setsockopt(struct thread td, struct linux_setsockopt_args args)
	{
	struct setsockopt_args /* {
	int s;
	int level;
	int name;
	caddr_t val;
	int valsize;
	} */ bsd_args;
	l_timeval linux_tv;
	struct timeval tv;
	int error, name;

	bsd_args.s = args->s;
	bsd_args.level = linux_to_bsd_sockopt_level(args->level);
	switch (bsd_args.level) {
	case SOL_SOCKET:
	name = linux_to_bsd_so_sockopt(args->optname);
	switch (name) {
	case SO_RCVTIMEO:
	/* FALLTHROUGH */
	case SO_SNDTIMEO:
	error = copyin(PTRIN(args->optval), &linux_tv,
	sizeof(linux_tv));
	if (error)
	return (error);
	tv.tv_sec = linux_tv.tv_sec;
	tv.tv_usec = linux_tv.tv_usec;
	return (kern_setsockopt(td, args->s, bsd_args.level,
	name, &tv, UIO_SYSSPACE, sizeof(tv)));
	/* NOTREACHED */
	break;
	default:
	break;
	}
	break;
	case IPPROTO_IP:
	name = linux_to_bsd_ip_sockopt(args->optname);
	break;
	case IPPROTO_TCP:
	/* Linux TCP option values match BSD's */
	name = args->optname;
	break;
	default:
	name = -1;
	break;
	}
	if (name == -1)
	return (ENOPROTOOPT);

	bsd_args.name = name;
	bsd_args.val = PTRIN(args->optval);
	bsd_args.valsize = args->optlen;

	if (name == IPV6_NEXTHOP) {
	linux_to_bsd_sockaddr((struct sockaddr *)bsd_args.val,
	bsd_args.valsize);
	- error = setsockopt(td, &bsd_args);
	+ error = sys_setsockopt(td, &bsd_args);
	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val);
	} else
	- error = setsockopt(td, &bsd_args);
	+ error = sys_setsockopt(td, &bsd_args);

	return (error);
	}

	struct linux_getsockopt_args {
	int s;
	int level;
	int optname;
	l_uintptr_t optval;
	l_uintptr_t optlen;
	};

	static int
	linux_getsockopt(struct thread td, struct linux_getsockopt_args args)
	{
	struct getsockopt_args /* {
	int s;
	int level;
	int name;
	caddr_t val;
	int *avalsize;
	} */ bsd_args;
	l_timeval linux_tv;
	struct timeval tv;
	socklen_t tv_len, xulen;
	struct xucred xu;
	struct l_ucred lxu;
	int error, name;

	bsd_args.s = args->s;
	bsd_args.level = linux_to_bsd_sockopt_level(args->level);
	switch (bsd_args.level) {
	case SOL_SOCKET:
	name = linux_to_bsd_so_sockopt(args->optname);
	switch (name) {
	case SO_RCVTIMEO:
	/* FALLTHROUGH */
	case SO_SNDTIMEO:
	tv_len = sizeof(tv);
	error = kern_getsockopt(td, args->s, bsd_args.level,
	name, &tv, UIO_SYSSPACE, &tv_len);
	if (error)
	return (error);
	linux_tv.tv_sec = tv.tv_sec;
	linux_tv.tv_usec = tv.tv_usec;
	return (copyout(&linux_tv, PTRIN(args->optval),
	sizeof(linux_tv)));
	/* NOTREACHED */
	break;
	case LOCAL_PEERCRED:
	if (args->optlen != sizeof(lxu))
	return (EINVAL);
	xulen = sizeof(xu);
	error = kern_getsockopt(td, args->s, bsd_args.level,
	name, &xu, UIO_SYSSPACE, &xulen);
	if (error)
	return (error);
	/*
	* XXX Use 0 for pid as the FreeBSD does not cache peer pid.
	*/
	lxu.pid = 0;
	lxu.uid = xu.cr_uid;
	lxu.gid = xu.cr_gid;
	return (copyout(&lxu, PTRIN(args->optval), sizeof(lxu)));
	/* NOTREACHED */
	break;
	default:
	break;
	}
	break;
	case IPPROTO_IP:
	name = linux_to_bsd_ip_sockopt(args->optname);
	break;
	case IPPROTO_TCP:
	/* Linux TCP option values match BSD's */
	name = args->optname;
	break;
	default:
	name = -1;
	break;
	}
	if (name == -1)
	return (EINVAL);

	bsd_args.name = name;
	bsd_args.val = PTRIN(args->optval);
	bsd_args.avalsize = PTRIN(args->optlen);

	if (name == IPV6_NEXTHOP) {
	- error = getsockopt(td, &bsd_args);
	+ error = sys_getsockopt(td, &bsd_args);
	bsd_to_linux_sockaddr((struct sockaddr *)bsd_args.val);
	} else
	- error = getsockopt(td, &bsd_args);
	+ error = sys_getsockopt(td, &bsd_args);

	return (error);
	}

	/* Argument list sizes for linux_socketcall */

	#define LINUX_AL(x) ((x) * sizeof(l_ulong))

	static const unsigned char lxs_args[] = {
	LINUX_AL(0) /* unused/, LINUX_AL(3) / socket */,
	LINUX_AL(3) /* bind /, LINUX_AL(3) / connect */,
	LINUX_AL(2) /* listen /, LINUX_AL(3) / accept */,
	LINUX_AL(3) /* getsockname /, LINUX_AL(3) / getpeername */,
	LINUX_AL(4) /* socketpair /, LINUX_AL(4) / send */,
	LINUX_AL(4) /* recv /, LINUX_AL(6) / sendto */,
	LINUX_AL(6) /* recvfrom /, LINUX_AL(2) / shutdown */,
	LINUX_AL(5) /* setsockopt /, LINUX_AL(5) / getsockopt */,
	LINUX_AL(3) /* sendmsg /, LINUX_AL(3) / recvmsg */,
	LINUX_AL(4) /* accept4 */
	};

	#define LINUX_AL_SIZE sizeof(lxs_args) / sizeof(lxs_args[0]) - 1

	int
	linux_socketcall(struct thread td, struct linux_socketcall_args args)
	{
	l_ulong a[6];
	void *arg;
	int error;

	if (args->what < LINUX_SOCKET \|\| args->what > LINUX_AL_SIZE)
	return (EINVAL);
	error = copyin(PTRIN(args->args), a, lxs_args[args->what]);
	if (error)
	return (error);

	arg = a;
	switch (args->what) {
	case LINUX_SOCKET:
	return (linux_socket(td, arg));
	case LINUX_BIND:
	return (linux_bind(td, arg));
	case LINUX_CONNECT:
	return (linux_connect(td, arg));
	case LINUX_LISTEN:
	return (linux_listen(td, arg));
	case LINUX_ACCEPT:
	return (linux_accept(td, arg));
	case LINUX_GETSOCKNAME:
	return (linux_getsockname(td, arg));
	case LINUX_GETPEERNAME:
	return (linux_getpeername(td, arg));
	case LINUX_SOCKETPAIR:
	return (linux_socketpair(td, arg));
	case LINUX_SEND:
	return (linux_send(td, arg));
	case LINUX_RECV:
	return (linux_recv(td, arg));
	case LINUX_SENDTO:
	return (linux_sendto(td, arg));
	case LINUX_RECVFROM:
	return (linux_recvfrom(td, arg));
	case LINUX_SHUTDOWN:
	return (linux_shutdown(td, arg));
	case LINUX_SETSOCKOPT:
	return (linux_setsockopt(td, arg));
	case LINUX_GETSOCKOPT:
	return (linux_getsockopt(td, arg));
	case LINUX_SENDMSG:
	return (linux_sendmsg(td, arg));
	case LINUX_RECVMSG:
	return (linux_recvmsg(td, arg));
	case LINUX_ACCEPT4:
	return (linux_accept4(td, arg));
	}

	uprintf("LINUX: 'socket' typ=%d not implemented\n", args->what);
	return (ENOSYS);
	}
	Index: head/sys/compat/linux/linux_uid16.c
	===================================================================
	--- head/sys/compat/linux/linux_uid16.c (revision 225616)
	+++ head/sys/compat/linux/linux_uid16.c (revision 225617)
	@@ -1,306 +1,306 @@
	/*-
	* Copyright (c) 2001 The FreeBSD Project
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"

	#include <sys/fcntl.h>
	#include <sys/param.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>

	#ifdef COMPAT_LINUX32
	#include <machine/../linux32/linux.h>
	#include <machine/../linux32/linux32_proto.h>
	#else
	#include <machine/../linux/linux.h>
	#include <machine/../linux/linux_proto.h>
	#endif

	#include <compat/linux/linux_util.h>

	DUMMY(setfsuid16);
	DUMMY(setfsgid16);
	DUMMY(getresuid16);
	DUMMY(getresgid16);

	#define CAST_NOCHG(x) ((x == 0xFFFF) ? -1 : x)

	int
	linux_chown16(struct thread td, struct linux_chown16_args args)
	{
	char *path;
	int error;

	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(chown16))
	printf(ARGS(chown16, "%s, %d, %d"), path, args->uid, args->gid);
	#endif
	error = kern_chown(td, path, UIO_SYSSPACE, CAST_NOCHG(args->uid),
	CAST_NOCHG(args->gid));
	LFREEPATH(path);
	return (error);
	}

	int
	linux_lchown16(struct thread td, struct linux_lchown16_args args)
	{
	char *path;
	int error;

	LCONVPATHEXIST(td, args->path, &path);

	#ifdef DEBUG
	if (ldebug(lchown16))
	printf(ARGS(lchown16, "%s, %d, %d"), path, args->uid,
	args->gid);
	#endif
	error = kern_lchown(td, path, UIO_SYSSPACE, CAST_NOCHG(args->uid),
	CAST_NOCHG(args->gid));
	LFREEPATH(path);
	return (error);
	}

	int
	linux_setgroups16(struct thread td, struct linux_setgroups16_args args)
	{
	struct ucred newcred, oldcred;
	l_gid16_t *linux_gidset;
	gid_t *bsd_gidset;
	int ngrp, error;
	struct proc *p;

	#ifdef DEBUG
	if (ldebug(setgroups16))
	printf(ARGS(setgroups16, "%d, *"), args->gidsetsize);
	#endif

	ngrp = args->gidsetsize;
	if (ngrp < 0 \|\| ngrp >= ngroups_max + 1)
	return (EINVAL);
	linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_TEMP, M_WAITOK);
	error = copyin(args->gidset, linux_gidset, ngrp * sizeof(l_gid16_t));
	if (error)
	return (error);
	newcred = crget();
	p = td->td_proc;
	PROC_LOCK(p);
	oldcred = crcopysafe(p, newcred);

	/*
	* cr_groups[0] holds egid. Setting the whole set from
	* the supplied set will cause egid to be changed too.
	* Keep cr_groups[0] unchanged to prevent that.
	*/

	if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0)) != 0) {
	PROC_UNLOCK(p);
	crfree(newcred);
	goto out;
	}

	if (ngrp > 0) {
	newcred->cr_ngroups = ngrp + 1;

	bsd_gidset = newcred->cr_groups;
	ngrp--;
	while (ngrp >= 0) {
	bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
	ngrp--;
	}
	}
	else
	newcred->cr_ngroups = 1;

	setsugid(td->td_proc);
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	crfree(oldcred);
	error = 0;
	out:
	free(linux_gidset, M_TEMP);
	return (error);
	}

	int
	linux_getgroups16(struct thread td, struct linux_getgroups16_args args)
	{
	struct ucred *cred;
	l_gid16_t *linux_gidset;
	gid_t *bsd_gidset;
	int bsd_gidsetsz, ngrp, error;

	#ifdef DEBUG
	if (ldebug(getgroups16))
	printf(ARGS(getgroups16, "%d, *"), args->gidsetsize);
	#endif

	cred = td->td_ucred;
	bsd_gidset = cred->cr_groups;
	bsd_gidsetsz = cred->cr_ngroups - 1;

	/*
	* cr_groups[0] holds egid. Returning the whole set
	* here will cause a duplicate. Exclude cr_groups[0]
	* to prevent that.
	*/

	if ((ngrp = args->gidsetsize) == 0) {
	td->td_retval[0] = bsd_gidsetsz;
	return (0);
	}

	if (ngrp < bsd_gidsetsz)
	return (EINVAL);

	ngrp = 0;
	linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
	M_TEMP, M_WAITOK);
	while (ngrp < bsd_gidsetsz) {
	linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
	ngrp++;
	}

	error = copyout(linux_gidset, args->gidset, ngrp * sizeof(l_gid16_t));
	free(linux_gidset, M_TEMP);
	if (error)
	return (error);

	td->td_retval[0] = ngrp;
	return (0);
	}

	/*
	* The FreeBSD native getgid(2) and getuid(2) also modify td->td_retval[1]
	* when COMPAT_43 is defined. This clobbers registers that are assumed to
	* be preserved. The following lightweight syscalls fixes this. See also
	* linux_getpid(2), linux_getgid(2) and linux_getuid(2) in linux_misc.c
	*
	* linux_getgid16() - MP SAFE
	* linux_getuid16() - MP SAFE
	*/

	int
	linux_getgid16(struct thread td, struct linux_getgid16_args args)
	{

	td->td_retval[0] = td->td_ucred->cr_rgid;
	return (0);
	}

	int
	linux_getuid16(struct thread td, struct linux_getuid16_args args)
	{

	td->td_retval[0] = td->td_ucred->cr_ruid;
	return (0);
	}

	int
	linux_getegid16(struct thread td, struct linux_getegid16_args args)
	{
	struct getegid_args bsd;

	- return (getegid(td, &bsd));
	+ return (sys_getegid(td, &bsd));
	}

	int
	linux_geteuid16(struct thread td, struct linux_geteuid16_args args)
	{
	struct geteuid_args bsd;

	- return (geteuid(td, &bsd));
	+ return (sys_geteuid(td, &bsd));
	}

	int
	linux_setgid16(struct thread td, struct linux_setgid16_args args)
	{
	struct setgid_args bsd;

	bsd.gid = args->gid;
	- return (setgid(td, &bsd));
	+ return (sys_setgid(td, &bsd));
	}

	int
	linux_setuid16(struct thread td, struct linux_setuid16_args args)
	{
	struct setuid_args bsd;

	bsd.uid = args->uid;
	- return (setuid(td, &bsd));
	+ return (sys_setuid(td, &bsd));
	}

	int
	linux_setregid16(struct thread td, struct linux_setregid16_args args)
	{
	struct setregid_args bsd;

	bsd.rgid = CAST_NOCHG(args->rgid);
	bsd.egid = CAST_NOCHG(args->egid);
	- return (setregid(td, &bsd));
	+ return (sys_setregid(td, &bsd));
	}

	int
	linux_setreuid16(struct thread td, struct linux_setreuid16_args args)
	{
	struct setreuid_args bsd;

	bsd.ruid = CAST_NOCHG(args->ruid);
	bsd.euid = CAST_NOCHG(args->euid);
	- return (setreuid(td, &bsd));
	+ return (sys_setreuid(td, &bsd));
	}

	int
	linux_setresgid16(struct thread td, struct linux_setresgid16_args args)
	{
	struct setresgid_args bsd;

	bsd.rgid = CAST_NOCHG(args->rgid);
	bsd.egid = CAST_NOCHG(args->egid);
	bsd.sgid = CAST_NOCHG(args->sgid);
	- return (setresgid(td, &bsd));
	+ return (sys_setresgid(td, &bsd));
	}

	int
	linux_setresuid16(struct thread td, struct linux_setresuid16_args args)
	{
	struct setresuid_args bsd;

	bsd.ruid = CAST_NOCHG(args->ruid);
	bsd.euid = CAST_NOCHG(args->euid);
	bsd.suid = CAST_NOCHG(args->suid);
	- return (setresuid(td, &bsd));
	+ return (sys_setresuid(td, &bsd));
	}
	Index: head/sys/compat/svr4/svr4_fcntl.c
	===================================================================
	--- head/sys/compat/svr4/svr4_fcntl.c (revision 225616)
	+++ head/sys/compat/svr4/svr4_fcntl.c (revision 225617)
	@@ -1,724 +1,724 @@
	/*-
	* Copyright (c) 1998 Mark Newton
	* Copyright (c) 1994, 1997 Christos Zoulas.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by Christos Zoulas.
	* 4. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/capability.h>
	#include <sys/systm.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	/#include <sys/ioctl.h>/
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/stat.h>
	#include <sys/syscallsubr.h>
	#include <sys/unistd.h>
	#include <sys/vnode.h>

	#include <sys/sysproto.h>

	#include <compat/svr4/svr4.h>
	#include <compat/svr4/svr4_types.h>
	#include <compat/svr4/svr4_signal.h>
	#include <compat/svr4/svr4_proto.h>
	#include <compat/svr4/svr4_util.h>
	#include <compat/svr4/svr4_fcntl.h>

	#include <security/mac/mac_framework.h>

	static int svr4_to_bsd_flags(int);
	static u_long svr4_to_bsd_cmd(u_long);
	static int fd_revoke(struct thread *, int);
	static int fd_truncate(struct thread , int, struct flock );
	static int bsd_to_svr4_flags(int);
	static void bsd_to_svr4_flock(struct flock , struct svr4_flock );
	static void svr4_to_bsd_flock(struct svr4_flock , struct flock );
	static void bsd_to_svr4_flock64(struct flock , struct svr4_flock64 );
	static void svr4_to_bsd_flock64(struct svr4_flock64 , struct flock );

	static u_long
	svr4_to_bsd_cmd(cmd)
	u_long cmd;
	{
	switch (cmd) {
	case SVR4_F_DUPFD:
	return F_DUPFD;
	case SVR4_F_DUP2FD:
	return F_DUP2FD;
	case SVR4_F_GETFD:
	return F_GETFD;
	case SVR4_F_SETFD:
	return F_SETFD;
	case SVR4_F_GETFL:
	return F_GETFL;
	case SVR4_F_SETFL:
	return F_SETFL;
	case SVR4_F_GETLK:
	return F_GETLK;
	case SVR4_F_SETLK:
	return F_SETLK;
	case SVR4_F_SETLKW:
	return F_SETLKW;
	default:
	return -1;
	}
	}

	static int
	svr4_to_bsd_flags(l)
	int l;
	{
	int r = 0;
	r \|= (l & SVR4_O_RDONLY) ? O_RDONLY : 0;
	r \|= (l & SVR4_O_WRONLY) ? O_WRONLY : 0;
	r \|= (l & SVR4_O_RDWR) ? O_RDWR : 0;
	r \|= (l & SVR4_O_NDELAY) ? O_NONBLOCK : 0;
	r \|= (l & SVR4_O_APPEND) ? O_APPEND : 0;
	r \|= (l & SVR4_O_SYNC) ? O_FSYNC : 0;
	r \|= (l & SVR4_O_NONBLOCK) ? O_NONBLOCK : 0;
	r \|= (l & SVR4_O_PRIV) ? O_EXLOCK : 0;
	r \|= (l & SVR4_O_CREAT) ? O_CREAT : 0;
	r \|= (l & SVR4_O_TRUNC) ? O_TRUNC : 0;
	r \|= (l & SVR4_O_EXCL) ? O_EXCL : 0;
	r \|= (l & SVR4_O_NOCTTY) ? O_NOCTTY : 0;
	return r;
	}

	static int
	bsd_to_svr4_flags(l)
	int l;
	{
	int r = 0;
	r \|= (l & O_RDONLY) ? SVR4_O_RDONLY : 0;
	r \|= (l & O_WRONLY) ? SVR4_O_WRONLY : 0;
	r \|= (l & O_RDWR) ? SVR4_O_RDWR : 0;
	r \|= (l & O_NDELAY) ? SVR4_O_NONBLOCK : 0;
	r \|= (l & O_APPEND) ? SVR4_O_APPEND : 0;
	r \|= (l & O_FSYNC) ? SVR4_O_SYNC : 0;
	r \|= (l & O_NONBLOCK) ? SVR4_O_NONBLOCK : 0;
	r \|= (l & O_EXLOCK) ? SVR4_O_PRIV : 0;
	r \|= (l & O_CREAT) ? SVR4_O_CREAT : 0;
	r \|= (l & O_TRUNC) ? SVR4_O_TRUNC : 0;
	r \|= (l & O_EXCL) ? SVR4_O_EXCL : 0;
	r \|= (l & O_NOCTTY) ? SVR4_O_NOCTTY : 0;
	return r;
	}


	static void
	bsd_to_svr4_flock(iflp, oflp)
	struct flock *iflp;
	struct svr4_flock *oflp;
	{
	switch (iflp->l_type) {
	case F_RDLCK:
	oflp->l_type = SVR4_F_RDLCK;
	break;
	case F_WRLCK:
	oflp->l_type = SVR4_F_WRLCK;
	break;
	case F_UNLCK:
	oflp->l_type = SVR4_F_UNLCK;
	break;
	default:
	oflp->l_type = -1;
	break;
	}

	oflp->l_whence = (short) iflp->l_whence;
	oflp->l_start = (svr4_off_t) iflp->l_start;
	oflp->l_len = (svr4_off_t) iflp->l_len;
	oflp->l_sysid = 0;
	oflp->l_pid = (svr4_pid_t) iflp->l_pid;
	}


	static void
	svr4_to_bsd_flock(iflp, oflp)
	struct svr4_flock *iflp;
	struct flock *oflp;
	{
	switch (iflp->l_type) {
	case SVR4_F_RDLCK:
	oflp->l_type = F_RDLCK;
	break;
	case SVR4_F_WRLCK:
	oflp->l_type = F_WRLCK;
	break;
	case SVR4_F_UNLCK:
	oflp->l_type = F_UNLCK;
	break;
	default:
	oflp->l_type = -1;
	break;
	}

	oflp->l_whence = iflp->l_whence;
	oflp->l_start = (off_t) iflp->l_start;
	oflp->l_len = (off_t) iflp->l_len;
	oflp->l_pid = (pid_t) iflp->l_pid;
	oflp->l_sysid = iflp->l_sysid;
	}

	static void
	bsd_to_svr4_flock64(iflp, oflp)
	struct flock *iflp;
	struct svr4_flock64 *oflp;
	{
	switch (iflp->l_type) {
	case F_RDLCK:
	oflp->l_type = SVR4_F_RDLCK;
	break;
	case F_WRLCK:
	oflp->l_type = SVR4_F_WRLCK;
	break;
	case F_UNLCK:
	oflp->l_type = SVR4_F_UNLCK;
	break;
	default:
	oflp->l_type = -1;
	break;
	}

	oflp->l_whence = (short) iflp->l_whence;
	oflp->l_start = (svr4_off64_t) iflp->l_start;
	oflp->l_len = (svr4_off64_t) iflp->l_len;
	oflp->l_sysid = iflp->l_sysid;
	oflp->l_pid = (svr4_pid_t) iflp->l_pid;
	}


	static void
	svr4_to_bsd_flock64(iflp, oflp)
	struct svr4_flock64 *iflp;
	struct flock *oflp;
	{
	switch (iflp->l_type) {
	case SVR4_F_RDLCK:
	oflp->l_type = F_RDLCK;
	break;
	case SVR4_F_WRLCK:
	oflp->l_type = F_WRLCK;
	break;
	case SVR4_F_UNLCK:
	oflp->l_type = F_UNLCK;
	break;
	default:
	oflp->l_type = -1;
	break;
	}

	oflp->l_whence = iflp->l_whence;
	oflp->l_start = (off_t) iflp->l_start;
	oflp->l_len = (off_t) iflp->l_len;
	oflp->l_pid = (pid_t) iflp->l_pid;

	}


	static int
	fd_revoke(td, fd)
	struct thread *td;
	int fd;
	{
	struct vnode *vp;
	struct mount *mp;
	struct vattr vattr;
	int error, *retval;

	retval = td->td_retval;
	/*
	* If we ever want to support Capsicum on SVR4 processes (unlikely)
	* or FreeBSD grows a native frevoke() (more likely), we will need a
	* CAP_REVOKE here.
	*
	* In the meantime, use CAP_MASK_VALID: if a SVR4 process wants to
	* do an frevoke(), it needs to do it on either a regular file
	* descriptor or a fully-privileged capability (which is effectively
	* the same as a non-capability-restricted file descriptor).
	*/
	if ((error = fgetvp(td, fd, CAP_MASK_VALID, &vp)) != 0)
	return (error);

	if (vp->v_type != VCHR && vp->v_type != VBLK) {
	error = EINVAL;
	goto out;
	}

	#ifdef MAC
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	error = mac_vnode_check_revoke(td->td_ucred, vp);
	VOP_UNLOCK(vp, 0);
	if (error)
	goto out;
	#endif

	if ((error = VOP_GETATTR(vp, &vattr, td->td_ucred)) != 0)
	goto out;

	if (td->td_ucred->cr_uid != vattr.va_uid &&
	(error = priv_check(td, PRIV_VFS_ADMIN)) != 0)
	goto out;

	if ((error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0)
	goto out;
	if (vcount(vp) > 1)
	VOP_REVOKE(vp, REVOKEALL);
	vn_finished_write(mp);
	out:
	vrele(vp);
	return error;
	}


	static int
	fd_truncate(td, fd, flp)
	struct thread *td;
	int fd;
	struct flock *flp;
	{
	off_t start, length;
	struct file *fp;
	struct vnode *vp;
	struct vattr vattr;
	int error, *retval;
	struct ftruncate_args ft;

	retval = td->td_retval;

	/*
	* We only support truncating the file.
	*/
	if ((error = fget(td, fd, CAP_FTRUNCATE, &fp)) != 0)
	return (error);

	vp = fp->f_vnode;

	if (fp->f_type != DTYPE_VNODE \|\| vp->v_type == VFIFO) {
	fdrop(fp, td);
	return ESPIPE;
	}

	if ((error = VOP_GETATTR(vp, &vattr, td->td_ucred)) != 0) {
	fdrop(fp, td);
	return error;
	}

	length = vattr.va_size;

	switch (flp->l_whence) {
	case SEEK_CUR:
	start = fp->f_offset + flp->l_start;
	break;

	case SEEK_END:
	start = flp->l_start + length;
	break;

	case SEEK_SET:
	start = flp->l_start;
	break;

	default:
	fdrop(fp, td);
	return EINVAL;
	}

	if (start + flp->l_len < length) {
	/* We don't support free'ing in the middle of the file */
	fdrop(fp, td);
	return EINVAL;
	}

	ft.fd = fd;
	ft.length = start;

	- error = ftruncate(td, &ft);
	+ error = sys_ftruncate(td, &ft);

	fdrop(fp, td);
	return (error);
	}

	int
	svr4_sys_open(td, uap)
	struct thread *td;
	struct svr4_sys_open_args *uap;
	{
	struct proc *p = td->td_proc;
	char *newpath;
	int bsd_flags, error, retval;

	CHECKALTEXIST(td, uap->path, &newpath);

	bsd_flags = svr4_to_bsd_flags(uap->flags);
	error = kern_open(td, newpath, UIO_SYSSPACE, bsd_flags, uap->mode);
	free(newpath, M_TEMP);

	if (error) {
	/* uprintf("svr4_open(%s, 0x%0x, 0%o): %d\n", uap->path,
	uap->flags, uap->mode, error);*/
	return error;
	}

	retval = td->td_retval[0];

	PROC_LOCK(p);
	if (!(bsd_flags & O_NOCTTY) && SESS_LEADER(p) &&
	!(p->p_flag & P_CONTROLT)) {
	#if defined(NOTYET)
	struct file *fp;

	error = fget(td, retval, CAP_IOCTL, &fp);
	PROC_UNLOCK(p);
	/*
	* we may have lost a race the above open() and
	* another thread issuing a close()
	*/
	if (error)
	return (EBADF); /* XXX: correct errno? */
	/* ignore any error, just give it a try */
	if (fp->f_type == DTYPE_VNODE)
	fo_ioctl(fp, TIOCSCTTY, (caddr_t) 0, td->td_ucred,
	td);
	fdrop(fp, td);
	} else {
	PROC_UNLOCK(p);
	}
	#else
	}
	PROC_UNLOCK(p);
	#endif
	return error;
	}

	int
	svr4_sys_open64(td, uap)
	struct thread *td;
	struct svr4_sys_open64_args *uap;
	{
	return svr4_sys_open(td, (struct svr4_sys_open_args *)uap);
	}

	int
	svr4_sys_creat(td, uap)
	struct thread *td;
	struct svr4_sys_creat_args *uap;
	{
	char *newpath;
	int error;

	CHECKALTEXIST(td, uap->path, &newpath);

	error = kern_open(td, newpath, UIO_SYSSPACE, O_WRONLY \| O_CREAT \|
	O_TRUNC, uap->mode);
	free(newpath, M_TEMP);
	return (error);
	}

	int
	svr4_sys_creat64(td, uap)
	struct thread *td;
	struct svr4_sys_creat64_args *uap;
	{
	return svr4_sys_creat(td, (struct svr4_sys_creat_args *)uap);
	}

	int
	svr4_sys_llseek(td, uap)
	struct thread *td;
	struct svr4_sys_llseek_args *uap;
	{
	struct lseek_args ap;

	ap.fd = uap->fd;

	#if BYTE_ORDER == BIG_ENDIAN
	ap.offset = (((u_int64_t) uap->offset1) << 32) \|
	uap->offset2;
	#else
	ap.offset = (((u_int64_t) uap->offset2) << 32) \|
	uap->offset1;
	#endif
	ap.whence = uap->whence;

	- return lseek(td, &ap);
	+ return sys_lseek(td, &ap);
	}

	int
	svr4_sys_access(td, uap)
	struct thread *td;
	struct svr4_sys_access_args *uap;
	{
	char *newpath;
	int error;

	CHECKALTEXIST(td, uap->path, &newpath);
	error = kern_access(td, newpath, UIO_SYSSPACE, uap->flags);
	free(newpath, M_TEMP);
	return (error);
	}

	#if defined(NOTYET)
	int
	svr4_sys_pread(td, uap)
	struct thread *td;
	struct svr4_sys_pread_args *uap;
	{
	struct pread_args pra;

	/*
	* Just translate the args structure and call the NetBSD
	* pread(2) system call (offset type is 64-bit in NetBSD).
	*/
	pra.fd = uap->fd;
	pra.buf = uap->buf;
	pra.nbyte = uap->nbyte;
	pra.offset = uap->off;

	return pread(td, &pra);
	}
	#endif

	#if defined(NOTYET)
	int
	svr4_sys_pread64(td, v, retval)
	struct thread *td;
	void *v;
	register_t *retval;
	{

	struct svr4_sys_pread64_args *uap = v;
	struct sys_pread_args pra;

	/*
	* Just translate the args structure and call the NetBSD
	* pread(2) system call (offset type is 64-bit in NetBSD).
	*/
	pra.fd = uap->fd;
	pra.buf = uap->buf;
	pra.nbyte = uap->nbyte;
	pra.offset = uap->off;

	return (sys_pread(td, &pra, retval));
	}
	#endif /* NOTYET */

	#if defined(NOTYET)
	int
	svr4_sys_pwrite(td, uap)
	struct thread *td;
	struct svr4_sys_pwrite_args *uap;
	{
	struct pwrite_args pwa;

	/*
	* Just translate the args structure and call the NetBSD
	* pwrite(2) system call (offset type is 64-bit in NetBSD).
	*/
	pwa.fd = uap->fd;
	pwa.buf = uap->buf;
	pwa.nbyte = uap->nbyte;
	pwa.offset = uap->off;

	return pwrite(td, &pwa);
	}
	#endif

	#if defined(NOTYET)
	int
	svr4_sys_pwrite64(td, v, retval)
	struct thread *td;
	void *v;
	register_t *retval;
	{
	struct svr4_sys_pwrite64_args *uap = v;
	struct sys_pwrite_args pwa;

	/*
	* Just translate the args structure and call the NetBSD
	* pwrite(2) system call (offset type is 64-bit in NetBSD).
	*/
	pwa.fd = uap->fd;
	pwa.buf = uap->buf;
	pwa.nbyte = uap->nbyte;
	pwa.offset = uap->off;

	return (sys_pwrite(td, &pwa, retval));
	}
	#endif /* NOTYET */

	int
	svr4_sys_fcntl(td, uap)
	struct thread *td;
	struct svr4_sys_fcntl_args *uap;
	{
	int cmd, error, *retval;

	retval = td->td_retval;

	cmd = svr4_to_bsd_cmd(uap->cmd);

	switch (cmd) {
	case F_DUPFD:
	case F_DUP2FD:
	case F_GETFD:
	case F_SETFD:
	return (kern_fcntl(td, uap->fd, cmd, (intptr_t)uap->arg));

	case F_GETFL:
	error = kern_fcntl(td, uap->fd, cmd, (intptr_t)uap->arg);
	if (error)
	return (error);
	retval = bsd_to_svr4_flags(retval);
	return (error);

	case F_SETFL:
	{
	/*
	* we must save the O_ASYNC flag, as that is
	* handled by ioctl(_, I_SETSIG, _) emulation.
	*/
	int flags;

	DPRINTF(("Setting flags %p\n", uap->arg));

	error = kern_fcntl(td, uap->fd, F_GETFL, 0);
	if (error)
	return (error);
	flags = *retval;
	flags &= O_ASYNC;
	flags \|= svr4_to_bsd_flags((u_long) uap->arg);
	return (kern_fcntl(td, uap->fd, F_SETFL, flags));
	}

	case F_GETLK:
	case F_SETLK:
	case F_SETLKW:
	{
	struct svr4_flock ifl;
	struct flock fl;

	error = copyin(uap->arg, &ifl, sizeof (ifl));
	if (error)
	return (error);

	svr4_to_bsd_flock(&ifl, &fl);

	error = kern_fcntl(td, uap->fd, cmd, (intptr_t)&fl);
	if (error \|\| cmd != F_GETLK)
	return (error);

	bsd_to_svr4_flock(&fl, &ifl);

	return (copyout(&ifl, uap->arg, sizeof (ifl)));
	}
	case -1:
	switch (uap->cmd) {
	case SVR4_F_FREESP:
	{
	struct svr4_flock ifl;
	struct flock fl;

	error = copyin(uap->arg, &ifl,
	sizeof ifl);
	if (error)
	return error;
	svr4_to_bsd_flock(&ifl, &fl);
	return fd_truncate(td, uap->fd, &fl);
	}

	case SVR4_F_GETLK64:
	case SVR4_F_SETLK64:
	case SVR4_F_SETLKW64:
	{
	struct svr4_flock64 ifl;
	struct flock fl;

	switch (uap->cmd) {
	case SVR4_F_GETLK64:
	cmd = F_GETLK;
	break;
	case SVR4_F_SETLK64:
	cmd = F_SETLK;
	break;
	case SVR4_F_SETLKW64:
	cmd = F_SETLKW;
	break;
	}
	error = copyin(uap->arg, &ifl,
	sizeof (ifl));
	if (error)
	return (error);

	svr4_to_bsd_flock64(&ifl, &fl);

	error = kern_fcntl(td, uap->fd, cmd,
	(intptr_t)&fl);
	if (error \|\| cmd != F_GETLK)
	return (error);

	bsd_to_svr4_flock64(&fl, &ifl);

	return (copyout(&ifl, uap->arg,
	sizeof (ifl)));
	}

	case SVR4_F_FREESP64:
	{
	struct svr4_flock64 ifl;
	struct flock fl;

	error = copyin(uap->arg, &ifl,
	sizeof ifl);
	if (error)
	return error;
	svr4_to_bsd_flock64(&ifl, &fl);
	return fd_truncate(td, uap->fd, &fl);
	}

	case SVR4_F_REVOKE:
	return fd_revoke(td, uap->fd);

	default:
	return ENOSYS;
	}

	default:
	return ENOSYS;
	}
	}
	Index: head/sys/compat/svr4/svr4_filio.c
	===================================================================
	--- head/sys/compat/svr4/svr4_filio.c (revision 225616)
	+++ head/sys/compat/svr4/svr4_filio.c (revision 225617)
	@@ -1,249 +1,249 @@
	/*-
	* Copyright (c) 1998 Mark Newton
	* Copyright (c) 1994 Christos Zoulas
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/file.h>
	#include <sys/filio.h>
	#include <sys/lock.h>
	#include <sys/signal.h>
	#include <sys/filedesc.h>
	#include <sys/poll.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>

	#include <sys/sysproto.h>

	#include <compat/svr4/svr4.h>
	#include <compat/svr4/svr4_types.h>
	#include <compat/svr4/svr4_util.h>
	#include <compat/svr4/svr4_signal.h>
	#include <compat/svr4/svr4_proto.h>
	#include <compat/svr4/svr4_ioctl.h>
	#include <compat/svr4/svr4_filio.h>

	/#define GROTTY_READ_HACK/

	int
	svr4_sys_poll(td, uap)
	struct thread *td;
	struct svr4_sys_poll_args *uap;
	{
	int error;
	struct poll_args pa;
	struct pollfd *pfd;
	int idx = 0, cerr;
	u_long siz;

	if (uap->nfds > maxfilesperproc && uap->nfds > FD_SETSIZE)
	return (EINVAL);

	pa.fds = uap->fds;
	pa.nfds = uap->nfds;
	pa.timeout = uap->timeout;

	siz = uap->nfds * sizeof(struct pollfd);
	pfd = (struct pollfd *)malloc(siz, M_TEMP, M_WAITOK);

	- error = poll(td, (struct poll_args *)uap);
	+ error = sys_poll(td, (struct poll_args *)uap);

	if ((cerr = copyin(uap->fds, pfd, siz)) != 0) {
	error = cerr;
	goto done;
	}

	for (idx = 0; idx < uap->nfds; idx++) {
	/* POLLWRNORM already equals POLLOUT, so we don't worry about that */
	if (pfd[idx].revents & (POLLOUT \| POLLWRNORM \| POLLWRBAND))
	pfd[idx].revents \|= (POLLOUT \| POLLWRNORM \| POLLWRBAND);
	}
	if ((cerr = copyout(pfd, uap->fds, siz)) != 0) {
	error = cerr;
	goto done; /* yeah, I know it's the next line, but this way I won't
	forget to update it if I add more code */
	}
	done:
	free(pfd, M_TEMP);
	return error;
	}

	#if defined(READ_TEST)
	int
	svr4_sys_read(td, uap)
	struct thread *td;
	struct svr4_sys_read_args *uap;
	{
	struct read_args ra;
	struct file *fp;
	struct socket *so = NULL;
	int so_state;
	sigset_t sigmask;
	int rv;

	ra.fd = uap->fd;
	ra.buf = uap->buf;
	ra.nbyte = uap->nbyte;

	if (fget(td, uap->fd, CAP_READ, &fp) != 0) {
	DPRINTF(("Something fishy with the user-supplied file descriptor...\n"));
	return EBADF;
	}

	if (fp->f_type == DTYPE_SOCKET) {
	so = fp->f_data;
	DPRINTF(("fd %d is a socket\n", uap->fd));
	if (so->so_state & SS_ASYNC) {
	DPRINTF(("fd %d is an ASYNC socket!\n", uap->fd));
	}
	DPRINTF(("Here are its flags: 0x%x\n", so->so_state));
	#if defined(GROTTY_READ_HACK)
	so_state = so->so_state;
	so->so_state &= ~SS_NBIO;
	#endif
	}

	rv = read(td, &ra);

	DPRINTF(("svr4_read(%d, 0x%0x, %d) = %d\n",
	uap->fd, uap->buf, uap->nbyte, rv));
	if (rv == EAGAIN) {
	#ifdef DEBUG_SVR4
	struct sigacts *ps;

	PROC_LOCK(td->td_proc);
	ps = td->td_proc->p_sigacts;
	mtx_lock(&ps->ps_mtx);
	#endif
	DPRINTF(("sigmask = 0x%x\n", td->td_sigmask));
	DPRINTF(("sigignore = 0x%x\n", ps->ps_sigignore));
	DPRINTF(("sigcaught = 0x%x\n", ps->ps_sigcatch));
	DPRINTF(("siglist = 0x%x\n", td->td_siglist));
	#ifdef DEBUG_SVR4
	mtx_unlock(&ps->ps_mtx);
	PROC_UNLOCK(td->td_proc);
	#endif
	}

	#if defined(GROTTY_READ_HACK)
	if (so) { /* We've already checked to see if this is a socket */
	so->so_state = so_state;
	}
	#endif
	fdrop(fp, td);

	return(rv);
	}
	#endif /* READ_TEST */

	#if defined(BOGUS)
	int
	svr4_sys_write(td, uap)
	struct thread *td;
	struct svr4_sys_write_args *uap;
	{
	struct write_args wa;
	struct file *fp;
	int rv;

	wa.fd = uap->fd;
	wa.buf = uap->buf;
	wa.nbyte = uap->nbyte;

	rv = write(td, &wa);

	DPRINTF(("svr4_write(%d, 0x%0x, %d) = %d\n",
	uap->fd, uap->buf, uap->nbyte, rv));

	return(rv);
	}
	#endif /* BOGUS */

	int
	svr4_fil_ioctl(fp, td, retval, fd, cmd, data)
	struct file *fp;
	struct thread *td;
	register_t *retval;
	int fd;
	u_long cmd;
	caddr_t data;
	{
	int error;
	int num;
	struct filedesc *fdp = td->td_proc->p_fd;

	*retval = 0;

	switch (cmd) {
	case SVR4_FIOCLEX:
	FILEDESC_XLOCK(fdp);
	fdp->fd_ofileflags[fd] \|= UF_EXCLOSE;
	FILEDESC_XUNLOCK(fdp);
	return 0;

	case SVR4_FIONCLEX:
	FILEDESC_XLOCK(fdp);
	fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
	FILEDESC_XUNLOCK(fdp);
	return 0;

	case SVR4_FIOGETOWN:
	case SVR4_FIOSETOWN:
	case SVR4_FIOASYNC:
	case SVR4_FIONBIO:
	case SVR4_FIONREAD:
	if ((error = copyin(data, &num, sizeof(num))) != 0)
	return error;

	switch (cmd) {
	case SVR4_FIOGETOWN: cmd = FIOGETOWN; break;
	case SVR4_FIOSETOWN: cmd = FIOSETOWN; break;
	case SVR4_FIOASYNC: cmd = FIOASYNC; break;
	case SVR4_FIONBIO: cmd = FIONBIO; break;
	case SVR4_FIONREAD: cmd = FIONREAD; break;
	}

	#ifdef SVR4_DEBUG
	if (cmd == FIOASYNC) DPRINTF(("FIOASYNC\n"));
	#endif
	error = fo_ioctl(fp, cmd, (caddr_t) &num, td->td_ucred, td);

	if (error)
	return error;

	return copyout(&num, data, sizeof(num));

	default:
	DPRINTF(("Unknown svr4 filio %lx\n", cmd));
	return 0; /* ENOSYS really */
	}
	}
	Index: head/sys/compat/svr4/svr4_ipc.c
	===================================================================
	--- head/sys/compat/svr4/svr4_ipc.c (revision 225616)
	+++ head/sys/compat/svr4/svr4_ipc.c (revision 225617)
	@@ -1,707 +1,707 @@
	/*-
	* Copyright (c) 1995 The NetBSD Foundation, Inc.
	* All rights reserved.
	*
	* This code is derived from software contributed to The NetBSD Foundation
	* by Christos Zoulas.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the NetBSD
	* Foundation, Inc. and its contributors.
	* 4. Neither the name of The NetBSD Foundation nor the names of its
	* contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/
	/*-
	* Portions of this code have been derived from software contributed
	* to the FreeBSD Project by Mark Newton.
	*
	* Copyright (c) 1999 Mark Newton
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* XXX- This code is presently a no-op on FreeBSD (and isn't compiled due
	* to preprocessor conditionals). A nice project for a kernel hacking
	* novice might be to MakeItGo, but I have more important fish to fry
	* at present.
	*
	* Derived from: $NetBSD: svr4_ipc.c,v 1.7 1998/10/19 22:43:00 tron Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_sysvipc.h"

	#include <sys/param.h>
	#include <sys/ipc.h>
	#include <sys/msg.h>
	#include <sys/proc.h>
	#include <sys/sem.h>
	#include <sys/shm.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/time.h>

	#include <compat/svr4/svr4.h>
	#include <compat/svr4/svr4_types.h>
	#include <compat/svr4/svr4_signal.h>
	#include <compat/svr4/svr4_proto.h>
	#include <compat/svr4/svr4_util.h>
	#include <compat/svr4/svr4_ipc.h>

	#if defined(SYSVMSG) \|\| defined(SYSVSHM) \|\| defined(SYSVSEM)
	static void svr4_to_bsd_ipc_perm(const struct svr4_ipc_perm *,
	struct ipc_perm *);
	static void bsd_to_svr4_ipc_perm(const struct ipc_perm *,
	struct svr4_ipc_perm *);
	#endif

	#ifdef SYSVSEM
	static void bsd_to_svr4_semid_ds(const struct semid_ds *,
	struct svr4_semid_ds *);
	static void svr4_to_bsd_semid_ds(const struct svr4_semid_ds *,
	struct semid_ds *);
	static int svr4_semop(struct thread , void );
	static int svr4_semget(struct thread , void );
	static int svr4_semctl(struct thread , void );
	#endif

	#ifdef SYSVMSG
	static void bsd_to_svr4_msqid_ds(const struct msqid_ds *,
	struct svr4_msqid_ds *);
	static void svr4_to_bsd_msqid_ds(const struct svr4_msqid_ds *,
	struct msqid_ds *);
	static int svr4_msgsnd(struct thread , void );
	static int svr4_msgrcv(struct thread , void );
	static int svr4_msgget(struct thread , void );
	static int svr4_msgctl(struct thread , void );
	#endif

	#ifdef SYSVSHM
	static void bsd_to_svr4_shmid_ds(const struct shmid_ds *,
	struct svr4_shmid_ds *);
	static void svr4_to_bsd_shmid_ds(const struct svr4_shmid_ds *,
	struct shmid_ds *);
	static int svr4_shmat(struct thread , void );
	static int svr4_shmdt(struct thread , void );
	static int svr4_shmget(struct thread , void );
	static int svr4_shmctl(struct thread , void );
	#endif

	#if defined(SYSVMSG) \|\| defined(SYSVSHM) \|\| defined(SYSVSEM)

	static void
	svr4_to_bsd_ipc_perm(spp, bpp)
	const struct svr4_ipc_perm *spp;
	struct ipc_perm *bpp;
	{
	bpp->key = spp->key;
	bpp->uid = spp->uid;
	bpp->gid = spp->gid;
	bpp->cuid = spp->cuid;
	bpp->cgid = spp->cgid;
	bpp->mode = spp->mode;
	bpp->seq = spp->seq;
	}

	static void
	bsd_to_svr4_ipc_perm(bpp, spp)
	const struct ipc_perm *bpp;
	struct svr4_ipc_perm *spp;
	{
	spp->key = bpp->key;
	spp->uid = bpp->uid;
	spp->gid = bpp->gid;
	spp->cuid = bpp->cuid;
	spp->cgid = bpp->cgid;
	spp->mode = bpp->mode;
	spp->seq = bpp->seq;
	}
	#endif

	#ifdef SYSVSEM
	static void
	bsd_to_svr4_semid_ds(bds, sds)
	const struct semid_ds *bds;
	struct svr4_semid_ds *sds;
	{
	bzero(sds, sizeof(*sds));
	bsd_to_svr4_ipc_perm(&bds->sem_perm, &sds->sem_perm);
	sds->sem_base = (struct svr4_sem *) bds->sem_base;
	sds->sem_nsems = bds->sem_nsems;
	sds->sem_otime = bds->sem_otime;
	sds->sem_ctime = bds->sem_ctime;
	}

	static void
	svr4_to_bsd_semid_ds(sds, bds)
	const struct svr4_semid_ds *sds;
	struct semid_ds *bds;
	{
	svr4_to_bsd_ipc_perm(&sds->sem_perm, &bds->sem_perm);
	bds->sem_base = (struct sem *) bds->sem_base;
	bds->sem_nsems = sds->sem_nsems;
	bds->sem_otime = sds->sem_otime;
	bds->sem_ctime = sds->sem_ctime;
	}

	struct svr4_sys_semctl_args {
	int what;
	int semid;
	int semnum;
	int cmd;
	union semun arg;
	};

	static int
	svr4_semctl(td, v)
	struct thread *td;
	void *v;
	{
	struct svr4_sys_semctl_args *uap = v;
	struct svr4_semid_ds ss;
	struct semid_ds bs;
	union semun semun;
	register_t rval;
	int cmd, error;

	switch (uap->cmd) {
	case SVR4_SEM_GETZCNT:
	cmd = GETZCNT;
	break;

	case SVR4_SEM_GETNCNT:
	cmd = GETNCNT;
	break;

	case SVR4_SEM_GETPID:
	cmd = GETPID;
	break;

	case SVR4_SEM_GETVAL:
	cmd = GETVAL;
	break;

	case SVR4_SEM_SETVAL:
	cmd = SETVAL;
	break;

	case SVR4_SEM_GETALL:
	cmd = GETVAL;
	break;

	case SVR4_SEM_SETALL:
	cmd = SETVAL;
	break;

	case SVR4_IPC_STAT:
	cmd = IPC_STAT;
	semun.buf = &bs;
	error = kern_semctl(td, uap->semid, uap->semnum, cmd, &semun,
	&rval);
	if (error)
	return (error);
	bsd_to_svr4_semid_ds(&bs, &ss);
	error = copyout(&ss, uap->arg.buf, sizeof(ss));
	if (error == 0)
	td->td_retval[0] = rval;
	return (error);

	case SVR4_IPC_SET:
	cmd = IPC_SET;
	error = copyin(uap->arg.buf, (caddr_t) &ss, sizeof ss);
	if (error)
	return (error);
	svr4_to_bsd_semid_ds(&ss, &bs);
	semun.buf = &bs;
	return (kern_semctl(td, uap->semid, uap->semnum, cmd, &semun,
	td->td_retval));

	case SVR4_IPC_RMID:
	cmd = IPC_RMID;
	break;

	default:
	return EINVAL;
	}

	return (kern_semctl(td, uap->semid, uap->semnum, cmd, &uap->arg,
	td->td_retval));
	}

	struct svr4_sys_semget_args {
	int what;
	svr4_key_t key;
	int nsems;
	int semflg;
	};

	static int
	svr4_semget(td, v)
	struct thread *td;
	void *v;
	{
	struct svr4_sys_semget_args *uap = v;
	struct semget_args ap;

	ap.key = uap->key;
	ap.nsems = uap->nsems;
	ap.semflg = uap->semflg;

	- return semget(td, &ap);
	+ return sys_semget(td, &ap);
	}

	struct svr4_sys_semop_args {
	int what;
	int semid;
	struct svr4_sembuf * sops;
	u_int nsops;
	};

	static int
	svr4_semop(td, v)
	struct thread *td;
	void *v;
	{
	struct svr4_sys_semop_args *uap = v;
	struct semop_args ap;

	ap.semid = uap->semid;
	/* These are the same */
	ap.sops = (struct sembuf *) uap->sops;
	ap.nsops = uap->nsops;

	- return semop(td, &ap);
	+ return sys_semop(td, &ap);
	}

	int
	svr4_sys_semsys(td, uap)
	struct thread *td;
	struct svr4_sys_semsys_args *uap;
	{

	DPRINTF(("svr4_semsys(%d)\n", uap->what));

	switch (uap->what) {
	case SVR4_semctl:
	return svr4_semctl(td, uap);
	case SVR4_semget:
	return svr4_semget(td, uap);
	case SVR4_semop:
	return svr4_semop(td, uap);
	default:
	return EINVAL;
	}
	}

	MODULE_DEPEND(svr4elf, sysvsem, 1, 1, 1);
	#endif

	#ifdef SYSVMSG
	static void
	bsd_to_svr4_msqid_ds(bds, sds)
	const struct msqid_ds *bds;
	struct svr4_msqid_ds *sds;
	{
	bzero(sds, sizeof(*sds));
	bsd_to_svr4_ipc_perm(&bds->msg_perm, &sds->msg_perm);
	sds->msg_first = (struct svr4_msg *) bds->msg_first;
	sds->msg_last = (struct svr4_msg *) bds->msg_last;
	sds->msg_cbytes = bds->msg_cbytes;
	sds->msg_qnum = bds->msg_qnum;
	sds->msg_qbytes = bds->msg_qbytes;
	sds->msg_lspid = bds->msg_lspid;
	sds->msg_lrpid = bds->msg_lrpid;
	sds->msg_stime = bds->msg_stime;
	sds->msg_rtime = bds->msg_rtime;
	sds->msg_ctime = bds->msg_ctime;
	}

	static void
	svr4_to_bsd_msqid_ds(sds, bds)
	const struct svr4_msqid_ds *sds;
	struct msqid_ds *bds;
	{
	svr4_to_bsd_ipc_perm(&sds->msg_perm, &bds->msg_perm);
	bds->msg_first = (struct msg *) sds->msg_first;
	bds->msg_last = (struct msg *) sds->msg_last;
	bds->msg_cbytes = sds->msg_cbytes;
	bds->msg_qnum = sds->msg_qnum;
	bds->msg_qbytes = sds->msg_qbytes;
	bds->msg_lspid = sds->msg_lspid;
	bds->msg_lrpid = sds->msg_lrpid;
	bds->msg_stime = sds->msg_stime;
	bds->msg_rtime = sds->msg_rtime;
	bds->msg_ctime = sds->msg_ctime;
	}

	struct svr4_sys_msgsnd_args {
	int what;
	int msqid;
	void * msgp;
	size_t msgsz;
	int msgflg;
	};

	static int
	svr4_msgsnd(td, v)
	struct thread *td;
	void *v;
	{
	struct svr4_sys_msgsnd_args *uap = v;
	struct msgsnd_args ap;

	ap.msqid = uap->msqid;
	ap.msgp = uap->msgp;
	ap.msgsz = uap->msgsz;
	ap.msgflg = uap->msgflg;

	- return msgsnd(td, &ap);
	+ return sys_msgsnd(td, &ap);
	}

	struct svr4_sys_msgrcv_args {
	int what;
	int msqid;
	void * msgp;
	size_t msgsz;
	long msgtyp;
	int msgflg;
	};

	static int
	svr4_msgrcv(td, v)
	struct thread *td;
	void *v;
	{
	struct svr4_sys_msgrcv_args *uap = v;
	struct msgrcv_args ap;

	ap.msqid = uap->msqid;
	ap.msgp = uap->msgp;
	ap.msgsz = uap->msgsz;
	ap.msgtyp = uap->msgtyp;
	ap.msgflg = uap->msgflg;

	- return msgrcv(td, &ap);
	+ return sys_msgrcv(td, &ap);
	}

	struct svr4_sys_msgget_args {
	int what;
	svr4_key_t key;
	int msgflg;
	};

	static int
	svr4_msgget(td, v)
	struct thread *td;
	void *v;
	{
	struct svr4_sys_msgget_args *uap = v;
	struct msgget_args ap;

	ap.key = uap->key;
	ap.msgflg = uap->msgflg;

	- return msgget(td, &ap);
	+ return sys_msgget(td, &ap);
	}

	struct svr4_sys_msgctl_args {
	int what;
	int msqid;
	int cmd;
	struct svr4_msqid_ds * buf;
	};

	static int
	svr4_msgctl(td, v)
	struct thread *td;
	void *v;
	{
	struct svr4_sys_msgctl_args *uap = v;
	struct svr4_msqid_ds ss;
	struct msqid_ds bs;
	int error;

	switch (uap->cmd) {
	case SVR4_IPC_STAT:
	error = kern_msgctl(td, uap->msqid, IPC_STAT, &bs);
	if (error)
	return error;
	bsd_to_svr4_msqid_ds(&bs, &ss);
	return copyout(&ss, uap->buf, sizeof ss);

	case SVR4_IPC_SET:
	error = copyin(uap->buf, &ss, sizeof ss);
	if (error)
	return error;
	svr4_to_bsd_msqid_ds(&ss, &bs);
	return (kern_msgctl(td, uap->msqid, IPC_SET, &bs));

	case SVR4_IPC_RMID:
	return (kern_msgctl(td, uap->msqid, IPC_RMID, NULL));

	default:
	return EINVAL;
	}
	}

	int
	svr4_sys_msgsys(td, uap)
	struct thread *td;
	struct svr4_sys_msgsys_args *uap;
	{

	DPRINTF(("svr4_msgsys(%d)\n", uap->what));

	switch (uap->what) {
	case SVR4_msgsnd:
	return svr4_msgsnd(td, uap);
	case SVR4_msgrcv:
	return svr4_msgrcv(td, uap);
	case SVR4_msgget:
	return svr4_msgget(td, uap);
	case SVR4_msgctl:
	return svr4_msgctl(td, uap);
	default:
	return EINVAL;
	}
	}

	MODULE_DEPEND(svr4elf, sysvmsg, 1, 1, 1);
	#endif

	#ifdef SYSVSHM

	static void
	bsd_to_svr4_shmid_ds(bds, sds)
	const struct shmid_ds *bds;
	struct svr4_shmid_ds *sds;
	{
	bzero(sds, sizeof(*sds));
	bsd_to_svr4_ipc_perm(&bds->shm_perm, &sds->shm_perm);
	sds->shm_segsz = bds->shm_segsz;
	sds->shm_lkcnt = 0;
	sds->shm_lpid = bds->shm_lpid;
	sds->shm_cpid = bds->shm_cpid;
	sds->shm_amp = 0;
	sds->shm_nattch = bds->shm_nattch;
	sds->shm_cnattch = 0;
	sds->shm_atime = bds->shm_atime;
	sds->shm_dtime = bds->shm_dtime;
	sds->shm_ctime = bds->shm_ctime;
	}

	static void
	svr4_to_bsd_shmid_ds(sds, bds)
	const struct svr4_shmid_ds *sds;
	struct shmid_ds *bds;
	{
	svr4_to_bsd_ipc_perm(&sds->shm_perm, &bds->shm_perm);
	bds->shm_segsz = sds->shm_segsz;
	bds->shm_lpid = sds->shm_lpid;
	bds->shm_cpid = sds->shm_cpid;
	bds->shm_nattch = sds->shm_nattch;
	bds->shm_atime = sds->shm_atime;
	bds->shm_dtime = sds->shm_dtime;
	bds->shm_ctime = sds->shm_ctime;
	}

	struct svr4_sys_shmat_args {
	int what;
	int shmid;
	void * shmaddr;
	int shmflg;
	};

	static int
	svr4_shmat(td, v)
	struct thread *td;
	void *v;
	{
	struct svr4_sys_shmat_args *uap = v;
	struct shmat_args ap;

	ap.shmid = uap->shmid;
	ap.shmaddr = uap->shmaddr;
	ap.shmflg = uap->shmflg;

	- return shmat(td, &ap);
	+ return sys_shmat(td, &ap);
	}

	struct svr4_sys_shmdt_args {
	int what;
	void * shmaddr;
	};

	static int
	svr4_shmdt(td, v)
	struct thread *td;
	void *v;
	{
	struct svr4_sys_shmdt_args *uap = v;
	struct shmdt_args ap;

	ap.shmaddr = uap->shmaddr;

	- return shmdt(td, &ap);
	+ return sys_shmdt(td, &ap);
	}

	struct svr4_sys_shmget_args {
	int what;
	key_t key;
	int size;
	int shmflg;
	};

	static int
	svr4_shmget(td, v)
	struct thread *td;
	void *v;
	{
	struct svr4_sys_shmget_args *uap = v;
	struct shmget_args ap;

	ap.key = uap->key;
	ap.size = uap->size;
	ap.shmflg = uap->shmflg;

	- return shmget(td, &ap);
	+ return sys_shmget(td, &ap);
	}

	struct svr4_sys_shmctl_args {
	int what;
	int shmid;
	int cmd;
	struct svr4_shmid_ds * buf;
	};

	int
	svr4_shmctl(td, v)
	struct thread *td;
	void *v;
	{
	struct svr4_sys_shmctl_args *uap = v;
	struct shmid_ds bs;
	struct svr4_shmid_ds ss;
	size_t bufsize;
	int cmd, error;

	if (uap->buf != NULL) {
	switch (uap->cmd) {
	case SVR4_IPC_SET:
	case SVR4_SHM_LOCK:
	case SVR4_SHM_UNLOCK:
	error = copyin(uap->buf, &ss, sizeof(ss));
	if (error)
	return (error);
	svr4_to_bsd_shmid_ds(&ss, &bs);
	break;
	default:
	return (EINVAL);
	}
	}

	switch (uap->cmd) {
	case SVR4_IPC_STAT:
	cmd = IPC_STAT;
	break;
	case SVR4_IPC_SET:
	cmd = IPC_SET;
	break;
	case SVR4_IPC_RMID:
	cmd = IPC_RMID;
	break;
	case SVR4_SHM_LOCK:
	cmd = SHM_LOCK;
	break;
	case SVR4_SHM_UNLOCK:
	cmd = SHM_UNLOCK;
	break;
	default:
	return (EINVAL);
	}

	error = kern_shmctl(td, uap->shmid, cmd, &bs, &bufsize);
	if (error)
	return (error);

	switch (uap->cmd) {
	case SVR4_IPC_STAT:
	if (uap->buf != NULL) {
	bsd_to_svr4_shmid_ds(&bs, &ss);
	error = copyout(&ss, uap->buf, sizeof(ss));
	}
	break;
	}

	return (error);
	}

	int
	svr4_sys_shmsys(td, uap)
	struct thread *td;
	struct svr4_sys_shmsys_args *uap;
	{

	DPRINTF(("svr4_shmsys(%d)\n", uap->what));

	switch (uap->what) {
	case SVR4_shmat:
	return svr4_shmat(td, uap);
	case SVR4_shmdt:
	return svr4_shmdt(td, uap);
	case SVR4_shmget:
	return svr4_shmget(td, uap);
	case SVR4_shmctl:
	return svr4_shmctl(td, uap);
	default:
	return ENOSYS;
	}
	}

	MODULE_DEPEND(svr4elf, sysvshm, 1, 1, 1);
	#endif /* SYSVSHM */
	Index: head/sys/compat/svr4/svr4_misc.c
	===================================================================
	--- head/sys/compat/svr4/svr4_misc.c (revision 225616)
	+++ head/sys/compat/svr4/svr4_misc.c (revision 225617)
	@@ -1,1671 +1,1671 @@
	/*-
	* Copyright (c) 1998 Mark Newton
	* Copyright (c) 1994 Christos Zoulas
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/
	/*
	* SVR4 compatibility module.
	*
	* SVR4 system calls that are implemented differently in BSD are
	* handled here.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/dirent.h>
	#include <sys/fcntl.h>
	#include <sys/filedesc.h>
	#include <sys/imgact.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/file.h> /* Must come after sys/malloc.h */
	#include <sys/mman.h>
	#include <sys/mount.h>
	#include <sys/msg.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/ptrace.h>
	#include <sys/resource.h>
	#include <sys/resourcevar.h>
	#include <sys/sem.h>
	#include <sys/signalvar.h>
	#include <sys/stat.h>
	#include <sys/sx.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>
	#include <sys/time.h>
	#include <sys/times.h>
	#include <sys/uio.h>
	#include <sys/vnode.h>
	#include <sys/wait.h>

	#include <compat/svr4/svr4.h>
	#include <compat/svr4/svr4_types.h>
	#include <compat/svr4/svr4_signal.h>
	#include <compat/svr4/svr4_proto.h>
	#include <compat/svr4/svr4_util.h>
	#include <compat/svr4/svr4_sysconfig.h>
	#include <compat/svr4/svr4_dirent.h>
	#include <compat/svr4/svr4_acl.h>
	#include <compat/svr4/svr4_ulimit.h>
	#include <compat/svr4/svr4_statvfs.h>
	#include <compat/svr4/svr4_hrt.h>
	#include <compat/svr4/svr4_mman.h>
	#include <compat/svr4/svr4_wait.h>

	#include <security/mac/mac_framework.h>

	#include <machine/vmparam.h>
	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/vm_map.h>
	#if defined(__FreeBSD__)
	#include <vm/uma.h>
	#include <vm/vm_extern.h>
	#endif

	#if defined(NetBSD)
	# if defined(UVM)
	# include <uvm/uvm_extern.h>
	# endif
	#endif

	#define BSD_DIRENT(cp) ((struct dirent *)(cp))

	static int svr4_mknod(struct thread , register_t , char *,
	svr4_mode_t, svr4_dev_t);

	static __inline clock_t timeval_to_clock_t(struct timeval *);
	static int svr4_setinfo (pid_t , struct rusage , int, svr4_siginfo_t );

	struct svr4_hrtcntl_args;
	static int svr4_hrtcntl (struct thread , struct svr4_hrtcntl_args ,
	register_t *);
	static void bsd_statfs_to_svr4_statvfs(const struct statfs *,
	struct svr4_statvfs *);
	static void bsd_statfs_to_svr4_statvfs64(const struct statfs *,
	struct svr4_statvfs64 *);
	static struct proc *svr4_pfind(pid_t pid);

	/* BOGUS noop */
	#if defined(BOGUS)
	int
	svr4_sys_setitimer(td, uap)
	struct thread *td;
	struct svr4_sys_setitimer_args *uap;
	{
	td->td_retval[0] = 0;
	return 0;
	}
	#endif

	int
	svr4_sys_wait(td, uap)
	struct thread *td;
	struct svr4_sys_wait_args *uap;
	{
	int error, st, sig;

	error = kern_wait(td, WAIT_ANY, &st, 0, NULL);
	if (error)
	return (error);

	if (WIFSIGNALED(st)) {
	sig = WTERMSIG(st);
	if (sig >= 0 && sig < NSIG)
	st = (st & ~0177) \| SVR4_BSD2SVR4_SIG(sig);
	} else if (WIFSTOPPED(st)) {
	sig = WSTOPSIG(st);
	if (sig >= 0 && sig < NSIG)
	st = (st & ~0xff00) \| (SVR4_BSD2SVR4_SIG(sig) << 8);
	}

	/*
	* It looks like wait(2) on svr4/solaris/2.4 returns
	* the status in retval[1], and the pid on retval[0].
	*/
	td->td_retval[1] = st;

	if (uap->status)
	error = copyout(&st, uap->status, sizeof(st));

	return (error);
	}

	int
	svr4_sys_execv(td, uap)
	struct thread *td;
	struct svr4_sys_execv_args *uap;
	{
	struct image_args eargs;
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);

	error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp, NULL);
	free(path, M_TEMP);
	if (error == 0)
	error = kern_execve(td, &eargs, NULL);
	return (error);
	}

	int
	svr4_sys_execve(td, uap)
	struct thread *td;
	struct svr4_sys_execve_args *uap;
	{
	struct image_args eargs;
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);

	error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp,
	uap->envp);
	free(path, M_TEMP);
	if (error == 0)
	error = kern_execve(td, &eargs, NULL);
	return (error);
	}

	int
	svr4_sys_time(td, v)
	struct thread *td;
	struct svr4_sys_time_args *v;
	{
	struct svr4_sys_time_args *uap = v;
	int error = 0;
	struct timeval tv;

	microtime(&tv);
	if (uap->t)
	error = copyout(&tv.tv_sec, uap->t,
	sizeof(*(uap->t)));
	td->td_retval[0] = (int) tv.tv_sec;

	return error;
	}


	/*
	* Read SVR4-style directory entries. We suck them into kernel space so
	* that they can be massaged before being copied out to user code.
	*
	* This code is ported from the Linux emulator: Changes to the VFS interface
	* between FreeBSD and NetBSD have made it simpler to port it from there than
	* to adapt the NetBSD version.
	*/
	int
	svr4_sys_getdents64(td, uap)
	struct thread *td;
	struct svr4_sys_getdents64_args *uap;
	{
	struct dirent *bdp;
	struct vnode *vp;
	caddr_t inp, buf; /* BSD-format */
	int len, reclen; /* BSD-format */
	caddr_t outp; /* SVR4-format */
	int resid, svr4reclen=0; /* SVR4-format */
	struct file *fp;
	struct uio auio;
	struct iovec aiov;
	off_t off;
	struct svr4_dirent64 svr4_dirent;
	int buflen, error, eofflag, nbytes, justone, vfslocked;
	u_long cookies = NULL, cookiep;
	int ncookies;

	DPRINTF(("svr4_sys_getdents64(%d, *, %d)\n",
	uap->fd, uap->nbytes));
	if ((error = getvnode(td->td_proc->p_fd, uap->fd,
	CAP_READ \| CAP_SEEK, &fp)) != 0) {
	return (error);
	}

	if ((fp->f_flag & FREAD) == 0) {
	fdrop(fp, td);
	return (EBADF);
	}

	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	if (vp->v_type != VDIR) {
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	return (EINVAL);
	}

	nbytes = uap->nbytes;
	if (nbytes == 1) {
	nbytes = sizeof (struct svr4_dirent64);
	justone = 1;
	}
	else
	justone = 0;

	off = fp->f_offset;
	#define DIRBLKSIZ 512 /* XXX we used to use ufs's DIRBLKSIZ */
	buflen = max(DIRBLKSIZ, nbytes);
	buflen = min(buflen, MAXBSIZE);
	buf = malloc(buflen, M_TEMP, M_WAITOK);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	again:
	aiov.iov_base = buf;
	aiov.iov_len = buflen;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_td = td;
	auio.uio_resid = buflen;
	auio.uio_offset = off;

	if (cookies) {
	free(cookies, M_TEMP);
	cookies = NULL;
	}

	#ifdef MAC
	error = mac_vnode_check_readdir(td->td_ucred, vp);
	if (error)
	goto out;
	#endif

	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
	&ncookies, &cookies);
	if (error) {
	goto out;
	}

	inp = buf;
	outp = (caddr_t) uap->dp;
	resid = nbytes;
	if ((len = buflen - auio.uio_resid) <= 0) {
	goto eof;
	}

	cookiep = cookies;

	if (cookies) {
	/*
	* When using cookies, the vfs has the option of reading from
	* a different offset than that supplied (UFS truncates the
	* offset to a block boundary to make sure that it never reads
	* partway through a directory entry, even if the directory
	* has been compacted).
	*/
	while (len > 0 && ncookies > 0 && *cookiep <= off) {
	bdp = (struct dirent *) inp;
	len -= bdp->d_reclen;
	inp += bdp->d_reclen;
	cookiep++;
	ncookies--;
	}
	}

	while (len > 0) {
	if (cookiep && ncookies == 0)
	break;
	bdp = (struct dirent *) inp;
	reclen = bdp->d_reclen;
	if (reclen & 3) {
	DPRINTF(("svr4_readdir: reclen=%d\n", reclen));
	error = EFAULT;
	goto out;
	}

	if (bdp->d_fileno == 0) {
	inp += reclen;
	if (cookiep) {
	off = *cookiep++;
	ncookies--;
	} else
	off += reclen;
	len -= reclen;
	continue;
	}
	svr4reclen = SVR4_RECLEN(&svr4_dirent, bdp->d_namlen);
	if (reclen > len \|\| resid < svr4reclen) {
	outp++;
	break;
	}
	svr4_dirent.d_ino = (long) bdp->d_fileno;
	if (justone) {
	/*
	* old svr4-style readdir usage.
	*/
	svr4_dirent.d_off = (svr4_off_t) svr4reclen;
	svr4_dirent.d_reclen = (u_short) bdp->d_namlen;
	} else {
	svr4_dirent.d_off = (svr4_off_t)(off + reclen);
	svr4_dirent.d_reclen = (u_short) svr4reclen;
	}
	strlcpy(svr4_dirent.d_name, bdp->d_name, sizeof(svr4_dirent.d_name));
	if ((error = copyout((caddr_t)&svr4_dirent, outp, svr4reclen)))
	goto out;
	inp += reclen;
	if (cookiep) {
	off = *cookiep++;
	ncookies--;
	} else
	off += reclen;
	outp += svr4reclen;
	resid -= svr4reclen;
	len -= reclen;
	if (justone)
	break;
	}

	if (outp == (caddr_t) uap->dp)
	goto again;
	fp->f_offset = off;

	if (justone)
	nbytes = resid + svr4reclen;

	eof:
	td->td_retval[0] = nbytes - resid;
	out:
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	if (cookies)
	free(cookies, M_TEMP);
	free(buf, M_TEMP);
	return error;
	}


	int
	svr4_sys_getdents(td, uap)
	struct thread *td;
	struct svr4_sys_getdents_args *uap;
	{
	struct dirent *bdp;
	struct vnode *vp;
	caddr_t inp, buf; /* BSD-format */
	int len, reclen; /* BSD-format */
	caddr_t outp; /* SVR4-format */
	int resid, svr4_reclen; /* SVR4-format */
	struct file *fp;
	struct uio auio;
	struct iovec aiov;
	struct svr4_dirent idb;
	off_t off; /* true file offset */
	int buflen, error, eofflag, vfslocked;
	u_long cookiebuf = NULL, cookie;
	int ncookies = 0, *retval = td->td_retval;

	if (uap->nbytes < 0)
	return (EINVAL);

	if ((error = getvnode(td->td_proc->p_fd, uap->fd,
	CAP_READ \| CAP_SEEK, &fp)) != 0)
	return (error);

	if ((fp->f_flag & FREAD) == 0) {
	fdrop(fp, td);
	return (EBADF);
	}

	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	if (vp->v_type != VDIR) {
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	return (EINVAL);
	}

	buflen = min(MAXBSIZE, uap->nbytes);
	buf = malloc(buflen, M_TEMP, M_WAITOK);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	off = fp->f_offset;
	again:
	aiov.iov_base = buf;
	aiov.iov_len = buflen;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_td = td;
	auio.uio_resid = buflen;
	auio.uio_offset = off;

	#ifdef MAC
	error = mac_vnode_check_readdir(td->td_ucred, vp);
	if (error)
	goto out;
	#endif

	/*
	* First we read into the malloc'ed buffer, then
	* we massage it into user space, one record at a time.
	*/
	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies,
	&cookiebuf);
	if (error) {
	goto out;
	}

	inp = buf;
	outp = uap->buf;
	resid = uap->nbytes;
	if ((len = buflen - auio.uio_resid) == 0)
	goto eof;

	for (cookie = cookiebuf; len > 0; len -= reclen) {
	bdp = (struct dirent *)inp;
	reclen = bdp->d_reclen;
	if (reclen & 3)
	panic("svr4_sys_getdents64: bad reclen");
	if (cookie)
	off = cookie++; / each entry points to the next */
	else
	off += reclen;
	if ((off >> 32) != 0) {
	uprintf("svr4_sys_getdents64: dir offset too large for emulated program");
	error = EINVAL;
	goto out;
	}
	if (bdp->d_fileno == 0) {
	inp += reclen; /* it is a hole; squish it out */
	continue;
	}
	svr4_reclen = SVR4_RECLEN(&idb, bdp->d_namlen);
	if (reclen > len \|\| resid < svr4_reclen) {
	/* entry too big for buffer, so just stop */
	outp++;
	break;
	}
	/*
	* Massage in place to make a SVR4-shaped dirent (otherwise
	* we have to worry about touching user memory outside of
	* the copyout() call).
	*/
	idb.d_ino = (svr4_ino_t)bdp->d_fileno;
	idb.d_off = (svr4_off_t)off;
	idb.d_reclen = (u_short)svr4_reclen;
	strlcpy(idb.d_name, bdp->d_name, sizeof(idb.d_name));
	if ((error = copyout((caddr_t)&idb, outp, svr4_reclen)))
	goto out;
	/* advance past this real entry */
	inp += reclen;
	/* advance output past SVR4-shaped entry */
	outp += svr4_reclen;
	resid -= svr4_reclen;
	}

	/* if we squished out the whole block, try again */
	if (outp == uap->buf)
	goto again;
	fp->f_offset = off; /* update the vnode offset */

	eof:
	*retval = uap->nbytes - resid;
	out:
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	if (cookiebuf)
	free(cookiebuf, M_TEMP);
	free(buf, M_TEMP);
	return error;
	}


	int
	svr4_sys_mmap(td, uap)
	struct thread *td;
	struct svr4_sys_mmap_args *uap;
	{
	struct mmap_args mm;
	int *retval;

	retval = td->td_retval;
	#define _MAP_NEW 0x80000000
	/*
	* Verify the arguments.
	*/
	if (uap->prot & ~(PROT_READ \| PROT_WRITE \| PROT_EXEC))
	return EINVAL; /* XXX still needed? */

	if (uap->len == 0)
	return EINVAL;

	mm.prot = uap->prot;
	mm.len = uap->len;
	mm.flags = uap->flags & ~_MAP_NEW;
	mm.fd = uap->fd;
	mm.addr = uap->addr;
	mm.pos = uap->pos;

	- return mmap(td, &mm);
	+ return sys_mmap(td, &mm);
	}

	int
	svr4_sys_mmap64(td, uap)
	struct thread *td;
	struct svr4_sys_mmap64_args *uap;
	{
	struct mmap_args mm;
	void *rp;

	#define _MAP_NEW 0x80000000
	/*
	* Verify the arguments.
	*/
	if (uap->prot & ~(PROT_READ \| PROT_WRITE \| PROT_EXEC))
	return EINVAL; /* XXX still needed? */

	if (uap->len == 0)
	return EINVAL;

	mm.prot = uap->prot;
	mm.len = uap->len;
	mm.flags = uap->flags & ~_MAP_NEW;
	mm.fd = uap->fd;
	mm.addr = uap->addr;
	mm.pos = uap->pos;

	rp = (void *) round_page((vm_offset_t)(td->td_proc->p_vmspace->vm_daddr + maxdsiz));
	if ((mm.flags & MAP_FIXED) == 0 &&
	mm.addr != 0 && (void *)mm.addr < rp)
	mm.addr = rp;

	- return mmap(td, &mm);
	+ return sys_mmap(td, &mm);
	}


	int
	svr4_sys_fchroot(td, uap)
	struct thread *td;
	struct svr4_sys_fchroot_args *uap;
	{
	struct filedesc *fdp = td->td_proc->p_fd;
	struct vnode *vp;
	struct file *fp;
	int error, vfslocked;

	if ((error = priv_check(td, PRIV_VFS_FCHROOT)) != 0)
	return error;
	/* XXX: we have the chroot priv... what cap might we need? all? */
	if ((error = getvnode(fdp, uap->fd, 0, &fp)) != 0)
	return error;
	vp = fp->f_vnode;
	VREF(vp);
	fdrop(fp, td);
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	error = change_dir(vp, td);
	if (error)
	goto fail;
	#ifdef MAC
	error = mac_vnode_check_chroot(td->td_ucred, vp);
	if (error)
	goto fail;
	#endif
	VOP_UNLOCK(vp, 0);
	error = change_root(vp, td);
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	fail:
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}


	static int
	svr4_mknod(td, retval, path, mode, dev)
	struct thread *td;
	register_t *retval;
	char *path;
	svr4_mode_t mode;
	svr4_dev_t dev;
	{
	char *newpath;
	int error;

	CHECKALTEXIST(td, path, &newpath);

	if (S_ISFIFO(mode))
	error = kern_mkfifo(td, newpath, UIO_SYSSPACE, mode);
	else
	error = kern_mknod(td, newpath, UIO_SYSSPACE, mode, dev);
	free(newpath, M_TEMP);
	return (error);
	}


	int
	svr4_sys_mknod(td, uap)
	struct thread *td;
	struct svr4_sys_mknod_args *uap;
	{
	int *retval = td->td_retval;
	return svr4_mknod(td, retval,
	uap->path, uap->mode,
	(svr4_dev_t)svr4_to_bsd_odev_t(uap->dev));
	}


	int
	svr4_sys_xmknod(td, uap)
	struct thread *td;
	struct svr4_sys_xmknod_args *uap;
	{
	int *retval = td->td_retval;
	return svr4_mknod(td, retval,
	uap->path, uap->mode,
	(svr4_dev_t)svr4_to_bsd_dev_t(uap->dev));
	}


	int
	svr4_sys_vhangup(td, uap)
	struct thread *td;
	struct svr4_sys_vhangup_args *uap;
	{
	return 0;
	}


	int
	svr4_sys_sysconfig(td, uap)
	struct thread *td;
	struct svr4_sys_sysconfig_args *uap;
	{
	int *retval;

	retval = &(td->td_retval[0]);

	switch (uap->name) {
	case SVR4_CONFIG_NGROUPS:
	*retval = ngroups_max;
	break;
	case SVR4_CONFIG_CHILD_MAX:
	*retval = maxproc;
	break;
	case SVR4_CONFIG_OPEN_FILES:
	*retval = maxfiles;
	break;
	case SVR4_CONFIG_POSIX_VER:
	*retval = 198808;
	break;
	case SVR4_CONFIG_PAGESIZE:
	*retval = PAGE_SIZE;
	break;
	case SVR4_CONFIG_CLK_TCK:
	retval = 60; / should this be `hz', ie. 100? */
	break;
	case SVR4_CONFIG_XOPEN_VER:
	retval = 2; / XXX: What should that be? */
	break;
	case SVR4_CONFIG_PROF_TCK:
	retval = 60; / XXX: What should that be? */
	break;
	case SVR4_CONFIG_NPROC_CONF:
	retval = 1; / Only one processor for now */
	break;
	case SVR4_CONFIG_NPROC_ONLN:
	retval = 1; / And it better be online */
	break;
	case SVR4_CONFIG_AIO_LISTIO_MAX:
	case SVR4_CONFIG_AIO_MAX:
	case SVR4_CONFIG_AIO_PRIO_DELTA_MAX:
	retval = 0; / No aio support */
	break;
	case SVR4_CONFIG_DELAYTIMER_MAX:
	retval = 0; / No delaytimer support */
	break;
	case SVR4_CONFIG_MQ_OPEN_MAX:
	*retval = msginfo.msgmni;
	break;
	case SVR4_CONFIG_MQ_PRIO_MAX:
	retval = 0; / XXX: Don't know */
	break;
	case SVR4_CONFIG_RTSIG_MAX:
	*retval = 0;
	break;
	case SVR4_CONFIG_SEM_NSEMS_MAX:
	*retval = seminfo.semmni;
	break;
	case SVR4_CONFIG_SEM_VALUE_MAX:
	*retval = seminfo.semvmx;
	break;
	case SVR4_CONFIG_SIGQUEUE_MAX:
	retval = 0; / XXX: Don't know */
	break;
	case SVR4_CONFIG_SIGRT_MIN:
	case SVR4_CONFIG_SIGRT_MAX:
	retval = 0; / No real time signals */
	break;
	case SVR4_CONFIG_TIMER_MAX:
	retval = 3; / XXX: real, virtual, profiling */
	break;
	#if defined(NOTYET)
	case SVR4_CONFIG_PHYS_PAGES:
	#if defined(UVM)
	retval = uvmexp.free; / XXX: free instead of total */
	#else
	retval = cnt.v_free_count; / XXX: free instead of total */
	#endif
	break;
	case SVR4_CONFIG_AVPHYS_PAGES:
	#if defined(UVM)
	retval = uvmexp.active; / XXX: active instead of avg */
	#else
	retval = cnt.v_active_count; / XXX: active instead of avg */
	#endif
	break;
	#endif /* NOTYET */
	case SVR4_CONFIG_COHERENCY:
	retval = 0; / XXX */
	break;
	case SVR4_CONFIG_SPLIT_CACHE:
	retval = 0; / XXX */
	break;
	case SVR4_CONFIG_ICACHESZ:
	retval = 256; / XXX */
	break;
	case SVR4_CONFIG_DCACHESZ:
	retval = 256; / XXX */
	break;
	case SVR4_CONFIG_ICACHELINESZ:
	retval = 64; / XXX */
	break;
	case SVR4_CONFIG_DCACHELINESZ:
	retval = 64; / XXX */
	break;
	case SVR4_CONFIG_ICACHEBLKSZ:
	retval = 64; / XXX */
	break;
	case SVR4_CONFIG_DCACHEBLKSZ:
	retval = 64; / XXX */
	break;
	case SVR4_CONFIG_DCACHETBLKSZ:
	retval = 64; / XXX */
	break;
	case SVR4_CONFIG_ICACHE_ASSOC:
	retval = 1; / XXX */
	break;
	case SVR4_CONFIG_DCACHE_ASSOC:
	retval = 1; / XXX */
	break;
	case SVR4_CONFIG_MAXPID:
	*retval = PID_MAX;
	break;
	case SVR4_CONFIG_STACK_PROT:
	*retval = PROT_READ\|PROT_WRITE\|PROT_EXEC;
	break;
	default:
	return EINVAL;
	}
	return 0;
	}

	/* ARGSUSED */
	int
	svr4_sys_break(td, uap)
	struct thread *td;
	struct svr4_sys_break_args *uap;
	{
	struct obreak_args ap;

	ap.nsize = uap->nsize;
	- return (obreak(td, &ap));
	+ return (sys_obreak(td, &ap));
	}

	static __inline clock_t
	timeval_to_clock_t(tv)
	struct timeval *tv;
	{
	return tv->tv_sec * hz + tv->tv_usec / (1000000 / hz);
	}


	int
	svr4_sys_times(td, uap)
	struct thread *td;
	struct svr4_sys_times_args *uap;
	{
	struct timeval tv, utime, stime, cutime, cstime;
	struct tms tms;
	struct proc *p;
	int error;

	p = td->td_proc;
	PROC_LOCK(p);
	PROC_SLOCK(p);
	calcru(p, &utime, &stime);
	PROC_SUNLOCK(p);
	calccru(p, &cutime, &cstime);
	PROC_UNLOCK(p);

	tms.tms_utime = timeval_to_clock_t(&utime);
	tms.tms_stime = timeval_to_clock_t(&stime);

	tms.tms_cutime = timeval_to_clock_t(&cutime);
	tms.tms_cstime = timeval_to_clock_t(&cstime);

	error = copyout(&tms, uap->tp, sizeof(tms));
	if (error)
	return (error);

	microtime(&tv);
	td->td_retval[0] = (int)timeval_to_clock_t(&tv);
	return (0);
	}


	int
	svr4_sys_ulimit(td, uap)
	struct thread *td;
	struct svr4_sys_ulimit_args *uap;
	{
	int *retval = td->td_retval;
	int error;

	switch (uap->cmd) {
	case SVR4_GFILLIM:
	PROC_LOCK(td->td_proc);
	*retval = lim_cur(td->td_proc, RLIMIT_FSIZE) / 512;
	PROC_UNLOCK(td->td_proc);
	if (*retval == -1)
	*retval = 0x7fffffff;
	return 0;

	case SVR4_SFILLIM:
	{
	struct rlimit krl;

	krl.rlim_cur = uap->newlimit * 512;
	PROC_LOCK(td->td_proc);
	krl.rlim_max = lim_max(td->td_proc, RLIMIT_FSIZE);
	PROC_UNLOCK(td->td_proc);

	error = kern_setrlimit(td, RLIMIT_FSIZE, &krl);
	if (error)
	return error;

	PROC_LOCK(td->td_proc);
	*retval = lim_cur(td->td_proc, RLIMIT_FSIZE);
	PROC_UNLOCK(td->td_proc);
	if (*retval == -1)
	*retval = 0x7fffffff;
	return 0;
	}

	case SVR4_GMEMLIM:
	{
	struct vmspace *vm = td->td_proc->p_vmspace;
	register_t r;

	PROC_LOCK(td->td_proc);
	r = lim_cur(td->td_proc, RLIMIT_DATA);
	PROC_UNLOCK(td->td_proc);

	if (r == -1)
	r = 0x7fffffff;
	r += (long) vm->vm_daddr;
	if (r < 0)
	r = 0x7fffffff;
	*retval = r;
	return 0;
	}

	case SVR4_GDESLIM:
	PROC_LOCK(td->td_proc);
	*retval = lim_cur(td->td_proc, RLIMIT_NOFILE);
	PROC_UNLOCK(td->td_proc);
	if (*retval == -1)
	*retval = 0x7fffffff;
	return 0;

	default:
	return EINVAL;
	}
	}

	static struct proc *
	svr4_pfind(pid)
	pid_t pid;
	{
	struct proc *p;

	/* look in the live processes */
	if ((p = pfind(pid)) == NULL)
	/* look in the zombies */
	p = zpfind(pid);

	return p;
	}


	int
	svr4_sys_pgrpsys(td, uap)
	struct thread *td;
	struct svr4_sys_pgrpsys_args *uap;
	{
	int *retval = td->td_retval;
	struct proc *p = td->td_proc;

	switch (uap->cmd) {
	case 1: /* setpgrp() */
	/*
	* SVR4 setpgrp() (which takes no arguments) has the
	* semantics that the session ID is also created anew, so
	* in almost every sense, setpgrp() is identical to
	* setsid() for SVR4. (Under BSD, the difference is that
	* a setpgid(0,0) will not create a new session.)
	*/
	- setsid(td, NULL);
	+ sys_setsid(td, NULL);
	/FALLTHROUGH/

	case 0: /* getpgrp() */
	PROC_LOCK(p);
	*retval = p->p_pgrp->pg_id;
	PROC_UNLOCK(p);
	return 0;

	case 2: /* getsid(pid) */
	if (uap->pid == 0)
	PROC_LOCK(p);
	else if ((p = svr4_pfind(uap->pid)) == NULL)
	return ESRCH;
	/*
	* This has already been initialized to the pid of
	* the session leader.
	*/
	*retval = (register_t) p->p_session->s_sid;
	PROC_UNLOCK(p);
	return 0;

	case 3: /* setsid() */
	- return setsid(td, NULL);
	+ return sys_setsid(td, NULL);

	case 4: /* getpgid(pid) */

	if (uap->pid == 0)
	PROC_LOCK(p);
	else if ((p = svr4_pfind(uap->pid)) == NULL)
	return ESRCH;

	*retval = (int) p->p_pgrp->pg_id;
	PROC_UNLOCK(p);
	return 0;

	case 5: /* setpgid(pid, pgid); */
	{
	struct setpgid_args sa;

	sa.pid = uap->pid;
	sa.pgid = uap->pgid;
	- return setpgid(td, &sa);
	+ return sys_setpgid(td, &sa);
	}

	default:
	return EINVAL;
	}
	}

	struct svr4_hrtcntl_args {
	int cmd;
	int fun;
	int clk;
	svr4_hrt_interval_t * iv;
	svr4_hrt_time_t * ti;
	};


	static int
	svr4_hrtcntl(td, uap, retval)
	struct thread *td;
	struct svr4_hrtcntl_args *uap;
	register_t *retval;
	{
	switch (uap->fun) {
	case SVR4_HRT_CNTL_RES:
	DPRINTF(("htrcntl(RES)\n"));
	*retval = SVR4_HRT_USEC;
	return 0;

	case SVR4_HRT_CNTL_TOFD:
	DPRINTF(("htrcntl(TOFD)\n"));
	{
	struct timeval tv;
	svr4_hrt_time_t t;
	if (uap->clk != SVR4_HRT_CLK_STD) {
	DPRINTF(("clk == %d\n", uap->clk));
	return EINVAL;
	}
	if (uap->ti == NULL) {
	DPRINTF(("ti NULL\n"));
	return EINVAL;
	}
	microtime(&tv);
	t.h_sec = tv.tv_sec;
	t.h_rem = tv.tv_usec;
	t.h_res = SVR4_HRT_USEC;
	return copyout(&t, uap->ti, sizeof(t));
	}

	case SVR4_HRT_CNTL_START:
	DPRINTF(("htrcntl(START)\n"));
	return ENOSYS;

	case SVR4_HRT_CNTL_GET:
	DPRINTF(("htrcntl(GET)\n"));
	return ENOSYS;
	default:
	DPRINTF(("Bad htrcntl command %d\n", uap->fun));
	return ENOSYS;
	}
	}


	int
	svr4_sys_hrtsys(td, uap)
	struct thread *td;
	struct svr4_sys_hrtsys_args *uap;
	{
	int *retval = td->td_retval;

	switch (uap->cmd) {
	case SVR4_HRT_CNTL:
	return svr4_hrtcntl(td, (struct svr4_hrtcntl_args *) uap,
	retval);

	case SVR4_HRT_ALRM:
	DPRINTF(("hrtalarm\n"));
	return ENOSYS;

	case SVR4_HRT_SLP:
	DPRINTF(("hrtsleep\n"));
	return ENOSYS;

	case SVR4_HRT_CAN:
	DPRINTF(("hrtcancel\n"));
	return ENOSYS;

	default:
	DPRINTF(("Bad hrtsys command %d\n", uap->cmd));
	return EINVAL;
	}
	}


	static int
	svr4_setinfo(pid, ru, st, s)
	pid_t pid;
	struct rusage *ru;
	int st;
	svr4_siginfo_t *s;
	{
	svr4_siginfo_t i;
	int sig;

	memset(&i, 0, sizeof(i));

	i.svr4_si_signo = SVR4_SIGCHLD;
	i.svr4_si_errno = 0; /* XXX? */

	i.svr4_si_pid = pid;
	if (ru) {
	i.svr4_si_stime = ru->ru_stime.tv_sec;
	i.svr4_si_utime = ru->ru_utime.tv_sec;
	}

	if (WIFEXITED(st)) {
	i.svr4_si_status = WEXITSTATUS(st);
	i.svr4_si_code = SVR4_CLD_EXITED;
	} else if (WIFSTOPPED(st)) {
	sig = WSTOPSIG(st);
	if (sig >= 0 && sig < NSIG)
	i.svr4_si_status = SVR4_BSD2SVR4_SIG(sig);

	if (i.svr4_si_status == SVR4_SIGCONT)
	i.svr4_si_code = SVR4_CLD_CONTINUED;
	else
	i.svr4_si_code = SVR4_CLD_STOPPED;
	} else {
	sig = WTERMSIG(st);
	if (sig >= 0 && sig < NSIG)
	i.svr4_si_status = SVR4_BSD2SVR4_SIG(sig);

	if (WCOREDUMP(st))
	i.svr4_si_code = SVR4_CLD_DUMPED;
	else
	i.svr4_si_code = SVR4_CLD_KILLED;
	}

	DPRINTF(("siginfo [pid %ld signo %d code %d errno %d status %d]\n",
	i.svr4_si_pid, i.svr4_si_signo, i.svr4_si_code, i.svr4_si_errno,
	i.svr4_si_status));

	return copyout(&i, s, sizeof(i));
	}


	int
	svr4_sys_waitsys(td, uap)
	struct thread *td;
	struct svr4_sys_waitsys_args *uap;
	{
	struct rusage ru;
	pid_t pid;
	int nfound, status;
	int error, *retval = td->td_retval;
	struct proc p, q;

	DPRINTF(("waitsys(%d, %d, %p, %x)\n",
	uap->grp, uap->id,
	uap->info, uap->options));

	q = td->td_proc;
	switch (uap->grp) {
	case SVR4_P_PID:
	pid = uap->id;
	break;

	case SVR4_P_PGID:
	PROC_LOCK(q);
	pid = -q->p_pgid;
	PROC_UNLOCK(q);
	break;

	case SVR4_P_ALL:
	pid = WAIT_ANY;
	break;

	default:
	return EINVAL;
	}

	/* Hand off the easy cases to kern_wait(). */
	if (!(uap->options & (SVR4_WNOWAIT)) &&
	(uap->options & (SVR4_WEXITED \| SVR4_WTRAPPED))) {
	int options;

	options = 0;
	if (uap->options & SVR4_WSTOPPED)
	options \|= WUNTRACED;
	if (uap->options & SVR4_WCONTINUED)
	options \|= WCONTINUED;
	if (uap->options & SVR4_WNOHANG)
	options \|= WNOHANG;

	error = kern_wait(td, pid, &status, options, &ru);
	if (error)
	return (error);
	if (uap->options & SVR4_WNOHANG && *retval == 0)
	error = svr4_setinfo(*retval, NULL, 0, uap->info);
	else
	error = svr4_setinfo(*retval, &ru, status, uap->info);
	*retval = 0;
	return (error);
	}

	/*
	* Ok, handle the weird cases. Either WNOWAIT is set (meaning we
	* just want to see if there is a process to harvest, we don't
	* want to actually harvest it), or WEXIT and WTRAPPED are clear
	* meaning we want to ignore zombies. Either way, we don't have
	* to handle harvesting zombies here. We do have to duplicate the
	* other portions of kern_wait() though, especially for WCONTINUED
	* and WSTOPPED.
	*/
	loop:
	nfound = 0;
	sx_slock(&proctree_lock);
	LIST_FOREACH(p, &q->p_children, p_sibling) {
	PROC_LOCK(p);
	if (pid != WAIT_ANY &&
	p->p_pid != pid && p->p_pgid != -pid) {
	PROC_UNLOCK(p);
	DPRINTF(("pid %d pgid %d != %d\n", p->p_pid,
	p->p_pgid, pid));
	continue;
	}
	if (p_canwait(td, p)) {
	PROC_UNLOCK(p);
	continue;
	}

	nfound++;

	PROC_SLOCK(p);
	/*
	* See if we have a zombie. If so, WNOWAIT should be set,
	* as otherwise we should have called kern_wait() up above.
	*/
	if ((p->p_state == PRS_ZOMBIE) &&
	((uap->options & (SVR4_WEXITED\|SVR4_WTRAPPED)))) {
	PROC_SUNLOCK(p);
	KASSERT(uap->options & SVR4_WNOWAIT,
	("WNOWAIT is clear"));

	/* Found a zombie, so cache info in local variables. */
	pid = p->p_pid;
	status = p->p_xstat;
	ru = p->p_ru;
	PROC_SLOCK(p);
	calcru(p, &ru.ru_utime, &ru.ru_stime);
	PROC_SUNLOCK(p);
	PROC_UNLOCK(p);
	sx_sunlock(&proctree_lock);

	/* Copy the info out to userland. */
	*retval = 0;
	DPRINTF(("found %d\n", pid));
	return (svr4_setinfo(pid, &ru, status, uap->info));
	}

	/*
	* See if we have a stopped or continued process.
	* XXX: This duplicates the same code in kern_wait().
	*/
	if ((p->p_flag & P_STOPPED_SIG) &&
	(p->p_suspcount == p->p_numthreads) &&
	(p->p_flag & P_WAITED) == 0 &&
	(p->p_flag & P_TRACED \|\| uap->options & SVR4_WSTOPPED)) {
	PROC_SUNLOCK(p);
	if (((uap->options & SVR4_WNOWAIT)) == 0)
	p->p_flag \|= P_WAITED;
	sx_sunlock(&proctree_lock);
	pid = p->p_pid;
	status = W_STOPCODE(p->p_xstat);
	ru = p->p_ru;
	PROC_SLOCK(p);
	calcru(p, &ru.ru_utime, &ru.ru_stime);
	PROC_SUNLOCK(p);
	PROC_UNLOCK(p);

	if (((uap->options & SVR4_WNOWAIT)) == 0) {
	PROC_LOCK(q);
	sigqueue_take(p->p_ksi);
	PROC_UNLOCK(q);
	}

	*retval = 0;
	DPRINTF(("jobcontrol %d\n", pid));
	return (svr4_setinfo(pid, &ru, status, uap->info));
	}
	PROC_SUNLOCK(p);
	if (uap->options & SVR4_WCONTINUED &&
	(p->p_flag & P_CONTINUED)) {
	sx_sunlock(&proctree_lock);
	if (((uap->options & SVR4_WNOWAIT)) == 0)
	p->p_flag &= ~P_CONTINUED;
	pid = p->p_pid;
	ru = p->p_ru;
	status = SIGCONT;
	PROC_SLOCK(p);
	calcru(p, &ru.ru_utime, &ru.ru_stime);
	PROC_SUNLOCK(p);
	PROC_UNLOCK(p);

	if (((uap->options & SVR4_WNOWAIT)) == 0) {
	PROC_LOCK(q);
	sigqueue_take(p->p_ksi);
	PROC_UNLOCK(q);
	}

	*retval = 0;
	DPRINTF(("jobcontrol %d\n", pid));
	return (svr4_setinfo(pid, &ru, status, uap->info));
	}
	PROC_UNLOCK(p);
	}

	if (nfound == 0) {
	sx_sunlock(&proctree_lock);
	return (ECHILD);
	}

	if (uap->options & SVR4_WNOHANG) {
	sx_sunlock(&proctree_lock);
	*retval = 0;
	return (svr4_setinfo(0, NULL, 0, uap->info));
	}

	PROC_LOCK(q);
	sx_sunlock(&proctree_lock);
	if (q->p_flag & P_STATCHILD) {
	q->p_flag &= ~P_STATCHILD;
	error = 0;
	} else
	error = msleep(q, &q->p_mtx, PWAIT \| PCATCH, "svr4_wait", 0);
	PROC_UNLOCK(q);
	if (error)
	return error;
	goto loop;
	}


	static void
	bsd_statfs_to_svr4_statvfs(bfs, sfs)
	const struct statfs *bfs;
	struct svr4_statvfs *sfs;
	{
	sfs->f_bsize = bfs->f_iosize; /* XXX */
	sfs->f_frsize = bfs->f_bsize;
	sfs->f_blocks = bfs->f_blocks;
	sfs->f_bfree = bfs->f_bfree;
	sfs->f_bavail = bfs->f_bavail;
	sfs->f_files = bfs->f_files;
	sfs->f_ffree = bfs->f_ffree;
	sfs->f_favail = bfs->f_ffree;
	sfs->f_fsid = bfs->f_fsid.val[0];
	memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype));
	sfs->f_flag = 0;
	if (bfs->f_flags & MNT_RDONLY)
	sfs->f_flag \|= SVR4_ST_RDONLY;
	if (bfs->f_flags & MNT_NOSUID)
	sfs->f_flag \|= SVR4_ST_NOSUID;
	sfs->f_namemax = MAXNAMLEN;
	memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */
	memset(sfs->f_filler, 0, sizeof(sfs->f_filler));
	}


	static void
	bsd_statfs_to_svr4_statvfs64(bfs, sfs)
	const struct statfs *bfs;
	struct svr4_statvfs64 *sfs;
	{
	sfs->f_bsize = bfs->f_iosize; /* XXX */
	sfs->f_frsize = bfs->f_bsize;
	sfs->f_blocks = bfs->f_blocks;
	sfs->f_bfree = bfs->f_bfree;
	sfs->f_bavail = bfs->f_bavail;
	sfs->f_files = bfs->f_files;
	sfs->f_ffree = bfs->f_ffree;
	sfs->f_favail = bfs->f_ffree;
	sfs->f_fsid = bfs->f_fsid.val[0];
	memcpy(sfs->f_basetype, bfs->f_fstypename, sizeof(sfs->f_basetype));
	sfs->f_flag = 0;
	if (bfs->f_flags & MNT_RDONLY)
	sfs->f_flag \|= SVR4_ST_RDONLY;
	if (bfs->f_flags & MNT_NOSUID)
	sfs->f_flag \|= SVR4_ST_NOSUID;
	sfs->f_namemax = MAXNAMLEN;
	memcpy(sfs->f_fstr, bfs->f_fstypename, sizeof(sfs->f_fstr)); /* XXX */
	memset(sfs->f_filler, 0, sizeof(sfs->f_filler));
	}


	int
	svr4_sys_statvfs(td, uap)
	struct thread *td;
	struct svr4_sys_statvfs_args *uap;
	{
	struct svr4_statvfs sfs;
	struct statfs bfs;
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);

	error = kern_statfs(td, path, UIO_SYSSPACE, &bfs);
	free(path, M_TEMP);
	if (error)
	return (error);
	bsd_statfs_to_svr4_statvfs(&bfs, &sfs);
	return copyout(&sfs, uap->fs, sizeof(sfs));
	}


	int
	svr4_sys_fstatvfs(td, uap)
	struct thread *td;
	struct svr4_sys_fstatvfs_args *uap;
	{
	struct svr4_statvfs sfs;
	struct statfs bfs;
	int error;

	error = kern_fstatfs(td, uap->fd, &bfs);
	if (error)
	return (error);
	bsd_statfs_to_svr4_statvfs(&bfs, &sfs);
	return copyout(&sfs, uap->fs, sizeof(sfs));
	}


	int
	svr4_sys_statvfs64(td, uap)
	struct thread *td;
	struct svr4_sys_statvfs64_args *uap;
	{
	struct svr4_statvfs64 sfs;
	struct statfs bfs;
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);

	error = kern_statfs(td, path, UIO_SYSSPACE, &bfs);
	free(path, M_TEMP);
	if (error)
	return (error);
	bsd_statfs_to_svr4_statvfs64(&bfs, &sfs);
	return copyout(&sfs, uap->fs, sizeof(sfs));
	}


	int
	svr4_sys_fstatvfs64(td, uap)
	struct thread *td;
	struct svr4_sys_fstatvfs64_args *uap;
	{
	struct svr4_statvfs64 sfs;
	struct statfs bfs;
	int error;

	error = kern_fstatfs(td, uap->fd, &bfs);
	if (error)
	return (error);
	bsd_statfs_to_svr4_statvfs64(&bfs, &sfs);
	return copyout(&sfs, uap->fs, sizeof(sfs));
	}

	int
	svr4_sys_alarm(td, uap)
	struct thread *td;
	struct svr4_sys_alarm_args *uap;
	{
	struct itimerval itv, oitv;
	int error;

	timevalclear(&itv.it_interval);
	itv.it_value.tv_sec = uap->sec;
	itv.it_value.tv_usec = 0;
	error = kern_setitimer(td, ITIMER_REAL, &itv, &oitv);
	if (error)
	return (error);
	if (oitv.it_value.tv_usec != 0)
	oitv.it_value.tv_sec++;
	td->td_retval[0] = oitv.it_value.tv_sec;
	return (0);
	}

	int
	svr4_sys_gettimeofday(td, uap)
	struct thread *td;
	struct svr4_sys_gettimeofday_args *uap;
	{
	if (uap->tp) {
	struct timeval atv;

	microtime(&atv);
	return copyout(&atv, uap->tp, sizeof (atv));
	}

	return 0;
	}

	int
	svr4_sys_facl(td, uap)
	struct thread *td;
	struct svr4_sys_facl_args *uap;
	{
	int *retval;

	retval = td->td_retval;
	*retval = 0;

	switch (uap->cmd) {
	case SVR4_SYS_SETACL:
	/* We don't support acls on any filesystem */
	return ENOSYS;

	case SVR4_SYS_GETACL:
	return copyout(retval, &uap->num,
	sizeof(uap->num));

	case SVR4_SYS_GETACLCNT:
	return 0;

	default:
	return EINVAL;
	}
	}


	int
	svr4_sys_acl(td, uap)
	struct thread *td;
	struct svr4_sys_acl_args *uap;
	{
	/* XXX: for now the same */
	return svr4_sys_facl(td, (struct svr4_sys_facl_args *)uap);
	}

	int
	svr4_sys_auditsys(td, uap)
	struct thread *td;
	struct svr4_sys_auditsys_args *uap;
	{
	/*
	* XXX: Big brother is not watching.
	*/
	return 0;
	}

	int
	svr4_sys_memcntl(td, uap)
	struct thread *td;
	struct svr4_sys_memcntl_args *uap;
	{
	switch (uap->cmd) {
	case SVR4_MC_SYNC:
	{
	struct msync_args msa;

	msa.addr = uap->addr;
	msa.len = uap->len;
	msa.flags = (int)uap->arg;

	- return msync(td, &msa);
	+ return sys_msync(td, &msa);
	}
	case SVR4_MC_ADVISE:
	{
	struct madvise_args maa;

	maa.addr = uap->addr;
	maa.len = uap->len;
	maa.behav = (int)uap->arg;

	- return madvise(td, &maa);
	+ return sys_madvise(td, &maa);
	}
	case SVR4_MC_LOCK:
	case SVR4_MC_UNLOCK:
	case SVR4_MC_LOCKAS:
	case SVR4_MC_UNLOCKAS:
	return EOPNOTSUPP;
	default:
	return ENOSYS;
	}
	}


	int
	svr4_sys_nice(td, uap)
	struct thread *td;
	struct svr4_sys_nice_args *uap;
	{
	struct setpriority_args ap;
	int error;

	ap.which = PRIO_PROCESS;
	ap.who = 0;
	ap.prio = uap->prio;

	- if ((error = setpriority(td, &ap)) != 0)
	+ if ((error = sys_setpriority(td, &ap)) != 0)
	return error;

	/* the cast is stupid, but the structures are the same */
	- if ((error = getpriority(td, (struct getpriority_args *)&ap)) != 0)
	+ if ((error = sys_getpriority(td, (struct getpriority_args *)&ap)) != 0)
	return error;

	return 0;
	}

	int
	svr4_sys_resolvepath(td, uap)
	struct thread *td;
	struct svr4_sys_resolvepath_args *uap;
	{
	struct nameidata nd;
	int error, *retval = td->td_retval;
	unsigned int ncopy;

	NDINIT(&nd, LOOKUP, NOFOLLOW \| SAVENAME \| MPSAFE, UIO_USERSPACE,
	uap->path, td);

	if ((error = namei(&nd)) != 0)
	return (error);
	NDFREE(&nd, NDF_NO_FREE_PNBUF);
	VFS_UNLOCK_GIANT(NDHASGIANT(&nd));

	ncopy = min(uap->bufsiz, strlen(nd.ni_cnd.cn_pnbuf) + 1);
	if ((error = copyout(nd.ni_cnd.cn_pnbuf, uap->buf, ncopy)) != 0)
	goto bad;

	*retval = ncopy;
	bad:
	NDFREE(&nd, NDF_ONLY_PNBUF);
	return error;
	}
	Index: head/sys/compat/svr4/svr4_signal.c
	===================================================================
	--- head/sys/compat/svr4/svr4_signal.c (revision 225616)
	+++ head/sys/compat/svr4/svr4_signal.c (revision 225617)
	@@ -1,577 +1,577 @@
	/*-
	* Copyright (c) 1998 Mark Newton
	* Copyright (c) 1994 Christos Zoulas
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/filedesc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/signal.h>
	#include <sys/signalvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>

	#include <machine/cpu.h>

	#include <compat/svr4/svr4.h>
	#include <compat/svr4/svr4_types.h>
	#include <compat/svr4/svr4_signal.h>
	#include <compat/svr4/svr4_proto.h>
	#include <compat/svr4/svr4_util.h>
	#include <compat/svr4/svr4_ucontext.h>

	#define svr4_sigmask(n) (1 << (((n) - 1) & 31))
	#define svr4_sigword(n) (((n) - 1) >> 5)
	#define svr4_sigemptyset(s) memset((s), 0, sizeof(*(s)))
	#define svr4_sigismember(s, n) ((s)->bits[svr4_sigword(n)] & svr4_sigmask(n))
	#define svr4_sigaddset(s, n) ((s)->bits[svr4_sigword(n)] \|= svr4_sigmask(n))

	void svr4_to_bsd_sigaction(const struct svr4_sigaction , struct sigaction );
	void bsd_to_svr4_sigaction(const struct sigaction , struct svr4_sigaction );
	void svr4_sigfillset(svr4_sigset_t *);

	int bsd_to_svr4_sig[SVR4_NSIG] = {
	0,
	SVR4_SIGHUP,
	SVR4_SIGINT,
	SVR4_SIGQUIT,
	SVR4_SIGILL,
	SVR4_SIGTRAP,
	SVR4_SIGABRT,
	SVR4_SIGEMT,
	SVR4_SIGFPE,
	SVR4_SIGKILL,
	SVR4_SIGBUS,
	SVR4_SIGSEGV,
	SVR4_SIGSYS,
	SVR4_SIGPIPE,
	SVR4_SIGALRM,
	SVR4_SIGTERM,
	SVR4_SIGURG,
	SVR4_SIGSTOP,
	SVR4_SIGTSTP,
	SVR4_SIGCONT,
	SVR4_SIGCHLD,
	SVR4_SIGTTIN,
	SVR4_SIGTTOU,
	SVR4_SIGIO,
	SVR4_SIGXCPU,
	SVR4_SIGXFSZ,
	SVR4_SIGVTALRM,
	SVR4_SIGPROF,
	SVR4_SIGWINCH,
	0, /* SIGINFO */
	SVR4_SIGUSR1,
	SVR4_SIGUSR2,
	};

	int svr4_to_bsd_sig[SVR4_NSIG] = {
	0,
	SIGHUP,
	SIGINT,
	SIGQUIT,
	SIGILL,
	SIGTRAP,
	SIGABRT,
	SIGEMT,
	SIGFPE,
	SIGKILL,
	SIGBUS,
	SIGSEGV,
	SIGSYS,
	SIGPIPE,
	SIGALRM,
	SIGTERM,
	SIGUSR1,
	SIGUSR2,
	SIGCHLD,
	0, /* XXX NetBSD uses SIGPWR here, but we don't seem to have one */
	SIGWINCH,
	SIGURG,
	SIGIO,
	SIGSTOP,
	SIGTSTP,
	SIGCONT,
	SIGTTIN,
	SIGTTOU,
	SIGVTALRM,
	SIGPROF,
	SIGXCPU,
	SIGXFSZ,
	};

	void
	svr4_sigfillset(s)
	svr4_sigset_t *s;
	{
	int i;

	svr4_sigemptyset(s);
	for (i = 1; i < SVR4_NSIG; i++)
	if (svr4_to_bsd_sig[i] != 0)
	svr4_sigaddset(s, i);
	}

	void
	svr4_to_bsd_sigset(sss, bss)
	const svr4_sigset_t *sss;
	sigset_t *bss;
	{
	int i, newsig;

	SIGEMPTYSET(*bss);
	for (i = 1; i < SVR4_NSIG; i++)
	if (svr4_sigismember(sss, i)) {
	newsig = svr4_to_bsd_sig[i];
	if (newsig)
	SIGADDSET(*bss, newsig);
	}
	}

	void
	bsd_to_svr4_sigset(bss, sss)
	const sigset_t *bss;
	svr4_sigset_t *sss;
	{
	int i, newsig;

	svr4_sigemptyset(sss);
	for (i = 1; i < SVR4_NSIG; i++) {
	if (SIGISMEMBER(*bss, i)) {
	newsig = bsd_to_svr4_sig[i];
	if (newsig)
	svr4_sigaddset(sss, newsig);
	}
	}
	}

	/*
	* XXX: Only a subset of the flags is currently implemented.
	*/
	void
	svr4_to_bsd_sigaction(ssa, bsa)
	const struct svr4_sigaction *ssa;
	struct sigaction *bsa;
	{

	bsa->sa_handler = (sig_t) ssa->ssa_handler;
	svr4_to_bsd_sigset(&ssa->ssa_mask, &bsa->sa_mask);
	bsa->sa_flags = 0;
	if ((ssa->ssa_flags & SVR4_SA_ONSTACK) != 0)
	bsa->sa_flags \|= SA_ONSTACK;
	if ((ssa->ssa_flags & SVR4_SA_RESETHAND) != 0)
	bsa->sa_flags \|= SA_RESETHAND;
	if ((ssa->ssa_flags & SVR4_SA_RESTART) != 0)
	bsa->sa_flags \|= SA_RESTART;
	if ((ssa->ssa_flags & SVR4_SA_SIGINFO) != 0)
	DPRINTF(("svr4_to_bsd_sigaction: SA_SIGINFO ignored\n"));
	if ((ssa->ssa_flags & SVR4_SA_NOCLDSTOP) != 0)
	bsa->sa_flags \|= SA_NOCLDSTOP;
	if ((ssa->ssa_flags & SVR4_SA_NODEFER) != 0)
	bsa->sa_flags \|= SA_NODEFER;
	if ((ssa->ssa_flags & SVR4_SA_NOCLDWAIT) != 0)
	bsa->sa_flags \|= SA_NOCLDWAIT;
	if ((ssa->ssa_flags & ~SVR4_SA_ALLBITS) != 0)
	DPRINTF(("svr4_to_bsd_sigaction: extra bits ignored\n"));
	}

	void
	bsd_to_svr4_sigaction(bsa, ssa)
	const struct sigaction *bsa;
	struct svr4_sigaction *ssa;
	{

	ssa->ssa_handler = (svr4_sig_t) bsa->sa_handler;
	bsd_to_svr4_sigset(&bsa->sa_mask, &ssa->ssa_mask);
	ssa->ssa_flags = 0;
	if ((bsa->sa_flags & SA_ONSTACK) != 0)
	ssa->ssa_flags \|= SVR4_SA_ONSTACK;
	if ((bsa->sa_flags & SA_RESETHAND) != 0)
	ssa->ssa_flags \|= SVR4_SA_RESETHAND;
	if ((bsa->sa_flags & SA_RESTART) != 0)
	ssa->ssa_flags \|= SVR4_SA_RESTART;
	if ((bsa->sa_flags & SA_NODEFER) != 0)
	ssa->ssa_flags \|= SVR4_SA_NODEFER;
	if ((bsa->sa_flags & SA_NOCLDSTOP) != 0)
	ssa->ssa_flags \|= SVR4_SA_NOCLDSTOP;
	}

	void
	svr4_to_bsd_sigaltstack(sss, bss)
	const struct svr4_sigaltstack *sss;
	struct sigaltstack *bss;
	{

	bss->ss_sp = sss->ss_sp;
	bss->ss_size = sss->ss_size;
	bss->ss_flags = 0;
	if ((sss->ss_flags & SVR4_SS_DISABLE) != 0)
	bss->ss_flags \|= SS_DISABLE;
	if ((sss->ss_flags & SVR4_SS_ONSTACK) != 0)
	bss->ss_flags \|= SS_ONSTACK;
	if ((sss->ss_flags & ~SVR4_SS_ALLBITS) != 0)
	/XXX/ uprintf("svr4_to_bsd_sigaltstack: extra bits ignored\n");
	}

	void
	bsd_to_svr4_sigaltstack(bss, sss)
	const struct sigaltstack *bss;
	struct svr4_sigaltstack *sss;
	{

	sss->ss_sp = bss->ss_sp;
	sss->ss_size = bss->ss_size;
	sss->ss_flags = 0;
	if ((bss->ss_flags & SS_DISABLE) != 0)
	sss->ss_flags \|= SVR4_SS_DISABLE;
	if ((bss->ss_flags & SS_ONSTACK) != 0)
	sss->ss_flags \|= SVR4_SS_ONSTACK;
	}

	int
	svr4_sys_sigaction(td, uap)
	struct thread *td;
	struct svr4_sys_sigaction_args *uap;
	{
	struct svr4_sigaction isa;
	struct sigaction nbsa, obsa;
	struct sigaction *nbsap;
	int error;

	if (uap->signum < 0 \|\| uap->signum >= SVR4_NSIG)
	return (EINVAL);

	DPRINTF(("@@@ svr4_sys_sigaction(%d, %d, %d)\n", td->td_proc->p_pid,
	uap->signum,
	SVR4_SVR42BSD_SIG(uap->signum)));

	if (uap->nsa != NULL) {
	if ((error = copyin(uap->nsa, &isa, sizeof(isa))) != 0)
	return (error);
	svr4_to_bsd_sigaction(&isa, &nbsa);
	nbsap = &nbsa;
	} else
	nbsap = NULL;
	#if defined(DEBUG_SVR4)
	{
	int i;
	for (i = 0; i < 4; i++)
	DPRINTF(("\tssa_mask[%d] = %lx\n", i,
	isa.ssa_mask.bits[i]));
	DPRINTF(("\tssa_handler = %p\n", isa.ssa_handler));
	}
	#endif
	error = kern_sigaction(td, SVR4_SVR42BSD_SIG(uap->signum), nbsap, &obsa,
	0);
	if (error == 0 && uap->osa != NULL) {
	bsd_to_svr4_sigaction(&obsa, &isa);
	error = copyout(&isa, uap->osa, sizeof(isa));
	}
	return (error);
	}

	int
	svr4_sys_sigaltstack(td, uap)
	struct thread *td;
	struct svr4_sys_sigaltstack_args *uap;
	{
	struct svr4_sigaltstack sss;
	struct sigaltstack nbss, obss, *nbssp;
	int error;

	if (uap->nss != NULL) {
	if ((error = copyin(uap->nss, &sss, sizeof(sss))) != 0)
	return (error);
	svr4_to_bsd_sigaltstack(&sss, &nbss);
	nbssp = &nbss;
	} else
	nbssp = NULL;
	error = kern_sigaltstack(td, nbssp, &obss);
	if (error == 0 && uap->oss != NULL) {
	bsd_to_svr4_sigaltstack(&obss, &sss);
	error = copyout(&sss, uap->oss, sizeof(sss));
	}
	return (error);
	}

	/*
	* Stolen from the ibcs2 one
	*/
	int
	svr4_sys_signal(td, uap)
	struct thread *td;
	struct svr4_sys_signal_args *uap;
	{
	struct proc *p;
	int signum;
	int error;

	p = td->td_proc;
	DPRINTF(("@@@ svr4_sys_signal(%d)\n", p->p_pid));

	signum = SVR4_SIGNO(uap->signum);
	if (signum < 0 \|\| signum >= SVR4_NSIG) {
	if (SVR4_SIGCALL(uap->signum) == SVR4_SIGNAL_MASK \|\|
	SVR4_SIGCALL(uap->signum) == SVR4_SIGDEFER_MASK)
	td->td_retval[0] = (int)SVR4_SIG_ERR;
	return (EINVAL);
	}
	signum = SVR4_SVR42BSD_SIG(signum);

	switch (SVR4_SIGCALL(uap->signum)) {
	case SVR4_SIGDEFER_MASK:
	if (uap->handler == SVR4_SIG_HOLD)
	goto sighold;
	/* FALLTHROUGH */

	case SVR4_SIGNAL_MASK:
	{
	struct sigaction nbsa, obsa;

	nbsa.sa_handler = (sig_t) uap->handler;
	SIGEMPTYSET(nbsa.sa_mask);
	nbsa.sa_flags = 0;
	if (signum != SIGALRM)
	nbsa.sa_flags = SA_RESTART;
	error = kern_sigaction(td, signum, &nbsa, &obsa, 0);
	if (error != 0) {
	DPRINTF(("signal: sigaction failed: %d\n",
	error));
	td->td_retval[0] = (int)SVR4_SIG_ERR;
	return (error);
	}
	td->td_retval[0] = (int)obsa.sa_handler;
	return (0);
	}

	case SVR4_SIGHOLD_MASK:
	sighold:
	{
	sigset_t set;

	SIGEMPTYSET(set);
	SIGADDSET(set, signum);
	return (kern_sigprocmask(td, SIG_BLOCK, &set, NULL, 0));
	}

	case SVR4_SIGRELSE_MASK:
	{
	sigset_t set;

	SIGEMPTYSET(set);
	SIGADDSET(set, signum);
	return (kern_sigprocmask(td, SIG_UNBLOCK, &set, NULL,
	0));
	}

	case SVR4_SIGIGNORE_MASK:
	{
	struct sigaction sa;

	sa.sa_handler = SIG_IGN;
	SIGEMPTYSET(sa.sa_mask);
	sa.sa_flags = 0;
	error = kern_sigaction(td, signum, &sa, NULL, 0);
	if (error != 0)
	DPRINTF(("sigignore: sigaction failed\n"));
	return (error);
	}

	case SVR4_SIGPAUSE_MASK:
	{
	sigset_t mask;

	PROC_LOCK(p);
	mask = td->td_sigmask;
	PROC_UNLOCK(p);
	SIGDELSET(mask, signum);
	return kern_sigsuspend(td, mask);
	}

	default:
	return (ENOSYS);
	}
	}


	int
	svr4_sys_sigprocmask(td, uap)
	struct thread *td;
	struct svr4_sys_sigprocmask_args *uap;
	{
	svr4_sigset_t sss;
	sigset_t oss, nss;
	sigset_t *nssp;
	int error;

	if (uap->set != NULL) {
	if ((error = copyin(uap->set, &sss, sizeof(sss))) != 0)
	return error;
	svr4_to_bsd_sigset(&sss, &nss);
	nssp = &nss;
	} else
	nssp = NULL;

	/* SVR/4 sigprocmask flag values are the same as the FreeBSD values. */
	error = kern_sigprocmask(td, uap->how, nssp, &oss, 0);
	if (error == 0 && uap->oset != NULL) {
	bsd_to_svr4_sigset(&oss, &sss);
	error = copyout(&sss, uap->oset, sizeof(sss));
	}
	return (error);
	}

	int
	svr4_sys_sigpending(td, uap)
	struct thread *td;
	struct svr4_sys_sigpending_args *uap;
	{
	struct proc *p;
	sigset_t bss;
	svr4_sigset_t sss;

	p = td->td_proc;
	DPRINTF(("@@@ svr4_sys_sigpending(%d)\n", p->p_pid));
	switch (uap->what) {
	case 1: /* sigpending */
	if (uap->mask == NULL)
	return 0;
	PROC_LOCK(p);
	bss = p->p_siglist;
	SIGSETOR(bss, td->td_siglist);
	SIGSETAND(bss, td->td_sigmask);
	PROC_UNLOCK(p);
	bsd_to_svr4_sigset(&bss, &sss);
	break;

	case 2: /* sigfillset */
	svr4_sigfillset(&sss);
	#if defined(DEBUG_SVR4)
	{
	int i;
	for (i = 0; i < 4; i++)
	DPRINTF(("new sigset[%d] = %lx\n", i, (long)sss.bits[i]));
	}
	#endif
	break;

	default:
	return EINVAL;
	}

	return copyout(&sss, uap->mask, sizeof(sss));
	}

	int
	svr4_sys_sigsuspend(td, uap)
	struct thread *td;
	struct svr4_sys_sigsuspend_args *uap;
	{
	svr4_sigset_t sss;
	sigset_t bss;
	int error;

	if ((error = copyin(uap->ss, &sss, sizeof(sss))) != 0)
	return error;

	svr4_to_bsd_sigset(&sss, &bss);
	return kern_sigsuspend(td, bss);
	}


	int
	svr4_sys_kill(td, uap)
	struct thread *td;
	struct svr4_sys_kill_args *uap;
	{
	struct kill_args ka;

	if (uap->signum < 0 \|\| uap->signum >= SVR4_NSIG)
	return (EINVAL);
	ka.pid = uap->pid;
	ka.signum = SVR4_SVR42BSD_SIG(uap->signum);
	- return kill(td, &ka);
	+ return sys_kill(td, &ka);
	}


	int
	svr4_sys_context(td, uap)
	struct thread *td;
	struct svr4_sys_context_args *uap;
	{
	struct svr4_ucontext uc;
	int error, onstack;

	switch (uap->func) {
	case 0:
	DPRINTF(("getcontext(%p)\n", uap->uc));
	PROC_LOCK(td->td_proc);
	onstack = sigonstack(cpu_getstack(td));
	PROC_UNLOCK(td->td_proc);
	svr4_getcontext(td, &uc, &td->td_sigmask, onstack);
	return copyout(&uc, uap->uc, sizeof(uc));

	case 1:
	DPRINTF(("setcontext(%p)\n", uap->uc));
	if ((error = copyin(uap->uc, &uc, sizeof(uc))) != 0)
	return error;
	DPRINTF(("uc_flags = %lx\n", uc.uc_flags));
	#if defined(DEBUG_SVR4)
	{
	int i;
	for (i = 0; i < 4; i++)
	DPRINTF(("uc_sigmask[%d] = %lx\n", i,
	uc.uc_sigmask.bits[i]));
	}
	#endif
	return svr4_setcontext(td, &uc);

	default:
	DPRINTF(("context(%d, %p)\n", uap->func,
	uap->uc));
	return ENOSYS;
	}
	return 0;
	}

	int
	svr4_sys_pause(td, uap)
	struct thread *td;
	struct svr4_sys_pause_args *uap;
	{
	sigset_t mask;

	PROC_LOCK(td->td_proc);
	mask = td->td_sigmask;
	PROC_UNLOCK(td->td_proc);
	return kern_sigsuspend(td, mask);
	}
	Index: head/sys/compat/svr4/svr4_socket.c
	===================================================================
	--- head/sys/compat/svr4/svr4_socket.c (revision 225616)
	+++ head/sys/compat/svr4/svr4_socket.c (revision 225617)
	@@ -1,242 +1,242 @@
	/*-
	* Copyright (c) 1998 Mark Newton
	* Copyright (c) 1996 Christos Zoulas.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by Christos Zoulas.
	* 4. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	/*
	* In SVR4 unix domain sockets are referenced sometimes
	* (in putmsg(2) for example) as a [device, inode] pair instead of a pathname.
	* Since there is no iname() routine in the kernel, and we need access to
	* a mapping from inode to pathname, we keep our own table. This is a simple
	* linked list that contains the pathname, the [device, inode] pair, the
	* file corresponding to that socket and the process. When the
	* socket gets closed we remove the item from the list. The list gets loaded
	* every time a stat(2) call finds a socket.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/queue.h>
	#include <sys/eventhandler.h>
	#include <sys/file.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysproto.h>
	#include <sys/un.h>
	#include <sys/stat.h>
	#include <sys/proc.h>
	#include <sys/malloc.h>

	#include <compat/svr4/svr4.h>
	#include <compat/svr4/svr4_types.h>
	#include <compat/svr4/svr4_util.h>
	#include <compat/svr4/svr4_socket.h>
	#include <compat/svr4/svr4_signal.h>
	#include <compat/svr4/svr4_sockmod.h>
	#include <compat/svr4/svr4_proto.h>

	struct svr4_sockcache_entry {
	struct proc p; / Process for the socket */
	void cookie; / Internal cookie used for matching */
	struct sockaddr_un sock;/* Pathname for the socket */
	dev_t dev; /* Device where the socket lives on */
	ino_t ino; /* Inode where the socket lives on */
	TAILQ_ENTRY(svr4_sockcache_entry) entries;
	};

	static TAILQ_HEAD(, svr4_sockcache_entry) svr4_head;
	static struct mtx svr4_sockcache_lock;
	static eventhandler_tag svr4_sockcache_exit_tag, svr4_sockcache_exec_tag;

	static void svr4_purge_sockcache(void arg, struct proc p);

	int
	svr4_find_socket(td, fp, dev, ino, saun)
	struct thread *td;
	struct file *fp;
	dev_t dev;
	ino_t ino;
	struct sockaddr_un *saun;
	{
	struct svr4_sockcache_entry *e;
	void cookie = ((struct socket )fp->f_data)->so_emuldata;

	DPRINTF(("svr4_find_socket: [%p,%d,%d]: ", td, dev, ino));
	mtx_lock(&svr4_sockcache_lock);
	TAILQ_FOREACH(e, &svr4_head, entries)
	if (e->p == td->td_proc && e->dev == dev && e->ino == ino) {
	#ifdef DIAGNOSTIC
	if (e->cookie != NULL && e->cookie != cookie)
	panic("svr4 socket cookie mismatch");
	#endif
	e->cookie = cookie;
	DPRINTF(("%s\n", e->sock.sun_path));
	*saun = e->sock;
	mtx_unlock(&svr4_sockcache_lock);
	return (0);
	}

	mtx_unlock(&svr4_sockcache_lock);
	DPRINTF(("not found\n"));
	return (ENOENT);
	}

	int
	svr4_add_socket(td, path, st)
	struct thread *td;
	const char *path;
	struct stat *st;
	{
	struct svr4_sockcache_entry *e;
	size_t len;
	int error;

	e = malloc(sizeof(*e), M_TEMP, M_WAITOK);
	e->cookie = NULL;
	e->dev = st->st_dev;
	e->ino = st->st_ino;
	e->p = td->td_proc;

	if ((error = copyinstr(path, e->sock.sun_path,
	sizeof(e->sock.sun_path), &len)) != 0) {
	DPRINTF(("svr4_add_socket: copyinstr failed %d\n", error));
	free(e, M_TEMP);
	return error;
	}

	e->sock.sun_family = AF_LOCAL;
	e->sock.sun_len = len;

	mtx_lock(&svr4_sockcache_lock);
	TAILQ_INSERT_HEAD(&svr4_head, e, entries);
	mtx_unlock(&svr4_sockcache_lock);
	DPRINTF(("svr4_add_socket: %s [%p,%d,%d]\n", e->sock.sun_path,
	td->td_proc, e->dev, e->ino));
	return 0;
	}

	void
	svr4_delete_socket(p, fp)
	struct proc *p;
	struct file *fp;
	{
	struct svr4_sockcache_entry *e;
	void cookie = ((struct socket )fp->f_data)->so_emuldata;

	mtx_lock(&svr4_sockcache_lock);
	TAILQ_FOREACH(e, &svr4_head, entries)
	if (e->p == p && e->cookie == cookie) {
	TAILQ_REMOVE(&svr4_head, e, entries);
	mtx_unlock(&svr4_sockcache_lock);
	DPRINTF(("svr4_delete_socket: %s [%p,%d,%d]\n",
	e->sock.sun_path, p, (int)e->dev, e->ino));
	free(e, M_TEMP);
	return;
	}
	mtx_unlock(&svr4_sockcache_lock);
	}

	void
	svr4_purge_sockcache(arg, p)
	void *arg;
	struct proc *p;
	{
	struct svr4_sockcache_entry e, ne;

	mtx_lock(&svr4_sockcache_lock);
	TAILQ_FOREACH_SAFE(e, &svr4_head, entries, ne) {
	if (e->p == p) {
	TAILQ_REMOVE(&svr4_head, e, entries);
	DPRINTF(("svr4_purge_sockcache: %s [%p,%d,%d]\n",
	e->sock.sun_path, p, (int)e->dev, e->ino));
	free(e, M_TEMP);
	}
	}
	mtx_unlock(&svr4_sockcache_lock);
	}

	void
	svr4_sockcache_init(void)
	{

	TAILQ_INIT(&svr4_head);
	mtx_init(&svr4_sockcache_lock, "svr4 socket cache", NULL, MTX_DEF);
	svr4_sockcache_exit_tag = EVENTHANDLER_REGISTER(process_exit,
	svr4_purge_sockcache, NULL, EVENTHANDLER_PRI_ANY);
	svr4_sockcache_exec_tag = EVENTHANDLER_REGISTER(process_exec,
	svr4_purge_sockcache, NULL, EVENTHANDLER_PRI_ANY);
	}

	void
	svr4_sockcache_destroy(void)
	{

	KASSERT(TAILQ_EMPTY(&svr4_head),
	("%s: sockcache entries still around", __func__));
	EVENTHANDLER_DEREGISTER(process_exec, svr4_sockcache_exec_tag);
	EVENTHANDLER_DEREGISTER(process_exit, svr4_sockcache_exit_tag);
	mtx_destroy(&svr4_sockcache_lock);
	}

	int
	svr4_sys_socket(td, uap)
	struct thread *td;
	struct svr4_sys_socket_args *uap;
	{
	switch (uap->type) {
	case SVR4_SOCK_DGRAM:
	uap->type = SOCK_DGRAM;
	break;

	case SVR4_SOCK_STREAM:
	uap->type = SOCK_STREAM;
	break;

	case SVR4_SOCK_RAW:
	uap->type = SOCK_RAW;
	break;

	case SVR4_SOCK_RDM:
	uap->type = SOCK_RDM;
	break;

	case SVR4_SOCK_SEQPACKET:
	uap->type = SOCK_SEQPACKET;
	break;
	default:
	return EINVAL;
	}
	- return socket(td, (struct socket_args *)uap);
	+ return sys_socket(td, (struct socket_args *)uap);
	}
	Index: head/sys/compat/svr4/svr4_stat.c
	===================================================================
	--- head/sys/compat/svr4/svr4_stat.c (revision 225616)
	+++ head/sys/compat/svr4/svr4_stat.c (revision 225617)
	@@ -1,699 +1,699 @@
	/*-
	* Copyright (c) 1998 Mark Newton
	* Copyright (c) 1994 Christos Zoulas
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/proc.h>
	#include <sys/stat.h>
	#include <sys/filedesc.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/namei.h>
	#include <sys/unistd.h>
	#include <sys/time.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/un.h>

	#include <vm/vm.h>

	#include <netinet/in.h>

	#include <compat/svr4/svr4.h>
	#include <compat/svr4/svr4_types.h>
	#include <compat/svr4/svr4_signal.h>
	#include <compat/svr4/svr4_proto.h>
	#include <compat/svr4/svr4_util.h>
	#include <compat/svr4/svr4_stat.h>
	#include <compat/svr4/svr4_ustat.h>
	#include <compat/svr4/svr4_utsname.h>
	#include <compat/svr4/svr4_systeminfo.h>
	#include <compat/svr4/svr4_socket.h>
	#include <compat/svr4/svr4_time.h>
	#if defined(NOTYET)
	#include "svr4_fuser.h"
	#endif

	#ifdef sparc
	/*
	* Solaris-2.4 on the sparc has the old stat call using the new
	* stat data structure...
	*/
	# define SVR4_NO_OSTAT
	#endif

	struct svr4_ustat_args {
	svr4_dev_t dev;
	struct svr4_ustat * name;
	};

	static void bsd_to_svr4_xstat(struct stat , struct svr4_xstat );
	static void bsd_to_svr4_stat64(struct stat , struct svr4_stat64 );
	int svr4_ustat(struct thread , struct svr4_ustat_args );
	static int svr4_to_bsd_pathconf(int);

	/*
	* SVR4 uses named pipes as named sockets, so we tell programs
	* that sockets are named pipes with mode 0
	*/
	#define BSD_TO_SVR4_MODE(mode) (S_ISSOCK(mode) ? S_IFIFO : (mode))


	#ifndef SVR4_NO_OSTAT
	static void bsd_to_svr4_stat(struct stat , struct svr4_stat );

	static void
	bsd_to_svr4_stat(st, st4)
	struct stat *st;
	struct svr4_stat *st4;
	{
	memset(st4, 0, sizeof(*st4));
	st4->st_dev = bsd_to_svr4_odev_t(st->st_dev);
	st4->st_ino = st->st_ino;
	st4->st_mode = BSD_TO_SVR4_MODE(st->st_mode);
	st4->st_nlink = st->st_nlink;
	st4->st_uid = st->st_uid;
	st4->st_gid = st->st_gid;
	st4->st_rdev = bsd_to_svr4_odev_t(st->st_rdev);
	st4->st_size = st->st_size;
	st4->st_atim = st->st_atim.tv_sec;
	st4->st_mtim = st->st_mtim.tv_sec;
	st4->st_ctim = st->st_ctim.tv_sec;
	}
	#endif


	static void
	bsd_to_svr4_xstat(st, st4)
	struct stat *st;
	struct svr4_xstat *st4;
	{
	memset(st4, 0, sizeof(*st4));
	st4->st_dev = bsd_to_svr4_dev_t(st->st_dev);
	st4->st_ino = st->st_ino;
	st4->st_mode = BSD_TO_SVR4_MODE(st->st_mode);
	st4->st_nlink = st->st_nlink;
	st4->st_uid = st->st_uid;
	st4->st_gid = st->st_gid;
	st4->st_rdev = bsd_to_svr4_dev_t(st->st_rdev);
	st4->st_size = st->st_size;
	st4->st_atim = st->st_atim;
	st4->st_mtim = st->st_mtim;
	st4->st_ctim = st->st_ctim;
	st4->st_blksize = st->st_blksize;
	st4->st_blocks = st->st_blocks;
	strcpy(st4->st_fstype, "unknown");
	}


	static void
	bsd_to_svr4_stat64(st, st4)
	struct stat *st;
	struct svr4_stat64 *st4;
	{
	memset(st4, 0, sizeof(*st4));
	st4->st_dev = bsd_to_svr4_dev_t(st->st_dev);
	st4->st_ino = st->st_ino;
	st4->st_mode = BSD_TO_SVR4_MODE(st->st_mode);
	st4->st_nlink = st->st_nlink;
	st4->st_uid = st->st_uid;
	st4->st_gid = st->st_gid;
	st4->st_rdev = bsd_to_svr4_dev_t(st->st_rdev);
	st4->st_size = st->st_size;
	st4->st_atim = st->st_atim;
	st4->st_mtim = st->st_mtim;
	st4->st_ctim = st->st_ctim;
	st4->st_blksize = st->st_blksize;
	st4->st_blocks = st->st_blocks;
	strcpy(st4->st_fstype, "unknown");
	}

	int
	svr4_sys_stat(td, uap)
	struct thread *td;
	struct svr4_sys_stat_args *uap;
	{
	struct svr4_stat svr4_st;
	struct stat st;
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);

	error = kern_stat(td, path, UIO_SYSSPACE, &st);
	free(path, M_TEMP);
	if (error)
	return (error);
	bsd_to_svr4_stat(&st, &svr4_st);

	if (S_ISSOCK(st.st_mode))
	(void) svr4_add_socket(td, uap->path, &st);

	return (copyout(&svr4_st, uap->ub, sizeof svr4_st));
	}


	int
	svr4_sys_lstat(td, uap)
	struct thread *td;
	struct svr4_sys_lstat_args *uap;
	{
	struct svr4_stat svr4_st;
	struct stat st;
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);

	error = kern_lstat(td, path, UIO_SYSSPACE, &st);
	free(path, M_TEMP);
	if (error)
	return (error);
	bsd_to_svr4_stat(&st, &svr4_st);

	if (S_ISSOCK(st.st_mode))
	(void) svr4_add_socket(td, uap->path, &st);

	return (copyout(&svr4_st, uap->ub, sizeof svr4_st));
	}


	int
	svr4_sys_fstat(td, uap)
	struct thread *td;
	struct svr4_sys_fstat_args *uap;
	{
	struct svr4_stat svr4_st;
	struct stat st;
	int error;


	error = kern_fstat(td, uap->fd, &st);
	if (error)
	return (error);
	bsd_to_svr4_stat(&st, &svr4_st);
	return (copyout(&svr4_st, uap->sb, sizeof svr4_st));
	}


	int
	svr4_sys_xstat(td, uap)
	struct thread *td;
	struct svr4_sys_xstat_args *uap;
	{
	struct svr4_xstat svr4_st;
	struct stat st;
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);

	error = kern_stat(td, path, UIO_SYSSPACE, &st);
	free(path, M_TEMP);
	if (error)
	return (error);

	bsd_to_svr4_xstat(&st, &svr4_st);

	#if defined(SOCKET_NOTYET)
	if (S_ISSOCK(st.st_mode))
	(void) svr4_add_socket(td, uap->path, &st);
	#endif

	return (copyout(&svr4_st, uap->ub, sizeof svr4_st));
	}

	int
	svr4_sys_lxstat(td, uap)
	struct thread *td;
	struct svr4_sys_lxstat_args *uap;
	{
	struct svr4_xstat svr4_st;
	struct stat st;
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);

	error = kern_lstat(td, path, UIO_SYSSPACE, &st);
	free(path, M_TEMP);
	if (error)
	return (error);

	bsd_to_svr4_xstat(&st, &svr4_st);

	#if defined(SOCKET_NOTYET)
	if (S_ISSOCK(st.st_mode))
	(void) svr4_add_socket(td, uap->path, &st);
	#endif
	return (copyout(&svr4_st, uap->ub, sizeof svr4_st));
	}


	int
	svr4_sys_fxstat(td, uap)
	struct thread *td;
	struct svr4_sys_fxstat_args *uap;
	{
	struct svr4_xstat svr4_st;
	struct stat st;
	int error;


	error = kern_fstat(td, uap->fd, &st);
	if (error)
	return (error);
	bsd_to_svr4_xstat(&st, &svr4_st);
	return (copyout(&svr4_st, uap->sb, sizeof svr4_st));
	}

	int
	svr4_sys_stat64(td, uap)
	struct thread *td;
	struct svr4_sys_stat64_args *uap;
	{
	struct svr4_stat64 svr4_st;
	struct stat st;
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);

	error = kern_stat(td, path, UIO_SYSSPACE, &st);
	free(path, M_TEMP);
	if (error)
	return (error);

	bsd_to_svr4_stat64(&st, &svr4_st);

	if (S_ISSOCK(st.st_mode))
	(void) svr4_add_socket(td, uap->path, &st);

	return (copyout(&svr4_st, uap->sb, sizeof svr4_st));
	}


	int
	svr4_sys_lstat64(td, uap)
	struct thread *td;
	struct svr4_sys_lstat64_args *uap;
	{
	struct svr4_stat64 svr4_st;
	struct stat st;
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);

	error = kern_lstat(td, path, UIO_SYSSPACE, &st);
	free(path, M_TEMP);
	if (error)
	return (error);

	bsd_to_svr4_stat64(&st, &svr4_st);

	if (S_ISSOCK(st.st_mode))
	(void) svr4_add_socket(td, uap->path, &st);

	return (copyout(&svr4_st, uap->sb, sizeof svr4_st));
	}


	int
	svr4_sys_fstat64(td, uap)
	struct thread *td;
	struct svr4_sys_fstat64_args *uap;
	{
	struct svr4_stat64 svr4_st;
	struct stat st;
	int error;

	error = kern_fstat(td, uap->fd, &st);
	if (error)
	return (error);
	bsd_to_svr4_stat64(&st, &svr4_st);
	return (copyout(&svr4_st, uap->sb, sizeof svr4_st));
	}


	int
	svr4_ustat(td, uap)
	struct thread *td;
	struct svr4_ustat_args *uap;
	{
	struct svr4_ustat us;
	int error;

	memset(&us, 0, sizeof us);

	/*
	* XXX: should set f_tfree and f_tinode at least
	* How do we translate dev -> fstat? (and then to svr4_ustat)
	*/
	if ((error = copyout(&us, uap->name, sizeof us)) != 0)
	return (error);

	return 0;
	}

	/extern char ostype[], osrelease[], version[], machine[];/

	int
	svr4_sys_uname(td, uap)
	struct thread *td;
	struct svr4_sys_uname_args *uap;
	{
	struct svr4_utsname sut;

	memset(&sut, 0, sizeof(sut));

	strlcpy(sut.sysname, ostype, sizeof(sut.sysname));
	getcredhostname(td->td_ucred, sut.nodename, sizeof(sut.nodename));
	strlcpy(sut.release, osrelease, sizeof(sut.release));
	strlcpy(sut.version, version, sizeof(sut.version));
	strlcpy(sut.machine, machine, sizeof(sut.machine));

	return copyout((caddr_t) &sut, (caddr_t) uap->name,
	sizeof(struct svr4_utsname));
	}

	int
	svr4_sys_systeminfo(td, uap)
	struct thread *td;
	struct svr4_sys_systeminfo_args *uap;
	{
	char *str = NULL;
	int error = 0;
	register_t *retval = td->td_retval;
	u_long hostid;
	size_t len = 0;
	char buf[MAXHOSTNAMELEN];
	u_int rlen = uap->len;

	switch (uap->what) {
	case SVR4_SI_SYSNAME:
	str = ostype;
	break;

	case SVR4_SI_HOSTNAME:
	getcredhostname(td->td_ucred, buf, sizeof(buf));
	str = buf;
	break;

	case SVR4_SI_RELEASE:
	str = osrelease;
	break;

	case SVR4_SI_VERSION:
	str = version;
	break;

	case SVR4_SI_MACHINE:
	str = machine;
	break;

	case SVR4_SI_ARCHITECTURE:
	str = machine;
	break;

	case SVR4_SI_ISALIST:
	#if defined(__sparc__)
	str = "sparcv9 sparcv9-fsmuld sparcv8 sparcv8-fsmuld sparcv7 sparc";
	#elif defined(__i386__)
	str = "i386";
	#elif defined(__amd64__)
	str = "amd64";
	#else
	str = "unknown";
	#endif
	break;

	case SVR4_SI_HW_SERIAL:
	getcredhostid(td->td_ucred, &hostid);
	snprintf(buf, sizeof(buf), "%lu", hostid);
	str = buf;
	break;

	case SVR4_SI_HW_PROVIDER:
	str = ostype;
	break;

	case SVR4_SI_SRPC_DOMAIN:
	getcreddomainname(td->td_ucred, buf, sizeof(buf));
	str = buf;
	break;

	case SVR4_SI_PLATFORM:
	#if defined(__i386__)
	str = "i86pc";
	#else
	str = "unknown";
	#endif
	break;

	case SVR4_SI_KERB_REALM:
	str = "unsupported";
	break;
	#if defined(WHY_DOES_AN_EMULATOR_WANT_TO_SET_HOSTNAMES)
	case SVR4_SI_SET_HOSTNAME:
	name = KERN_HOSTNAME;
	return kern_sysctl(&name, 1, 0, 0, uap->buf, rlen, td);

	case SVR4_SI_SET_SRPC_DOMAIN:
	name = KERN_NISDOMAINNAME;
	return kern_sysctl(&name, 1, 0, 0, uap->buf, rlen, td);
	#else
	case SVR4_SI_SET_HOSTNAME:
	case SVR4_SI_SET_SRPC_DOMAIN:
	/* FALLTHROUGH */
	#endif
	case SVR4_SI_SET_KERB_REALM:
	return 0;

	default:
	DPRINTF(("Bad systeminfo command %d\n", uap->what));
	return ENOSYS;
	}

	if (str) {
	len = strlen(str) + 1;
	if (len > rlen)
	len = rlen;

	if (uap->buf) {
	error = copyout(str, uap->buf, len);
	if (error)
	return error;
	/* make sure we are NULL terminated */
	buf[0] = '\0';
	error = copyout(buf, &(uap->buf[len - 1]), 1);
	}
	else
	error = 0;
	}
	/* XXX NetBSD has hostname setting stuff here. Why would an emulator
	want to do that? */

	*retval = len;
	return error;
	}

	int
	svr4_sys_utssys(td, uap)
	struct thread *td;
	struct svr4_sys_utssys_args *uap;
	{
	switch (uap->sel) {
	case 0: /* uname(2) */
	{
	struct svr4_sys_uname_args ua;
	ua.name = uap->a1;
	return svr4_sys_uname(td, &ua);
	}

	case 2: /* ustat(2) */
	{
	struct svr4_ustat_args ua;
	ua.dev = (svr4_dev_t) uap->a2;
	ua.name = uap->a1;
	return svr4_ustat(td, &ua);
	}

	case 3: /* fusers(2) */
	return ENOSYS;

	default:
	return ENOSYS;
	}
	return ENOSYS;
	}


	int
	svr4_sys_utime(td, uap)
	struct thread *td;
	struct svr4_sys_utime_args *uap;
	{
	struct svr4_utimbuf ub;
	struct timeval tbuf[2], *tp;
	char *path;
	int error;

	if (uap->ubuf != NULL) {
	error = copyin(uap->ubuf, &ub, sizeof(ub));
	if (error)
	return (error);
	tbuf[0].tv_sec = ub.actime;
	tbuf[0].tv_usec = 0;
	tbuf[1].tv_sec = ub.modtime;
	tbuf[1].tv_usec = 0;
	tp = tbuf;
	} else
	tp = NULL;

	CHECKALTEXIST(td, uap->path, &path);
	error = kern_utimes(td, path, UIO_SYSSPACE, tp, UIO_SYSSPACE);
	free(path, M_TEMP);
	return (error);
	}


	int
	svr4_sys_utimes(td, uap)
	struct thread *td;
	struct svr4_sys_utimes_args *uap;
	{
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);
	error = kern_utimes(td, path, UIO_SYSSPACE, uap->tptr, UIO_USERSPACE);
	free(path, M_TEMP);
	return (error);
	}

	static int
	svr4_to_bsd_pathconf(name)
	int name;
	{
	switch (name) {
	case SVR4_PC_LINK_MAX:
	return _PC_LINK_MAX;

	case SVR4_PC_MAX_CANON:
	return _PC_MAX_CANON;

	case SVR4_PC_MAX_INPUT:
	return _PC_MAX_INPUT;

	case SVR4_PC_NAME_MAX:
	return _PC_NAME_MAX;

	case SVR4_PC_PATH_MAX:
	return _PC_PATH_MAX;

	case SVR4_PC_PIPE_BUF:
	return _PC_PIPE_BUF;

	case SVR4_PC_NO_TRUNC:
	return _PC_NO_TRUNC;

	case SVR4_PC_VDISABLE:
	return _PC_VDISABLE;

	case SVR4_PC_CHOWN_RESTRICTED:
	return _PC_CHOWN_RESTRICTED;
	case SVR4_PC_SYNC_IO:
	#if defined(_PC_SYNC_IO)
	return _PC_SYNC_IO;
	#else
	return 0;
	#endif
	case SVR4_PC_ASYNC_IO:
	case SVR4_PC_PRIO_IO:
	/* Not supported */
	return 0;

	default:
	/* Invalid */
	return -1;
	}
	}


	int
	svr4_sys_pathconf(td, uap)
	struct thread *td;
	struct svr4_sys_pathconf_args *uap;
	{
	char *path;
	int error, name;

	name = svr4_to_bsd_pathconf(uap->name);

	switch (name) {
	case -1:
	td->td_retval[0] = -1;
	return (EINVAL);
	case 0:
	td->td_retval[0] = 0;
	return (0);
	default:
	CHECKALTEXIST(td, uap->path, &path);
	error = kern_pathconf(td, path, UIO_SYSSPACE, name, FOLLOW);
	free(path, M_TEMP);
	return (error);
	}
	}


	int
	svr4_sys_fpathconf(td, uap)
	struct thread *td;
	struct svr4_sys_fpathconf_args *uap;
	{
	register_t *retval = td->td_retval;

	uap->name = svr4_to_bsd_pathconf(uap->name);

	switch (uap->name) {
	case -1:
	*retval = -1;
	return EINVAL;
	case 0:
	*retval = 0;
	return 0;
	default:
	- return fpathconf(td, (struct fpathconf_args *)uap);
	+ return sys_fpathconf(td, (struct fpathconf_args *)uap);
	}
	}
	Index: head/sys/compat/svr4/svr4_stream.c
	===================================================================
	--- head/sys/compat/svr4/svr4_stream.c (revision 225616)
	+++ head/sys/compat/svr4/svr4_stream.c (revision 225617)
	@@ -1,2038 +1,2038 @@
	/*-
	* Copyright (c) 1998 Mark Newton. All rights reserved.
	* Copyright (c) 1994, 1996 Christos Zoulas. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by Christos Zoulas.
	* 4. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	/*
	* Pretend that we have streams...
	* Yes, this is gross.
	*
	* ToDo: The state machine for getmsg needs re-thinking
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_ktrace.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/fcntl.h>
	#include <sys/filedesc.h>
	#include <sys/filio.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/file.h> /* Must come after sys/malloc.h */
	#include <sys/mbuf.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/protosw.h>
	#include <sys/signal.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/stat.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>
	#include <sys/uio.h>
	#include <sys/ktrace.h> /* Must come after sys/uio.h */
	#include <sys/un.h>

	#include <netinet/in.h>

	#include <compat/svr4/svr4.h>
	#include <compat/svr4/svr4_types.h>
	#include <compat/svr4/svr4_util.h>
	#include <compat/svr4/svr4_signal.h>
	#include <compat/svr4/svr4_proto.h>
	#include <compat/svr4/svr4_stropts.h>
	#include <compat/svr4/svr4_timod.h>
	#include <compat/svr4/svr4_sockmod.h>
	#include <compat/svr4/svr4_ioctl.h>
	#include <compat/svr4/svr4_socket.h>

	/* Utils */
	static int clean_pipe(struct thread , char );
	static void getparm(struct file , struct svr4_si_sockparms );
	static int svr4_do_putmsg(struct thread , struct svr4_sys_putmsg_args ,
	struct file *);
	static int svr4_do_getmsg(struct thread , struct svr4_sys_getmsg_args ,
	struct file *);

	/* Address Conversions */
	static void sockaddr_to_netaddr_in(struct svr4_strmcmd *,
	const struct sockaddr_in *);
	static void sockaddr_to_netaddr_un(struct svr4_strmcmd *,
	const struct sockaddr_un *);
	static void netaddr_to_sockaddr_in(struct sockaddr_in *,
	const struct svr4_strmcmd *);
	static void netaddr_to_sockaddr_un(struct sockaddr_un *,
	const struct svr4_strmcmd *);

	/* stream ioctls */
	static int i_nread(struct file , struct thread , register_t *, int,
	u_long, caddr_t);
	static int i_fdinsert(struct file , struct thread , register_t *, int,
	u_long, caddr_t);
	static int i_str(struct file , struct thread , register_t *, int,
	u_long, caddr_t);
	static int i_setsig(struct file , struct thread , register_t *, int,
	u_long, caddr_t);
	static int i_getsig(struct file , struct thread , register_t *, int,
	u_long, caddr_t);
	static int _i_bind_rsvd(struct file , struct thread , register_t *, int,
	u_long, caddr_t);
	static int _i_rele_rsvd(struct file , struct thread , register_t *, int,
	u_long, caddr_t);

	/* i_str sockmod calls */
	static int sockmod(struct file , int, struct svr4_strioctl ,
	struct thread *);
	static int si_listen(struct file , int, struct svr4_strioctl ,
	struct thread *);
	static int si_ogetudata(struct file , int, struct svr4_strioctl ,
	struct thread *);
	static int si_sockparams(struct file , int, struct svr4_strioctl ,
	struct thread *);
	static int si_shutdown (struct file , int, struct svr4_strioctl ,
	struct thread *);
	static int si_getudata(struct file , int, struct svr4_strioctl ,
	struct thread *);

	/* i_str timod calls */
	static int timod(struct file , int, struct svr4_strioctl , struct thread *);
	static int ti_getinfo(struct file , int, struct svr4_strioctl ,
	struct thread *);
	static int ti_bind(struct file , int, struct svr4_strioctl , struct thread *);

	#ifdef DEBUG_SVR4
	static void bufprint(u_char *, size_t);
	static int show_ioc(const char , struct svr4_strioctl );
	static int show_strbuf(struct svr4_strbuf *);
	static void show_msg(const char , int, struct svr4_strbuf ,
	struct svr4_strbuf *, int);

	static void
	bufprint(buf, len)
	u_char *buf;
	size_t len;
	{
	size_t i;

	uprintf("\n\t");
	for (i = 0; i < len; i++) {
	uprintf("%x ", buf[i]);
	if (i && (i % 16) == 0)
	uprintf("\n\t");
	}
	}

	static int
	show_ioc(str, ioc)
	const char *str;
	struct svr4_strioctl *ioc;
	{
	u_char *ptr = NULL;
	int len;
	int error;

	len = ioc->len;
	if (len > 1024)
	len = 1024;

	if (len > 0) {
	ptr = (u_char *) malloc(len, M_TEMP, M_WAITOK);
	if ((error = copyin(ioc->buf, ptr, len)) != 0) {
	free((char *) ptr, M_TEMP);
	return error;
	}
	}

	uprintf("%s cmd = %ld, timeout = %d, len = %d, buf = %p { ",
	str, ioc->cmd, ioc->timeout, ioc->len, ioc->buf);

	if (ptr != NULL)
	bufprint(ptr, len);

	uprintf("}\n");

	if (ptr != NULL)
	free((char *) ptr, M_TEMP);
	return 0;
	}


	static int
	show_strbuf(str)
	struct svr4_strbuf *str;
	{
	int error;
	u_char *ptr = NULL;
	int maxlen = str->maxlen;
	int len = str->len;

	if (maxlen > 8192)
	maxlen = 8192;

	if (maxlen < 0)
	maxlen = 0;

	if (len >= maxlen)
	len = maxlen;

	if (len > 0) {
	ptr = (u_char *) malloc(len, M_TEMP, M_WAITOK);

	if ((error = copyin(str->buf, ptr, len)) != 0) {
	free((char *) ptr, M_TEMP);
	return error;
	}
	}

	uprintf(", { %d, %d, %p=[ ", str->maxlen, str->len, str->buf);

	if (ptr)
	bufprint(ptr, len);

	uprintf("]}");

	if (ptr)
	free((char *) ptr, M_TEMP);

	return 0;
	}


	static void
	show_msg(str, fd, ctl, dat, flags)
	const char *str;
	int fd;
	struct svr4_strbuf *ctl;
	struct svr4_strbuf *dat;
	int flags;
	{
	struct svr4_strbuf buf;
	int error;

	uprintf("%s(%d", str, fd);
	if (ctl != NULL) {
	if ((error = copyin(ctl, &buf, sizeof(buf))) != 0)
	return;
	show_strbuf(&buf);
	}
	else
	uprintf(", NULL");

	if (dat != NULL) {
	if ((error = copyin(dat, &buf, sizeof(buf))) != 0)
	return;
	show_strbuf(&buf);
	}
	else
	uprintf(", NULL");

	uprintf(", %x);\n", flags);
	}

	#endif /* DEBUG_SVR4 */

	/*
	* We are faced with an interesting situation. On svr4 unix sockets
	* are really pipes. But we really have sockets, and we might as
	* well use them. At the point where svr4 calls TI_BIND, it has
	* already created a named pipe for the socket using mknod(2).
	* We need to create a socket with the same name when we bind,
	* so we need to remove the pipe before, otherwise we'll get address
	* already in use. So we carefully remove the pipe, to avoid
	* using this as a random file removal tool. We use system calls
	* to avoid code duplication.
	*/
	static int
	clean_pipe(td, path)
	struct thread *td;
	char *path;
	{
	struct stat st;
	int error;

	error = kern_lstat(td, path, UIO_SYSSPACE, &st);

	/*
	* Make sure we are dealing with a mode 0 named pipe.
	*/
	if ((st.st_mode & S_IFMT) != S_IFIFO)
	return (0);

	if ((st.st_mode & ALLPERMS) != 0)
	return (0);

	error = kern_unlink(td, path, UIO_SYSSPACE);
	if (error)
	DPRINTF(("clean_pipe: unlink failed %d\n", error));
	return (error);
	}


	static void
	sockaddr_to_netaddr_in(sc, sain)
	struct svr4_strmcmd *sc;
	const struct sockaddr_in *sain;
	{
	struct svr4_netaddr_in *na;
	na = SVR4_ADDROF(sc);

	na->family = sain->sin_family;
	na->port = sain->sin_port;
	na->addr = sain->sin_addr.s_addr;
	DPRINTF(("sockaddr_in -> netaddr %d %d %lx\n", na->family, na->port,
	na->addr));
	}


	static void
	sockaddr_to_netaddr_un(sc, saun)
	struct svr4_strmcmd *sc;
	const struct sockaddr_un *saun;
	{
	struct svr4_netaddr_un *na;
	char dst, edst = ((char *) sc) + sc->offs + sizeof(na->family) + 1 -
	sizeof(*sc);
	const char *src;

	na = SVR4_ADDROF(sc);
	na->family = saun->sun_family;
	for (src = saun->sun_path, dst = na->path; (dst++ = src++) != '\0'; )
	if (dst == edst)
	break;
	DPRINTF(("sockaddr_un -> netaddr %d %s\n", na->family, na->path));
	}


	static void
	netaddr_to_sockaddr_in(sain, sc)
	struct sockaddr_in *sain;
	const struct svr4_strmcmd *sc;
	{
	const struct svr4_netaddr_in *na;


	na = SVR4_C_ADDROF(sc);
	memset(sain, 0, sizeof(*sain));
	sain->sin_len = sizeof(*sain);
	sain->sin_family = na->family;
	sain->sin_port = na->port;
	sain->sin_addr.s_addr = na->addr;
	DPRINTF(("netaddr -> sockaddr_in %d %d %x\n", sain->sin_family,
	sain->sin_port, sain->sin_addr.s_addr));
	}


	static void
	netaddr_to_sockaddr_un(saun, sc)
	struct sockaddr_un *saun;
	const struct svr4_strmcmd *sc;
	{
	const struct svr4_netaddr_un *na;
	char dst, edst = &saun->sun_path[sizeof(saun->sun_path) - 1];
	const char *src;

	na = SVR4_C_ADDROF(sc);
	memset(saun, 0, sizeof(*saun));
	saun->sun_family = na->family;
	for (src = na->path, dst = saun->sun_path; (dst++ = src++) != '\0'; )
	if (dst == edst)
	break;
	saun->sun_len = dst - saun->sun_path;
	DPRINTF(("netaddr -> sockaddr_un %d %s\n", saun->sun_family,
	saun->sun_path));
	}


	static void
	getparm(fp, pa)
	struct file *fp;
	struct svr4_si_sockparms *pa;
	{
	struct svr4_strm *st;
	struct socket *so;

	st = svr4_stream_get(fp);
	if (st == NULL)
	return;

	so = fp->f_data;

	pa->family = st->s_family;

	switch (so->so_type) {
	case SOCK_DGRAM:
	pa->type = SVR4_T_CLTS;
	pa->protocol = IPPROTO_UDP;
	DPRINTF(("getparm(dgram)\n"));
	return;

	case SOCK_STREAM:
	pa->type = SVR4_T_COTS; /* What about T_COTS_ORD? XXX */
	pa->protocol = IPPROTO_IP;
	DPRINTF(("getparm(stream)\n"));
	return;

	case SOCK_RAW:
	pa->type = SVR4_T_CLTS;
	pa->protocol = IPPROTO_RAW;
	DPRINTF(("getparm(raw)\n"));
	return;

	default:
	pa->type = 0;
	pa->protocol = 0;
	DPRINTF(("getparm(type %d?)\n", so->so_type));
	return;
	}
	}


	static int
	si_ogetudata(fp, fd, ioc, td)
	struct file *fp;
	int fd;
	struct svr4_strioctl *ioc;
	struct thread *td;
	{
	int error;
	struct svr4_si_oudata ud;
	struct svr4_si_sockparms pa;

	if (ioc->len != sizeof(ud) && ioc->len != sizeof(ud) - sizeof(int)) {
	DPRINTF(("SI_OGETUDATA: Wrong size %d != %d\n",
	sizeof(ud), ioc->len));
	return EINVAL;
	}

	if ((error = copyin(ioc->buf, &ud, sizeof(ud))) != 0)
	return error;

	getparm(fp, &pa);

	switch (pa.family) {
	case AF_INET:
	ud.tidusize = 16384;
	ud.addrsize = sizeof(struct svr4_sockaddr_in);
	if (pa.type == SVR4_SOCK_STREAM)
	ud.etsdusize = 1;
	else
	ud.etsdusize = 0;
	break;

	case AF_LOCAL:
	ud.tidusize = 65536;
	ud.addrsize = 128;
	ud.etsdusize = 128;
	break;

	default:
	DPRINTF(("SI_OGETUDATA: Unsupported address family %d\n",
	pa.family));
	return ENOSYS;
	}

	/* I have no idea what these should be! */
	ud.optsize = 128;
	ud.tsdusize = 128;

	ud.servtype = pa.type;

	/* XXX: Fixme */
	ud.so_state = 0;
	ud.so_options = 0;
	return copyout(&ud, ioc->buf, ioc->len);
	}


	static int
	si_sockparams(fp, fd, ioc, td)
	struct file *fp;
	int fd;
	struct svr4_strioctl *ioc;
	struct thread *td;
	{
	struct svr4_si_sockparms pa;

	getparm(fp, &pa);
	return copyout(&pa, ioc->buf, sizeof(pa));
	}


	static int
	si_listen(fp, fd, ioc, td)
	struct file *fp;
	int fd;
	struct svr4_strioctl *ioc;
	struct thread *td;
	{
	int error;
	struct svr4_strm *st = svr4_stream_get(fp);
	struct svr4_strmcmd lst;
	struct listen_args la;

	if (st == NULL)
	return EINVAL;

	if (ioc->len < 0 \|\| ioc->len > sizeof(lst))
	return EINVAL;

	if ((error = copyin(ioc->buf, &lst, ioc->len)) != 0)
	return error;

	if (lst.cmd != SVR4_TI_OLD_BIND_REQUEST) {
	DPRINTF(("si_listen: bad request %ld\n", lst.cmd));
	return EINVAL;
	}

	/*
	* We are making assumptions again...
	*/
	la.s = fd;
	DPRINTF(("SI_LISTEN: fileno %d backlog = %d\n", fd, 5));
	la.backlog = 5;

	- if ((error = listen(td, &la)) != 0) {
	+ if ((error = sys_listen(td, &la)) != 0) {
	DPRINTF(("SI_LISTEN: listen failed %d\n", error));
	return error;
	}

	st->s_cmd = SVR4_TI__ACCEPT_WAIT;
	lst.cmd = SVR4_TI_BIND_REPLY;

	switch (st->s_family) {
	case AF_INET:
	/* XXX: Fill the length here */
	break;

	case AF_LOCAL:
	lst.len = 140;
	lst.pad[28] = 0x00000000; /* magic again */
	lst.pad[29] = 0x00000800; /* magic again */
	lst.pad[30] = 0x80001400; /* magic again */
	break;

	default:
	DPRINTF(("SI_LISTEN: Unsupported address family %d\n",
	st->s_family));
	return ENOSYS;
	}


	if ((error = copyout(&lst, ioc->buf, ioc->len)) != 0)
	return error;

	return 0;
	}


	static int
	si_getudata(fp, fd, ioc, td)
	struct file *fp;
	int fd;
	struct svr4_strioctl *ioc;
	struct thread *td;
	{
	int error;
	struct svr4_si_udata ud;

	if (sizeof(ud) != ioc->len) {
	DPRINTF(("SI_GETUDATA: Wrong size %d != %d\n",
	sizeof(ud), ioc->len));
	return EINVAL;
	}

	if ((error = copyin(ioc->buf, &ud, sizeof(ud))) != 0)
	return error;

	getparm(fp, &ud.sockparms);

	switch (ud.sockparms.family) {
	case AF_INET:
	DPRINTF(("getudata_inet\n"));
	ud.tidusize = 16384;
	ud.tsdusize = 16384;
	ud.addrsize = sizeof(struct svr4_sockaddr_in);
	if (ud.sockparms.type == SVR4_SOCK_STREAM)
	ud.etsdusize = 1;
	else
	ud.etsdusize = 0;
	ud.optsize = 0;
	break;

	case AF_LOCAL:
	DPRINTF(("getudata_local\n"));
	ud.tidusize = 65536;
	ud.tsdusize = 128;
	ud.addrsize = 128;
	ud.etsdusize = 128;
	ud.optsize = 128;
	break;

	default:
	DPRINTF(("SI_GETUDATA: Unsupported address family %d\n",
	ud.sockparms.family));
	return ENOSYS;
	}


	ud.servtype = ud.sockparms.type;
	DPRINTF(("ud.servtype = %d\n", ud.servtype));
	/* XXX: Fixme */
	ud.so_state = 0;
	ud.so_options = 0;
	return copyout(&ud, ioc->buf, sizeof(ud));
	}


	static int
	si_shutdown(fp, fd, ioc, td)
	struct file *fp;
	int fd;
	struct svr4_strioctl *ioc;
	struct thread *td;
	{
	int error;
	struct shutdown_args ap;

	if (ioc->len != sizeof(ap.how)) {
	DPRINTF(("SI_SHUTDOWN: Wrong size %d != %d\n",
	sizeof(ap.how), ioc->len));
	return EINVAL;
	}

	if ((error = copyin(ioc->buf, &ap.how, ioc->len)) != 0)
	return error;

	ap.s = fd;

	- return shutdown(td, &ap);
	+ return sys_shutdown(td, &ap);
	}


	static int
	sockmod(fp, fd, ioc, td)
	struct file *fp;
	int fd;
	struct svr4_strioctl *ioc;
	struct thread *td;
	{
	switch (ioc->cmd) {
	case SVR4_SI_OGETUDATA:
	DPRINTF(("SI_OGETUDATA\n"));
	return si_ogetudata(fp, fd, ioc, td);

	case SVR4_SI_SHUTDOWN:
	DPRINTF(("SI_SHUTDOWN\n"));
	return si_shutdown(fp, fd, ioc, td);

	case SVR4_SI_LISTEN:
	DPRINTF(("SI_LISTEN\n"));
	return si_listen(fp, fd, ioc, td);

	case SVR4_SI_SETMYNAME:
	DPRINTF(("SI_SETMYNAME\n"));
	return 0;

	case SVR4_SI_SETPEERNAME:
	DPRINTF(("SI_SETPEERNAME\n"));
	return 0;

	case SVR4_SI_GETINTRANSIT:
	DPRINTF(("SI_GETINTRANSIT\n"));
	return 0;

	case SVR4_SI_TCL_LINK:
	DPRINTF(("SI_TCL_LINK\n"));
	return 0;

	case SVR4_SI_TCL_UNLINK:
	DPRINTF(("SI_TCL_UNLINK\n"));
	return 0;

	case SVR4_SI_SOCKPARAMS:
	DPRINTF(("SI_SOCKPARAMS\n"));
	return si_sockparams(fp, fd, ioc, td);

	case SVR4_SI_GETUDATA:
	DPRINTF(("SI_GETUDATA\n"));
	return si_getudata(fp, fd, ioc, td);

	default:
	DPRINTF(("Unknown sockmod ioctl %lx\n", ioc->cmd));
	return 0;

	}
	}


	static int
	ti_getinfo(fp, fd, ioc, td)
	struct file *fp;
	int fd;
	struct svr4_strioctl *ioc;
	struct thread *td;
	{
	int error;
	struct svr4_infocmd info;

	memset(&info, 0, sizeof(info));

	if (ioc->len < 0 \|\| ioc->len > sizeof(info))
	return EINVAL;

	if ((error = copyin(ioc->buf, &info, ioc->len)) != 0)
	return error;

	if (info.cmd != SVR4_TI_INFO_REQUEST)
	return EINVAL;

	info.cmd = SVR4_TI_INFO_REPLY;
	info.tsdu = 0;
	info.etsdu = 1;
	info.cdata = -2;
	info.ddata = -2;
	info.addr = 16;
	info.opt = -1;
	info.tidu = 16384;
	info.serv = 2;
	info.current = 0;
	info.provider = 2;

	ioc->len = sizeof(info);
	if ((error = copyout(&info, ioc->buf, ioc->len)) != 0)
	return error;

	return 0;
	}


	static int
	ti_bind(fp, fd, ioc, td)
	struct file *fp;
	int fd;
	struct svr4_strioctl *ioc;
	struct thread *td;
	{
	int error;
	struct svr4_strm *st = svr4_stream_get(fp);
	struct sockaddr_in sain;
	struct sockaddr_un saun;
	struct sockaddr *skp;
	int sasize;
	struct svr4_strmcmd bnd;

	if (st == NULL) {
	DPRINTF(("ti_bind: bad file descriptor\n"));
	return EINVAL;
	}

	if (ioc->len < 0 \|\| ioc->len > sizeof(bnd))
	return EINVAL;

	if ((error = copyin(ioc->buf, &bnd, ioc->len)) != 0)
	return error;

	if (bnd.cmd != SVR4_TI_OLD_BIND_REQUEST) {
	DPRINTF(("ti_bind: bad request %ld\n", bnd.cmd));
	return EINVAL;
	}

	switch (st->s_family) {
	case AF_INET:
	skp = (struct sockaddr *)&sain;
	sasize = sizeof(sain);

	if (bnd.offs == 0)
	goto error;

	netaddr_to_sockaddr_in(&sain, &bnd);

	DPRINTF(("TI_BIND: fam %d, port %d, addr %x\n",
	sain.sin_family, sain.sin_port,
	sain.sin_addr.s_addr));
	break;

	case AF_LOCAL:
	skp = (struct sockaddr *)&saun;
	sasize = sizeof(saun);
	if (bnd.offs == 0)
	goto error;

	netaddr_to_sockaddr_un(&saun, &bnd);

	if (saun.sun_path[0] == '\0')
	goto error;

	DPRINTF(("TI_BIND: fam %d, path %s\n",
	saun.sun_family, saun.sun_path));

	if ((error = clean_pipe(td, saun.sun_path)) != 0)
	return error;

	bnd.pad[28] = 0x00001000; /* magic again */
	break;

	default:
	DPRINTF(("TI_BIND: Unsupported address family %d\n",
	st->s_family));
	return ENOSYS;
	}

	DPRINTF(("TI_BIND: fileno %d\n", fd));

	if ((error = kern_bind(td, fd, skp)) != 0) {
	DPRINTF(("TI_BIND: bind failed %d\n", error));
	return error;
	}
	goto reply;

	error:
	memset(&bnd, 0, sizeof(bnd));
	bnd.len = sasize + 4;
	bnd.offs = 0x10; /* XXX */

	reply:
	bnd.cmd = SVR4_TI_BIND_REPLY;

	if ((error = copyout(&bnd, ioc->buf, ioc->len)) != 0)
	return error;

	return 0;
	}


	static int
	timod(fp, fd, ioc, td)
	struct file *fp;
	int fd;
	struct svr4_strioctl *ioc;
	struct thread *td;
	{
	switch (ioc->cmd) {
	case SVR4_TI_GETINFO:
	DPRINTF(("TI_GETINFO\n"));
	return ti_getinfo(fp, fd, ioc, td);

	case SVR4_TI_OPTMGMT:
	DPRINTF(("TI_OPTMGMT\n"));
	return 0;

	case SVR4_TI_BIND:
	DPRINTF(("TI_BIND\n"));
	return ti_bind(fp, fd, ioc, td);

	case SVR4_TI_UNBIND:
	DPRINTF(("TI_UNBIND\n"));
	return 0;

	default:
	DPRINTF(("Unknown timod ioctl %lx\n", ioc->cmd));
	return 0;
	}
	}


	int
	svr4_stream_ti_ioctl(fp, td, retval, fd, cmd, dat)
	struct file *fp;
	struct thread *td;
	register_t *retval;
	int fd;
	u_long cmd;
	caddr_t dat;
	{
	struct svr4_strbuf skb, sub = (struct svr4_strbuf ) dat;
	struct svr4_strm *st = svr4_stream_get(fp);
	int error;
	struct sockaddr *sa;
	socklen_t sasize, oldsasize;
	struct svr4_strmcmd sc;

	DPRINTF(("svr4_stream_ti_ioctl\n"));

	if (st == NULL)
	return EINVAL;

	sc.offs = 0x10;

	if ((error = copyin(sub, &skb, sizeof(skb))) != 0) {
	DPRINTF(("ti_ioctl: error copying in strbuf\n"));
	return error;
	}

	switch (st->s_family) {
	case AF_INET:
	sasize = sizeof(struct sockaddr_in);
	break;

	case AF_LOCAL:
	sasize = sizeof(struct sockaddr_un);
	break;

	default:
	DPRINTF(("ti_ioctl: Unsupported address family %d\n",
	st->s_family));
	return ENOSYS;
	}
	oldsasize = sasize;

	switch (cmd) {
	case SVR4_TI_GETMYNAME:
	DPRINTF(("TI_GETMYNAME\n"));
	{
	error = kern_getsockname(td, fd, &sa, &sasize);
	if (error) {
	DPRINTF(("ti_ioctl: getsockname error\n"));
	return error;
	}
	}
	break;

	case SVR4_TI_GETPEERNAME:
	DPRINTF(("TI_GETPEERNAME\n"));
	{
	error = kern_getpeername(td, fd, &sa, &sasize);
	if (error) {
	DPRINTF(("ti_ioctl: getpeername error\n"));
	return error;
	}
	}
	break;

	case SVR4_TI_SETMYNAME:
	DPRINTF(("TI_SETMYNAME\n"));
	return 0;

	case SVR4_TI_SETPEERNAME:
	DPRINTF(("TI_SETPEERNAME\n"));
	return 0;
	default:
	DPRINTF(("ti_ioctl: Unknown ioctl %lx\n", cmd));
	return ENOSYS;
	}

	if (sasize < 0 \|\| sasize > oldsasize) {
	free(sa, M_SONAME);
	return EINVAL;
	}

	switch (st->s_family) {
	case AF_INET:
	sockaddr_to_netaddr_in(&sc, (struct sockaddr_in *)sa);
	skb.len = sasize;
	break;

	case AF_LOCAL:
	sockaddr_to_netaddr_un(&sc, (struct sockaddr_un *)sa);
	skb.len = sasize + 4;
	break;

	default:
	free(sa, M_SONAME);
	return ENOSYS;
	}
	free(sa, M_SONAME);

	if ((error = copyout(SVR4_ADDROF(&sc), skb.buf, sasize)) != 0) {
	DPRINTF(("ti_ioctl: error copying out socket data\n"));
	return error;
	}


	if ((error = copyout(&skb, sub, sizeof(skb))) != 0) {
	DPRINTF(("ti_ioctl: error copying out strbuf\n"));
	return error;
	}

	return error;
	}




	static int
	i_nread(fp, td, retval, fd, cmd, dat)
	struct file *fp;
	struct thread *td;
	register_t *retval;
	int fd;
	u_long cmd;
	caddr_t dat;
	{
	int error;
	int nread = 0;

	/*
	* We are supposed to return the message length in nread, and the
	* number of messages in retval. We don't have the notion of number
	* of stream messages, so we just find out if we have any bytes waiting
	* for us, and if we do, then we assume that we have at least one
	* message waiting for us.
	*/
	if ((error = fo_ioctl(fp, FIONREAD, (caddr_t) &nread, td->td_ucred,
	td)) != 0)
	return error;

	if (nread != 0)
	*retval = 1;
	else
	*retval = 0;

	return copyout(&nread, dat, sizeof(nread));
	}

	static int
	i_fdinsert(fp, td, retval, fd, cmd, dat)
	struct file *fp;
	struct thread *td;
	register_t *retval;
	int fd;
	u_long cmd;
	caddr_t dat;
	{
	/*
	* Major hack again here. We assume that we are using this to
	* implement accept(2). If that is the case, we have already
	* called accept, and we have stored the file descriptor in
	* afd. We find the file descriptor that the code wants to use
	* in fd insert, and then we dup2() our accepted file descriptor
	* to it.
	*/
	int error;
	struct svr4_strm *st = svr4_stream_get(fp);
	struct svr4_strfdinsert fdi;
	struct dup2_args d2p;

	if (st == NULL) {
	DPRINTF(("fdinsert: bad file type\n"));
	return EINVAL;
	}

	mtx_lock(&Giant);
	if (st->s_afd == -1) {
	DPRINTF(("fdinsert: accept fd not found\n"));
	mtx_unlock(&Giant);
	return ENOENT;
	}

	if ((error = copyin(dat, &fdi, sizeof(fdi))) != 0) {
	DPRINTF(("fdinsert: copyin failed %d\n", error));
	mtx_unlock(&Giant);
	return error;
	}

	d2p.from = st->s_afd;
	d2p.to = fdi.fd;

	- if ((error = dup2(td, &d2p)) != 0) {
	+ if ((error = sys_dup2(td, &d2p)) != 0) {
	DPRINTF(("fdinsert: dup2(%d, %d) failed %d\n",
	st->s_afd, fdi.fd, error));
	mtx_unlock(&Giant);
	return error;
	}

	if ((error = kern_close(td, st->s_afd)) != 0) {
	DPRINTF(("fdinsert: close(%d) failed %d\n",
	st->s_afd, error));
	mtx_unlock(&Giant);
	return error;
	}

	st->s_afd = -1;
	mtx_unlock(&Giant);

	*retval = 0;
	return 0;
	}


	static int
	_i_bind_rsvd(fp, td, retval, fd, cmd, dat)
	struct file *fp;
	struct thread *td;
	register_t *retval;
	int fd;
	u_long cmd;
	caddr_t dat;
	{
	struct mkfifo_args ap;

	/*
	* This is a supposed to be a kernel and library only ioctl.
	* It gets called before ti_bind, when we have a unix
	* socket, to physically create the socket transport and
	* ``reserve'' it. I don't know how this get reserved inside
	* the kernel, but we are going to create it nevertheless.
	*/
	ap.path = dat;
	ap.mode = S_IFIFO;

	- return mkfifo(td, &ap);
	+ return sys_mkfifo(td, &ap);
	}

	static int
	_i_rele_rsvd(fp, td, retval, fd, cmd, dat)
	struct file *fp;
	struct thread *td;
	register_t *retval;
	int fd;
	u_long cmd;
	caddr_t dat;
	{
	struct unlink_args ap;

	/*
	* This is a supposed to be a kernel and library only ioctl.
	* I guess it is supposed to release the socket.
	*/
	ap.path = dat;

	- return unlink(td, &ap);
	+ return sys_unlink(td, &ap);
	}

	static int
	i_str(fp, td, retval, fd, cmd, dat)
	struct file *fp;
	struct thread *td;
	register_t *retval;
	int fd;
	u_long cmd;
	caddr_t dat;
	{
	int error;
	struct svr4_strioctl ioc;

	if ((error = copyin(dat, &ioc, sizeof(ioc))) != 0)
	return error;

	#ifdef DEBUG_SVR4
	if ((error = show_ioc(">", &ioc)) != 0)
	return error;
	#endif /* DEBUG_SVR4 */

	switch (ioc.cmd & 0xff00) {
	case SVR4_SIMOD:
	if ((error = sockmod(fp, fd, &ioc, td)) != 0)
	return error;
	break;

	case SVR4_TIMOD:
	if ((error = timod(fp, fd, &ioc, td)) != 0)
	return error;
	break;

	default:
	DPRINTF(("Unimplemented module %c %ld\n",
	(char) (cmd >> 8), cmd & 0xff));
	return 0;
	}

	#ifdef DEBUG_SVR4
	if ((error = show_ioc("<", &ioc)) != 0)
	return error;
	#endif /* DEBUG_SVR4 */
	return copyout(&ioc, dat, sizeof(ioc));
	}

	static int
	i_setsig(fp, td, retval, fd, cmd, dat)
	struct file *fp;
	struct thread *td;
	register_t *retval;
	int fd;
	u_long cmd;
	caddr_t dat;
	{
	/*
	* This is the best we can do for now; we cannot generate
	* signals only for specific events so the signal mask gets
	* ignored; we save it just to pass it to a possible I_GETSIG...
	*
	* We alse have to fix the O_ASYNC fcntl bit, so the
	* process will get SIGPOLLs.
	*/
	int error;
	register_t oflags, flags;
	struct svr4_strm *st = svr4_stream_get(fp);

	if (st == NULL) {
	DPRINTF(("i_setsig: bad file descriptor\n"));
	return EINVAL;
	}
	/* get old status flags */
	error = kern_fcntl(td, fd, F_GETFL, 0);
	if (error)
	return (error);

	oflags = td->td_retval[0];

	/* update the flags */
	mtx_lock(&Giant);
	if (dat != NULL) {
	int mask;

	flags = oflags \| O_ASYNC;
	if ((error = copyin(dat, &mask, sizeof(mask))) != 0) {
	DPRINTF(("i_setsig: bad eventmask pointer\n"));
	return error;
	}
	if (mask & SVR4_S_ALLMASK) {
	DPRINTF(("i_setsig: bad eventmask data %x\n", mask));
	return EINVAL;
	}
	st->s_eventmask = mask;
	}
	else {
	flags = oflags & ~O_ASYNC;
	st->s_eventmask = 0;
	}
	mtx_unlock(&Giant);

	/* set the new flags, if changed */
	if (flags != oflags) {
	error = kern_fcntl(td, fd, F_SETFL, flags);
	if (error)
	return (error);
	flags = td->td_retval[0];
	}

	/* set up SIGIO receiver if needed */
	if (dat != NULL)
	return (kern_fcntl(td, fd, F_SETOWN, td->td_proc->p_pid));
	return 0;
	}

	static int
	i_getsig(fp, td, retval, fd, cmd, dat)
	struct file *fp;
	struct thread *td;
	register_t *retval;
	int fd;
	u_long cmd;
	caddr_t dat;
	{
	int error, eventmask;

	if (dat != NULL) {
	struct svr4_strm *st = svr4_stream_get(fp);

	if (st == NULL) {
	DPRINTF(("i_getsig: bad file descriptor\n"));
	return EINVAL;
	}
	mtx_lock(&Giant);
	eventmask = st->s_eventmask;
	mtx_unlock(&Giant);
	if ((error = copyout(&eventmask, dat,
	sizeof(eventmask))) != 0) {
	DPRINTF(("i_getsig: bad eventmask pointer\n"));
	return error;
	}
	}
	return 0;
	}

	int
	svr4_stream_ioctl(fp, td, retval, fd, cmd, dat)
	struct file *fp;
	struct thread *td;
	register_t *retval;
	int fd;
	u_long cmd;
	caddr_t dat;
	{
	*retval = 0;

	/*
	* All the following stuff assumes "sockmod" is pushed...
	*/
	switch (cmd) {
	case SVR4_I_NREAD:
	DPRINTF(("I_NREAD\n"));
	return i_nread(fp, td, retval, fd, cmd, dat);

	case SVR4_I_PUSH:
	DPRINTF(("I_PUSH %p\n", dat));
	#if defined(DEBUG_SVR4)
	show_strbuf((struct svr4_strbuf *)dat);
	#endif
	return 0;

	case SVR4_I_POP:
	DPRINTF(("I_POP\n"));
	return 0;

	case SVR4_I_LOOK:
	DPRINTF(("I_LOOK\n"));
	return 0;

	case SVR4_I_FLUSH:
	DPRINTF(("I_FLUSH\n"));
	return 0;

	case SVR4_I_SRDOPT:
	DPRINTF(("I_SRDOPT\n"));
	return 0;

	case SVR4_I_GRDOPT:
	DPRINTF(("I_GRDOPT\n"));
	return 0;

	case SVR4_I_STR:
	DPRINTF(("I_STR\n"));
	return i_str(fp, td, retval, fd, cmd, dat);

	case SVR4_I_SETSIG:
	DPRINTF(("I_SETSIG\n"));
	return i_setsig(fp, td, retval, fd, cmd, dat);

	case SVR4_I_GETSIG:
	DPRINTF(("I_GETSIG\n"));
	return i_getsig(fp, td, retval, fd, cmd, dat);

	case SVR4_I_FIND:
	DPRINTF(("I_FIND\n"));
	/*
	* Here we are not pushing modules really, we just
	* pretend all are present
	*/
	*retval = 0;
	return 0;

	case SVR4_I_LINK:
	DPRINTF(("I_LINK\n"));
	return 0;

	case SVR4_I_UNLINK:
	DPRINTF(("I_UNLINK\n"));
	return 0;

	case SVR4_I_ERECVFD:
	DPRINTF(("I_ERECVFD\n"));
	return 0;

	case SVR4_I_PEEK:
	DPRINTF(("I_PEEK\n"));
	return 0;

	case SVR4_I_FDINSERT:
	DPRINTF(("I_FDINSERT\n"));
	return i_fdinsert(fp, td, retval, fd, cmd, dat);

	case SVR4_I_SENDFD:
	DPRINTF(("I_SENDFD\n"));
	return 0;

	case SVR4_I_RECVFD:
	DPRINTF(("I_RECVFD\n"));
	return 0;

	case SVR4_I_SWROPT:
	DPRINTF(("I_SWROPT\n"));
	return 0;

	case SVR4_I_GWROPT:
	DPRINTF(("I_GWROPT\n"));
	return 0;

	case SVR4_I_LIST:
	DPRINTF(("I_LIST\n"));
	return 0;

	case SVR4_I_PLINK:
	DPRINTF(("I_PLINK\n"));
	return 0;

	case SVR4_I_PUNLINK:
	DPRINTF(("I_PUNLINK\n"));
	return 0;

	case SVR4_I_SETEV:
	DPRINTF(("I_SETEV\n"));
	return 0;

	case SVR4_I_GETEV:
	DPRINTF(("I_GETEV\n"));
	return 0;

	case SVR4_I_STREV:
	DPRINTF(("I_STREV\n"));
	return 0;

	case SVR4_I_UNSTREV:
	DPRINTF(("I_UNSTREV\n"));
	return 0;

	case SVR4_I_FLUSHBAND:
	DPRINTF(("I_FLUSHBAND\n"));
	return 0;

	case SVR4_I_CKBAND:
	DPRINTF(("I_CKBAND\n"));
	return 0;

	case SVR4_I_GETBAND:
	DPRINTF(("I_GETBANK\n"));
	return 0;

	case SVR4_I_ATMARK:
	DPRINTF(("I_ATMARK\n"));
	return 0;

	case SVR4_I_SETCLTIME:
	DPRINTF(("I_SETCLTIME\n"));
	return 0;

	case SVR4_I_GETCLTIME:
	DPRINTF(("I_GETCLTIME\n"));
	return 0;

	case SVR4_I_CANPUT:
	DPRINTF(("I_CANPUT\n"));
	return 0;

	case SVR4__I_BIND_RSVD:
	DPRINTF(("_I_BIND_RSVD\n"));
	return _i_bind_rsvd(fp, td, retval, fd, cmd, dat);

	case SVR4__I_RELE_RSVD:
	DPRINTF(("_I_RELE_RSVD\n"));
	return _i_rele_rsvd(fp, td, retval, fd, cmd, dat);

	default:
	DPRINTF(("unimpl cmd = %lx\n", cmd));
	break;
	}

	return 0;
	}



	int
	svr4_sys_putmsg(td, uap)
	struct thread *td;
	struct svr4_sys_putmsg_args *uap;
	{
	struct file *fp;
	int error;

	if ((error = fget(td, uap->fd, CAP_WRITE, &fp)) != 0) {
	#ifdef DEBUG_SVR4
	uprintf("putmsg: bad fp\n");
	#endif
	return EBADF;
	}
	error = svr4_do_putmsg(td, uap, fp);
	fdrop(fp, td);
	return (error);
	}

	static int
	svr4_do_putmsg(td, uap, fp)
	struct thread *td;
	struct svr4_sys_putmsg_args *uap;
	struct file *fp;
	{
	struct svr4_strbuf dat, ctl;
	struct svr4_strmcmd sc;
	struct sockaddr_in sain;
	struct sockaddr_un saun;
	struct sockaddr *sa;
	int sasize, *retval;
	struct svr4_strm *st;
	int error;

	retval = td->td_retval;

	#ifdef DEBUG_SVR4
	show_msg(">putmsg", uap->fd, uap->ctl,
	uap->dat, uap->flags);
	#endif /* DEBUG_SVR4 */

	if (uap->ctl != NULL) {
	if ((error = copyin(uap->ctl, &ctl, sizeof(ctl))) != 0) {
	#ifdef DEBUG_SVR4
	uprintf("putmsg: copyin(): %d\n", error);
	#endif
	return error;
	}
	}
	else
	ctl.len = -1;

	if (uap->dat != NULL) {
	if ((error = copyin(uap->dat, &dat, sizeof(dat))) != 0) {
	#ifdef DEBUG_SVR4
	uprintf("putmsg: copyin(): %d (2)\n", error);
	#endif
	return error;
	}
	}
	else
	dat.len = -1;

	/*
	* Only for sockets for now.
	*/
	if ((st = svr4_stream_get(fp)) == NULL) {
	DPRINTF(("putmsg: bad file type\n"));
	return EINVAL;
	}

	if (ctl.len < 0 \|\| ctl.len > sizeof(sc)) {
	DPRINTF(("putmsg: Bad control size %d != %d\n", ctl.len,
	sizeof(struct svr4_strmcmd)));
	return EINVAL;
	}

	if ((error = copyin(ctl.buf, &sc, ctl.len)) != 0)
	return error;

	switch (st->s_family) {
	case AF_INET:
	if (sc.len != sizeof(sain)) {
	if (sc.cmd == SVR4_TI_DATA_REQUEST) {
	struct write_args wa;

	/* Solaris seems to use sc.cmd = 3 to
	* send "expedited" data. telnet uses
	* this for options processing, sending EOF,
	* etc. I'm sure other things use it too.
	* I don't have any documentation
	* on it, so I'm making a guess that this
	* is how it works. newton@atdot.dotat.org XXX
	*/
	DPRINTF(("sending expedited data ??\n"));
	wa.fd = uap->fd;
	wa.buf = dat.buf;
	wa.nbyte = dat.len;
	- return write(td, &wa);
	+ return sys_write(td, &wa);
	}
	DPRINTF(("putmsg: Invalid inet length %ld\n", sc.len));
	return EINVAL;
	}
	netaddr_to_sockaddr_in(&sain, &sc);
	sa = (struct sockaddr *)&sain;
	sasize = sizeof(sain);
	if (sain.sin_family != st->s_family)
	error = EINVAL;
	break;

	case AF_LOCAL:
	if (ctl.len == 8) {
	/* We are doing an accept; succeed */
	DPRINTF(("putmsg: Do nothing\n"));
	*retval = 0;
	return 0;
	}
	else {
	/* Maybe we've been given a device/inode pair */
	dev_t *dev = SVR4_ADDROF(&sc);
	ino_t ino = (ino_t ) &dev[1];
	if (svr4_find_socket(td, fp, dev, ino, &saun) != 0) {
	/* I guess we have it by name */
	netaddr_to_sockaddr_un(&saun, &sc);
	}
	sa = (struct sockaddr *)&saun;
	sasize = sizeof(saun);
	}
	break;

	default:
	DPRINTF(("putmsg: Unsupported address family %d\n",
	st->s_family));
	return ENOSYS;
	}

	mtx_lock(&Giant);
	st->s_cmd = sc.cmd;
	mtx_unlock(&Giant);
	switch (sc.cmd) {
	case SVR4_TI_CONNECT_REQUEST: /* connect */
	{

	return (kern_connect(td, uap->fd, sa));
	}

	case SVR4_TI_SENDTO_REQUEST: /* sendto */
	{
	struct msghdr msg;
	struct iovec aiov;

	msg.msg_name = sa;
	msg.msg_namelen = sasize;
	msg.msg_iov = &aiov;
	msg.msg_iovlen = 1;
	msg.msg_control = 0;
	msg.msg_flags = 0;
	aiov.iov_base = dat.buf;
	aiov.iov_len = dat.len;
	error = kern_sendit(td, uap->fd, &msg, uap->flags,
	NULL, UIO_USERSPACE);
	DPRINTF(("sendto_request error: %d\n", error));
	*retval = 0;
	return error;
	}

	default:
	DPRINTF(("putmsg: Unimplemented command %lx\n", sc.cmd));
	return ENOSYS;
	}
	}

	int
	svr4_sys_getmsg(td, uap)
	struct thread *td;
	struct svr4_sys_getmsg_args *uap;
	{
	struct file *fp;
	int error;

	if ((error = fget(td, uap->fd, CAP_READ, &fp)) != 0) {
	#ifdef DEBUG_SVR4
	uprintf("getmsg: bad fp\n");
	#endif
	return EBADF;
	}
	error = svr4_do_getmsg(td, uap, fp);
	fdrop(fp, td);
	return (error);
	}

	int
	svr4_do_getmsg(td, uap, fp)
	struct thread *td;
	struct svr4_sys_getmsg_args *uap;
	struct file *fp;
	{
	struct svr4_strbuf dat, ctl;
	struct svr4_strmcmd sc;
	int error, *retval;
	struct msghdr msg;
	struct iovec aiov;
	struct sockaddr_in sain;
	struct sockaddr_un saun;
	struct sockaddr *sa;
	socklen_t sasize;
	struct svr4_strm *st;
	struct file *afp;
	int fl;

	retval = td->td_retval;
	error = 0;
	afp = NULL;

	memset(&sc, 0, sizeof(sc));

	#ifdef DEBUG_SVR4
	show_msg(">getmsg", uap->fd, uap->ctl,
	uap->dat, 0);
	#endif /* DEBUG_SVR4 */

	if (uap->ctl != NULL) {
	if ((error = copyin(uap->ctl, &ctl, sizeof(ctl))) != 0)
	return error;
	if (ctl.len < 0)
	return EINVAL;
	}
	else {
	ctl.len = -1;
	ctl.maxlen = 0;
	}

	if (uap->dat != NULL) {
	if ((error = copyin(uap->dat, &dat, sizeof(dat))) != 0)
	return error;
	}
	else {
	dat.len = -1;
	dat.maxlen = 0;
	}

	/*
	* Only for sockets for now.
	*/
	if ((st = svr4_stream_get(fp)) == NULL) {
	DPRINTF(("getmsg: bad file type\n"));
	return EINVAL;
	}

	if (ctl.maxlen == -1 \|\| dat.maxlen == -1) {
	DPRINTF(("getmsg: Cannot handle -1 maxlen (yet)\n"));
	return ENOSYS;
	}

	switch (st->s_family) {
	case AF_INET:
	sasize = sizeof(sain);
	break;

	case AF_LOCAL:
	sasize = sizeof(saun);
	break;

	default:
	DPRINTF(("getmsg: Unsupported address family %d\n",
	st->s_family));
	return ENOSYS;
	}

	mtx_lock(&Giant);
	switch (st->s_cmd) {
	case SVR4_TI_CONNECT_REQUEST:
	DPRINTF(("getmsg: TI_CONNECT_REQUEST\n"));
	/*
	* We do the connect in one step, so the putmsg should
	* have gotten the error.
	*/
	sc.cmd = SVR4_TI_OK_REPLY;
	sc.len = 0;

	ctl.len = 8;
	dat.len = -1;
	fl = 1;
	st->s_cmd = sc.cmd;
	break;

	case SVR4_TI_OK_REPLY:
	DPRINTF(("getmsg: TI_OK_REPLY\n"));
	/*
	* We are immediately after a connect reply, so we send
	* a connect verification.
	*/

	error = kern_getpeername(td, uap->fd, &sa, &sasize);
	if (error) {
	mtx_unlock(&Giant);
	DPRINTF(("getmsg: getpeername failed %d\n", error));
	return error;
	}

	sc.cmd = SVR4_TI_CONNECT_REPLY;
	sc.pad[0] = 0x4;
	sc.offs = 0x18;
	sc.pad[1] = 0x14;
	sc.pad[2] = 0x04000402;

	switch (st->s_family) {
	case AF_INET:
	sc.len = sasize;
	sockaddr_to_netaddr_in(&sc, (struct sockaddr_in *)sa);
	break;

	case AF_LOCAL:
	sc.len = sasize + 4;
	sockaddr_to_netaddr_un(&sc, (struct sockaddr_un *)sa);
	break;

	default:
	mtx_unlock(&Giant);
	free(sa, M_SONAME);
	return ENOSYS;
	}
	free(sa, M_SONAME);

	ctl.len = 40;
	dat.len = -1;
	fl = 0;
	st->s_cmd = sc.cmd;
	break;

	case SVR4_TI__ACCEPT_OK:
	DPRINTF(("getmsg: TI__ACCEPT_OK\n"));
	/*
	* We do the connect in one step, so the putmsg should
	* have gotten the error.
	*/
	sc.cmd = SVR4_TI_OK_REPLY;
	sc.len = 1;

	ctl.len = 8;
	dat.len = -1;
	fl = 1;
	st->s_cmd = SVR4_TI__ACCEPT_WAIT;
	break;

	case SVR4_TI__ACCEPT_WAIT:
	DPRINTF(("getmsg: TI__ACCEPT_WAIT\n"));
	/*
	* We are after a listen, so we try to accept...
	*/

	error = kern_accept(td, uap->fd, &sa, &sasize, &afp);
	if (error) {
	mtx_unlock(&Giant);
	DPRINTF(("getmsg: accept failed %d\n", error));
	return error;
	}

	st->s_afd = *retval;

	DPRINTF(("getmsg: Accept fd = %d\n", st->s_afd));

	sc.cmd = SVR4_TI_ACCEPT_REPLY;
	sc.offs = 0x18;
	sc.pad[0] = 0x0;

	switch (st->s_family) {
	case AF_INET:
	sc.pad[1] = 0x28;
	sockaddr_to_netaddr_in(&sc, (struct sockaddr_in *)&sa);
	ctl.len = 40;
	sc.len = sasize;
	break;

	case AF_LOCAL:
	sc.pad[1] = 0x00010000;
	sc.pad[2] = 0xf6bcdaa0; /* I don't know what that is */
	sc.pad[3] = 0x00010000;
	ctl.len = 134;
	sc.len = sasize + 4;
	break;

	default:
	fdclose(td->td_proc->p_fd, afp, st->s_afd, td);
	fdrop(afp, td);
	st->s_afd = -1;
	mtx_unlock(&Giant);
	free(sa, M_SONAME);
	return ENOSYS;
	}
	free(sa, M_SONAME);

	dat.len = -1;
	fl = 0;
	st->s_cmd = SVR4_TI__ACCEPT_OK;
	break;

	case SVR4_TI_SENDTO_REQUEST:
	DPRINTF(("getmsg: TI_SENDTO_REQUEST\n"));
	if (ctl.maxlen > 36 && ctl.len < 36)
	ctl.len = 36;

	if (ctl.len > sizeof(sc))
	ctl.len = sizeof(sc);

	if ((error = copyin(ctl.buf, &sc, ctl.len)) != 0) {
	mtx_unlock(&Giant);
	return error;
	}

	switch (st->s_family) {
	case AF_INET:
	sa = (struct sockaddr *)&sain;
	sockaddr_to_netaddr_in(&sc, &sain);
	break;

	case AF_LOCAL:
	sa = (struct sockaddr *)&saun;
	sockaddr_to_netaddr_un(&sc, &saun);
	break;

	default:
	mtx_unlock(&Giant);
	return ENOSYS;
	}

	msg.msg_name = sa;
	msg.msg_namelen = sasize;
	msg.msg_iov = &aiov;
	msg.msg_iovlen = 1;
	msg.msg_control = 0;
	aiov.iov_base = dat.buf;
	aiov.iov_len = dat.maxlen;
	msg.msg_flags = 0;

	error = kern_recvit(td, uap->fd, &msg, UIO_SYSSPACE, NULL);

	if (error) {
	mtx_unlock(&Giant);
	DPRINTF(("getmsg: recvit failed %d\n", error));
	return error;
	}

	sc.cmd = SVR4_TI_RECVFROM_IND;

	switch (st->s_family) {
	case AF_INET:
	sc.len = sasize;
	sockaddr_to_netaddr_in(&sc, &sain);
	break;

	case AF_LOCAL:
	sc.len = sasize + 4;
	sockaddr_to_netaddr_un(&sc, &saun);
	break;

	default:
	mtx_unlock(&Giant);
	return ENOSYS;
	}

	dat.len = *retval;
	fl = 0;
	st->s_cmd = sc.cmd;
	break;

	default:
	st->s_cmd = sc.cmd;
	if (st->s_cmd == SVR4_TI_CONNECT_REQUEST) {
	struct read_args ra;

	/* More weirdness: Again, I can't find documentation
	* to back this up, but when a process does a generic
	* "getmsg()" call it seems that the command field is
	* zero and the length of the data area is zero. I
	* think processes expect getmsg() to fill in dat.len
	* after reading at most dat.maxlen octets from the
	* stream. Since we're using sockets I can let
	* read() look after it and frob return values
	* appropriately (or inappropriately :-)
	* -- newton@atdot.dotat.org XXX
	*/
	ra.fd = uap->fd;
	ra.buf = dat.buf;
	ra.nbyte = dat.maxlen;
	- if ((error = read(td, &ra)) != 0) {
	+ if ((error = sys_read(td, &ra)) != 0) {
	mtx_unlock(&Giant);
	return error;
	}
	dat.len = *retval;
	*retval = 0;
	st->s_cmd = SVR4_TI_SENDTO_REQUEST;
	break;
	}
	mtx_unlock(&Giant);
	DPRINTF(("getmsg: Unknown state %x\n", st->s_cmd));
	return EINVAL;
	}

	if (uap->ctl) {
	if (ctl.len > sizeof(sc))
	ctl.len = sizeof(sc);
	if (ctl.len != -1)
	error = copyout(&sc, ctl.buf, ctl.len);

	if (error == 0)
	error = copyout(&ctl, uap->ctl, sizeof(ctl));
	}

	if (uap->dat) {
	if (error == 0)
	error = copyout(&dat, uap->dat, sizeof(dat));
	}

	if (uap->flags) { /* XXX: Need translation */
	if (error == 0)
	error = copyout(&fl, uap->flags, sizeof(fl));
	}

	if (error) {
	if (afp) {
	fdclose(td->td_proc->p_fd, afp, st->s_afd, td);
	fdrop(afp, td);
	st->s_afd = -1;
	}
	mtx_unlock(&Giant);
	return (error);
	}
	mtx_unlock(&Giant);
	if (afp)
	fdrop(afp, td);

	*retval = 0;

	#ifdef DEBUG_SVR4
	show_msg("<getmsg", uap->fd, uap->ctl,
	uap->dat, fl);
	#endif /* DEBUG_SVR4 */
	return error;
	}

	int svr4_sys_send(td, uap)
	struct thread *td;
	struct svr4_sys_send_args *uap;
	{
	struct sendto_args sta;

	sta.s = uap->s;
	sta.buf = uap->buf;
	sta.len = uap->len;
	sta.flags = uap->flags;
	sta.to = NULL;
	sta.tolen = 0;

	- return (sendto(td, &sta));
	+ return (sys_sendto(td, &sta));
	}

	int svr4_sys_recv(td, uap)
	struct thread *td;
	struct svr4_sys_recv_args *uap;
	{
	struct recvfrom_args rfa;

	rfa.s = uap->s;
	rfa.buf = uap->buf;
	rfa.len = uap->len;
	rfa.flags = uap->flags;
	rfa.from = NULL;
	rfa.fromlenaddr = NULL;

	- return (recvfrom(td, &rfa));
	+ return (sys_recvfrom(td, &rfa));
	}

	/*
	* XXX This isn't necessary, but it's handy for inserting debug code into
	* sendto(). Let's leave it here for now...
	*/
	int
	svr4_sys_sendto(td, uap)
	struct thread *td;
	struct svr4_sys_sendto_args *uap;
	{
	struct sendto_args sa;

	sa.s = uap->s;
	sa.buf = uap->buf;
	sa.len = uap->len;
	sa.flags = uap->flags;
	sa.to = (caddr_t)uap->to;
	sa.tolen = uap->tolen;

	DPRINTF(("calling sendto()\n"));
	- return sendto(td, &sa);
	+ return sys_sendto(td, &sa);
	}

	Index: head/sys/dev/bktr/bktr_core.c
	===================================================================
	--- head/sys/dev/bktr/bktr_core.c (revision 225616)
	+++ head/sys/dev/bktr/bktr_core.c (revision 225617)
	@@ -1,4315 +1,4315 @@
	/*-
	* 1. Redistributions of source code must retain the
	* Copyright (c) 1997 Amancio Hasty, 1999 Roger Hardiman
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by Amancio Hasty and
	* Roger Hardiman
	* 4. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
	* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/
	/*-
	* 1. Redistributions of source code must retain the
	* Copyright (c) 1995 Mark Tinguely and Jim Lowe
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by Mark Tinguely and Jim Lowe
	* 4. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
	* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
	* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* This is part of the Driver for Video Capture Cards (Frame grabbers)
	* and TV Tuner cards using the Brooktree Bt848, Bt848A, Bt849A, Bt878, Bt879
	* chipset.
	* Copyright Roger Hardiman and Amancio Hasty.
	*
	* bktr_core : This deals with the Bt848/849/878/879 PCI Frame Grabber,
	* Handles all the open, close, ioctl and read userland calls.
	* Sets the Bt848 registers and generates RISC pograms.
	* Controls the i2c bus and GPIO interface.
	* Contains the interface to the kernel.
	* (eg probe/attach and open/close/ioctl)
	*/

	/*
	The Brooktree BT848 Driver driver is based upon Mark Tinguely and
	Jim Lowe's driver for the Matrox Meteor PCI card . The
	Philips SAA 7116 and SAA 7196 are very different chipsets than
	the BT848.

	The original copyright notice by Mark and Jim is included mostly
	to honor their fantastic work in the Matrox Meteor driver!
	*/

	#include "opt_bktr.h" /* Include any kernel config options */

	#if ( \
	(defined(__FreeBSD__)) \
	\|\| (defined(__bsdi__)) \
	\|\| (defined(__OpenBSD__)) \
	\|\| (defined(__NetBSD__)) \
	)


	/*******************/
	/* * FreeBSD * */
	/*******************/
	#ifdef __FreeBSD__

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/fcntl.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/signalvar.h>
	#include <sys/selinfo.h>
	#include <sys/uio.h>

	#include <vm/vm.h>
	#include <vm/vm_kern.h>
	#include <vm/pmap.h>
	#include <vm/vm_extern.h>

	#include <sys/bus.h> /* used by smbus and newbus */

	#if (__FreeBSD_version < 500000)
	#include <machine/clock.h> /* for DELAY */
	#define PROC_LOCK(p)
	#define PROC_UNLOCK(p)
	#include <pci/pcivar.h>
	#else
	#include <dev/pci/pcivar.h>
	#endif

	#include <machine/bus.h>
	#include <sys/bus.h>

	#include <dev/bktr/ioctl_meteor.h>
	#include <dev/bktr/ioctl_bt848.h> /* extensions to ioctl_meteor.h */
	#include <dev/bktr/bktr_reg.h>
	#include <dev/bktr/bktr_tuner.h>
	#include <dev/bktr/bktr_card.h>
	#include <dev/bktr/bktr_audio.h>
	#include <dev/bktr/bktr_os.h>
	#include <dev/bktr/bktr_core.h>
	#if defined(BKTR_FREEBSD_MODULE)
	#include <dev/bktr/bktr_mem.h>
	#endif

	#if defined(BKTR_USE_FREEBSD_SMBUS)
	#include <dev/bktr/bktr_i2c.h>
	#include <dev/smbus/smbconf.h>
	#include <dev/iicbus/iiconf.h>
	#include "smbus_if.h"
	#include "iicbus_if.h"
	#endif

	const char *
	bktr_name(bktr_ptr_t bktr)
	{
	return bktr->bktr_xname;
	}


	#endif /* __FreeBSD__ */


	/****************/
	/* * BSDI * */
	/****************/
	#ifdef __bsdi__
	#define PROC_LOCK(p)
	#define PROC_UNLOCK(p)
	#endif /* __bsdi__ */


	/**************************/
	/* * OpenBSD/NetBSD * */
	/**************************/
	#if defined(__NetBSD__) \|\| defined(__OpenBSD__)

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/signalvar.h>
	#include <sys/vnode.h>

	#ifdef __NetBSD__
	#include <uvm/uvm_extern.h>
	#else
	#include <vm/vm.h>
	#include <vm/vm_kern.h>
	#include <vm/pmap.h>
	#include <vm/vm_extern.h>
	#endif

	#include <sys/inttypes.h> /* uintptr_t */
	#include <dev/ic/bt8xx.h>
	#include <dev/pci/bktr/bktr_reg.h>
	#include <dev/pci/bktr/bktr_tuner.h>
	#include <dev/pci/bktr/bktr_card.h>
	#include <dev/pci/bktr/bktr_audio.h>
	#include <dev/pci/bktr/bktr_core.h>
	#include <dev/pci/bktr/bktr_os.h>

	static int bt848_format = -1;

	const char *
	bktr_name(bktr_ptr_t bktr)
	{
	return (bktr->bktr_dev.dv_xname);
	}

	#define PROC_LOCK(p)
	#define PROC_UNLOCK(p)

	#endif /* __NetBSD__ \|\| __OpenBSD__ */


	typedef u_char bool_t;

	#define BKTRPRI (PZERO+8)\|PCATCH
	#define VBIPRI (PZERO-4)\|PCATCH


	/*
	* memory allocated for DMA programs
	*/
	#define DMA_PROG_ALLOC (8 * PAGE_SIZE)

	/* When to split a dma transfer , the bt848 has timing as well as
	dma transfer size limitations so that we have to split dma
	transfers into two dma requests
	*/
	#define DMA_BT848_SPLIT 319*2

	/*
	* Allocate enough memory for:
	* 768x576 RGB 16 or YUV (16 storage bits/pixel) = 884736 = 216 pages
	*
	* You may override this using the options "BROOKTREE_ALLOC_PAGES=value"
	* in your kernel configuration file.
	*/

	#ifndef BROOKTREE_ALLOC_PAGES
	#define BROOKTREE_ALLOC_PAGES 217*4
	#endif
	#define BROOKTREE_ALLOC (BROOKTREE_ALLOC_PAGES * PAGE_SIZE)

	/* Definitions for VBI capture.
	* There are 16 VBI lines in a PAL video field (32 in a frame),
	* and we take 2044 samples from each line (placed in a 2048 byte buffer
	* for alignment).
	* VBI lines are held in a circular buffer before being read by a
	* user program from /dev/vbi.
	*/

	#define MAX_VBI_LINES 16 /* Maximum for all vidoe formats */
	#define VBI_LINE_SIZE 2048 /* Store upto 2048 bytes per line */
	#define VBI_BUFFER_ITEMS 20 /* Number of frames we buffer */
	#define VBI_DATA_SIZE (VBI_LINE_SIZE * MAX_VBI_LINES * 2)
	#define VBI_BUFFER_SIZE (VBI_DATA_SIZE * VBI_BUFFER_ITEMS)


	/* Defines for fields */
	#define ODD_F 0x01
	#define EVEN_F 0x02


	/*
	* Parameters describing size of transmitted image.
	*/

	static struct format_params format_params[] = {
	/* # define BT848_IFORM_F_AUTO (0x0) - don't matter. */
	{ 525, 26, 480, 910, 135, 754, 640, 780, 30, 0x68, 0x5d, BT848_IFORM_X_AUTO,
	12, 1600 },
	/* # define BT848_IFORM_F_NTSCM (0x1) */
	{ 525, 26, 480, 910, 135, 754, 640, 780, 30, 0x68, 0x5d, BT848_IFORM_X_XT0,
	12, 1600 },
	/* # define BT848_IFORM_F_NTSCJ (0x2) */
	{ 525, 22, 480, 910, 135, 754, 640, 780, 30, 0x68, 0x5d, BT848_IFORM_X_XT0,
	12, 1600 },
	/* # define BT848_IFORM_F_PALBDGHI (0x3) */
	{ 625, 32, 576, 1135, 186, 924, 768, 944, 25, 0x7f, 0x72, BT848_IFORM_X_XT1,
	16, 2044 },
	/* # define BT848_IFORM_F_PALM (0x4) */
	{ 525, 22, 480, 910, 135, 754, 640, 780, 30, 0x68, 0x5d, BT848_IFORM_X_XT0,
	12, 1600 },
	/* # define BT848_IFORM_F_PALN (0x5) */
	{ 625, 32, 576, 1135, 186, 924, 768, 944, 25, 0x7f, 0x72, BT848_IFORM_X_XT1,
	16, 2044 },
	/* # define BT848_IFORM_F_SECAM (0x6) */
	{ 625, 32, 576, 1135, 186, 924, 768, 944, 25, 0x7f, 0xa0, BT848_IFORM_X_XT1,
	16, 2044 },
	/* # define BT848_IFORM_F_RSVD (0x7) - ???? */
	{ 625, 32, 576, 1135, 186, 924, 768, 944, 25, 0x7f, 0x72, BT848_IFORM_X_XT0,
	16, 2044 },
	};

	/*
	* Table of supported Pixel Formats
	*/

	static struct meteor_pixfmt_internal {
	struct meteor_pixfmt public;
	u_int color_fmt;
	} pixfmt_table[] = {

	{ { 0, METEOR_PIXTYPE_RGB, 2, { 0x7c00, 0x03e0, 0x001f }, 0,0 }, 0x33 },
	{ { 0, METEOR_PIXTYPE_RGB, 2, { 0x7c00, 0x03e0, 0x001f }, 1,0 }, 0x33 },

	{ { 0, METEOR_PIXTYPE_RGB, 2, { 0xf800, 0x07e0, 0x001f }, 0,0 }, 0x22 },
	{ { 0, METEOR_PIXTYPE_RGB, 2, { 0xf800, 0x07e0, 0x001f }, 1,0 }, 0x22 },

	{ { 0, METEOR_PIXTYPE_RGB, 3, { 0xff0000,0x00ff00,0x0000ff }, 1,0 }, 0x11 },

	{ { 0, METEOR_PIXTYPE_RGB, 4, { 0xff0000,0x00ff00,0x0000ff }, 0,0 }, 0x00 },
	{ { 0, METEOR_PIXTYPE_RGB, 4, { 0xff0000,0x00ff00,0x0000ff }, 0,1 }, 0x00 },
	{ { 0, METEOR_PIXTYPE_RGB, 4, { 0xff0000,0x00ff00,0x0000ff }, 1,0 }, 0x00 },
	{ { 0, METEOR_PIXTYPE_RGB, 4, { 0xff0000,0x00ff00,0x0000ff }, 1,1 }, 0x00 },
	{ { 0, METEOR_PIXTYPE_YUV, 2, { 0xff0000,0x00ff00,0x0000ff }, 1,1 }, 0x88 },
	{ { 0, METEOR_PIXTYPE_YUV_PACKED, 2, { 0xff0000,0x00ff00,0x0000ff }, 0,1 }, 0x44 },
	{ { 0, METEOR_PIXTYPE_YUV_12, 2, { 0xff0000,0x00ff00,0x0000ff }, 1,1 }, 0x88 },

	};
	#define PIXFMT_TABLE_SIZE ( sizeof(pixfmt_table) / sizeof(pixfmt_table[0]) )

	/*
	* Table of Meteor-supported Pixel Formats (for SETGEO compatibility)
	*/

	/* FIXME: Also add YUV_422 and YUV_PACKED as well */
	static struct {
	u_long meteor_format;
	struct meteor_pixfmt public;
	} meteor_pixfmt_table[] = {
	{ METEOR_GEO_YUV_12,
	{ 0, METEOR_PIXTYPE_YUV_12, 2, { 0xff0000,0x00ff00,0x0000ff }, 1,1 }
	},

	/* FIXME: Should byte swap flag be on for this one; negative in drvr? */
	{ METEOR_GEO_YUV_422,
	{ 0, METEOR_PIXTYPE_YUV, 2, { 0xff0000,0x00ff00,0x0000ff }, 1,1 }
	},
	{ METEOR_GEO_YUV_PACKED,
	{ 0, METEOR_PIXTYPE_YUV_PACKED, 2, { 0xff0000,0x00ff00,0x0000ff }, 0,1 }
	},
	{ METEOR_GEO_RGB16,
	{ 0, METEOR_PIXTYPE_RGB, 2, { 0x7c00, 0x03e0, 0x001f }, 0, 0 }
	},
	{ METEOR_GEO_RGB24,
	{ 0, METEOR_PIXTYPE_RGB, 4, { 0xff0000, 0x00ff00, 0x0000ff }, 0, 0 }
	},

	};
	#define METEOR_PIXFMT_TABLE_SIZE ( sizeof(meteor_pixfmt_table) / \
	sizeof(meteor_pixfmt_table[0]) )


	#define BSWAP (BT848_COLOR_CTL_BSWAP_ODD \| BT848_COLOR_CTL_BSWAP_EVEN)
	#define WSWAP (BT848_COLOR_CTL_WSWAP_ODD \| BT848_COLOR_CTL_WSWAP_EVEN)



	/* sync detect threshold */
	#if 0
	#define SYNC_LEVEL (BT848_ADC_RESERVED \| \
	BT848_ADC_CRUSH) /* threshold ~125 mV */
	#else
	#define SYNC_LEVEL (BT848_ADC_RESERVED \| \
	BT848_ADC_SYNC_T) /* threshold ~75 mV */
	#endif




	/* debug utility for holding previous INT_STAT contents */
	#define STATUS_SUM
	static u_long status_sum = 0;

	/*
	* defines to make certain bit-fiddles understandable
	*/
	#define FIFO_ENABLED BT848_DMA_CTL_FIFO_EN
	#define RISC_ENABLED BT848_DMA_CTL_RISC_EN
	#define FIFO_RISC_ENABLED (BT848_DMA_CTL_FIFO_EN \| BT848_DMA_CTL_RISC_EN)
	#define FIFO_RISC_DISABLED 0

	#define ALL_INTS_DISABLED 0
	#define ALL_INTS_CLEARED 0xffffffff
	#define CAPTURE_OFF 0

	#define BIT_SEVEN_HIGH (1<<7)
	#define BIT_EIGHT_HIGH (1<<8)

	#define I2C_BITS (BT848_INT_RACK \| BT848_INT_I2CDONE)
	#define TDEC_BITS (BT848_INT_FDSR \| BT848_INT_FBUS)



	static int oformat_meteor_to_bt( u_long format );

	static u_int pixfmt_swap_flags( int pixfmt );

	/*
	* bt848 RISC programming routines.
	*/
	#ifdef BT848_DUMP
	static int dump_bt848( bktr_ptr_t bktr );
	#endif

	static void yuvpack_prog( bktr_ptr_t bktr, char i_flag, int cols,
	int rows, int interlace );
	static void yuv422_prog( bktr_ptr_t bktr, char i_flag, int cols,
	int rows, int interlace );
	static void yuv12_prog( bktr_ptr_t bktr, char i_flag, int cols,
	int rows, int interlace );
	static void rgb_prog( bktr_ptr_t bktr, char i_flag, int cols,
	int rows, int interlace );
	static void rgb_vbi_prog( bktr_ptr_t bktr, char i_flag, int cols,
	int rows, int interlace );
	static void build_dma_prog( bktr_ptr_t bktr, char i_flag );

	static bool_t getline(bktr_reg_t *, int);
	static bool_t notclipped(bktr_reg_t * , int , int);
	static bool_t split(bktr_reg_t , volatile uint32_t *, int, u_long, int,
	volatile u_char ** , int );

	static void start_capture( bktr_ptr_t bktr, unsigned type );
	static void set_fps( bktr_ptr_t bktr, u_short fps );



	/*
	* Remote Control Functions
	*/
	static void remote_read(bktr_ptr_t bktr, struct bktr_remote *remote);


	/*
	* ioctls common to both video & tuner.
	*/
	static int common_ioctl( bktr_ptr_t bktr, ioctl_cmd_t cmd, caddr_t arg );


	#if !defined(BKTR_USE_FREEBSD_SMBUS)
	/*
	* i2c primitives for low level control of i2c bus. Added for MSP34xx control
	*/
	static void i2c_start( bktr_ptr_t bktr);
	static void i2c_stop( bktr_ptr_t bktr);
	static int i2c_write_byte( bktr_ptr_t bktr, unsigned char data);
	static int i2c_read_byte( bktr_ptr_t bktr, unsigned char *data, int last );
	#endif



	/*
	* the common attach code, used by all OS versions.
	*/
	void
	common_bktr_attach( bktr_ptr_t bktr, int unit, u_long pci_id, u_int rev )
	{
	vm_offset_t buf = 0;
	int need_to_allocate_memory = 1;
	#ifdef BKTR_NEW_MSP34XX_DRIVER
	int err;
	#endif

	/***************************************/
	/* * OS Specific memory routines * */
	/***************************************/
	#if defined(__NetBSD__) \|\| defined(__OpenBSD__)
	/* allocate space for dma program */
	bktr->dma_prog = get_bktr_mem(bktr, &bktr->dm_prog,
	DMA_PROG_ALLOC);
	bktr->odd_dma_prog = get_bktr_mem(bktr, &bktr->dm_oprog,
	DMA_PROG_ALLOC);

	/* allocate space for the VBI buffer */
	bktr->vbidata = get_bktr_mem(bktr, &bktr->dm_vbidata,
	VBI_DATA_SIZE);
	bktr->vbibuffer = get_bktr_mem(bktr, &bktr->dm_vbibuffer,
	VBI_BUFFER_SIZE);

	/* allocate space for pixel buffer */
	if ( BROOKTREE_ALLOC )
	buf = get_bktr_mem(bktr, &bktr->dm_mem, BROOKTREE_ALLOC);
	else
	buf = 0;
	#endif

	#if defined(__FreeBSD__) \|\| defined(__bsdi__)

	/* If this is a module, check if there is any currently saved contiguous memory */
	#if defined(BKTR_FREEBSD_MODULE)
	if (bktr_has_stored_addresses(unit) == 1) {
	/* recover the addresses */
	bktr->dma_prog = bktr_retrieve_address(unit, BKTR_MEM_DMA_PROG);
	bktr->odd_dma_prog = bktr_retrieve_address(unit, BKTR_MEM_ODD_DMA_PROG);
	bktr->vbidata = bktr_retrieve_address(unit, BKTR_MEM_VBIDATA);
	bktr->vbibuffer = bktr_retrieve_address(unit, BKTR_MEM_VBIBUFFER);
	buf = bktr_retrieve_address(unit, BKTR_MEM_BUF);
	need_to_allocate_memory = 0;
	}
	#endif

	if (need_to_allocate_memory == 1) {
	/* allocate space for dma program */
	bktr->dma_prog = get_bktr_mem(unit, DMA_PROG_ALLOC);
	bktr->odd_dma_prog = get_bktr_mem(unit, DMA_PROG_ALLOC);

	/* allocte space for the VBI buffer */
	bktr->vbidata = get_bktr_mem(unit, VBI_DATA_SIZE);
	bktr->vbibuffer = get_bktr_mem(unit, VBI_BUFFER_SIZE);

	/* allocate space for pixel buffer */
	if ( BROOKTREE_ALLOC )
	buf = get_bktr_mem(unit, BROOKTREE_ALLOC);
	else
	buf = 0;
	}
	#endif /* FreeBSD or BSDi */

	#ifdef USE_VBIMUTEX
	mtx_init(&bktr->vbimutex, "bktr vbi lock", NULL, MTX_DEF);
	#endif

	/* If this is a module, save the current contiguous memory */
	#if defined(BKTR_FREEBSD_MODULE)
	bktr_store_address(unit, BKTR_MEM_DMA_PROG, bktr->dma_prog);
	bktr_store_address(unit, BKTR_MEM_ODD_DMA_PROG, bktr->odd_dma_prog);
	bktr_store_address(unit, BKTR_MEM_VBIDATA, bktr->vbidata);
	bktr_store_address(unit, BKTR_MEM_VBIBUFFER, bktr->vbibuffer);
	bktr_store_address(unit, BKTR_MEM_BUF, buf);
	#endif


	if ( bootverbose ) {
	printf("%s: buffer size %d, addr %p\n",
	bktr_name(bktr), (int)BROOKTREE_ALLOC,
	(void *)(uintptr_t)vtophys(buf));
	}

	if ( buf != 0 ) {
	bktr->bigbuf = buf;
	bktr->alloc_pages = BROOKTREE_ALLOC_PAGES;
	bzero((caddr_t) bktr->bigbuf, BROOKTREE_ALLOC);
	} else {
	bktr->alloc_pages = 0;
	}


	bktr->flags = METEOR_INITALIZED \| METEOR_AUTOMODE \|
	METEOR_DEV0 \| METEOR_RGB16;
	bktr->dma_prog_loaded = FALSE;
	bktr->cols = 640;
	bktr->rows = 480;
	bktr->frames = 1; /* one frame */
	bktr->format = METEOR_GEO_RGB16;
	bktr->pixfmt = oformat_meteor_to_bt( bktr->format );
	bktr->pixfmt_compat = TRUE;


	bktr->vbiinsert = 0;
	bktr->vbistart = 0;
	bktr->vbisize = 0;
	bktr->vbiflags = 0;


	/* using the pci device id and revision id */
	/* and determine the card type */
	if (PCI_VENDOR(pci_id) == PCI_VENDOR_BROOKTREE)
	{
	switch (PCI_PRODUCT(pci_id)) {
	case PCI_PRODUCT_BROOKTREE_BT848:
	if (rev == 0x12)
	bktr->id = BROOKTREE_848A;
	else
	bktr->id = BROOKTREE_848;
	break;
	case PCI_PRODUCT_BROOKTREE_BT849:
	bktr->id = BROOKTREE_849A;
	break;
	case PCI_PRODUCT_BROOKTREE_BT878:
	bktr->id = BROOKTREE_878;
	break;
	case PCI_PRODUCT_BROOKTREE_BT879:
	bktr->id = BROOKTREE_879;
	break;
	}
	};

	bktr->clr_on_start = FALSE;

	/* defaults for the tuner section of the card */
	bktr->tflags = TUNER_INITALIZED;
	bktr->tuner.frequency = 0;
	bktr->tuner.channel = 0;
	bktr->tuner.chnlset = DEFAULT_CHNLSET;
	bktr->tuner.afc = 0;
	bktr->tuner.radio_mode = 0;
	bktr->audio_mux_select = 0;
	bktr->audio_mute_state = FALSE;
	bktr->bt848_card = -1;
	bktr->bt848_tuner = -1;
	bktr->reverse_mute = -1;
	bktr->slow_msp_audio = 0;
	bktr->msp_use_mono_source = 0;
	bktr->msp_source_selected = -1;
	bktr->audio_mux_present = 1;

	#if defined(__FreeBSD__)
	#ifdef BKTR_NEW_MSP34XX_DRIVER
	/* get hint on short programming of the msp34xx, so we know */
	/* if the decision what thread to start should be overwritten */
	if ( (err = resource_int_value("bktr", unit, "mspsimple",
	&(bktr->mspsimple)) ) != 0 )
	bktr->mspsimple = -1; /* fall back to default */
	#endif
	#endif

	probeCard( bktr, TRUE, unit );

	/* Initialise any MSP34xx or TDA98xx audio chips */
	init_audio_devices( bktr );

	#ifdef BKTR_NEW_MSP34XX_DRIVER
	/* setup the kenrel thread */
	err = msp_attach( bktr );
	if ( err != 0 ) /* error doing kernel thread stuff, disable msp3400c */
	bktr->card.msp3400c = 0;
	#endif


	}


	/* Copy the vbi lines from 'vbidata' into the circular buffer, 'vbibuffer'.
	* The circular buffer holds 'n' fixed size data blocks.
	* vbisize is the number of bytes in the circular buffer
	* vbiread is the point we reading data out of the circular buffer
	* vbiinsert is the point we insert data into the circular buffer
	*/
	static void vbidecode(bktr_ptr_t bktr) {
	unsigned char *dest;
	unsigned int *seq_dest;

	/* Check if there is room in the buffer to insert the data. */
	if (bktr->vbisize + VBI_DATA_SIZE > VBI_BUFFER_SIZE) return;

	/* Copy the VBI data into the next free slot in the buffer. */
	/* 'dest' is the point in vbibuffer where we want to insert new data */
	dest = (unsigned char *)bktr->vbibuffer + bktr->vbiinsert;
	memcpy(dest, (unsigned char*)bktr->vbidata, VBI_DATA_SIZE);

	/* Write the VBI sequence number to the end of the vbi data */
	/* This is used by the AleVT teletext program */
	seq_dest = (unsigned int )((unsigned char )bktr->vbibuffer
	+ bktr->vbiinsert
	+ (VBI_DATA_SIZE - sizeof(bktr->vbi_sequence_number)));
	*seq_dest = bktr->vbi_sequence_number;

	/* And increase the VBI sequence number */
	/* This can wrap around */
	bktr->vbi_sequence_number++;


	/* Increment the vbiinsert pointer */
	/* This can wrap around */
	bktr->vbiinsert += VBI_DATA_SIZE;
	bktr->vbiinsert = (bktr->vbiinsert % VBI_BUFFER_SIZE);

	/* And increase the amount of vbi data in the buffer */
	bktr->vbisize = bktr->vbisize + VBI_DATA_SIZE;

	}


	/*
	* the common interrupt handler.
	* Returns a 0 or 1 depending on whether the interrupt has handled.
	* In the OS specific section, bktr_intr() is defined which calls this
	* common interrupt handler.
	*/
	int
	common_bktr_intr( void *arg )
	{
	bktr_ptr_t bktr;
	u_long bktr_status;
	u_char dstatus;
	u_long field;
	u_long w_field;
	u_long req_field;

	bktr = (bktr_ptr_t) arg;

	/*
	* check to see if any interrupts are unmasked on this device. If
	* none are, then we likely got here by way of being on a PCI shared
	* interrupt dispatch list.
	*/
	if (INL(bktr, BKTR_INT_MASK) == ALL_INTS_DISABLED)
	return 0; /* bail out now, before we do something we
	shouldn't */

	if (!(bktr->flags & METEOR_OPEN)) {
	OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_RISC_DISABLED);
	OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);
	/* return; ?? */
	}

	/* record and clear the INTerrupt status bits */
	bktr_status = INL(bktr, BKTR_INT_STAT);
	OUTL(bktr, BKTR_INT_STAT, bktr_status & ~I2C_BITS); /* don't touch i2c */

	/* record and clear the device status register */
	dstatus = INB(bktr, BKTR_DSTATUS);
	OUTB(bktr, BKTR_DSTATUS, 0x00);

	#if defined( STATUS_SUM )
	/* add any new device status or INTerrupt status bits */
	status_sum \|= (bktr_status & ~(BT848_INT_RSV0\|BT848_INT_RSV1));
	status_sum \|= ((dstatus & (BT848_DSTATUS_COF\|BT848_DSTATUS_LOF)) << 6);
	#endif /* STATUS_SUM */
	/* printf( "%s: STATUS %x %x %x \n", bktr_name(bktr),
	dstatus, bktr_status, INL(bktr, BKTR_RISC_COUNT) );
	*/


	/* if risc was disabled re-start process again */
	/* if there was one of the following errors re-start again */
	if ( !(bktr_status & BT848_INT_RISC_EN) \|\|
	((bktr_status &(/* BT848_INT_FBUS \| */
	/* BT848_INT_FTRGT \| */
	/* BT848_INT_FDSR \| */
	BT848_INT_PPERR \|
	BT848_INT_RIPERR \| BT848_INT_PABORT \|
	BT848_INT_OCERR \| BT848_INT_SCERR) ) != 0)
	\|\| ((INB(bktr, BKTR_TDEC) == 0) && (bktr_status & TDEC_BITS)) ) {

	u_short tdec_save = INB(bktr, BKTR_TDEC);

	OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_RISC_DISABLED);
	OUTB(bktr, BKTR_CAP_CTL, CAPTURE_OFF);

	OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);

	/* Reset temporal decimation counter */
	OUTB(bktr, BKTR_TDEC, 0);
	OUTB(bktr, BKTR_TDEC, tdec_save);

	/* Reset to no-fields captured state */
	if (bktr->flags & (METEOR_CONTIN \| METEOR_SYNCAP)) {
	switch(bktr->flags & METEOR_ONLY_FIELDS_MASK) {
	case METEOR_ONLY_ODD_FIELDS:
	bktr->flags \|= METEOR_WANT_ODD;
	break;
	case METEOR_ONLY_EVEN_FIELDS:
	bktr->flags \|= METEOR_WANT_EVEN;
	break;
	default:
	bktr->flags \|= METEOR_WANT_MASK;
	break;
	}
	}

	OUTL(bktr, BKTR_RISC_STRT_ADD, vtophys(bktr->dma_prog));
	OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
	OUTW(bktr, BKTR_GPIO_DMA_CTL, bktr->capcontrol);

	OUTL(bktr, BKTR_INT_MASK, BT848_INT_MYSTERYBIT \|
	BT848_INT_RISCI \|
	BT848_INT_VSYNC \|
	BT848_INT_FMTCHG);

	OUTB(bktr, BKTR_CAP_CTL, bktr->bktr_cap_ctl);
	return 1;
	}

	/* If this is not a RISC program interrupt, return */
	if (!(bktr_status & BT848_INT_RISCI))
	return 0;

	/**
	printf( "%s: intr status %x %x %x\n", bktr_name(bktr),
	bktr_status, dstatus, INL(bktr, BKTR_RISC_COUNT) );
	*/


	/*
	* Disable future interrupts if a capture mode is not selected.
	* This can happen when we are in the process of closing or
	* changing capture modes, otherwise it shouldn't happen.
	*/
	if (!(bktr->flags & METEOR_CAP_MASK))
	OUTB(bktr, BKTR_CAP_CTL, CAPTURE_OFF);


	/* Determine which field generated this interrupt */
	field = ( bktr_status & BT848_INT_FIELD ) ? EVEN_F : ODD_F;


	/*
	* Process the VBI data if it is being captured. We do this once
	* both Odd and Even VBI data is captured. Therefore we do this
	* in the Even field interrupt handler.
	*/
	LOCK_VBI(bktr);
	if ( (bktr->vbiflags & VBI_CAPTURE)
	&&(bktr->vbiflags & VBI_OPEN)
	&&(field==EVEN_F)) {
	/* Put VBI data into circular buffer */
	vbidecode(bktr);

	/* If someone is blocked on reading from /dev/vbi, wake them */
	if (bktr->vbi_read_blocked) {
	bktr->vbi_read_blocked = FALSE;
	wakeup(VBI_SLEEP);
	}

	/* If someone has a select() on /dev/vbi, inform them */
	if (SEL_WAITING(&bktr->vbi_select)) {
	selwakeuppri(&bktr->vbi_select, VBIPRI);
	}


	}
	UNLOCK_VBI(bktr);

	/*
	* Register the completed field
	* (For dual-field mode, require fields from the same frame)
	*/
	switch ( bktr->flags & METEOR_WANT_MASK ) {
	case METEOR_WANT_ODD : w_field = ODD_F ; break;
	case METEOR_WANT_EVEN : w_field = EVEN_F ; break;
	default : w_field = (ODD_F\|EVEN_F); break;
	}
	switch ( bktr->flags & METEOR_ONLY_FIELDS_MASK ) {
	case METEOR_ONLY_ODD_FIELDS : req_field = ODD_F ; break;
	case METEOR_ONLY_EVEN_FIELDS : req_field = EVEN_F ; break;
	default : req_field = (ODD_F\|EVEN_F);
	break;
	}

	if (( field == EVEN_F ) && ( w_field == EVEN_F ))
	bktr->flags &= ~METEOR_WANT_EVEN;
	else if (( field == ODD_F ) && ( req_field == ODD_F ) &&
	( w_field == ODD_F ))
	bktr->flags &= ~METEOR_WANT_ODD;
	else if (( field == ODD_F ) && ( req_field == (ODD_F\|EVEN_F) ) &&
	( w_field == (ODD_F\|EVEN_F) ))
	bktr->flags &= ~METEOR_WANT_ODD;
	else if (( field == ODD_F ) && ( req_field == (ODD_F\|EVEN_F) ) &&
	( w_field == ODD_F )) {
	bktr->flags &= ~METEOR_WANT_ODD;
	bktr->flags \|= METEOR_WANT_EVEN;
	}
	else {
	/* We're out of sync. Start over. */
	if (bktr->flags & (METEOR_CONTIN \| METEOR_SYNCAP)) {
	switch(bktr->flags & METEOR_ONLY_FIELDS_MASK) {
	case METEOR_ONLY_ODD_FIELDS:
	bktr->flags \|= METEOR_WANT_ODD;
	break;
	case METEOR_ONLY_EVEN_FIELDS:
	bktr->flags \|= METEOR_WANT_EVEN;
	break;
	default:
	bktr->flags \|= METEOR_WANT_MASK;
	break;
	}
	}
	return 1;
	}

	/*
	* If we have a complete frame.
	*/
	if (!(bktr->flags & METEOR_WANT_MASK)) {
	bktr->frames_captured++;
	/*
	* post the completion time.
	*/
	if (bktr->flags & METEOR_WANT_TS) {
	struct timeval *ts;

	if ((u_int) bktr->alloc_pages * PAGE_SIZE
	<= (bktr->frame_size + sizeof(struct timeval))) {
	ts =(struct timeval *)bktr->bigbuf +
	bktr->frame_size;
	/* doesn't work in synch mode except
	* for first frame */
	/* XXX */
	microtime(ts);
	}
	}


	/*
	* Wake up the user in single capture mode.
	*/
	if (bktr->flags & METEOR_SINGLE) {

	/* stop dma */
	OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);

	/* disable risc, leave fifo running */
	OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
	wakeup(BKTR_SLEEP);
	}

	/*
	* If the user requested to be notified via signal,
	* let them know the frame is complete.
	*/

	if (bktr->proc != NULL) {
	PROC_LOCK(bktr->proc);
	- psignal( bktr->proc, bktr->signal);
	+ kern_psignal( bktr->proc, bktr->signal);
	PROC_UNLOCK(bktr->proc);
	}

	/*
	* Reset the want flags if in continuous or
	* synchronous capture mode.
	*/
	/*
	* XXX NOTE (Luigi):
	* currently we only support 3 capture modes: odd only, even only,
	* odd+even interlaced (odd field first). A fourth mode (non interlaced,
	* either even OR odd) could provide 60 (50 for PAL) pictures per
	* second, but it would require this routine to toggle the desired frame
	* each time, and one more different DMA program for the Bt848.
	* As a consequence, this fourth mode is currently unsupported.
	*/

	if (bktr->flags & (METEOR_CONTIN \| METEOR_SYNCAP)) {
	switch(bktr->flags & METEOR_ONLY_FIELDS_MASK) {
	case METEOR_ONLY_ODD_FIELDS:
	bktr->flags \|= METEOR_WANT_ODD;
	break;
	case METEOR_ONLY_EVEN_FIELDS:
	bktr->flags \|= METEOR_WANT_EVEN;
	break;
	default:
	bktr->flags \|= METEOR_WANT_MASK;
	break;
	}
	}
	}

	return 1;
	}




	/*
	*
	*/
	extern int bt848_format; /* used to set the default format, PAL or NTSC */
	int
	video_open( bktr_ptr_t bktr )
	{
	int frame_rate, video_format=0;

	if (bktr->flags & METEOR_OPEN) /* device is busy */
	return( EBUSY );

	bktr->flags \|= METEOR_OPEN;

	#ifdef BT848_DUMP
	dump_bt848( bt848 );
	#endif

	bktr->clr_on_start = FALSE;

	OUTB(bktr, BKTR_DSTATUS, 0x00); /* clear device status reg. */

	OUTB(bktr, BKTR_ADC, SYNC_LEVEL);

	#if defined(BKTR_SYSTEM_DEFAULT) && BKTR_SYSTEM_DEFAULT == BROOKTREE_PAL
	video_format = 0;
	#else
	video_format = 1;
	#endif

	if (bt848_format == 0 )
	video_format = 0;

	if (bt848_format == 1 )
	video_format = 1;

	if (video_format == 1 ) {
	OUTB(bktr, BKTR_IFORM, BT848_IFORM_F_NTSCM);
	bktr->format_params = BT848_IFORM_F_NTSCM;

	} else {
	OUTB(bktr, BKTR_IFORM, BT848_IFORM_F_PALBDGHI);
	bktr->format_params = BT848_IFORM_F_PALBDGHI;

	}

	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) \| format_params[bktr->format_params].iform_xtsel);

	/* work around for new Hauppauge 878 cards */
	if ((bktr->card.card_id == CARD_HAUPPAUGE) &&
	(bktr->id==BROOKTREE_878 \|\| bktr->id==BROOKTREE_879) )
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) \| BT848_IFORM_M_MUX3);
	else
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) \| BT848_IFORM_M_MUX1);

	OUTB(bktr, BKTR_ADELAY, format_params[bktr->format_params].adelay);
	OUTB(bktr, BKTR_BDELAY, format_params[bktr->format_params].bdelay);
	frame_rate = format_params[bktr->format_params].frame_rate;

	/* enable PLL mode using 28Mhz crystal for PAL/SECAM users */
	if (bktr->xtal_pll_mode == BT848_USE_PLL) {
	OUTB(bktr, BKTR_TGCTRL, 0);
	OUTB(bktr, BKTR_PLL_F_LO, 0xf9);
	OUTB(bktr, BKTR_PLL_F_HI, 0xdc);
	OUTB(bktr, BKTR_PLL_F_XCI, 0x8e);
	}

	bktr->flags = (bktr->flags & ~METEOR_DEV_MASK) \| METEOR_DEV0;

	bktr->max_clip_node = 0;

	OUTB(bktr, BKTR_COLOR_CTL, BT848_COLOR_CTL_GAMMA \| BT848_COLOR_CTL_RGB_DED);

	OUTB(bktr, BKTR_E_HSCALE_LO, 170);
	OUTB(bktr, BKTR_O_HSCALE_LO, 170);

	OUTB(bktr, BKTR_E_DELAY_LO, 0x72);
	OUTB(bktr, BKTR_O_DELAY_LO, 0x72);
	OUTB(bktr, BKTR_E_SCLOOP, 0);
	OUTB(bktr, BKTR_O_SCLOOP, 0);

	OUTB(bktr, BKTR_VBI_PACK_SIZE, 0);
	OUTB(bktr, BKTR_VBI_PACK_DEL, 0);

	bktr->fifo_errors = 0;
	bktr->dma_errors = 0;
	bktr->frames_captured = 0;
	bktr->even_fields_captured = 0;
	bktr->odd_fields_captured = 0;
	bktr->proc = NULL;
	set_fps(bktr, frame_rate);
	bktr->video.addr = 0;
	bktr->video.width = 0;
	bktr->video.banksize = 0;
	bktr->video.ramsize = 0;
	bktr->pixfmt_compat = TRUE;
	bktr->format = METEOR_GEO_RGB16;
	bktr->pixfmt = oformat_meteor_to_bt( bktr->format );

	bktr->capture_area_enabled = FALSE;

	OUTL(bktr, BKTR_INT_MASK, BT848_INT_MYSTERYBIT); /* if you take this out triton
	based motherboards will
	operate unreliably */
	return( 0 );
	}

	int
	vbi_open( bktr_ptr_t bktr )
	{

	LOCK_VBI(bktr);

	if (bktr->vbiflags & VBI_OPEN) { /* device is busy */
	UNLOCK_VBI(bktr);
	return( EBUSY );
	}

	bktr->vbiflags \|= VBI_OPEN;

	/* reset the VBI circular buffer pointers and clear the buffers */
	bktr->vbiinsert = 0;
	bktr->vbistart = 0;
	bktr->vbisize = 0;
	bktr->vbi_sequence_number = 0;
	bktr->vbi_read_blocked = FALSE;

	bzero((caddr_t) bktr->vbibuffer, VBI_BUFFER_SIZE);
	bzero((caddr_t) bktr->vbidata, VBI_DATA_SIZE);

	UNLOCK_VBI(bktr);

	return( 0 );
	}

	/*
	*
	*/
	int
	tuner_open( bktr_ptr_t bktr )
	{
	if ( !(bktr->tflags & TUNER_INITALIZED) ) /* device not found */
	return( ENXIO );

	if ( bktr->tflags & TUNER_OPEN ) /* already open */
	return( 0 );

	bktr->tflags \|= TUNER_OPEN;
	bktr->tuner.frequency = 0;
	bktr->tuner.channel = 0;
	bktr->tuner.chnlset = DEFAULT_CHNLSET;
	bktr->tuner.afc = 0;
	bktr->tuner.radio_mode = 0;

	/* enable drivers on the GPIO port that control the MUXes */
	OUTL(bktr, BKTR_GPIO_OUT_EN, INL(bktr, BKTR_GPIO_OUT_EN) \| bktr->card.gpio_mux_bits);

	/* unmute the audio stream */
	set_audio( bktr, AUDIO_UNMUTE );

	/* Initialise any audio chips, eg MSP34xx or TDA98xx */
	init_audio_devices( bktr );

	return( 0 );
	}




	/*
	*
	*/
	int
	video_close( bktr_ptr_t bktr )
	{
	bktr->flags &= ~(METEOR_OPEN \|
	METEOR_SINGLE \|
	METEOR_CAP_MASK \|
	METEOR_WANT_MASK);

	OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_RISC_DISABLED);
	OUTB(bktr, BKTR_CAP_CTL, CAPTURE_OFF);

	bktr->dma_prog_loaded = FALSE;
	OUTB(bktr, BKTR_TDEC, 0);
	OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);

	/** FIXME: is 0xf magic, wouldn't 0x00 work ??? */
	OUTL(bktr, BKTR_SRESET, 0xf);
	OUTL(bktr, BKTR_INT_STAT, ALL_INTS_CLEARED);

	return( 0 );
	}


	/*
	* tuner close handle,
	* place holder for tuner specific operations on a close.
	*/
	int
	tuner_close( bktr_ptr_t bktr )
	{
	bktr->tflags &= ~TUNER_OPEN;

	/* mute the audio by switching the mux */
	set_audio( bktr, AUDIO_MUTE );

	/* disable drivers on the GPIO port that control the MUXes */
	OUTL(bktr, BKTR_GPIO_OUT_EN, INL(bktr, BKTR_GPIO_OUT_EN) & ~bktr->card.gpio_mux_bits);

	return( 0 );
	}

	int
	vbi_close( bktr_ptr_t bktr )
	{

	LOCK_VBI(bktr);

	bktr->vbiflags &= ~VBI_OPEN;

	UNLOCK_VBI(bktr);

	return( 0 );
	}

	/*
	*
	*/
	int
	video_read(bktr_ptr_t bktr, int unit, struct cdev dev, struct uio uio)
	{
	int status;
	int count;


	if (bktr->bigbuf == 0) /* no frame buffer allocated (ioctl failed) */
	return( ENOMEM );

	if (bktr->flags & METEOR_CAP_MASK)
	return( EIO ); /* already capturing */

	OUTB(bktr, BKTR_CAP_CTL, bktr->bktr_cap_ctl);


	count = bktr->rows * bktr->cols *
	pixfmt_table[ bktr->pixfmt ].public.Bpp;

	if ((int) uio->uio_iov->iov_len < count)
	return( EINVAL );

	bktr->flags &= ~(METEOR_CAP_MASK \| METEOR_WANT_MASK);

	/* capture one frame */
	start_capture(bktr, METEOR_SINGLE);
	/* wait for capture to complete */
	OUTL(bktr, BKTR_INT_STAT, ALL_INTS_CLEARED);
	OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
	OUTW(bktr, BKTR_GPIO_DMA_CTL, bktr->capcontrol);
	OUTL(bktr, BKTR_INT_MASK, BT848_INT_MYSTERYBIT \|
	BT848_INT_RISCI \|
	BT848_INT_VSYNC \|
	BT848_INT_FMTCHG);


	status = tsleep(BKTR_SLEEP, BKTRPRI, "captur", 0);
	if (!status) /* successful capture */
	status = uiomove((caddr_t)bktr->bigbuf, count, uio);
	else
	printf ("%s: read: tsleep error %d\n",
	bktr_name(bktr), status);

	bktr->flags &= ~(METEOR_SINGLE \| METEOR_WANT_MASK);

	return( status );
	}

	/*
	* Read VBI data from the vbi circular buffer
	* The buffer holds vbi data blocks which are the same size
	* vbiinsert is the position we will insert the next item into the buffer
	* vbistart is the actual position in the buffer we want to read from
	* vbisize is the exact number of bytes in the buffer left to read
	*/
	int
	vbi_read(bktr_ptr_t bktr, struct uio *uio, int ioflag)
	{
	int readsize, readsize2, start;
	int status;

	/*
	* XXX - vbi_read() should be protected against being re-entered
	* while it is unlocked for the uiomove.
	*/
	LOCK_VBI(bktr);

	while(bktr->vbisize == 0) {
	if (ioflag & FNDELAY) {
	status = EWOULDBLOCK;
	goto out;
	}

	bktr->vbi_read_blocked = TRUE;
	#ifdef USE_VBIMUTEX
	if ((status = msleep(VBI_SLEEP, &bktr->vbimutex, VBIPRI, "vbi",
	0))) {
	goto out;
	}
	#else
	if ((status = tsleep(VBI_SLEEP, VBIPRI, "vbi", 0))) {
	goto out;
	}
	#endif
	}

	/* Now we have some data to give to the user */

	/* We cannot read more bytes than there are in
	* the circular buffer
	*/
	readsize = (int)uio->uio_iov->iov_len;

	if (readsize > bktr->vbisize) readsize = bktr->vbisize;

	/* Check if we can read this number of bytes without having
	* to wrap around the circular buffer */
	if((bktr->vbistart + readsize) >= VBI_BUFFER_SIZE) {
	/* We need to wrap around */

	readsize2 = VBI_BUFFER_SIZE - bktr->vbistart;
	start = bktr->vbistart;
	UNLOCK_VBI(bktr);
	status = uiomove((caddr_t)bktr->vbibuffer + start, readsize2, uio);
	if (status == 0)
	status = uiomove((caddr_t)bktr->vbibuffer, (readsize - readsize2), uio);
	} else {
	UNLOCK_VBI(bktr);
	/* We do not need to wrap around */
	status = uiomove((caddr_t)bktr->vbibuffer + bktr->vbistart, readsize, uio);
	}

	LOCK_VBI(bktr);

	/* Update the number of bytes left to read */
	bktr->vbisize -= readsize;

	/* Update vbistart */
	bktr->vbistart += readsize;
	bktr->vbistart = bktr->vbistart % VBI_BUFFER_SIZE; /* wrap around if needed */

	out:
	UNLOCK_VBI(bktr);

	return( status );

	}



	/*
	* video ioctls
	*/
	int
	video_ioctl( bktr_ptr_t bktr, int unit, ioctl_cmd_t cmd, caddr_t arg, struct thread* td )
	{
	volatile u_char c_temp;
	unsigned int temp;
	unsigned int temp_iform;
	unsigned int error;
	struct meteor_geomet *geo;
	struct meteor_counts *counts;
	struct meteor_video *video;
	struct bktr_capture_area *cap_area;
	vm_offset_t buf;
	int i;
	int sig;
	char char_temp;

	switch ( cmd ) {

	case BT848SCLIP: /* set clip region */
	bktr->max_clip_node = 0;
	memcpy(&bktr->clip_list, arg, sizeof(bktr->clip_list));

	for (i = 0; i < BT848_MAX_CLIP_NODE; i++) {
	if (bktr->clip_list[i].y_min == 0 &&
	bktr->clip_list[i].y_max == 0)
	break;
	}
	bktr->max_clip_node = i;

	/* make sure that the list contains a valid clip secquence */
	/* the clip rectangles should be sorted by x then by y as the
	second order sort key */

	/* clip rectangle list is terminated by y_min and y_max set to 0 */

	/* to disable clipping set y_min and y_max to 0 in the first
	clip rectangle . The first clip rectangle is clip_list[0].
	*/



	if (bktr->max_clip_node == 0 &&
	(bktr->clip_list[0].y_min != 0 &&
	bktr->clip_list[0].y_max != 0)) {
	return EINVAL;
	}

	for (i = 0; i < BT848_MAX_CLIP_NODE - 1 ; i++) {
	if (bktr->clip_list[i].y_min == 0 &&
	bktr->clip_list[i].y_max == 0) {
	break;
	}
	if ( bktr->clip_list[i+1].y_min != 0 &&
	bktr->clip_list[i+1].y_max != 0 &&
	bktr->clip_list[i].x_min > bktr->clip_list[i+1].x_min ) {

	bktr->max_clip_node = 0;
	return (EINVAL);

	}

	if (bktr->clip_list[i].x_min >= bktr->clip_list[i].x_max \|\|
	bktr->clip_list[i].y_min >= bktr->clip_list[i].y_max \|\|
	bktr->clip_list[i].x_min < 0 \|\|
	bktr->clip_list[i].x_max < 0 \|\|
	bktr->clip_list[i].y_min < 0 \|\|
	bktr->clip_list[i].y_max < 0 ) {
	bktr->max_clip_node = 0;
	return (EINVAL);
	}
	}

	bktr->dma_prog_loaded = FALSE;

	break;

	case METEORSTATUS: /* get Bt848 status */
	c_temp = INB(bktr, BKTR_DSTATUS);
	temp = 0;
	if (!(c_temp & 0x40)) temp \|= METEOR_STATUS_HCLK;
	if (!(c_temp & 0x10)) temp \|= METEOR_STATUS_FIDT;
	(u_short )arg = temp;
	break;

	case BT848SFMT: /* set input format */
	temp = (unsigned long)arg & BT848_IFORM_FORMAT;
	temp_iform = INB(bktr, BKTR_IFORM);
	temp_iform &= ~BT848_IFORM_FORMAT;
	temp_iform &= ~BT848_IFORM_XTSEL;
	OUTB(bktr, BKTR_IFORM, (temp_iform \| temp \| format_params[temp].iform_xtsel));
	switch( temp ) {
	case BT848_IFORM_F_AUTO:
	bktr->flags = (bktr->flags & ~METEOR_FORM_MASK) \|
	METEOR_AUTOMODE;
	break;

	case BT848_IFORM_F_NTSCM:
	case BT848_IFORM_F_NTSCJ:
	bktr->flags = (bktr->flags & ~METEOR_FORM_MASK) \|
	METEOR_NTSC;
	OUTB(bktr, BKTR_ADELAY, format_params[temp].adelay);
	OUTB(bktr, BKTR_BDELAY, format_params[temp].bdelay);
	bktr->format_params = temp;
	break;

	case BT848_IFORM_F_PALBDGHI:
	case BT848_IFORM_F_PALN:
	case BT848_IFORM_F_SECAM:
	case BT848_IFORM_F_RSVD:
	case BT848_IFORM_F_PALM:
	bktr->flags = (bktr->flags & ~METEOR_FORM_MASK) \|
	METEOR_PAL;
	OUTB(bktr, BKTR_ADELAY, format_params[temp].adelay);
	OUTB(bktr, BKTR_BDELAY, format_params[temp].bdelay);
	bktr->format_params = temp;
	break;

	}
	bktr->dma_prog_loaded = FALSE;
	break;

	case METEORSFMT: /* set input format */
	temp_iform = INB(bktr, BKTR_IFORM);
	temp_iform &= ~BT848_IFORM_FORMAT;
	temp_iform &= ~BT848_IFORM_XTSEL;
	switch((unsigned long )arg & METEOR_FORM_MASK ) {
	case 0: /* default */
	case METEOR_FMT_NTSC:
	bktr->flags = (bktr->flags & ~METEOR_FORM_MASK) \|
	METEOR_NTSC;
	OUTB(bktr, BKTR_IFORM, temp_iform \| BT848_IFORM_F_NTSCM \|
	format_params[BT848_IFORM_F_NTSCM].iform_xtsel);
	OUTB(bktr, BKTR_ADELAY, format_params[BT848_IFORM_F_NTSCM].adelay);
	OUTB(bktr, BKTR_BDELAY, format_params[BT848_IFORM_F_NTSCM].bdelay);
	bktr->format_params = BT848_IFORM_F_NTSCM;
	break;

	case METEOR_FMT_PAL:
	bktr->flags = (bktr->flags & ~METEOR_FORM_MASK) \|
	METEOR_PAL;
	OUTB(bktr, BKTR_IFORM, temp_iform \| BT848_IFORM_F_PALBDGHI \|
	format_params[BT848_IFORM_F_PALBDGHI].iform_xtsel);
	OUTB(bktr, BKTR_ADELAY, format_params[BT848_IFORM_F_PALBDGHI].adelay);
	OUTB(bktr, BKTR_BDELAY, format_params[BT848_IFORM_F_PALBDGHI].bdelay);
	bktr->format_params = BT848_IFORM_F_PALBDGHI;
	break;

	case METEOR_FMT_AUTOMODE:
	bktr->flags = (bktr->flags & ~METEOR_FORM_MASK) \|
	METEOR_AUTOMODE;
	OUTB(bktr, BKTR_IFORM, temp_iform \| BT848_IFORM_F_AUTO \|
	format_params[BT848_IFORM_F_AUTO].iform_xtsel);
	break;

	default:
	return( EINVAL );
	}
	bktr->dma_prog_loaded = FALSE;
	break;

	case METEORGFMT: /* get input format */
	(u_long )arg = bktr->flags & METEOR_FORM_MASK;
	break;


	case BT848GFMT: /* get input format */
	(u_long )arg = INB(bktr, BKTR_IFORM) & BT848_IFORM_FORMAT;
	break;

	case METEORSCOUNT: /* (re)set error counts */
	counts = (struct meteor_counts *) arg;
	bktr->fifo_errors = counts->fifo_errors;
	bktr->dma_errors = counts->dma_errors;
	bktr->frames_captured = counts->frames_captured;
	bktr->even_fields_captured = counts->even_fields_captured;
	bktr->odd_fields_captured = counts->odd_fields_captured;
	break;

	case METEORGCOUNT: /* get error counts */
	counts = (struct meteor_counts *) arg;
	counts->fifo_errors = bktr->fifo_errors;
	counts->dma_errors = bktr->dma_errors;
	counts->frames_captured = bktr->frames_captured;
	counts->even_fields_captured = bktr->even_fields_captured;
	counts->odd_fields_captured = bktr->odd_fields_captured;
	break;

	case METEORGVIDEO:
	video = (struct meteor_video *)arg;
	video->addr = bktr->video.addr;
	video->width = bktr->video.width;
	video->banksize = bktr->video.banksize;
	video->ramsize = bktr->video.ramsize;
	break;

	case METEORSVIDEO:
	video = (struct meteor_video *)arg;
	bktr->video.addr = video->addr;
	bktr->video.width = video->width;
	bktr->video.banksize = video->banksize;
	bktr->video.ramsize = video->ramsize;
	break;

	case METEORSFPS:
	set_fps(bktr, (u_short )arg);
	break;

	case METEORGFPS:
	(u_short )arg = bktr->fps;
	break;

	case METEORSHUE: /* set hue */
	OUTB(bktr, BKTR_HUE, ((u_char ) arg) & 0xff);
	break;

	case METEORGHUE: /* get hue */
	(u_char )arg = INB(bktr, BKTR_HUE);
	break;

	case METEORSBRIG: /* set brightness */
	char_temp = ( (u_char )arg & 0xff) - 128;
	OUTB(bktr, BKTR_BRIGHT, char_temp);

	break;

	case METEORGBRIG: /* get brightness */
	(u_char )arg = INB(bktr, BKTR_BRIGHT);
	break;

	case METEORSCSAT: /* set chroma saturation */
	temp = (int)(u_char )arg;

	OUTB(bktr, BKTR_SAT_U_LO, (temp << 1) & 0xff);
	OUTB(bktr, BKTR_SAT_V_LO, (temp << 1) & 0xff);
	OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL)
	& ~(BT848_E_CONTROL_SAT_U_MSB
	\| BT848_E_CONTROL_SAT_V_MSB));
	OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL)
	& ~(BT848_O_CONTROL_SAT_U_MSB \|
	BT848_O_CONTROL_SAT_V_MSB));

	if ( temp & BIT_SEVEN_HIGH ) {
	OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL)
	\| (BT848_E_CONTROL_SAT_U_MSB
	\| BT848_E_CONTROL_SAT_V_MSB));
	OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL)
	\| (BT848_O_CONTROL_SAT_U_MSB
	\| BT848_O_CONTROL_SAT_V_MSB));
	}
	break;

	case METEORGCSAT: /* get chroma saturation */
	temp = (INB(bktr, BKTR_SAT_V_LO) >> 1) & 0xff;
	if ( INB(bktr, BKTR_E_CONTROL) & BT848_E_CONTROL_SAT_V_MSB )
	temp \|= BIT_SEVEN_HIGH;
	(u_char )arg = (u_char)temp;
	break;

	case METEORSCONT: /* set contrast */
	temp = (int)(u_char )arg & 0xff;
	temp <<= 1;
	OUTB(bktr, BKTR_CONTRAST_LO, temp & 0xff);
	OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~BT848_E_CONTROL_CON_MSB);
	OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) & ~BT848_O_CONTROL_CON_MSB);
	OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) \|
	(((temp & 0x100) >> 6 ) & BT848_E_CONTROL_CON_MSB));
	OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) \|
	(((temp & 0x100) >> 6 ) & BT848_O_CONTROL_CON_MSB));
	break;

	case METEORGCONT: /* get contrast */
	temp = (int)INB(bktr, BKTR_CONTRAST_LO) & 0xff;
	temp \|= ((int)INB(bktr, BKTR_O_CONTROL) & 0x04) << 6;
	(u_char )arg = (u_char)((temp >> 1) & 0xff);
	break;

	case BT848SCBUF: /* set Clear-Buffer-on-start flag */
	bktr->clr_on_start = ((int )arg != 0);
	break;

	case BT848GCBUF: /* get Clear-Buffer-on-start flag */
	(int )arg = (int) bktr->clr_on_start;
	break;

	case METEORSSIGNAL:
	sig = (int )arg;
	/* Historically, applications used METEOR_SIG_MODE_MASK
	* to reset signal delivery.
	*/
	if (sig == METEOR_SIG_MODE_MASK)
	sig = 0;
	if (sig < 0 \|\| sig > _SIG_MAXSIG)
	return (EINVAL);
	bktr->signal = sig;
	bktr->proc = sig ? td->td_proc : NULL;
	break;

	case METEORGSIGNAL:
	(int )arg = bktr->signal;
	break;

	case METEORCAPTUR:
	temp = bktr->flags;
	switch ((int ) arg) {
	case METEOR_CAP_SINGLE:

	if (bktr->bigbuf==0) /* no frame buffer allocated */
	return( ENOMEM );
	/* already capturing */
	if (temp & METEOR_CAP_MASK)
	return( EIO );



	start_capture(bktr, METEOR_SINGLE);

	/* wait for capture to complete */
	OUTL(bktr, BKTR_INT_STAT, ALL_INTS_CLEARED);
	OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
	OUTW(bktr, BKTR_GPIO_DMA_CTL, bktr->capcontrol);

	OUTL(bktr, BKTR_INT_MASK, BT848_INT_MYSTERYBIT \|
	BT848_INT_RISCI \|
	BT848_INT_VSYNC \|
	BT848_INT_FMTCHG);

	OUTB(bktr, BKTR_CAP_CTL, bktr->bktr_cap_ctl);
	error = tsleep(BKTR_SLEEP, BKTRPRI, "captur", hz);
	if (error && (error != ERESTART)) {
	/* Here if we didn't get complete frame */
	#ifdef DIAGNOSTIC
	printf( "%s: ioctl: tsleep error %d %x\n",
	bktr_name(bktr), error,
	INL(bktr, BKTR_RISC_COUNT));
	#endif

	/* stop dma */
	OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);

	/* disable risc, leave fifo running */
	OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
	}

	bktr->flags &= ~(METEOR_SINGLE\|METEOR_WANT_MASK);
	/* FIXME: should we set bt848->int_stat ??? */
	break;

	case METEOR_CAP_CONTINOUS:
	if (bktr->bigbuf==0) /* no frame buffer allocated */
	return( ENOMEM );
	/* already capturing */
	if (temp & METEOR_CAP_MASK)
	return( EIO );


	start_capture(bktr, METEOR_CONTIN);

	/* Clear the interrypt status register */
	OUTL(bktr, BKTR_INT_STAT, INL(bktr, BKTR_INT_STAT));

	OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
	OUTW(bktr, BKTR_GPIO_DMA_CTL, bktr->capcontrol);
	OUTB(bktr, BKTR_CAP_CTL, bktr->bktr_cap_ctl);

	OUTL(bktr, BKTR_INT_MASK, BT848_INT_MYSTERYBIT \|
	BT848_INT_RISCI \|
	BT848_INT_VSYNC \|
	BT848_INT_FMTCHG);
	#ifdef BT848_DUMP
	dump_bt848( bt848 );
	#endif
	break;

	case METEOR_CAP_STOP_CONT:
	if (bktr->flags & METEOR_CONTIN) {
	/* turn off capture */
	OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_RISC_DISABLED);
	OUTB(bktr, BKTR_CAP_CTL, CAPTURE_OFF);
	OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);
	bktr->flags &=
	~(METEOR_CONTIN \| METEOR_WANT_MASK);

	}
	}
	break;

	case METEORSETGEO:
	/* can't change parameters while capturing */
	if (bktr->flags & METEOR_CAP_MASK)
	return( EBUSY );


	geo = (struct meteor_geomet *) arg;

	error = 0;
	/* Either even or odd, if even & odd, then these a zero */
	if ((geo->oformat & METEOR_GEO_ODD_ONLY) &&
	(geo->oformat & METEOR_GEO_EVEN_ONLY)) {
	printf( "%s: ioctl: Geometry odd or even only.\n",
	bktr_name(bktr));
	return( EINVAL );
	}

	/* set/clear even/odd flags */
	if (geo->oformat & METEOR_GEO_ODD_ONLY)
	bktr->flags \|= METEOR_ONLY_ODD_FIELDS;
	else
	bktr->flags &= ~METEOR_ONLY_ODD_FIELDS;
	if (geo->oformat & METEOR_GEO_EVEN_ONLY)
	bktr->flags \|= METEOR_ONLY_EVEN_FIELDS;
	else
	bktr->flags &= ~METEOR_ONLY_EVEN_FIELDS;

	if (geo->columns <= 0) {
	printf(
	"%s: ioctl: %d: columns must be greater than zero.\n",
	bktr_name(bktr), geo->columns);
	error = EINVAL;
	}
	else if ((geo->columns & 0x3fe) != geo->columns) {
	printf(
	"%s: ioctl: %d: columns too large or not even.\n",
	bktr_name(bktr), geo->columns);
	error = EINVAL;
	}

	if (geo->rows <= 0) {
	printf(
	"%s: ioctl: %d: rows must be greater than zero.\n",
	bktr_name(bktr), geo->rows);
	error = EINVAL;
	}
	else if (((geo->rows & 0x7fe) != geo->rows) \|\|
	((geo->oformat & METEOR_GEO_FIELD_MASK) &&
	((geo->rows & 0x3fe) != geo->rows)) ) {
	printf(
	"%s: ioctl: %d: rows too large or not even.\n",
	bktr_name(bktr), geo->rows);
	error = EINVAL;
	}

	if (geo->frames > 32) {
	printf("%s: ioctl: too many frames.\n",
	bktr_name(bktr));

	error = EINVAL;
	}

	if (error)
	return( error );

	bktr->dma_prog_loaded = FALSE;
	OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_RISC_DISABLED);

	OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);

	if ((temp=(geo->rows * geo->columns * geo->frames * 2))) {
	if (geo->oformat & METEOR_GEO_RGB24) temp = temp * 2;

	/* meteor_mem structure for SYNC Capture */
	if (geo->frames > 1) temp += PAGE_SIZE;

	temp = btoc(temp);
	if ((int) temp > bktr->alloc_pages
	&& bktr->video.addr == 0) {

	/*****************************/
	/* * OS Dependant code * */
	/*****************************/
	#if defined(__NetBSD__) \|\| defined(__OpenBSD__)
	bus_dmamap_t dmamap;

	buf = get_bktr_mem(bktr, &dmamap,
	temp * PAGE_SIZE);
	if (buf != 0) {
	free_bktr_mem(bktr, bktr->dm_mem,
	bktr->bigbuf);
	bktr->dm_mem = dmamap;

	#else
	buf = get_bktr_mem(unit, temp*PAGE_SIZE);
	if (buf != 0) {
	kmem_free(kernel_map, bktr->bigbuf,
	(bktr->alloc_pages * PAGE_SIZE));
	#endif

	bktr->bigbuf = buf;
	bktr->alloc_pages = temp;
	if (bootverbose)
	printf("%s: ioctl: Allocating %d bytes\n",
	bktr_name(bktr), (int)(temp*PAGE_SIZE));
	}
	else
	error = ENOMEM;
	}
	}

	if (error)
	return error;

	bktr->rows = geo->rows;
	bktr->cols = geo->columns;
	bktr->frames = geo->frames;

	/* Pixel format (if in meteor pixfmt compatibility mode) */
	if ( bktr->pixfmt_compat ) {
	bktr->format = METEOR_GEO_YUV_422;
	switch (geo->oformat & METEOR_GEO_OUTPUT_MASK) {
	case 0: /* default */
	case METEOR_GEO_RGB16:
	bktr->format = METEOR_GEO_RGB16;
	break;
	case METEOR_GEO_RGB24:
	bktr->format = METEOR_GEO_RGB24;
	break;
	case METEOR_GEO_YUV_422:
	bktr->format = METEOR_GEO_YUV_422;
	if (geo->oformat & METEOR_GEO_YUV_12)
	bktr->format = METEOR_GEO_YUV_12;
	break;
	case METEOR_GEO_YUV_PACKED:
	bktr->format = METEOR_GEO_YUV_PACKED;
	break;
	}
	bktr->pixfmt = oformat_meteor_to_bt( bktr->format );
	}

	if (bktr->flags & METEOR_CAP_MASK) {

	if (bktr->flags & (METEOR_CONTIN\|METEOR_SYNCAP)) {
	switch(bktr->flags & METEOR_ONLY_FIELDS_MASK) {
	case METEOR_ONLY_ODD_FIELDS:
	bktr->flags \|= METEOR_WANT_ODD;
	break;
	case METEOR_ONLY_EVEN_FIELDS:
	bktr->flags \|= METEOR_WANT_EVEN;
	break;
	default:
	bktr->flags \|= METEOR_WANT_MASK;
	break;
	}

	start_capture(bktr, METEOR_CONTIN);
	OUTL(bktr, BKTR_INT_STAT, INL(bktr, BKTR_INT_STAT));
	OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_ENABLED);
	OUTW(bktr, BKTR_GPIO_DMA_CTL, bktr->capcontrol);
	OUTL(bktr, BKTR_INT_MASK, BT848_INT_MYSTERYBIT \|
	BT848_INT_VSYNC \|
	BT848_INT_FMTCHG);
	}
	}
	break;
	/* end of METEORSETGEO */

	/* FIXME. The Capture Area currently has the following restrictions:
	GENERAL
	y_offset may need to be even in interlaced modes
	RGB24 - Interlaced mode
	x_size must be greater than or equal to 1.666*METEORSETGEO width (cols)
	y_size must be greater than or equal to METEORSETGEO height (rows)
	RGB24 - Even Only (or Odd Only) mode
	x_size must be greater than or equal to 1.666*METEORSETGEO width (cols)
	y_size must be greater than or equal to 2*METEORSETGEO height (rows)
	YUV12 - Interlaced mode
	x_size must be greater than or equal to METEORSETGEO width (cols)
	y_size must be greater than or equal to METEORSETGEO height (rows)
	YUV12 - Even Only (or Odd Only) mode
	x_size must be greater than or equal to METEORSETGEO width (cols)
	y_size must be greater than or equal to 2*METEORSETGEO height (rows)
	*/

	case BT848_SCAPAREA: /* set capture area of each video frame */
	/* can't change parameters while capturing */
	if (bktr->flags & METEOR_CAP_MASK)
	return( EBUSY );

	cap_area = (struct bktr_capture_area *) arg;
	bktr->capture_area_x_offset = cap_area->x_offset;
	bktr->capture_area_y_offset = cap_area->y_offset;
	bktr->capture_area_x_size = cap_area->x_size;
	bktr->capture_area_y_size = cap_area->y_size;
	bktr->capture_area_enabled = TRUE;

	bktr->dma_prog_loaded = FALSE;
	break;

	case BT848_GCAPAREA: /* get capture area of each video frame */
	cap_area = (struct bktr_capture_area *) arg;
	if (bktr->capture_area_enabled == FALSE) {
	cap_area->x_offset = 0;
	cap_area->y_offset = 0;
	cap_area->x_size = format_params[
	bktr->format_params].scaled_hactive;
	cap_area->y_size = format_params[
	bktr->format_params].vactive;
	} else {
	cap_area->x_offset = bktr->capture_area_x_offset;
	cap_area->y_offset = bktr->capture_area_y_offset;
	cap_area->x_size = bktr->capture_area_x_size;
	cap_area->y_size = bktr->capture_area_y_size;
	}
	break;

	default:
	return common_ioctl( bktr, cmd, arg );
	}

	return( 0 );
	}

	/*
	* tuner ioctls
	*/
	int
	tuner_ioctl( bktr_ptr_t bktr, int unit, ioctl_cmd_t cmd, caddr_t arg, struct thread* td )
	{
	int tmp_int;
	unsigned int temp, temp1;
	int offset;
	int count;
	u_char *buf;
	u_long par;
	u_char write;
	int i2c_addr;
	int i2c_port;
	u_long data;

	switch ( cmd ) {

	case REMOTE_GETKEY:
	/* Read the last key pressed by the Remote Control */
	if (bktr->remote_control == 0) return (EINVAL);
	remote_read(bktr, (struct bktr_remote *)arg);
	break;

	#if defined( TUNER_AFC )
	case TVTUNER_SETAFC:
	bktr->tuner.afc = ((int )arg != 0);
	break;

	case TVTUNER_GETAFC:
	(int )arg = bktr->tuner.afc;
	/* XXX Perhaps use another bit to indicate AFC success? */
	break;
	#endif /* TUNER_AFC */

	case TVTUNER_SETCHNL:
	temp_mute( bktr, TRUE );
	temp = tv_channel( bktr, (int)(unsigned long )arg );
	if ( temp < 0 ) {
	temp_mute( bktr, FALSE );
	return( EINVAL );
	}
	(unsigned long )arg = temp;

	/* after every channel change, we must restart the MSP34xx */
	/* audio chip to reselect NICAM STEREO or MONO audio */
	if ( bktr->card.msp3400c )
	msp_autodetect( bktr );

	/* after every channel change, we must restart the DPL35xx */
	if ( bktr->card.dpl3518a )
	dpl_autodetect( bktr );

	temp_mute( bktr, FALSE );
	break;

	case TVTUNER_GETCHNL:
	(unsigned long )arg = bktr->tuner.channel;
	break;

	case TVTUNER_SETTYPE:
	temp = (unsigned long )arg;
	if ( (temp < CHNLSET_MIN) \|\| (temp > CHNLSET_MAX) )
	return( EINVAL );
	bktr->tuner.chnlset = temp;
	break;

	case TVTUNER_GETTYPE:
	(unsigned long )arg = bktr->tuner.chnlset;
	break;

	case TVTUNER_GETSTATUS:
	temp = get_tuner_status( bktr );
	(unsigned long )arg = temp & 0xff;
	break;

	case TVTUNER_SETFREQ:
	temp_mute( bktr, TRUE );
	temp = tv_freq( bktr, (int)(unsigned long )arg, TV_FREQUENCY);
	temp_mute( bktr, FALSE );
	if ( temp < 0 ) {
	temp_mute( bktr, FALSE );
	return( EINVAL );
	}
	(unsigned long )arg = temp;

	/* after every channel change, we must restart the MSP34xx */
	/* audio chip to reselect NICAM STEREO or MONO audio */
	if ( bktr->card.msp3400c )
	msp_autodetect( bktr );

	/* after every channel change, we must restart the DPL35xx */
	if ( bktr->card.dpl3518a )
	dpl_autodetect( bktr );

	temp_mute( bktr, FALSE );
	break;

	case TVTUNER_GETFREQ:
	(unsigned long )arg = bktr->tuner.frequency;
	break;

	case TVTUNER_GETCHNLSET:
	return tuner_getchnlset((struct bktr_chnlset *)arg);

	case BT848_SAUDIO: /* set audio channel */
	if ( set_audio( bktr, (int)arg ) < 0 )
	return( EIO );
	break;

	/* hue is a 2's compliment number, -90' to +89.3' in 0.7' steps */
	case BT848_SHUE: /* set hue */
	OUTB(bktr, BKTR_HUE, (u_char)((int)arg & 0xff));
	break;

	case BT848_GHUE: /* get hue */
	(int)arg = (signed char)(INB(bktr, BKTR_HUE) & 0xff);
	break;

	/* brightness is a 2's compliment #, -50 to +%49.6% in 0.39% steps */
	case BT848_SBRIG: /* set brightness */
	OUTB(bktr, BKTR_BRIGHT, (u_char)((int )arg & 0xff));
	break;

	case BT848_GBRIG: /* get brightness */
	(int )arg = (signed char)(INB(bktr, BKTR_BRIGHT) & 0xff);
	break;

	/* */
	case BT848_SCSAT: /* set chroma saturation */
	tmp_int = (int)arg;

	temp = INB(bktr, BKTR_E_CONTROL);
	temp1 = INB(bktr, BKTR_O_CONTROL);
	if ( tmp_int & BIT_EIGHT_HIGH ) {
	temp \|= (BT848_E_CONTROL_SAT_U_MSB \|
	BT848_E_CONTROL_SAT_V_MSB);
	temp1 \|= (BT848_O_CONTROL_SAT_U_MSB \|
	BT848_O_CONTROL_SAT_V_MSB);
	}
	else {
	temp &= ~(BT848_E_CONTROL_SAT_U_MSB \|
	BT848_E_CONTROL_SAT_V_MSB);
	temp1 &= ~(BT848_O_CONTROL_SAT_U_MSB \|
	BT848_O_CONTROL_SAT_V_MSB);
	}

	OUTB(bktr, BKTR_SAT_U_LO, (u_char)(tmp_int & 0xff));
	OUTB(bktr, BKTR_SAT_V_LO, (u_char)(tmp_int & 0xff));
	OUTB(bktr, BKTR_E_CONTROL, temp);
	OUTB(bktr, BKTR_O_CONTROL, temp1);
	break;

	case BT848_GCSAT: /* get chroma saturation */
	tmp_int = (int)(INB(bktr, BKTR_SAT_V_LO) & 0xff);
	if ( INB(bktr, BKTR_E_CONTROL) & BT848_E_CONTROL_SAT_V_MSB )
	tmp_int \|= BIT_EIGHT_HIGH;
	(int)arg = tmp_int;
	break;

	/* */
	case BT848_SVSAT: /* set chroma V saturation */
	tmp_int = (int)arg;

	temp = INB(bktr, BKTR_E_CONTROL);
	temp1 = INB(bktr, BKTR_O_CONTROL);
	if ( tmp_int & BIT_EIGHT_HIGH) {
	temp \|= BT848_E_CONTROL_SAT_V_MSB;
	temp1 \|= BT848_O_CONTROL_SAT_V_MSB;
	}
	else {
	temp &= ~BT848_E_CONTROL_SAT_V_MSB;
	temp1 &= ~BT848_O_CONTROL_SAT_V_MSB;
	}

	OUTB(bktr, BKTR_SAT_V_LO, (u_char)(tmp_int & 0xff));
	OUTB(bktr, BKTR_E_CONTROL, temp);
	OUTB(bktr, BKTR_O_CONTROL, temp1);
	break;

	case BT848_GVSAT: /* get chroma V saturation */
	tmp_int = (int)INB(bktr, BKTR_SAT_V_LO) & 0xff;
	if ( INB(bktr, BKTR_E_CONTROL) & BT848_E_CONTROL_SAT_V_MSB )
	tmp_int \|= BIT_EIGHT_HIGH;
	(int)arg = tmp_int;
	break;

	/* */
	case BT848_SUSAT: /* set chroma U saturation */
	tmp_int = (int)arg;

	temp = INB(bktr, BKTR_E_CONTROL);
	temp1 = INB(bktr, BKTR_O_CONTROL);
	if ( tmp_int & BIT_EIGHT_HIGH ) {
	temp \|= BT848_E_CONTROL_SAT_U_MSB;
	temp1 \|= BT848_O_CONTROL_SAT_U_MSB;
	}
	else {
	temp &= ~BT848_E_CONTROL_SAT_U_MSB;
	temp1 &= ~BT848_O_CONTROL_SAT_U_MSB;
	}

	OUTB(bktr, BKTR_SAT_U_LO, (u_char)(tmp_int & 0xff));
	OUTB(bktr, BKTR_E_CONTROL, temp);
	OUTB(bktr, BKTR_O_CONTROL, temp1);
	break;

	case BT848_GUSAT: /* get chroma U saturation */
	tmp_int = (int)INB(bktr, BKTR_SAT_U_LO) & 0xff;
	if ( INB(bktr, BKTR_E_CONTROL) & BT848_E_CONTROL_SAT_U_MSB )
	tmp_int \|= BIT_EIGHT_HIGH;
	(int)arg = tmp_int;
	break;

	/* lr 970528 luma notch etc - 3 high bits of e_control/o_control */

	case BT848_SLNOTCH: /* set luma notch */
	tmp_int = ((int )arg & 0x7) << 5 ;
	OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~0xe0);
	OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) & ~0xe0);
	OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) \| tmp_int);
	OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) \| tmp_int);
	break;

	case BT848_GLNOTCH: /* get luma notch */
	(int )arg = (int) ( (INB(bktr, BKTR_E_CONTROL) & 0xe0) >> 5) ;
	break;


	/* */
	case BT848_SCONT: /* set contrast */
	tmp_int = (int)arg;

	temp = INB(bktr, BKTR_E_CONTROL);
	temp1 = INB(bktr, BKTR_O_CONTROL);
	if ( tmp_int & BIT_EIGHT_HIGH ) {
	temp \|= BT848_E_CONTROL_CON_MSB;
	temp1 \|= BT848_O_CONTROL_CON_MSB;
	}
	else {
	temp &= ~BT848_E_CONTROL_CON_MSB;
	temp1 &= ~BT848_O_CONTROL_CON_MSB;
	}

	OUTB(bktr, BKTR_CONTRAST_LO, (u_char)(tmp_int & 0xff));
	OUTB(bktr, BKTR_E_CONTROL, temp);
	OUTB(bktr, BKTR_O_CONTROL, temp1);
	break;

	case BT848_GCONT: /* get contrast */
	tmp_int = (int)INB(bktr, BKTR_CONTRAST_LO) & 0xff;
	if ( INB(bktr, BKTR_E_CONTROL) & BT848_E_CONTROL_CON_MSB )
	tmp_int \|= BIT_EIGHT_HIGH;
	(int)arg = tmp_int;
	break;

	/* FIXME: SCBARS and CCBARS require a valid int * */
	/* argument to succeed, but its not used; consider */
	/* using the arg to store the on/off state so */
	/* there's only one ioctl() needed to turn cbars on/off */
	case BT848_SCBARS: /* set colorbar output */
	OUTB(bktr, BKTR_COLOR_CTL, INB(bktr, BKTR_COLOR_CTL) \| BT848_COLOR_CTL_COLOR_BARS);
	break;

	case BT848_CCBARS: /* clear colorbar output */
	OUTB(bktr, BKTR_COLOR_CTL, INB(bktr, BKTR_COLOR_CTL) & ~(BT848_COLOR_CTL_COLOR_BARS));
	break;

	case BT848_GAUDIO: /* get audio channel */
	temp = bktr->audio_mux_select;
	if ( bktr->audio_mute_state == TRUE )
	temp \|= AUDIO_MUTE;
	(int)arg = temp;
	break;

	case BT848_SBTSC: /* set audio channel */
	if ( set_BTSC( bktr, (int)arg ) < 0 )
	return( EIO );
	break;

	case BT848_WEEPROM: /* write eeprom */
	offset = (((struct eeProm *)arg)->offset);
	count = (((struct eeProm *)arg)->count);
	buf = &(((struct eeProm *)arg)->bytes[ 0 ]);
	if ( writeEEProm( bktr, offset, count, buf ) < 0 )
	return( EIO );
	break;

	case BT848_REEPROM: /* read eeprom */
	offset = (((struct eeProm *)arg)->offset);
	count = (((struct eeProm *)arg)->count);
	buf = &(((struct eeProm *)arg)->bytes[ 0 ]);
	if ( readEEProm( bktr, offset, count, buf ) < 0 )
	return( EIO );
	break;

	case BT848_SIGNATURE:
	offset = (((struct eeProm *)arg)->offset);
	count = (((struct eeProm *)arg)->count);
	buf = &(((struct eeProm *)arg)->bytes[ 0 ]);
	if ( signCard( bktr, offset, count, buf ) < 0 )
	return( EIO );
	break;

	/* Ioctl's for direct gpio access */
	#ifdef BKTR_GPIO_ACCESS
	case BT848_GPIO_GET_EN:
	(int)arg = INL(bktr, BKTR_GPIO_OUT_EN);
	break;

	case BT848_GPIO_SET_EN:
	OUTL(bktr, BKTR_GPIO_OUT_EN, (int)arg);
	break;

	case BT848_GPIO_GET_DATA:
	(int)arg = INL(bktr, BKTR_GPIO_DATA);
	break;

	case BT848_GPIO_SET_DATA:
	OUTL(bktr, BKTR_GPIO_DATA, (int)arg);
	break;
	#endif /* BKTR_GPIO_ACCESS */

	/* Ioctl's for running the tuner device in radio mode */

	case RADIO_GETMODE:
	(unsigned char )arg = bktr->tuner.radio_mode;
	break;

	case RADIO_SETMODE:
	bktr->tuner.radio_mode = (unsigned char )arg;
	break;

	case RADIO_GETFREQ:
	(unsigned long )arg = bktr->tuner.frequency;
	break;

	case RADIO_SETFREQ:
	/* The argument to this ioctl is NOT freq*16. It is
	** freq*100.
	*/

	temp=(int)(unsigned long )arg;

	#ifdef BKTR_RADIO_DEBUG
	printf("%s: arg=%d temp=%d\n", bktr_name(bktr),
	(int)(unsigned long )arg, temp);
	#endif

	#ifndef BKTR_RADIO_NOFREQCHECK
	/* According to the spec. sheet the band: 87.5MHz-108MHz */
	/* is supported. */
	if(temp<8750 \|\| temp>10800) {
	printf("%s: Radio frequency out of range\n", bktr_name(bktr));
	return(EINVAL);
	}
	#endif
	temp_mute( bktr, TRUE );
	temp = tv_freq( bktr, temp, FM_RADIO_FREQUENCY );
	temp_mute( bktr, FALSE );
	#ifdef BKTR_RADIO_DEBUG
	if(temp)
	printf("%s: tv_freq returned: %d\n", bktr_name(bktr), temp);
	#endif
	if ( temp < 0 )
	return( EINVAL );
	(unsigned long )arg = temp;
	break;

	/* Luigi's I2CWR ioctl */
	case BT848_I2CWR:
	par = (u_long )arg;
	write = (par >> 24) & 0xff ;
	i2c_addr = (par >> 16) & 0xff ;
	i2c_port = (par >> 8) & 0xff ;
	data = (par) & 0xff ;

	if (write) {
	i2cWrite( bktr, i2c_addr, i2c_port, data);
	} else {
	data = i2cRead( bktr, i2c_addr);
	}
	(u_long )arg = (par & 0xffffff00) \| ( data & 0xff );
	break;


	#ifdef BT848_MSP_READ
	/* I2C ioctls to allow userland access to the MSP chip */
	case BT848_MSP_READ:
	{
	struct bktr_msp_control *msp;
	msp = (struct bktr_msp_control *) arg;
	msp->data = msp_dpl_read(bktr, bktr->msp_addr,
	msp->function, msp->address);
	break;
	}

	case BT848_MSP_WRITE:
	{
	struct bktr_msp_control *msp;
	msp = (struct bktr_msp_control *) arg;
	msp_dpl_write(bktr, bktr->msp_addr, msp->function,
	msp->address, msp->data );
	break;
	}

	case BT848_MSP_RESET:
	msp_dpl_reset(bktr, bktr->msp_addr);
	break;
	#endif

	default:
	return common_ioctl( bktr, cmd, arg );
	}

	return( 0 );
	}


	/*
	* common ioctls
	*/
	static int
	common_ioctl( bktr_ptr_t bktr, ioctl_cmd_t cmd, caddr_t arg )
	{
	int pixfmt;
	unsigned int temp;
	struct meteor_pixfmt *pf_pub;

	switch (cmd) {

	case METEORSINPUT: /* set input device */
	/Bt848 has 3 MUX Inputs. Bt848A/849A/878/879 has 4 MUX Inputs/
	/* On the original bt848 boards, */
	/* Tuner is MUX0, RCA is MUX1, S-Video is MUX2 */
	/* On the Hauppauge bt878 boards, */
	/* Tuner is MUX0, RCA is MUX3 */
	/* Unfortunatly Meteor driver codes DEV_RCA as DEV_0, so we */
	/* stick with this system in our Meteor Emulation */

	switch((unsigned long )arg & METEOR_DEV_MASK) {

	/* this is the RCA video input */
	case 0: /* default */
	case METEOR_INPUT_DEV0:
	/* METEOR_INPUT_DEV_RCA: */
	bktr->flags = (bktr->flags & ~METEOR_DEV_MASK)
	\| METEOR_DEV0;
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM)
	& ~BT848_IFORM_MUXSEL);

	/* work around for new Hauppauge 878 cards */
	if ((bktr->card.card_id == CARD_HAUPPAUGE) &&
	(bktr->id==BROOKTREE_878 \|\|
	bktr->id==BROOKTREE_879) )
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) \| BT848_IFORM_M_MUX3);
	else
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) \| BT848_IFORM_M_MUX1);

	OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~BT848_E_CONTROL_COMP);
	OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) & ~BT848_O_CONTROL_COMP);
	set_audio( bktr, AUDIO_EXTERN );
	break;

	/* this is the tuner input */
	case METEOR_INPUT_DEV1:
	bktr->flags = (bktr->flags & ~METEOR_DEV_MASK)
	\| METEOR_DEV1;
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) & ~BT848_IFORM_MUXSEL);
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) \| BT848_IFORM_M_MUX0);
	OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~BT848_E_CONTROL_COMP);
	OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) & ~BT848_O_CONTROL_COMP);
	set_audio( bktr, AUDIO_TUNER );
	break;

	/* this is the S-VHS input, but with a composite camera */
	case METEOR_INPUT_DEV2:
	bktr->flags = (bktr->flags & ~METEOR_DEV_MASK)
	\| METEOR_DEV2;
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) & ~BT848_IFORM_MUXSEL);
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) \| BT848_IFORM_M_MUX2);
	OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~BT848_E_CONTROL_COMP);
	OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~BT848_O_CONTROL_COMP);
	set_audio( bktr, AUDIO_EXTERN );
	break;

	/* this is the S-VHS input */
	case METEOR_INPUT_DEV_SVIDEO:
	bktr->flags = (bktr->flags & ~METEOR_DEV_MASK)
	\| METEOR_DEV_SVIDEO;
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) & ~BT848_IFORM_MUXSEL);
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) \| BT848_IFORM_M_MUX2);
	OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) \| BT848_E_CONTROL_COMP);
	OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) \| BT848_O_CONTROL_COMP);
	set_audio( bktr, AUDIO_EXTERN );
	break;

	case METEOR_INPUT_DEV3:
	if ((bktr->id == BROOKTREE_848A) \|\|
	(bktr->id == BROOKTREE_849A) \|\|
	(bktr->id == BROOKTREE_878) \|\|
	(bktr->id == BROOKTREE_879) ) {
	bktr->flags = (bktr->flags & ~METEOR_DEV_MASK)
	\| METEOR_DEV3;
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) & ~BT848_IFORM_MUXSEL);

	/* work around for new Hauppauge 878 cards */
	if ((bktr->card.card_id == CARD_HAUPPAUGE) &&
	(bktr->id==BROOKTREE_878 \|\|
	bktr->id==BROOKTREE_879) )
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) \| BT848_IFORM_M_MUX1);
	else
	OUTB(bktr, BKTR_IFORM, INB(bktr, BKTR_IFORM) \| BT848_IFORM_M_MUX3);

	OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) & ~BT848_E_CONTROL_COMP);
	OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) & ~BT848_O_CONTROL_COMP);
	set_audio( bktr, AUDIO_EXTERN );

	break;
	}

	default:
	return( EINVAL );
	}
	break;

	case METEORGINPUT: /* get input device */
	(u_long )arg = bktr->flags & METEOR_DEV_MASK;
	break;

	case METEORSACTPIXFMT:
	if (( (int )arg < 0 ) \|\|
	( (int )arg >= PIXFMT_TABLE_SIZE ))
	return( EINVAL );

	bktr->pixfmt = (int )arg;
	OUTB(bktr, BKTR_COLOR_CTL, (INB(bktr, BKTR_COLOR_CTL) & 0xf0)
	\| pixfmt_swap_flags( bktr->pixfmt ));
	bktr->pixfmt_compat = FALSE;
	break;

	case METEORGACTPIXFMT:
	(int )arg = bktr->pixfmt;
	break;

	case METEORGSUPPIXFMT :
	pf_pub = (struct meteor_pixfmt *)arg;
	pixfmt = pf_pub->index;

	if (( pixfmt < 0 ) \|\| ( pixfmt >= PIXFMT_TABLE_SIZE ))
	return( EINVAL );

	memcpy( pf_pub, &pixfmt_table[ pixfmt ].public,
	sizeof( *pf_pub ) );

	/* Patch in our format index */
	pf_pub->index = pixfmt;
	break;

	#if defined( STATUS_SUM )
	case BT848_GSTATUS: /* reap status */
	{
	DECLARE_INTR_MASK(s);
	DISABLE_INTR(s);
	temp = status_sum;
	status_sum = 0;
	ENABLE_INTR(s);
	(u_int)arg = temp;
	break;
	}
	#endif /* STATUS_SUM */

	default:
	return( ENOTTY );
	}

	return( 0 );
	}




	/******************************************************************************
	* bt848 RISC programming routines:
	*/


	/*
	*
	*/
	#ifdef BT848_DEBUG
	static int
	dump_bt848( bktr_ptr_t bktr )
	{
	int r[60]={
	4, 8, 0xc, 0x8c, 0x10, 0x90, 0x14, 0x94,
	0x18, 0x98, 0x1c, 0x9c, 0x20, 0xa0, 0x24, 0xa4,
	0x28, 0x2c, 0xac, 0x30, 0x34, 0x38, 0x3c, 0x40,
	0xc0, 0x48, 0x4c, 0xcc, 0x50, 0xd0, 0xd4, 0x60,
	0x64, 0x68, 0x6c, 0xec, 0xd8, 0xdc, 0xe0, 0xe4,
	0, 0, 0, 0
	};
	int i;

	for (i = 0; i < 40; i+=4) {
	printf("%s: Reg:value : \t%x:%x \t%x:%x \t %x:%x \t %x:%x\n",
	bktr_name(bktr),
	r[i], INL(bktr, r[i]),
	r[i+1], INL(bktr, r[i+1]),
	r[i+2], INL(bktr, r[i+2]),
	r[i+3], INL(bktr, r[i+3]]));
	}

	printf("%s: INT STAT %x \n", bktr_name(bktr),
	INL(bktr, BKTR_INT_STAT));
	printf("%s: Reg INT_MASK %x \n", bktr_name(bktr),
	INL(bktr, BKTR_INT_MASK));
	printf("%s: Reg GPIO_DMA_CTL %x \n", bktr_name(bktr),
	INW(bktr, BKTR_GPIO_DMA_CTL));

	return( 0 );
	}

	#endif

	/*
	* build write instruction
	*/
	#define BKTR_FM1 0x6 /* packed data to follow */
	#define BKTR_FM3 0xe /* planar data to follow */
	#define BKTR_VRE 0x4 /* Marks the end of the even field */
	#define BKTR_VRO 0xC /* Marks the end of the odd field */
	#define BKTR_PXV 0x0 /* valid word (never used) */
	#define BKTR_EOL 0x1 /* last dword, 4 bytes */
	#define BKTR_SOL 0x2 /* first dword */

	#define OP_WRITE (0x1 << 28)
	#define OP_SKIP (0x2 << 28)
	#define OP_WRITEC (0x5 << 28)
	#define OP_JUMP (0x7 << 28)
	#define OP_SYNC (0x8 << 28)
	#define OP_WRITE123 (0x9 << 28)
	#define OP_WRITES123 (0xb << 28)
	#define OP_SOL (1 << 27) /* first instr for scanline */
	#define OP_EOL (1 << 26)

	#define BKTR_RESYNC (1 << 15)
	#define BKTR_GEN_IRQ (1 << 24)

	/*
	* The RISC status bits can be set/cleared in the RISC programs
	* and tested in the Interrupt Handler
	*/
	#define BKTR_SET_RISC_STATUS_BIT0 (1 << 16)
	#define BKTR_SET_RISC_STATUS_BIT1 (1 << 17)
	#define BKTR_SET_RISC_STATUS_BIT2 (1 << 18)
	#define BKTR_SET_RISC_STATUS_BIT3 (1 << 19)

	#define BKTR_CLEAR_RISC_STATUS_BIT0 (1 << 20)
	#define BKTR_CLEAR_RISC_STATUS_BIT1 (1 << 21)
	#define BKTR_CLEAR_RISC_STATUS_BIT2 (1 << 22)
	#define BKTR_CLEAR_RISC_STATUS_BIT3 (1 << 23)

	#define BKTR_TEST_RISC_STATUS_BIT0 (1 << 28)
	#define BKTR_TEST_RISC_STATUS_BIT1 (1 << 29)
	#define BKTR_TEST_RISC_STATUS_BIT2 (1 << 30)
	#define BKTR_TEST_RISC_STATUS_BIT3 (1 << 31)

	static bool_t notclipped (bktr_reg_t * bktr, int x, int width) {
	int i;
	bktr_clip_t * clip_node;
	bktr->clip_start = -1;
	bktr->last_y = 0;
	bktr->y = 0;
	bktr->y2 = width;
	bktr->line_length = width;
	bktr->yclip = -1;
	bktr->yclip2 = -1;
	bktr->current_col = 0;

	if (bktr->max_clip_node == 0 ) return TRUE;
	clip_node = (bktr_clip_t *) &bktr->clip_list[0];


	for (i = 0; i < bktr->max_clip_node; i++ ) {
	clip_node = (bktr_clip_t *) &bktr->clip_list[i];
	if (x >= clip_node->x_min && x <= clip_node->x_max ) {
	bktr->clip_start = i;
	return FALSE;
	}
	}

	return TRUE;
	}

	static bool_t getline(bktr_reg_t *bktr, int x ) {
	int i, j;
	bktr_clip_t * clip_node ;

	if (bktr->line_length == 0 \|\|
	bktr->current_col >= bktr->line_length) return FALSE;

	bktr->y = min(bktr->last_y, bktr->line_length);
	bktr->y2 = bktr->line_length;

	bktr->yclip = bktr->yclip2 = -1;
	for (i = bktr->clip_start; i < bktr->max_clip_node; i++ ) {
	clip_node = (bktr_clip_t *) &bktr->clip_list[i];
	if (x >= clip_node->x_min && x <= clip_node->x_max) {
	if (bktr->last_y <= clip_node->y_min) {
	bktr->y = min(bktr->last_y, bktr->line_length);
	bktr->y2 = min(clip_node->y_min, bktr->line_length);
	bktr->yclip = min(clip_node->y_min, bktr->line_length);
	bktr->yclip2 = min(clip_node->y_max, bktr->line_length);
	bktr->last_y = bktr->yclip2;
	bktr->clip_start = i;

	for (j = i+1; j < bktr->max_clip_node; j++ ) {
	clip_node = (bktr_clip_t *) &bktr->clip_list[j];
	if (x >= clip_node->x_min && x <= clip_node->x_max) {
	if (bktr->last_y >= clip_node->y_min) {
	bktr->yclip2 = min(clip_node->y_max, bktr->line_length);
	bktr->last_y = bktr->yclip2;
	bktr->clip_start = j;
	}
	} else break ;
	}
	return TRUE;
	}
	}
	}

	if (bktr->current_col <= bktr->line_length) {
	bktr->current_col = bktr->line_length;
	return TRUE;
	}
	return FALSE;
	}

	static bool_t split(bktr_reg_t * bktr, volatile uint32_t **dma_prog, int width ,
	u_long operation, int pixel_width,
	volatile u_char ** target_buffer, int cols ) {

	u_long flag, flag2;
	struct meteor_pixfmt *pf = &pixfmt_table[ bktr->pixfmt ].public;
	u_int skip, start_skip;

	/* For RGB24, we need to align the component in FIFO Byte Lane 0 */
	/* to the 1st byte in the mem dword containing our start addr. */
	/* BTW, we know this pixfmt's 1st byte is Blue; thus the start addr */
	/* must be Blue. */
	start_skip = 0;
	if (( pf->type == METEOR_PIXTYPE_RGB ) && ( pf->Bpp == 3 ))
	switch ( ((uintptr_t) (volatile void ) target_buffer) % 4 ) {
	case 2 : start_skip = 4 ; break;
	case 1 : start_skip = 8 ; break;
	}

	if ((width * pixel_width) < DMA_BT848_SPLIT ) {
	if ( width == cols) {
	flag = OP_SOL \| OP_EOL;
	} else if (bktr->current_col == 0 ) {
	flag = OP_SOL;
	} else if (bktr->current_col == cols) {
	flag = OP_EOL;
	} else flag = 0;

	skip = 0;
	if (( flag & OP_SOL ) && ( start_skip > 0 )) {
	(dma_prog)++ = OP_SKIP \| OP_SOL \| start_skip;
	flag &= ~OP_SOL;
	skip = start_skip;
	}

	(dma_prog)++ = operation \| flag \| (width * pixel_width - skip);
	if (operation != OP_SKIP )
	(dma_prog)++ = (uintptr_t) (volatile void ) target_buffer;

	target_buffer += width pixel_width;
	bktr->current_col += width;

	} else {

	if (bktr->current_col == 0 && width == cols) {
	flag = OP_SOL ;
	flag2 = OP_EOL;
	} else if (bktr->current_col == 0 ) {
	flag = OP_SOL;
	flag2 = 0;
	} else if (bktr->current_col >= cols) {
	flag = 0;
	flag2 = OP_EOL;
	} else {
	flag = 0;
	flag2 = 0;
	}

	skip = 0;
	if (( flag & OP_SOL ) && ( start_skip > 0 )) {
	(dma_prog)++ = OP_SKIP \| OP_SOL \| start_skip;
	flag &= ~OP_SOL;
	skip = start_skip;
	}

	(dma_prog)++ = operation \| flag \|
	(width * pixel_width / 2 - skip);
	if (operation != OP_SKIP )
	(dma_prog)++ = (uintptr_t) (volatile void ) target_buffer ;
	target_buffer += (width pixel_width / 2) ;

	if ( operation == OP_WRITE )
	operation = OP_WRITEC;
	(dma_prog)++ = operation \| flag2 \|
	(width * pixel_width / 2);
	target_buffer += (width pixel_width / 2) ;
	bktr->current_col += width;

	}
	return TRUE;
	}


	/*
	* Generate the RISC instructions to capture both VBI and video images
	*/
	static void
	rgb_vbi_prog( bktr_ptr_t bktr, char i_flag, int cols, int rows, int interlace )
	{
	int i;
	volatile uint32_t target_buffer, buffer, target,width;
	volatile uint32_t pitch;
	volatile uint32_t dma_prog; / DMA prog is an array of
	32 bit RISC instructions */
	volatile uint32_t *loop_point;
	struct meteor_pixfmt_internal *pf_int = &pixfmt_table[ bktr->pixfmt ];
	u_int Bpp = pf_int->public.Bpp;
	unsigned int vbisamples; /* VBI samples per line */
	unsigned int vbilines; /* VBI lines per field */
	unsigned int num_dwords; /* DWORDS per line */

	vbisamples = format_params[bktr->format_params].vbi_num_samples;
	vbilines = format_params[bktr->format_params].vbi_num_lines;
	num_dwords = vbisamples/4;

	OUTB(bktr, BKTR_COLOR_FMT, pf_int->color_fmt);
	OUTB(bktr, BKTR_ADC, SYNC_LEVEL);
	OUTB(bktr, BKTR_VBI_PACK_SIZE, ((num_dwords)) & 0xff);
	OUTB(bktr, BKTR_VBI_PACK_DEL, ((num_dwords)>> 8) & 0x01); /* no hdelay */
	/* no ext frame */

	OUTB(bktr, BKTR_OFORM, 0x00);

	OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) \| 0x40); /* set chroma comb */
	OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) \| 0x40);
	OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) & ~0x80); /* clear Ycomb */
	OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) & ~0x80);

	/* disable gamma correction removal */
	OUTB(bktr, BKTR_COLOR_CTL, INB(bktr, BKTR_COLOR_CTL) \| BT848_COLOR_CTL_GAMMA);

	if (cols > 385 ) {
	OUTB(bktr, BKTR_E_VTC, 0);
	OUTB(bktr, BKTR_O_VTC, 0);
	} else {
	OUTB(bktr, BKTR_E_VTC, 1);
	OUTB(bktr, BKTR_O_VTC, 1);
	}
	bktr->capcontrol = 3 << 2 \| 3;

	dma_prog = (uint32_t *) bktr->dma_prog;

	/* Construct Write */

	if (bktr->video.addr) {
	target_buffer = (u_long) bktr->video.addr;
	pitch = bktr->video.width;
	}
	else {
	target_buffer = (u_long) vtophys(bktr->bigbuf);
	pitch = cols*Bpp;
	}

	buffer = target_buffer;

	/* Wait for the VRE sync marking the end of the Even and
	* the start of the Odd field. Resync here.
	*/
	*dma_prog++ = OP_SYNC \| BKTR_RESYNC \|BKTR_VRE;
	*dma_prog++ = 0;

	loop_point = dma_prog;

	/* store the VBI data */
	/* look for sync with packed data */
	*dma_prog++ = OP_SYNC \| BKTR_FM1;
	*dma_prog++ = 0;
	for(i = 0; i < vbilines; i++) {
	*dma_prog++ = OP_WRITE \| OP_SOL \| OP_EOL \| vbisamples;
	*dma_prog++ = (u_long) vtophys((caddr_t)bktr->vbidata +
	(i * VBI_LINE_SIZE));
	}

	if ( (i_flag == 2/Odd/) \|\| (i_flag==3) /interlaced/ ) {
	/* store the Odd field video image */
	/* look for sync with packed data */
	*dma_prog++ = OP_SYNC \| BKTR_FM1;
	dma_prog++ = 0; / NULL WORD */
	width = cols;
	for (i = 0; i < (rows/interlace); i++) {
	target = target_buffer;
	if ( notclipped(bktr, i, width)) {
	split(bktr, (volatile uint32_t **) &dma_prog,
	bktr->y2 - bktr->y, OP_WRITE,
	Bpp, (volatile u_char **)(uintptr_t)&target, cols);

	} else {
	while(getline(bktr, i)) {
	if (bktr->y != bktr->y2 ) {
	split(bktr, (volatile uint32_t **) &dma_prog,
	bktr->y2 - bktr->y, OP_WRITE,
	Bpp, (volatile u_char **)(uintptr_t)&target, cols);
	}
	if (bktr->yclip != bktr->yclip2 ) {
	split(bktr,(volatile uint32_t **) &dma_prog,
	bktr->yclip2 - bktr->yclip,
	OP_SKIP,
	Bpp, (volatile u_char **)(uintptr_t)&target, cols);
	}
	}

	}

	target_buffer += interlace * pitch;

	}

	} /* end if */

	/* Grab the Even field */
	/* Look for the VRO, end of Odd field, marker */
	*dma_prog++ = OP_SYNC \| BKTR_GEN_IRQ \| BKTR_RESYNC \| BKTR_VRO;
	dma_prog++ = 0; / NULL WORD */

	/* store the VBI data */
	/* look for sync with packed data */
	*dma_prog++ = OP_SYNC \| BKTR_FM1;
	*dma_prog++ = 0;
	for(i = 0; i < vbilines; i++) {
	*dma_prog++ = OP_WRITE \| OP_SOL \| OP_EOL \| vbisamples;
	*dma_prog++ = (u_long) vtophys((caddr_t)bktr->vbidata +
	((i+MAX_VBI_LINES) * VBI_LINE_SIZE));
	}

	/* store the video image */
	if (i_flag == 1) /Even Only/
	target_buffer = buffer;
	if (i_flag == 3) /interlaced/
	target_buffer = buffer+pitch;


	if ((i_flag == 1) /Even Only/ \|\| (i_flag==3) /interlaced/) {
	/* look for sync with packed data */
	*dma_prog++ = OP_SYNC \| BKTR_FM1;
	dma_prog++ = 0; / NULL WORD */
	width = cols;
	for (i = 0; i < (rows/interlace); i++) {
	target = target_buffer;
	if ( notclipped(bktr, i, width)) {
	split(bktr, (volatile uint32_t **) &dma_prog,
	bktr->y2 - bktr->y, OP_WRITE,
	Bpp, (volatile u_char **)(uintptr_t)&target, cols);
	} else {
	while(getline(bktr, i)) {
	if (bktr->y != bktr->y2 ) {
	split(bktr, (volatile uint32_t **) &dma_prog,
	bktr->y2 - bktr->y, OP_WRITE,
	Bpp, (volatile u_char **)(uintptr_t)&target,
	cols);
	}
	if (bktr->yclip != bktr->yclip2 ) {
	split(bktr, (volatile uint32_t **) &dma_prog,
	bktr->yclip2 - bktr->yclip, OP_SKIP,
	Bpp, (volatile u_char **)(uintptr_t) &target, cols);
	}

	}

	}

	target_buffer += interlace * pitch;

	}
	}

	/* Look for end of 'Even Field' */
	*dma_prog++ = OP_SYNC \| BKTR_GEN_IRQ \| BKTR_RESYNC \| BKTR_VRE;
	dma_prog++ = 0; / NULL WORD */

	*dma_prog++ = OP_JUMP ;
	*dma_prog++ = (u_long ) vtophys(loop_point) ;
	dma_prog++ = 0; / NULL WORD */

	}




	static void
	rgb_prog( bktr_ptr_t bktr, char i_flag, int cols, int rows, int interlace )
	{
	int i;
	volatile uint32_t target_buffer, buffer, target,width;
	volatile uint32_t pitch;
	volatile uint32_t *dma_prog;
	struct meteor_pixfmt_internal *pf_int = &pixfmt_table[ bktr->pixfmt ];
	u_int Bpp = pf_int->public.Bpp;

	OUTB(bktr, BKTR_COLOR_FMT, pf_int->color_fmt);
	OUTB(bktr, BKTR_VBI_PACK_SIZE, 0);
	OUTB(bktr, BKTR_VBI_PACK_DEL, 0);
	OUTB(bktr, BKTR_ADC, SYNC_LEVEL);

	OUTB(bktr, BKTR_OFORM, 0x00);

	OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) \| 0x40); /* set chroma comb */
	OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) \| 0x40);
	OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) & ~0x80); /* clear Ycomb */
	OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) & ~0x80);

	/* disable gamma correction removal */
	OUTB(bktr, BKTR_COLOR_CTL, INB(bktr, BKTR_COLOR_CTL) \| BT848_COLOR_CTL_GAMMA);

	if (cols > 385 ) {
	OUTB(bktr, BKTR_E_VTC, 0);
	OUTB(bktr, BKTR_O_VTC, 0);
	} else {
	OUTB(bktr, BKTR_E_VTC, 1);
	OUTB(bktr, BKTR_O_VTC, 1);
	}
	bktr->capcontrol = 3 << 2 \| 3;

	dma_prog = (uint32_t *) bktr->dma_prog;

	/* Construct Write */

	if (bktr->video.addr) {
	target_buffer = (uint32_t) bktr->video.addr;
	pitch = bktr->video.width;
	}
	else {
	target_buffer = (uint32_t) vtophys(bktr->bigbuf);
	pitch = cols*Bpp;
	}

	buffer = target_buffer;

	/* contruct sync : for video packet format */
	*dma_prog++ = OP_SYNC \| BKTR_RESYNC \| BKTR_FM1;

	/* sync, mode indicator packed data */
	dma_prog++ = 0; / NULL WORD */
	width = cols;
	for (i = 0; i < (rows/interlace); i++) {
	target = target_buffer;
	if ( notclipped(bktr, i, width)) {
	split(bktr, (volatile uint32_t **) &dma_prog,
	bktr->y2 - bktr->y, OP_WRITE,
	Bpp, (volatile u_char **)(uintptr_t)&target, cols);

	} else {
	while(getline(bktr, i)) {
	if (bktr->y != bktr->y2 ) {
	split(bktr, (volatile uint32_t **) &dma_prog,
	bktr->y2 - bktr->y, OP_WRITE,
	Bpp, (volatile u_char **)(uintptr_t)&target, cols);
	}
	if (bktr->yclip != bktr->yclip2 ) {
	split(bktr,(volatile uint32_t **) &dma_prog,
	bktr->yclip2 - bktr->yclip,
	OP_SKIP,
	Bpp, (volatile u_char **)(uintptr_t)&target, cols);
	}
	}

	}

	target_buffer += interlace * pitch;

	}

	switch (i_flag) {
	case 1:
	/* sync vre */
	*dma_prog++ = OP_SYNC \| BKTR_GEN_IRQ \| BKTR_VRO;
	dma_prog++ = 0; / NULL WORD */

	*dma_prog++ = OP_JUMP;
	*dma_prog++ = (uint32_t ) vtophys(bktr->dma_prog);
	return;

	case 2:
	/* sync vro */
	*dma_prog++ = OP_SYNC \| BKTR_GEN_IRQ \| BKTR_VRE;
	dma_prog++ = 0; / NULL WORD */

	*dma_prog++ = OP_JUMP;
	*dma_prog++ = (uint32_t ) vtophys(bktr->dma_prog);
	return;

	case 3:
	/* sync vro */
	*dma_prog++ = OP_SYNC \| BKTR_GEN_IRQ \| BKTR_RESYNC \| BKTR_VRO;
	dma_prog++ = 0; / NULL WORD */
	*dma_prog++ = OP_JUMP; ;
	*dma_prog = (uint32_t ) vtophys(bktr->odd_dma_prog);
	break;
	}

	if (interlace == 2) {

	target_buffer = buffer + pitch;

	dma_prog = (uint32_t *) bktr->odd_dma_prog;

	/* sync vre IRQ bit */
	*dma_prog++ = OP_SYNC \| BKTR_RESYNC \| BKTR_FM1;
	dma_prog++ = 0; / NULL WORD */
	width = cols;
	for (i = 0; i < (rows/interlace); i++) {
	target = target_buffer;
	if ( notclipped(bktr, i, width)) {
	split(bktr, (volatile uint32_t **) &dma_prog,
	bktr->y2 - bktr->y, OP_WRITE,
	Bpp, (volatile u_char **)(uintptr_t)&target, cols);
	} else {
	while(getline(bktr, i)) {
	if (bktr->y != bktr->y2 ) {
	split(bktr, (volatile uint32_t **) &dma_prog,
	bktr->y2 - bktr->y, OP_WRITE,
	Bpp, (volatile u_char **)(uintptr_t)&target,
	cols);
	}
	if (bktr->yclip != bktr->yclip2 ) {
	split(bktr, (volatile uint32_t **) &dma_prog,
	bktr->yclip2 - bktr->yclip, OP_SKIP,
	Bpp, (volatile u_char **)(uintptr_t)&target, cols);
	}

	}

	}

	target_buffer += interlace * pitch;

	}
	}

	/* sync vre IRQ bit */
	*dma_prog++ = OP_SYNC \| BKTR_GEN_IRQ \| BKTR_RESYNC \| BKTR_VRE;
	dma_prog++ = 0; / NULL WORD */
	*dma_prog++ = OP_JUMP ;
	*dma_prog++ = (uint32_t ) vtophys(bktr->dma_prog) ;
	dma_prog++ = 0; / NULL WORD */
	}


	/*
	*
	*/
	static void
	yuvpack_prog( bktr_ptr_t bktr, char i_flag,
	int cols, int rows, int interlace )
	{
	int i;
	volatile unsigned int inst;
	volatile unsigned int inst3;
	volatile uint32_t target_buffer, buffer;
	volatile uint32_t *dma_prog;
	struct meteor_pixfmt_internal *pf_int = &pixfmt_table[ bktr->pixfmt ];
	int b;

	OUTB(bktr, BKTR_COLOR_FMT, pf_int->color_fmt);

	OUTB(bktr, BKTR_E_SCLOOP, INB(bktr, BKTR_E_SCLOOP) \| BT848_E_SCLOOP_CAGC); /* enable chroma comb */
	OUTB(bktr, BKTR_O_SCLOOP, INB(bktr, BKTR_O_SCLOOP) \| BT848_O_SCLOOP_CAGC);

	OUTB(bktr, BKTR_COLOR_CTL, INB(bktr, BKTR_COLOR_CTL) \| BT848_COLOR_CTL_RGB_DED \| BT848_COLOR_CTL_GAMMA);
	OUTB(bktr, BKTR_ADC, SYNC_LEVEL);

	bktr->capcontrol = 1 << 6 \| 1 << 4 \| 1 << 2 \| 3;
	bktr->capcontrol = 3 << 2 \| 3;

	dma_prog = (uint32_t *) bktr->dma_prog;

	/* Construct Write */

	/* write , sol, eol */
	inst = OP_WRITE \| OP_SOL \| (cols);
	/* write , sol, eol */
	inst3 = OP_WRITE \| OP_EOL \| (cols);

	if (bktr->video.addr)
	target_buffer = (uint32_t) bktr->video.addr;
	else
	target_buffer = (uint32_t) vtophys(bktr->bigbuf);

	buffer = target_buffer;

	/* contruct sync : for video packet format */
	/* sync, mode indicator packed data */
	*dma_prog++ = OP_SYNC \| BKTR_RESYNC \| BKTR_FM1;
	dma_prog++ = 0; / NULL WORD */

	b = cols;

	for (i = 0; i < (rows/interlace); i++) {
	*dma_prog++ = inst;
	*dma_prog++ = target_buffer;
	*dma_prog++ = inst3;
	*dma_prog++ = target_buffer + b;
	target_buffer += interlace(cols 2);
	}

	switch (i_flag) {
	case 1:
	/* sync vre */
	*dma_prog++ = OP_SYNC \| BKTR_GEN_IRQ \| BKTR_VRE;
	dma_prog++ = 0; / NULL WORD */

	*dma_prog++ = OP_JUMP;
	*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
	return;

	case 2:
	/* sync vro */
	*dma_prog++ = OP_SYNC \| BKTR_GEN_IRQ \| BKTR_VRO;
	dma_prog++ = 0; / NULL WORD */
	*dma_prog++ = OP_JUMP;
	*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
	return;

	case 3:
	/* sync vro */
	*dma_prog++ = OP_SYNC \| BKTR_GEN_IRQ \| BKTR_RESYNC \| BKTR_VRO;
	dma_prog++ = 0; / NULL WORD */
	*dma_prog++ = OP_JUMP ;
	*dma_prog = (uint32_t) vtophys(bktr->odd_dma_prog);
	break;
	}

	if (interlace == 2) {

	target_buffer = (uint32_t) buffer + cols*2;

	dma_prog = (uint32_t *) bktr->odd_dma_prog;

	/* sync vre */
	*dma_prog++ = OP_SYNC \| BKTR_RESYNC \| BKTR_FM1;
	dma_prog++ = 0; / NULL WORD */

	for (i = 0; i < (rows/interlace) ; i++) {
	*dma_prog++ = inst;
	*dma_prog++ = target_buffer;
	*dma_prog++ = inst3;
	*dma_prog++ = target_buffer + b;
	target_buffer += interlace * ( cols*2);
	}
	}

	/* sync vro IRQ bit */
	*dma_prog++ = OP_SYNC \| BKTR_GEN_IRQ \| BKTR_RESYNC \| BKTR_VRE;
	dma_prog++ = 0; / NULL WORD */
	*dma_prog++ = OP_JUMP ;
	*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);

	*dma_prog++ = OP_JUMP;
	*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
	dma_prog++ = 0; / NULL WORD */
	}


	/*
	*
	*/
	static void
	yuv422_prog( bktr_ptr_t bktr, char i_flag,
	int cols, int rows, int interlace ){

	int i;
	volatile unsigned int inst;
	volatile uint32_t target_buffer, t1, buffer;
	volatile uint32_t *dma_prog;
	struct meteor_pixfmt_internal *pf_int = &pixfmt_table[ bktr->pixfmt ];

	OUTB(bktr, BKTR_COLOR_FMT, pf_int->color_fmt);

	dma_prog = (uint32_t*) bktr->dma_prog;

	bktr->capcontrol = 1 << 6 \| 1 << 4 \| 3;

	OUTB(bktr, BKTR_ADC, SYNC_LEVEL);
	OUTB(bktr, BKTR_OFORM, 0x00);

	OUTB(bktr, BKTR_E_CONTROL, INB(bktr, BKTR_E_CONTROL) \| BT848_E_CONTROL_LDEC); /* disable luma decimation */
	OUTB(bktr, BKTR_O_CONTROL, INB(bktr, BKTR_O_CONTROL) \| BT848_O_CONTROL_LDEC);

	OUTB(bktr, BKTR_E_SCLOOP, INB(bktr, BKTR_E_SCLOOP) \| BT848_E_SCLOOP_CAGC); /* chroma agc enable */
	OUTB(bktr, BKTR_O_SCLOOP, INB(bktr, BKTR_O_SCLOOP) \| BT848_O_SCLOOP_CAGC);

	OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) & ~0x80); /* clear Ycomb */
	OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) & ~0x80);
	OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) \| 0x40); /* set chroma comb */
	OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) \| 0x40);

	/* disable gamma correction removal */
	OUTB(bktr, BKTR_COLOR_CTL, INB(bktr, BKTR_COLOR_CTL) \| BT848_COLOR_CTL_GAMMA);

	/* Construct Write */
	inst = OP_WRITE123 \| OP_SOL \| OP_EOL \| (cols);
	if (bktr->video.addr)
	target_buffer = (uint32_t) bktr->video.addr;
	else
	target_buffer = (uint32_t) vtophys(bktr->bigbuf);

	buffer = target_buffer;

	t1 = buffer;

	/* contruct sync : for video packet format */
	dma_prog++ = OP_SYNC \| 1 << 15 \| BKTR_FM3; /sync, mode indicator packed data*/
	dma_prog++ = 0; / NULL WORD */

	for (i = 0; i < (rows/interlace ) ; i++) {
	*dma_prog++ = inst;
	*dma_prog++ = cols/2 \| cols/2 << 16;
	*dma_prog++ = target_buffer;
	dma_prog++ = t1 + (colsrows) + icols/2 interlace;
	dma_prog++ = t1 + (colsrows) + (colsrows/2) + icols/2 * interlace;
	target_buffer += interlace*cols;
	}

	switch (i_flag) {
	case 1:
	dma_prog++ = OP_SYNC \| 1 << 24 \| BKTR_VRE; /sync vre*/
	dma_prog++ = 0; / NULL WORD */

	*dma_prog++ = OP_JUMP ;
	*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
	return;

	case 2:
	dma_prog++ = OP_SYNC \| 1 << 24 \| BKTR_VRO; /sync vre*/
	dma_prog++ = 0; / NULL WORD */

	*dma_prog++ = OP_JUMP;
	*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
	return;

	case 3:
	*dma_prog++ = OP_SYNC \| 1 << 24 \| 1 << 15 \| BKTR_VRO;
	dma_prog++ = 0; / NULL WORD */

	*dma_prog++ = OP_JUMP ;
	*dma_prog = (uint32_t) vtophys(bktr->odd_dma_prog);
	break;
	}

	if (interlace == 2) {

	dma_prog = (uint32_t *) bktr->odd_dma_prog;

	target_buffer = (uint32_t) buffer + cols;
	t1 = buffer + cols/2;
	*dma_prog++ = OP_SYNC \| 1 << 15 \| BKTR_FM3;
	dma_prog++ = 0; / NULL WORD */

	for (i = 0; i < (rows/interlace ) ; i++) {
	*dma_prog++ = inst;
	*dma_prog++ = cols/2 \| cols/2 << 16;
	*dma_prog++ = target_buffer;
	dma_prog++ = t1 + (colsrows) + icols/2 interlace;
	dma_prog++ = t1 + (colsrows) + (colsrows/2) + icols/2 * interlace;
	target_buffer += interlace*cols;
	}
	}

	*dma_prog++ = OP_SYNC \| 1 << 24 \| 1 << 15 \| BKTR_VRE;
	dma_prog++ = 0; / NULL WORD */
	*dma_prog++ = OP_JUMP ;
	*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog) ;
	dma_prog++ = 0; / NULL WORD */
	}


	/*
	*
	*/
	static void
	yuv12_prog( bktr_ptr_t bktr, char i_flag,
	int cols, int rows, int interlace ){

	int i;
	volatile unsigned int inst;
	volatile unsigned int inst1;
	volatile uint32_t target_buffer, t1, buffer;
	volatile uint32_t *dma_prog;
	struct meteor_pixfmt_internal *pf_int = &pixfmt_table[ bktr->pixfmt ];

	OUTB(bktr, BKTR_COLOR_FMT, pf_int->color_fmt);

	dma_prog = (uint32_t *) bktr->dma_prog;

	bktr->capcontrol = 1 << 6 \| 1 << 4 \| 3;

	OUTB(bktr, BKTR_ADC, SYNC_LEVEL);
	OUTB(bktr, BKTR_OFORM, 0x0);

	/* Construct Write */
	inst = OP_WRITE123 \| OP_SOL \| OP_EOL \| (cols);
	inst1 = OP_WRITES123 \| OP_SOL \| OP_EOL \| (cols);
	if (bktr->video.addr)
	target_buffer = (uint32_t) bktr->video.addr;
	else
	target_buffer = (uint32_t) vtophys(bktr->bigbuf);

	buffer = target_buffer;
	t1 = buffer;

	dma_prog++ = OP_SYNC \| 1 << 15 \| BKTR_FM3; /sync, mode indicator packed data*/
	dma_prog++ = 0; / NULL WORD */

	for (i = 0; i < (rows/interlace )/2 ; i++) {
	*dma_prog++ = inst;
	*dma_prog++ = cols/2 \| (cols/2 << 16);
	*dma_prog++ = target_buffer;
	dma_prog++ = t1 + (colsrows) + icols/2 interlace;
	dma_prog++ = t1 + (colsrows) + (colsrows/4) + icols/2 * interlace;
	target_buffer += interlace*cols;
	*dma_prog++ = inst1;
	*dma_prog++ = cols/2 \| (cols/2 << 16);
	*dma_prog++ = target_buffer;
	target_buffer += interlace*cols;

	}

	switch (i_flag) {
	case 1:
	dma_prog++ = OP_SYNC \| 1 << 24 \| BKTR_VRE; /sync vre*/
	dma_prog++ = 0; / NULL WORD */

	*dma_prog++ = OP_JUMP;
	*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
	return;

	case 2:
	dma_prog++ = OP_SYNC \| 1 << 24 \| BKTR_VRO; /sync vro*/
	dma_prog++ = 0; / NULL WORD */

	*dma_prog++ = OP_JUMP;
	*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
	return;

	case 3:
	*dma_prog++ = OP_SYNC \| 1 << 24 \| 1 << 15 \| BKTR_VRO;
	dma_prog++ = 0; / NULL WORD */
	*dma_prog++ = OP_JUMP ;
	*dma_prog = (uint32_t) vtophys(bktr->odd_dma_prog);
	break;
	}

	if (interlace == 2) {

	dma_prog = (uint32_t *) bktr->odd_dma_prog;

	target_buffer = (uint32_t) buffer + cols;
	t1 = buffer + cols/2;
	*dma_prog++ = OP_SYNC \| 1 << 15 \| BKTR_FM3;
	dma_prog++ = 0; / NULL WORD */

	for (i = 0; i < ((rows/interlace )/2 ) ; i++) {
	*dma_prog++ = inst;
	*dma_prog++ = cols/2 \| (cols/2 << 16);
	*dma_prog++ = target_buffer;
	dma_prog++ = t1 + (colsrows) + icols/2 interlace;
	dma_prog++ = t1 + (colsrows) + (colsrows/4) + icols/2 * interlace;
	target_buffer += interlace*cols;
	*dma_prog++ = inst1;
	*dma_prog++ = cols/2 \| (cols/2 << 16);
	*dma_prog++ = target_buffer;
	target_buffer += interlace*cols;

	}


	}

	*dma_prog++ = OP_SYNC \| 1 << 24 \| 1 << 15 \| BKTR_VRE;
	dma_prog++ = 0; / NULL WORD */
	*dma_prog++ = OP_JUMP;
	*dma_prog++ = (uint32_t) vtophys(bktr->dma_prog);
	dma_prog++ = 0; / NULL WORD */
	}



	/*
	*
	*/
	static void
	build_dma_prog( bktr_ptr_t bktr, char i_flag )
	{
	int rows, cols, interlace;
	int tmp_int;
	unsigned int temp;
	struct format_params *fp;
	struct meteor_pixfmt_internal *pf_int = &pixfmt_table[ bktr->pixfmt ];


	fp = &format_params[bktr->format_params];

	OUTL(bktr, BKTR_INT_MASK, ALL_INTS_DISABLED);

	/* disable FIFO & RISC, leave other bits alone */
	OUTW(bktr, BKTR_GPIO_DMA_CTL, INW(bktr, BKTR_GPIO_DMA_CTL) & ~FIFO_RISC_ENABLED);

	/* set video parameters */
	if (bktr->capture_area_enabled)
	temp = ((quad_t ) fp->htotal* (quad_t) bktr->capture_area_x_size * 4096
	/ fp->scaled_htotal / bktr->cols) - 4096;
	else
	temp = ((quad_t ) fp->htotal* (quad_t) fp->scaled_hactive * 4096
	/ fp->scaled_htotal / bktr->cols) - 4096;

	/* printf("%s: HSCALE value is %d\n", bktr_name(bktr), temp); */
	OUTB(bktr, BKTR_E_HSCALE_LO, temp & 0xff);
	OUTB(bktr, BKTR_O_HSCALE_LO, temp & 0xff);
	OUTB(bktr, BKTR_E_HSCALE_HI, (temp >> 8) & 0xff);
	OUTB(bktr, BKTR_O_HSCALE_HI, (temp >> 8) & 0xff);

	/* horizontal active */
	temp = bktr->cols;
	/* printf("%s: HACTIVE value is %d\n", bktr_name(bktr), temp); */
	OUTB(bktr, BKTR_E_HACTIVE_LO, temp & 0xff);
	OUTB(bktr, BKTR_O_HACTIVE_LO, temp & 0xff);
	OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) & ~0x3);
	OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) & ~0x3);
	OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) \| ((temp >> 8) & 0x3));
	OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) \| ((temp >> 8) & 0x3));

	/* horizontal delay */
	if (bktr->capture_area_enabled)
	temp = ( (fp->hdelay* fp->scaled_hactive + bktr->capture_area_x_offset* fp->scaled_htotal)
	* bktr->cols) / (bktr->capture_area_x_size * fp->hactive);
	else
	temp = (fp->hdelay * bktr->cols) / fp->hactive;

	temp = temp & 0x3fe;

	/* printf("%s: HDELAY value is %d\n", bktr_name(bktr), temp); */
	OUTB(bktr, BKTR_E_DELAY_LO, temp & 0xff);
	OUTB(bktr, BKTR_O_DELAY_LO, temp & 0xff);
	OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) & ~0xc);
	OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) & ~0xc);
	OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) \| ((temp >> 6) & 0xc));
	OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) \| ((temp >> 6) & 0xc));

	/* vertical scale */

	if (bktr->capture_area_enabled) {
	if (bktr->flags & METEOR_ONLY_ODD_FIELDS \|\|
	bktr->flags & METEOR_ONLY_EVEN_FIELDS)
	tmp_int = 65536 -
	(((bktr->capture_area_y_size * 256 + (bktr->rows/2)) / bktr->rows) - 512);
	else {
	tmp_int = 65536 -
	(((bktr->capture_area_y_size * 512 + (bktr->rows / 2)) / bktr->rows) - 512);
	}
	} else {
	if (bktr->flags & METEOR_ONLY_ODD_FIELDS \|\|
	bktr->flags & METEOR_ONLY_EVEN_FIELDS)
	tmp_int = 65536 -
	(((fp->vactive * 256 + (bktr->rows/2)) / bktr->rows) - 512);
	else {
	tmp_int = 65536 -
	(((fp->vactive * 512 + (bktr->rows / 2)) / bktr->rows) - 512);
	}
	}

	tmp_int &= 0x1fff;
	/* printf("%s: VSCALE value is %d\n", bktr_name(bktr), tmp_int); */
	OUTB(bktr, BKTR_E_VSCALE_LO, tmp_int & 0xff);
	OUTB(bktr, BKTR_O_VSCALE_LO, tmp_int & 0xff);
	OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) & ~0x1f);
	OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) & ~0x1f);
	OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) \| ((tmp_int >> 8) & 0x1f));
	OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) \| ((tmp_int >> 8) & 0x1f));


	/* vertical active */
	if (bktr->capture_area_enabled)
	temp = bktr->capture_area_y_size;
	else
	temp = fp->vactive;
	/* printf("%s: VACTIVE is %d\n", bktr_name(bktr), temp); */
	OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) & ~0x30);
	OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) \| ((temp >> 4) & 0x30));
	OUTB(bktr, BKTR_E_VACTIVE_LO, temp & 0xff);
	OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) & ~0x30);
	OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) \| ((temp >> 4) & 0x30));
	OUTB(bktr, BKTR_O_VACTIVE_LO, temp & 0xff);

	/* vertical delay */
	if (bktr->capture_area_enabled)
	temp = fp->vdelay + (bktr->capture_area_y_offset);
	else
	temp = fp->vdelay;
	/* printf("%s: VDELAY is %d\n", bktr_name(bktr), temp); */
	OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) & ~0xC0);
	OUTB(bktr, BKTR_E_CROP, INB(bktr, BKTR_E_CROP) \| ((temp >> 2) & 0xC0));
	OUTB(bktr, BKTR_E_VDELAY_LO, temp & 0xff);
	OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) & ~0xC0);
	OUTB(bktr, BKTR_O_CROP, INB(bktr, BKTR_O_CROP) \| ((temp >> 2) & 0xC0));
	OUTB(bktr, BKTR_O_VDELAY_LO, temp & 0xff);

	/* end of video params */

	if ((bktr->xtal_pll_mode == BT848_USE_PLL)
	&& (fp->iform_xtsel==BT848_IFORM_X_XT1)) {
	OUTB(bktr, BKTR_TGCTRL, BT848_TGCTRL_TGCKI_PLL); /* Select PLL mode */
	} else {
	OUTB(bktr, BKTR_TGCTRL, BT848_TGCTRL_TGCKI_XTAL); /* Select Normal xtal 0/xtal 1 mode */
	}

	/* capture control */
	switch (i_flag) {
	case 1:
	bktr->bktr_cap_ctl =
	(BT848_CAP_CTL_DITH_FRAME \| BT848_CAP_CTL_EVEN);
	OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) & ~0x20);
	OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) & ~0x20);
	interlace = 1;
	break;
	case 2:
	bktr->bktr_cap_ctl =
	(BT848_CAP_CTL_DITH_FRAME \| BT848_CAP_CTL_ODD);
	OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) & ~0x20);
	OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) & ~0x20);
	interlace = 1;
	break;
	default:
	bktr->bktr_cap_ctl =
	(BT848_CAP_CTL_DITH_FRAME \|
	BT848_CAP_CTL_EVEN \| BT848_CAP_CTL_ODD);
	OUTB(bktr, BKTR_E_VSCALE_HI, INB(bktr, BKTR_E_VSCALE_HI) \| 0x20);
	OUTB(bktr, BKTR_O_VSCALE_HI, INB(bktr, BKTR_O_VSCALE_HI) \| 0x20);
	interlace = 2;
	break;
	}

	OUTL(bktr, BKTR_RISC_STRT_ADD, vtophys(bktr->dma_prog));

	rows = bktr->rows;
	cols = bktr->cols;

	bktr->vbiflags &= ~VBI_CAPTURE; /* default - no vbi capture */

	/* RGB Grabs. If /dev/vbi is already open, or we are a PAL/SECAM */
	/* user, then use the rgb_vbi RISC program. */
	/* Otherwise, use the normal rgb RISC program */
	if (pf_int->public.type == METEOR_PIXTYPE_RGB) {
	if ( (bktr->vbiflags & VBI_OPEN)
	\|\|(bktr->format_params == BT848_IFORM_F_PALBDGHI)
	\|\|(bktr->format_params == BT848_IFORM_F_SECAM)
	){
	bktr->bktr_cap_ctl \|=
	BT848_CAP_CTL_VBI_EVEN \| BT848_CAP_CTL_VBI_ODD;
	bktr->vbiflags \|= VBI_CAPTURE;
	rgb_vbi_prog(bktr, i_flag, cols, rows, interlace);
	return;
	} else {
	rgb_prog(bktr, i_flag, cols, rows, interlace);
	return;
	}
	}

	if ( pf_int->public.type == METEOR_PIXTYPE_YUV ) {
	yuv422_prog(bktr, i_flag, cols, rows, interlace);
	OUTB(bktr, BKTR_COLOR_CTL, (INB(bktr, BKTR_COLOR_CTL) & 0xf0)
	\| pixfmt_swap_flags( bktr->pixfmt ));
	return;
	}

	if ( pf_int->public.type == METEOR_PIXTYPE_YUV_PACKED ) {
	yuvpack_prog(bktr, i_flag, cols, rows, interlace);
	OUTB(bktr, BKTR_COLOR_CTL, (INB(bktr, BKTR_COLOR_CTL) & 0xf0)
	\| pixfmt_swap_flags( bktr->pixfmt ));
	return;
	}

	if ( pf_int->public.type == METEOR_PIXTYPE_YUV_12 ) {
	yuv12_prog(bktr, i_flag, cols, rows, interlace);
	OUTB(bktr, BKTR_COLOR_CTL, (INB(bktr, BKTR_COLOR_CTL) & 0xf0)
	\| pixfmt_swap_flags( bktr->pixfmt ));
	return;
	}
	return;
	}


	/******************************************************************************
	* video & video capture specific routines:
	*/


	/*
	*
	*/
	static void
	start_capture( bktr_ptr_t bktr, unsigned type )
	{
	u_char i_flag;
	struct format_params *fp;

	fp = &format_params[bktr->format_params];

	/* If requested, clear out capture buf first */
	if (bktr->clr_on_start && (bktr->video.addr == 0)) {
	bzero((caddr_t)bktr->bigbuf,
	(size_t)bktr->rows * bktr->cols * bktr->frames *
	pixfmt_table[ bktr->pixfmt ].public.Bpp);
	}

	OUTB(bktr, BKTR_DSTATUS, 0);
	OUTL(bktr, BKTR_INT_STAT, INL(bktr, BKTR_INT_STAT));

	bktr->flags \|= type;
	bktr->flags &= ~METEOR_WANT_MASK;
	switch(bktr->flags & METEOR_ONLY_FIELDS_MASK) {
	case METEOR_ONLY_EVEN_FIELDS:
	bktr->flags \|= METEOR_WANT_EVEN;
	i_flag = 1;
	break;
	case METEOR_ONLY_ODD_FIELDS:
	bktr->flags \|= METEOR_WANT_ODD;
	i_flag = 2;
	break;
	default:
	bktr->flags \|= METEOR_WANT_MASK;
	i_flag = 3;
	break;
	}

	/* TDEC is only valid for continuous captures */
	if ( type == METEOR_SINGLE ) {
	u_short fps_save = bktr->fps;

	set_fps(bktr, fp->frame_rate);
	bktr->fps = fps_save;
	}
	else
	set_fps(bktr, bktr->fps);

	if (bktr->dma_prog_loaded == FALSE) {
	build_dma_prog(bktr, i_flag);
	bktr->dma_prog_loaded = TRUE;
	}


	OUTL(bktr, BKTR_RISC_STRT_ADD, vtophys(bktr->dma_prog));

	}


	/*
	*
	*/
	static void
	set_fps( bktr_ptr_t bktr, u_short fps )
	{
	struct format_params *fp;
	int i_flag;

	fp = &format_params[bktr->format_params];

	switch(bktr->flags & METEOR_ONLY_FIELDS_MASK) {
	case METEOR_ONLY_EVEN_FIELDS:
	bktr->flags \|= METEOR_WANT_EVEN;
	i_flag = 1;
	break;
	case METEOR_ONLY_ODD_FIELDS:
	bktr->flags \|= METEOR_WANT_ODD;
	i_flag = 1;
	break;
	default:
	bktr->flags \|= METEOR_WANT_MASK;
	i_flag = 2;
	break;
	}

	OUTW(bktr, BKTR_GPIO_DMA_CTL, FIFO_RISC_DISABLED);
	OUTL(bktr, BKTR_INT_STAT, ALL_INTS_CLEARED);

	bktr->fps = fps;
	OUTB(bktr, BKTR_TDEC, 0);

	if (fps < fp->frame_rate)
	OUTB(bktr, BKTR_TDEC, i_flag*(fp->frame_rate - fps) & 0x3f);
	else
	OUTB(bktr, BKTR_TDEC, 0);
	return;

	}





	/*
	* Given a pixfmt index, compute the bt848 swap_flags necessary to
	* achieve the specified swapping.
	* Note that without bt swapping, 2Bpp and 3Bpp modes are written
	* byte-swapped, and 4Bpp modes are byte and word swapped (see Table 6
	* and read R->L).
	* Note also that for 3Bpp, we may additionally need to do some creative
	* SKIPing to align the FIFO bytelines with the target buffer (see split()).
	* This is abstracted here: e.g. no swaps = RGBA; byte & short swap = ABGR
	* as one would expect.
	*/

	static u_int pixfmt_swap_flags( int pixfmt )
	{
	struct meteor_pixfmt *pf = &pixfmt_table[ pixfmt ].public;
	u_int swapf = 0;

	switch ( pf->Bpp ) {
	case 2 : swapf = ( pf->swap_bytes ? 0 : BSWAP );
	break;

	case 3 : /* no swaps supported for 3bpp - makes no sense w/ bt848 */
	break;

	case 4 : if ( pf->swap_bytes )
	swapf = pf->swap_shorts ? 0 : WSWAP;
	else
	swapf = pf->swap_shorts ? BSWAP : (BSWAP \| WSWAP);
	break;
	}
	return swapf;
	}



	/*
	* Converts meteor-defined pixel formats (e.g. METEOR_GEO_RGB16) into
	* our pixfmt_table indices.
	*/

	static int oformat_meteor_to_bt( u_long format )
	{
	int i;
	struct meteor_pixfmt pf1, pf2;

	/* Find format in compatibility table */
	for ( i = 0; i < METEOR_PIXFMT_TABLE_SIZE; i++ )
	if ( meteor_pixfmt_table[i].meteor_format == format )
	break;

	if ( i >= METEOR_PIXFMT_TABLE_SIZE )
	return -1;
	pf1 = &meteor_pixfmt_table[i].public;

	/* Match it with an entry in master pixel format table */
	for ( i = 0; i < PIXFMT_TABLE_SIZE; i++ ) {
	pf2 = &pixfmt_table[i].public;

	if (( pf1->type == pf2->type ) &&
	( pf1->Bpp == pf2->Bpp ) &&
	!bcmp( pf1->masks, pf2->masks, sizeof( pf1->masks )) &&
	( pf1->swap_bytes == pf2->swap_bytes ) &&
	( pf1->swap_shorts == pf2->swap_shorts ))
	break;
	}
	if ( i >= PIXFMT_TABLE_SIZE )
	return -1;

	return i;
	}

	/******************************************************************************
	* i2c primitives:
	*/

	/* */
	#define I2CBITTIME (0x5<<4) /* 5 * 0.48uS */
	#define I2CBITTIME_878 (1 << 7)
	#define I2C_READ 0x01
	#define I2C_COMMAND (I2CBITTIME \| \
	BT848_DATA_CTL_I2CSCL \| \
	BT848_DATA_CTL_I2CSDA)

	#define I2C_COMMAND_878 (I2CBITTIME_878 \| \
	BT848_DATA_CTL_I2CSCL \| \
	BT848_DATA_CTL_I2CSDA)

	/* Select between old i2c code and new iicbus / smbus code */
	#if defined(BKTR_USE_FREEBSD_SMBUS)

	/*
	* The hardware interface is actually SMB commands
	*/
	int
	i2cWrite( bktr_ptr_t bktr, int addr, int byte1, int byte2 )
	{
	char cmd;

	if (bktr->id == BROOKTREE_848 \|\|
	bktr->id == BROOKTREE_848A \|\|
	bktr->id == BROOKTREE_849A)
	cmd = I2C_COMMAND;
	else
	cmd = I2C_COMMAND_878;

	if (byte2 != -1) {
	if (smbus_writew(bktr->i2c_sc.smbus, addr, cmd,
	(short)(((byte2 & 0xff) << 8) \| (byte1 & 0xff))))
	return (-1);
	} else {
	if (smbus_writeb(bktr->i2c_sc.smbus, addr, cmd,
	(char)(byte1 & 0xff)))
	return (-1);
	}

	/* return OK */
	return( 0 );
	}

	int
	i2cRead( bktr_ptr_t bktr, int addr )
	{
	char result;
	char cmd;

	if (bktr->id == BROOKTREE_848 \|\|
	bktr->id == BROOKTREE_848A \|\|
	bktr->id == BROOKTREE_849A)
	cmd = I2C_COMMAND;
	else
	cmd = I2C_COMMAND_878;

	if (smbus_readb(bktr->i2c_sc.smbus, addr, cmd, &result))
	return (-1);

	return ((int)((unsigned char)result));
	}

	#define IICBUS(bktr) ((bktr)->i2c_sc.iicbb)

	/* The MSP34xx and DPL35xx Audio chip require i2c bus writes of up */
	/* to 5 bytes which the bt848 automated i2c bus controller cannot handle */
	/* Therefore we need low level control of the i2c bus hardware */

	/* Write to the MSP or DPL registers */
	void
	msp_dpl_write(bktr_ptr_t bktr, int i2c_addr, unsigned char dev, unsigned int addr, unsigned int data)
	{
	unsigned char addr_l, addr_h, data_h, data_l ;

	addr_h = (addr >>8) & 0xff;
	addr_l = addr & 0xff;
	data_h = (data >>8) & 0xff;
	data_l = data & 0xff;

	iicbus_start(IICBUS(bktr), i2c_addr, 0 /* no timeout? */);

	iicbus_write_byte(IICBUS(bktr), dev, 0);
	iicbus_write_byte(IICBUS(bktr), addr_h, 0);
	iicbus_write_byte(IICBUS(bktr), addr_l, 0);
	iicbus_write_byte(IICBUS(bktr), data_h, 0);
	iicbus_write_byte(IICBUS(bktr), data_l, 0);

	iicbus_stop(IICBUS(bktr));

	return;
	}

	/* Read from the MSP or DPL registers */
	unsigned int
	msp_dpl_read(bktr_ptr_t bktr, int i2c_addr, unsigned char dev, unsigned int addr)
	{
	unsigned int data;
	unsigned char addr_l, addr_h, dev_r;
	int read;
	u_char data_read[2];

	addr_h = (addr >>8) & 0xff;
	addr_l = addr & 0xff;
	dev_r = dev+1;

	/* XXX errors ignored */
	iicbus_start(IICBUS(bktr), i2c_addr, 0 /* no timeout? */);

	iicbus_write_byte(IICBUS(bktr), dev_r, 0);
	iicbus_write_byte(IICBUS(bktr), addr_h, 0);
	iicbus_write_byte(IICBUS(bktr), addr_l, 0);

	iicbus_repeated_start(IICBUS(bktr), i2c_addr +1, 0 /* no timeout? */);
	iicbus_read(IICBUS(bktr), data_read, 2, &read, IIC_LAST_READ, 0);
	iicbus_stop(IICBUS(bktr));

	data = (data_read[0]<<8) \| data_read[1];

	return (data);
	}

	/* Reset the MSP or DPL chip */
	/* The user can block the reset (which is handy if you initialise the
	* MSP and/or DPL audio in another operating system first (eg in Windows)
	*/
	void
	msp_dpl_reset( bktr_ptr_t bktr, int i2c_addr )
	{

	#ifndef BKTR_NO_MSP_RESET
	/* put into reset mode */
	iicbus_start(IICBUS(bktr), i2c_addr, 0 /* no timeout? */);
	iicbus_write_byte(IICBUS(bktr), 0x00, 0);
	iicbus_write_byte(IICBUS(bktr), 0x80, 0);
	iicbus_write_byte(IICBUS(bktr), 0x00, 0);
	iicbus_stop(IICBUS(bktr));

	/* put back to operational mode */
	iicbus_start(IICBUS(bktr), i2c_addr, 0 /* no timeout? */);
	iicbus_write_byte(IICBUS(bktr), 0x00, 0);
	iicbus_write_byte(IICBUS(bktr), 0x00, 0);
	iicbus_write_byte(IICBUS(bktr), 0x00, 0);
	iicbus_stop(IICBUS(bktr));
	#endif
	return;
	}

	static void remote_read(bktr_ptr_t bktr, struct bktr_remote *remote) {
	int read;

	/* XXX errors ignored */
	iicbus_start(IICBUS(bktr), bktr->remote_control_addr, 0 /* no timeout? */);
	iicbus_read(IICBUS(bktr), remote->data, 3, &read, IIC_LAST_READ, 0);
	iicbus_stop(IICBUS(bktr));

	return;
	}

	#else /* defined(BKTR_USE_FREEBSD_SMBUS) */

	/*
	* Program the i2c bus directly
	*/
	int
	i2cWrite( bktr_ptr_t bktr, int addr, int byte1, int byte2 )
	{
	u_long x;
	u_long data;

	/* clear status bits */
	OUTL(bktr, BKTR_INT_STAT, BT848_INT_RACK \| BT848_INT_I2CDONE);

	/* build the command datum */
	if (bktr->id == BROOKTREE_848 \|\|
	bktr->id == BROOKTREE_848A \|\|
	bktr->id == BROOKTREE_849A) {
	data = ((addr & 0xff) << 24) \| ((byte1 & 0xff) << 16) \| I2C_COMMAND;
	} else {
	data = ((addr & 0xff) << 24) \| ((byte1 & 0xff) << 16) \| I2C_COMMAND_878;
	}
	if ( byte2 != -1 ) {
	data \|= ((byte2 & 0xff) << 8);
	data \|= BT848_DATA_CTL_I2CW3B;
	}

	/* write the address and data */
	OUTL(bktr, BKTR_I2C_DATA_CTL, data);

	/* wait for completion */
	for ( x = 0x7fffffff; x; --x ) { /* safety valve */
	if ( INL(bktr, BKTR_INT_STAT) & BT848_INT_I2CDONE )
	break;
	}

	/* check for ACK */
	if ( !x \|\| !(INL(bktr, BKTR_INT_STAT) & BT848_INT_RACK) )
	return( -1 );

	/* return OK */
	return( 0 );
	}


	/*
	*
	*/
	int
	i2cRead( bktr_ptr_t bktr, int addr )
	{
	u_long x;

	/* clear status bits */
	OUTL(bktr, BKTR_INT_STAT, BT848_INT_RACK \| BT848_INT_I2CDONE);

	/* write the READ address */
	/* The Bt878 and Bt879 differed on the treatment of i2c commands */

	if (bktr->id == BROOKTREE_848 \|\|
	bktr->id == BROOKTREE_848A \|\|
	bktr->id == BROOKTREE_849A) {
	OUTL(bktr, BKTR_I2C_DATA_CTL, ((addr & 0xff) << 24) \| I2C_COMMAND);
	} else {
	OUTL(bktr, BKTR_I2C_DATA_CTL, ((addr & 0xff) << 24) \| I2C_COMMAND_878);
	}

	/* wait for completion */
	for ( x = 0x7fffffff; x; --x ) { /* safety valve */
	if ( INL(bktr, BKTR_INT_STAT) & BT848_INT_I2CDONE )
	break;
	}

	/* check for ACK */
	if ( !x \|\| !(INL(bktr, BKTR_INT_STAT) & BT848_INT_RACK) )
	return( -1 );

	/* it was a read */
	return( (INL(bktr, BKTR_I2C_DATA_CTL) >> 8) & 0xff );
	}

	/* The MSP34xx Audio chip require i2c bus writes of up to 5 bytes which the */
	/* bt848 automated i2c bus controller cannot handle */
	/* Therefore we need low level control of the i2c bus hardware */
	/* Idea for the following functions are from elsewhere in this driver and */
	/* from the Linux BTTV i2c driver by Gerd Knorr <kraxel@cs.tu-berlin.de> */

	#define BITD 40
	static void i2c_start( bktr_ptr_t bktr) {
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1); DELAY( BITD ); /* release data */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 3); DELAY( BITD ); /* release clock */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 2); DELAY( BITD ); /* lower data */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 0); DELAY( BITD ); /* lower clock */
	}

	static void i2c_stop( bktr_ptr_t bktr) {
	OUTL(bktr, BKTR_I2C_DATA_CTL, 0); DELAY( BITD ); /* lower clock & data */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 2); DELAY( BITD ); /* release clock */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 3); DELAY( BITD ); /* release data */
	}

	static int i2c_write_byte( bktr_ptr_t bktr, unsigned char data) {
	int x;
	int status;

	/* write out the byte */
	for ( x = 7; x >= 0; --x ) {
	if ( data & (1<<x) ) {
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
	DELAY( BITD ); /* assert HI data */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 3);
	DELAY( BITD ); /* strobe clock */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
	DELAY( BITD ); /* release clock */
	}
	else {
	OUTL(bktr, BKTR_I2C_DATA_CTL, 0);
	DELAY( BITD ); /* assert LO data */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 2);
	DELAY( BITD ); /* strobe clock */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 0);
	DELAY( BITD ); /* release clock */
	}
	}

	/* look for an ACK */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1); DELAY( BITD ); /* float data */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 3); DELAY( BITD ); /* strobe clock */
	status = INL(bktr, BKTR_I2C_DATA_CTL) & 1; /* read the ACK bit */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1); DELAY( BITD ); /* release clock */

	return( status );
	}

	static int i2c_read_byte( bktr_ptr_t bktr, unsigned char *data, int last ) {
	int x;
	int bit;
	int byte = 0;

	/* read in the byte */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
	DELAY( BITD ); /* float data */
	for ( x = 7; x >= 0; --x ) {
	OUTL(bktr, BKTR_I2C_DATA_CTL, 3);
	DELAY( BITD ); /* strobe clock */
	bit = INL(bktr, BKTR_I2C_DATA_CTL) & 1; /* read the data bit */
	if ( bit ) byte \|= (1<<x);
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
	DELAY( BITD ); /* release clock */
	}
	/* After reading the byte, send an ACK */
	/* (unless that was the last byte, for which we send a NAK */
	if (last) { /* send NAK - same a writing a 1 */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
	DELAY( BITD ); /* set data bit */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 3);
	DELAY( BITD ); /* strobe clock */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
	DELAY( BITD ); /* release clock */
	} else { /* send ACK - same as writing a 0 */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 0);
	DELAY( BITD ); /* set data bit */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 2);
	DELAY( BITD ); /* strobe clock */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 0);
	DELAY( BITD ); /* release clock */
	}

	*data=byte;
	return 0;
	}
	#undef BITD

	/* Write to the MSP or DPL registers */
	void msp_dpl_write( bktr_ptr_t bktr, int i2c_addr, unsigned char dev, unsigned int addr,
	unsigned int data){
	unsigned int msp_w_addr = i2c_addr;
	unsigned char addr_l, addr_h, data_h, data_l ;
	addr_h = (addr >>8) & 0xff;
	addr_l = addr & 0xff;
	data_h = (data >>8) & 0xff;
	data_l = data & 0xff;

	i2c_start(bktr);
	i2c_write_byte(bktr, msp_w_addr);
	i2c_write_byte(bktr, dev);
	i2c_write_byte(bktr, addr_h);
	i2c_write_byte(bktr, addr_l);
	i2c_write_byte(bktr, data_h);
	i2c_write_byte(bktr, data_l);
	i2c_stop(bktr);
	}

	/* Read from the MSP or DPL registers */
	unsigned int msp_dpl_read(bktr_ptr_t bktr, int i2c_addr, unsigned char dev, unsigned int addr){
	unsigned int data;
	unsigned char addr_l, addr_h, data_1, data_2, dev_r ;
	addr_h = (addr >>8) & 0xff;
	addr_l = addr & 0xff;
	dev_r = dev+1;

	i2c_start(bktr);
	i2c_write_byte(bktr,i2c_addr);
	i2c_write_byte(bktr,dev_r);
	i2c_write_byte(bktr,addr_h);
	i2c_write_byte(bktr,addr_l);

	i2c_start(bktr);
	i2c_write_byte(bktr,i2c_addr+1);
	i2c_read_byte(bktr,&data_1, 0);
	i2c_read_byte(bktr,&data_2, 1);
	i2c_stop(bktr);
	data = (data_1<<8) \| data_2;
	return data;
	}

	/* Reset the MSP or DPL chip */
	/* The user can block the reset (which is handy if you initialise the
	* MSP audio in another operating system first (eg in Windows)
	*/
	void msp_dpl_reset( bktr_ptr_t bktr, int i2c_addr ) {

	#ifndef BKTR_NO_MSP_RESET
	/* put into reset mode */
	i2c_start(bktr);
	i2c_write_byte(bktr, i2c_addr);
	i2c_write_byte(bktr, 0x00);
	i2c_write_byte(bktr, 0x80);
	i2c_write_byte(bktr, 0x00);
	i2c_stop(bktr);

	/* put back to operational mode */
	i2c_start(bktr);
	i2c_write_byte(bktr, i2c_addr);
	i2c_write_byte(bktr, 0x00);
	i2c_write_byte(bktr, 0x00);
	i2c_write_byte(bktr, 0x00);
	i2c_stop(bktr);
	#endif
	return;

	}

	static void remote_read(bktr_ptr_t bktr, struct bktr_remote *remote) {

	/* XXX errors ignored */
	i2c_start(bktr);
	i2c_write_byte(bktr,bktr->remote_control_addr);
	i2c_read_byte(bktr,&(remote->data[0]), 0);
	i2c_read_byte(bktr,&(remote->data[1]), 0);
	i2c_read_byte(bktr,&(remote->data[2]), 0);
	i2c_stop(bktr);

	return;
	}

	#endif /* defined(BKTR_USE_FREEBSD_SMBUS) */


	#if defined( I2C_SOFTWARE_PROBE )

	/*
	* we are keeping this around for any parts that we need to probe
	* but that CANNOT be probed via an i2c read.
	* this is necessary because the hardware i2c mechanism
	* cannot be programmed for 1 byte writes.
	* currently there are no known i2c parts that we need to probe
	* and that cannot be safely read.
	*/
	static int i2cProbe( bktr_ptr_t bktr, int addr );
	#define BITD 40
	#define EXTRA_START

	/*
	* probe for an I2C device at addr.
	*/
	static int
	i2cProbe( bktr_ptr_t bktr, int addr )
	{
	int x, status;

	/* the START */
	#if defined( EXTRA_START )
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1); DELAY( BITD ); /* release data */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 3); DELAY( BITD ); /* release clock */
	#endif /* EXTRA_START */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 2); DELAY( BITD ); /* lower data */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 0); DELAY( BITD ); /* lower clock */

	/* write addr */
	for ( x = 7; x >= 0; --x ) {
	if ( addr & (1<<x) ) {
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
	DELAY( BITD ); /* assert HI data */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 3);
	DELAY( BITD ); /* strobe clock */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1);
	DELAY( BITD ); /* release clock */
	}
	else {
	OUTL(bktr, BKTR_I2C_DATA_CTL, 0);
	DELAY( BITD ); /* assert LO data */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 2);
	DELAY( BITD ); /* strobe clock */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 0);
	DELAY( BITD ); /* release clock */
	}
	}

	/* look for an ACK */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1); DELAY( BITD ); /* float data */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 3); DELAY( BITD ); /* strobe clock */
	status = INL(bktr, BKTR_I2C_DATA_CTL) & 1; /* read the ACK bit */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 1); DELAY( BITD ); /* release clock */

	/* the STOP */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 0); DELAY( BITD ); /* lower clock & data */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 2); DELAY( BITD ); /* release clock */
	OUTL(bktr, BKTR_I2C_DATA_CTL, 3); DELAY( BITD ); /* release data */

	return( status );
	}
	#undef EXTRA_START
	#undef BITD

	#endif /* I2C_SOFTWARE_PROBE */


	#define ABSENT (-1)

	#endif /* FreeBSD, BSDI, NetBSD, OpenBSD */

	Index: head/sys/dev/hwpmc/hwpmc_logging.c
	===================================================================
	--- head/sys/dev/hwpmc/hwpmc_logging.c (revision 225616)
	+++ head/sys/dev/hwpmc/hwpmc_logging.c (revision 225617)
	@@ -1,1025 +1,1025 @@
	/*-
	* Copyright (c) 2005-2007 Joseph Koshy
	* Copyright (c) 2007 The FreeBSD Foundation
	* All rights reserved.
	*
	* Portions of this software were developed by A. Joseph Koshy under
	* sponsorship from the FreeBSD Foundation and Google, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	/*
	* Logging code for hwpmc(4)
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/capability.h>
	#include <sys/file.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/lock.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/pmc.h>
	#include <sys/pmckern.h>
	#include <sys/pmclog.h>
	#include <sys/proc.h>
	#include <sys/signalvar.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/uio.h>
	#include <sys/unistd.h>
	#include <sys/vnode.h>

	/*
	* Sysctl tunables
	*/

	SYSCTL_DECL(_kern_hwpmc);

	/*
	* kern.hwpmc.logbuffersize -- size of the per-cpu owner buffers.
	*/

	static int pmclog_buffer_size = PMC_LOG_BUFFER_SIZE;
	TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "logbuffersize", &pmclog_buffer_size);
	SYSCTL_INT(_kern_hwpmc, OID_AUTO, logbuffersize, CTLFLAG_TUN\|CTLFLAG_RD,
	&pmclog_buffer_size, 0, "size of log buffers in kilobytes");

	/*
	* kern.hwpmc.nbuffer -- number of global log buffers
	*/

	static int pmc_nlogbuffers = PMC_NLOGBUFFERS;
	TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "nbuffers", &pmc_nlogbuffers);
	SYSCTL_INT(_kern_hwpmc, OID_AUTO, nbuffers, CTLFLAG_TUN\|CTLFLAG_RD,
	&pmc_nlogbuffers, 0, "number of global log buffers");

	/*
	* Global log buffer list and associated spin lock.
	*/

	TAILQ_HEAD(, pmclog_buffer) pmc_bufferlist =
	TAILQ_HEAD_INITIALIZER(pmc_bufferlist);
	static struct mtx pmc_bufferlist_mtx; /* spin lock */
	static struct mtx pmc_kthread_mtx; /* sleep lock */

	#define PMCLOG_INIT_BUFFER_DESCRIPTOR(D) do { \
	const int __roundup = roundup(sizeof(*D), \
	sizeof(uint32_t)); \
	(D)->plb_fence = ((char *) (D)) + \
	1024*pmclog_buffer_size; \
	(D)->plb_base = (D)->plb_ptr = ((char *) (D)) + \
	__roundup; \
	} while (0)


	/*
	* Log file record constructors.
	*/
	#define _PMCLOG_TO_HEADER(T,L) \
	((PMCLOG_HEADER_MAGIC << 24) \| \
	(PMCLOG_TYPE_ ## T << 16) \| \
	((L) & 0xFFFF))

	/* reserve LEN bytes of space and initialize the entry header */
	#define _PMCLOG_RESERVE(PO,TYPE,LEN,ACTION) do { \
	uint32_t *_le; \
	int _len = roundup((LEN), sizeof(uint32_t)); \
	if ((_le = pmclog_reserve((PO), _len)) == NULL) { \
	ACTION; \
	} \
	*_le = _PMCLOG_TO_HEADER(TYPE,_len); \
	_le += 3 /* skip over timestamp */

	#define PMCLOG_RESERVE(P,T,L) _PMCLOG_RESERVE(P,T,L,return)
	#define PMCLOG_RESERVE_WITH_ERROR(P,T,L) _PMCLOG_RESERVE(P,T,L, \
	error=ENOMEM;goto error)

	#define PMCLOG_EMIT32(V) do { *_le++ = (V); } while (0)
	#define PMCLOG_EMIT64(V) do { \
	*_le++ = (uint32_t) ((V) & 0xFFFFFFFF); \
	*_le++ = (uint32_t) (((V) >> 32) & 0xFFFFFFFF); \
	} while (0)


	/* Emit a string. Caution: does NOT update _le, so needs to be last */
	#define PMCLOG_EMITSTRING(S,L) do { bcopy((S), _le, (L)); } while (0)

	#define PMCLOG_DESPATCH(PO) \
	pmclog_release((PO)); \
	} while (0)


	/*
	* Assertions about the log file format.
	*/

	CTASSERT(sizeof(struct pmclog_callchain) == 6*4 +
	PMC_CALLCHAIN_DEPTH_MAX*sizeof(uintfptr_t));
	CTASSERT(sizeof(struct pmclog_closelog) == 3*4);
	CTASSERT(sizeof(struct pmclog_dropnotify) == 3*4);
	CTASSERT(sizeof(struct pmclog_map_in) == PATH_MAX +
	4*4 + sizeof(uintfptr_t));
	CTASSERT(offsetof(struct pmclog_map_in,pl_pathname) ==
	4*4 + sizeof(uintfptr_t));
	CTASSERT(sizeof(struct pmclog_map_out) == 44 + 2sizeof(uintfptr_t));
	CTASSERT(sizeof(struct pmclog_pcsample) == 6*4 + sizeof(uintfptr_t));
	CTASSERT(sizeof(struct pmclog_pmcallocate) == 6*4);
	CTASSERT(sizeof(struct pmclog_pmcattach) == 5*4 + PATH_MAX);
	CTASSERT(offsetof(struct pmclog_pmcattach,pl_pathname) == 5*4);
	CTASSERT(sizeof(struct pmclog_pmcdetach) == 5*4);
	CTASSERT(sizeof(struct pmclog_proccsw) == 5*4 + 8);
	CTASSERT(sizeof(struct pmclog_procexec) == 5*4 + PATH_MAX +
	sizeof(uintfptr_t));
	CTASSERT(offsetof(struct pmclog_procexec,pl_pathname) == 5*4 +
	sizeof(uintfptr_t));
	CTASSERT(sizeof(struct pmclog_procexit) == 5*4 + 8);
	CTASSERT(sizeof(struct pmclog_procfork) == 5*4);
	CTASSERT(sizeof(struct pmclog_sysexit) == 4*4);
	CTASSERT(sizeof(struct pmclog_userdata) == 4*4);

	/*
	* Log buffer structure
	*/

	struct pmclog_buffer {
	TAILQ_ENTRY(pmclog_buffer) plb_next;
	char *plb_base;
	char *plb_ptr;
	char *plb_fence;
	};

	/*
	* Prototypes
	*/

	static int pmclog_get_buffer(struct pmc_owner *po);
	static void pmclog_loop(void *arg);
	static void pmclog_release(struct pmc_owner *po);
	static uint32_t pmclog_reserve(struct pmc_owner po, int length);
	static void pmclog_schedule_io(struct pmc_owner *po);
	static void pmclog_stop_kthread(struct pmc_owner *po);

	/*
	* Helper functions
	*/

	/*
	* Get a log buffer
	*/

	static int
	pmclog_get_buffer(struct pmc_owner *po)
	{
	struct pmclog_buffer *plb;

	mtx_assert(&po->po_mtx, MA_OWNED);

	KASSERT(po->po_curbuf == NULL,
	("[pmclog,%d] po=%p current buffer still valid", __LINE__, po));

	mtx_lock_spin(&pmc_bufferlist_mtx);
	if ((plb = TAILQ_FIRST(&pmc_bufferlist)) != NULL)
	TAILQ_REMOVE(&pmc_bufferlist, plb, plb_next);
	mtx_unlock_spin(&pmc_bufferlist_mtx);

	PMCDBG(LOG,GTB,1, "po=%p plb=%p", po, plb);

	#ifdef DEBUG
	if (plb)
	KASSERT(plb->plb_ptr == plb->plb_base &&
	plb->plb_base < plb->plb_fence,
	("[pmclog,%d] po=%p buffer invariants: ptr=%p "
	"base=%p fence=%p", __LINE__, po, plb->plb_ptr,
	plb->plb_base, plb->plb_fence));
	#endif

	po->po_curbuf = plb;

	/* update stats */
	atomic_add_int(&pmc_stats.pm_buffer_requests, 1);
	if (plb == NULL)
	atomic_add_int(&pmc_stats.pm_buffer_requests_failed, 1);

	return (plb ? 0 : ENOMEM);
	}

	/*
	* Log handler loop.
	*
	* This function is executed by each pmc owner's helper thread.
	*/

	static void
	pmclog_loop(void *arg)
	{
	int error, last_buffer;
	struct pmc_owner *po;
	struct pmclog_buffer *lb;
	struct proc *p;
	struct ucred *ownercred;
	struct ucred *mycred;
	struct thread *td;
	struct uio auio;
	struct iovec aiov;
	size_t nbytes;

	po = (struct pmc_owner *) arg;
	p = po->po_owner;
	td = curthread;
	mycred = td->td_ucred;
	last_buffer = 0;

	PROC_LOCK(p);
	ownercred = crhold(p->p_ucred);
	PROC_UNLOCK(p);

	PMCDBG(LOG,INI,1, "po=%p kt=%p", po, po->po_kthread);
	KASSERT(po->po_kthread == curthread->td_proc,
	("[pmclog,%d] proc mismatch po=%p po/kt=%p curproc=%p", __LINE__,
	po, po->po_kthread, curthread->td_proc));

	lb = NULL;


	/*
	* Loop waiting for I/O requests to be added to the owner
	* struct's queue. The loop is exited when the log file
	* is deconfigured.
	*/

	mtx_lock(&pmc_kthread_mtx);

	for (;;) {

	/* check if we've been asked to exit */
	if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)
	break;

	if (lb == NULL) { /* look for a fresh buffer to write */
	mtx_lock_spin(&po->po_mtx);
	if ((lb = TAILQ_FIRST(&po->po_logbuffers)) == NULL) {
	mtx_unlock_spin(&po->po_mtx);

	(void) msleep(po, &pmc_kthread_mtx, PWAIT,
	"pmcloop", 0);
	continue;
	}

	TAILQ_REMOVE(&po->po_logbuffers, lb, plb_next);
	if (po->po_flags & PMC_PO_SHUTDOWN)
	last_buffer = TAILQ_EMPTY(&po->po_logbuffers);
	mtx_unlock_spin(&po->po_mtx);
	}

	mtx_unlock(&pmc_kthread_mtx);

	/* process the request */
	PMCDBG(LOG,WRI,2, "po=%p base=%p ptr=%p", po,
	lb->plb_base, lb->plb_ptr);
	/* change our thread's credentials before issuing the I/O */

	aiov.iov_base = lb->plb_base;
	aiov.iov_len = nbytes = lb->plb_ptr - lb->plb_base;

	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = -1;
	auio.uio_resid = nbytes;
	auio.uio_rw = UIO_WRITE;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_td = td;

	/* switch thread credentials -- see kern_ktrace.c */
	td->td_ucred = ownercred;
	error = fo_write(po->po_file, &auio, ownercred, 0, td);
	td->td_ucred = mycred;

	if (error) {
	/* XXX some errors are recoverable */
	/* send a SIGIO to the owner and exit */
	PROC_LOCK(p);
	- psignal(p, SIGIO);
	+ kern_psignal(p, SIGIO);
	PROC_UNLOCK(p);

	mtx_lock(&pmc_kthread_mtx);

	po->po_error = error; /* save for flush log */

	PMCDBG(LOG,WRI,2, "po=%p error=%d", po, error);

	break;
	}

	if (last_buffer) {
	/*
	* Close the file to get PMCLOG_EOF error
	* in pmclog(3).
	*/
	fo_close(po->po_file, curthread);
	}

	mtx_lock(&pmc_kthread_mtx);

	/* put the used buffer back into the global pool */
	PMCLOG_INIT_BUFFER_DESCRIPTOR(lb);

	mtx_lock_spin(&pmc_bufferlist_mtx);
	TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next);
	mtx_unlock_spin(&pmc_bufferlist_mtx);

	lb = NULL;
	}

	po->po_kthread = NULL;

	mtx_unlock(&pmc_kthread_mtx);

	/* return the current I/O buffer to the global pool */
	if (lb) {
	PMCLOG_INIT_BUFFER_DESCRIPTOR(lb);

	mtx_lock_spin(&pmc_bufferlist_mtx);
	TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next);
	mtx_unlock_spin(&pmc_bufferlist_mtx);
	}

	/*
	* Exit this thread, signalling the waiter
	*/

	crfree(ownercred);

	kproc_exit(0);
	}

	/*
	* Release and log entry and schedule an I/O if needed.
	*/

	static void
	pmclog_release(struct pmc_owner *po)
	{
	KASSERT(po->po_curbuf->plb_ptr >= po->po_curbuf->plb_base,
	("[pmclog,%d] buffer invariants po=%p ptr=%p base=%p", __LINE__,
	po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base));
	KASSERT(po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence,
	("[pmclog,%d] buffer invariants po=%p ptr=%p fenc=%p", __LINE__,
	po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_fence));

	/* schedule an I/O if we've filled a buffer */
	if (po->po_curbuf->plb_ptr >= po->po_curbuf->plb_fence)
	pmclog_schedule_io(po);

	mtx_unlock_spin(&po->po_mtx);

	PMCDBG(LOG,REL,1, "po=%p", po);
	}


	/*
	* Attempt to reserve 'length' bytes of space in an owner's log
	* buffer. The function returns a pointer to 'length' bytes of space
	* if there was enough space or returns NULL if no space was
	* available. Non-null returns do so with the po mutex locked. The
	* caller must invoke pmclog_release() on the pmc owner structure
	* when done.
	*/

	static uint32_t *
	pmclog_reserve(struct pmc_owner *po, int length)
	{
	uintptr_t newptr, oldptr;
	uint32_t *lh;
	struct timespec ts;

	PMCDBG(LOG,ALL,1, "po=%p len=%d", po, length);

	KASSERT(length % sizeof(uint32_t) == 0,
	("[pmclog,%d] length not a multiple of word size", __LINE__));

	mtx_lock_spin(&po->po_mtx);

	/* No more data when shutdown in progress. */
	if (po->po_flags & PMC_PO_SHUTDOWN) {
	mtx_unlock_spin(&po->po_mtx);
	return (NULL);
	}

	if (po->po_curbuf == NULL)
	if (pmclog_get_buffer(po) != 0) {
	mtx_unlock_spin(&po->po_mtx);
	return (NULL);
	}

	KASSERT(po->po_curbuf != NULL,
	("[pmclog,%d] po=%p no current buffer", __LINE__, po));

	KASSERT(po->po_curbuf->plb_ptr >= po->po_curbuf->plb_base &&
	po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence,
	("[pmclog,%d] po=%p buffer invariants: ptr=%p base=%p fence=%p",
	__LINE__, po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base,
	po->po_curbuf->plb_fence));

	oldptr = (uintptr_t) po->po_curbuf->plb_ptr;
	newptr = oldptr + length;

	KASSERT(oldptr != (uintptr_t) NULL,
	("[pmclog,%d] po=%p Null log buffer pointer", __LINE__, po));

	/*
	* If we have space in the current buffer, return a pointer to
	* available space with the PO structure locked.
	*/
	if (newptr <= (uintptr_t) po->po_curbuf->plb_fence) {
	po->po_curbuf->plb_ptr = (char *) newptr;
	goto done;
	}

	/*
	* Otherwise, schedule the current buffer for output and get a
	* fresh buffer.
	*/
	pmclog_schedule_io(po);

	if (pmclog_get_buffer(po) != 0) {
	mtx_unlock_spin(&po->po_mtx);
	return (NULL);
	}

	KASSERT(po->po_curbuf != NULL,
	("[pmclog,%d] po=%p no current buffer", __LINE__, po));

	KASSERT(po->po_curbuf->plb_ptr != NULL,
	("[pmclog,%d] null return from pmc_get_log_buffer", __LINE__));

	KASSERT(po->po_curbuf->plb_ptr == po->po_curbuf->plb_base &&
	po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence,
	("[pmclog,%d] po=%p buffer invariants: ptr=%p base=%p fence=%p",
	__LINE__, po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base,
	po->po_curbuf->plb_fence));

	oldptr = (uintptr_t) po->po_curbuf->plb_ptr;

	done:
	lh = (uint32_t *) oldptr;
	lh++; /* skip header */
	getnanotime(&ts); /* fill in the timestamp */
	*lh++ = ts.tv_sec & 0xFFFFFFFF;
	*lh++ = ts.tv_nsec & 0xFFFFFFF;
	return ((uint32_t *) oldptr);
	}

	/*
	* Schedule an I/O.
	*
	* Transfer the current buffer to the helper kthread.
	*/

	static void
	pmclog_schedule_io(struct pmc_owner *po)
	{
	KASSERT(po->po_curbuf != NULL,
	("[pmclog,%d] schedule_io with null buffer po=%p", __LINE__, po));

	KASSERT(po->po_curbuf->plb_ptr >= po->po_curbuf->plb_base,
	("[pmclog,%d] buffer invariants po=%p ptr=%p base=%p", __LINE__,
	po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_base));
	KASSERT(po->po_curbuf->plb_ptr <= po->po_curbuf->plb_fence,
	("[pmclog,%d] buffer invariants po=%p ptr=%p fenc=%p", __LINE__,
	po, po->po_curbuf->plb_ptr, po->po_curbuf->plb_fence));

	PMCDBG(LOG,SIO, 1, "po=%p", po);

	mtx_assert(&po->po_mtx, MA_OWNED);

	/*
	* Add the current buffer to the tail of the buffer list and
	* wakeup the helper.
	*/
	TAILQ_INSERT_TAIL(&po->po_logbuffers, po->po_curbuf, plb_next);
	po->po_curbuf = NULL;
	wakeup_one(po);
	}

	/*
	* Stop the helper kthread.
	*/

	static void
	pmclog_stop_kthread(struct pmc_owner *po)
	{
	/*
	* Close the file to force the thread out of fo_write,
	* unset flag, wakeup the helper thread,
	* wait for it to exit
	*/

	if (po->po_file != NULL)
	fo_close(po->po_file, curthread);

	mtx_lock(&pmc_kthread_mtx);
	po->po_flags &= ~PMC_PO_OWNS_LOGFILE;
	wakeup_one(po);
	if (po->po_kthread)
	msleep(po->po_kthread, &pmc_kthread_mtx, PPAUSE, "pmckstp", 0);
	mtx_unlock(&pmc_kthread_mtx);
	}

	/*
	* Public functions
	*/

	/*
	* Configure a log file for pmc owner 'po'.
	*
	* Parameter 'logfd' is a file handle referencing an open file in the
	* owner process. This file needs to have been opened for writing.
	*/

	int
	pmclog_configure_log(struct pmc_mdep md, struct pmc_owner po, int logfd)
	{
	int error;
	struct proc *p;

	/*
	* As long as it is possible to get a LOR between pmc_sx lock and
	* proctree/allproc sx locks used for adding a new process, assure
	* the former is not held here.
	*/
	sx_assert(&pmc_sx, SA_UNLOCKED);
	PMCDBG(LOG,CFG,1, "config po=%p logfd=%d", po, logfd);

	p = po->po_owner;

	/* return EBUSY if a log file was already present */
	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
	return (EBUSY);

	KASSERT(po->po_kthread == NULL,
	("[pmclog,%d] po=%p kthread (%p) already present", __LINE__, po,
	po->po_kthread));
	KASSERT(po->po_file == NULL,
	("[pmclog,%d] po=%p file (%p) already present", __LINE__, po,
	po->po_file));

	/* get a reference to the file state */
	error = fget_write(curthread, logfd, CAP_WRITE, &po->po_file);
	if (error)
	goto error;

	/* mark process as owning a log file */
	po->po_flags \|= PMC_PO_OWNS_LOGFILE;
	error = kproc_create(pmclog_loop, po, &po->po_kthread,
	RFHIGHPID, 0, "hwpmc: proc(%d)", p->p_pid);
	if (error)
	goto error;

	/* mark process as using HWPMCs */
	PROC_LOCK(p);
	p->p_flag \|= P_HWPMC;
	PROC_UNLOCK(p);

	/* create a log initialization entry */
	PMCLOG_RESERVE_WITH_ERROR(po, INITIALIZE,
	sizeof(struct pmclog_initialize));
	PMCLOG_EMIT32(PMC_VERSION);
	PMCLOG_EMIT32(md->pmd_cputype);
	PMCLOG_DESPATCH(po);

	return (0);

	error:
	/* shutdown the thread */
	if (po->po_kthread)
	pmclog_stop_kthread(po);

	KASSERT(po->po_kthread == NULL, ("[pmclog,%d] po=%p kthread not "
	"stopped", __LINE__, po));

	if (po->po_file)
	(void) fdrop(po->po_file, curthread);
	po->po_file = NULL; /* clear file and error state */
	po->po_error = 0;

	return (error);
	}


	/*
	* De-configure a log file. This will throw away any buffers queued
	* for this owner process.
	*/

	int
	pmclog_deconfigure_log(struct pmc_owner *po)
	{
	int error;
	struct pmclog_buffer *lb;

	PMCDBG(LOG,CFG,1, "de-config po=%p", po);

	if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)
	return (EINVAL);

	KASSERT(po->po_sscount == 0,
	("[pmclog,%d] po=%p still owning SS PMCs", __LINE__, po));
	KASSERT(po->po_file != NULL,
	("[pmclog,%d] po=%p no log file", __LINE__, po));

	/* stop the kthread, this will reset the 'OWNS_LOGFILE' flag */
	if (po->po_kthread)
	pmclog_stop_kthread(po);

	KASSERT(po->po_kthread == NULL,
	("[pmclog,%d] po=%p kthread not stopped", __LINE__, po));

	/* return all queued log buffers to the global pool */
	while ((lb = TAILQ_FIRST(&po->po_logbuffers)) != NULL) {
	TAILQ_REMOVE(&po->po_logbuffers, lb, plb_next);
	PMCLOG_INIT_BUFFER_DESCRIPTOR(lb);
	mtx_lock_spin(&pmc_bufferlist_mtx);
	TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next);
	mtx_unlock_spin(&pmc_bufferlist_mtx);
	}

	/* return the 'current' buffer to the global pool */
	if ((lb = po->po_curbuf) != NULL) {
	PMCLOG_INIT_BUFFER_DESCRIPTOR(lb);
	mtx_lock_spin(&pmc_bufferlist_mtx);
	TAILQ_INSERT_HEAD(&pmc_bufferlist, lb, plb_next);
	mtx_unlock_spin(&pmc_bufferlist_mtx);
	}

	/* drop a reference to the fd */
	error = fdrop(po->po_file, curthread);
	po->po_file = NULL;
	po->po_error = 0;

	return (error);
	}

	/*
	* Flush a process' log buffer.
	*/

	int
	pmclog_flush(struct pmc_owner *po)
	{
	int error;

	PMCDBG(LOG,FLS,1, "po=%p", po);

	/*
	* If there is a pending error recorded by the logger thread,
	* return that.
	*/
	if (po->po_error)
	return (po->po_error);

	error = 0;

	/*
	* Check that we do have an active log file.
	*/
	mtx_lock(&pmc_kthread_mtx);
	if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) {
	error = EINVAL;
	goto error;
	}

	/*
	* Schedule the current buffer if any.
	*/
	mtx_lock_spin(&po->po_mtx);
	if (po->po_curbuf)
	pmclog_schedule_io(po);
	mtx_unlock_spin(&po->po_mtx);

	/*
	* Initiate shutdown: no new data queued,
	* thread will close file on last block.
	*/
	po->po_flags \|= PMC_PO_SHUTDOWN;

	error:
	mtx_unlock(&pmc_kthread_mtx);

	return (error);
	}


	void
	pmclog_process_callchain(struct pmc pm, struct pmc_sample ps)
	{
	int n, recordlen;
	uint32_t flags;
	struct pmc_owner *po;

	PMCDBG(LOG,SAM,1,"pm=%p pid=%d n=%d", pm, ps->ps_pid,
	ps->ps_nsamples);

	recordlen = offsetof(struct pmclog_callchain, pl_pc) +
	ps->ps_nsamples * sizeof(uintfptr_t);
	po = pm->pm_owner;
	flags = PMC_CALLCHAIN_TO_CPUFLAGS(ps->ps_cpu,ps->ps_flags);
	PMCLOG_RESERVE(po, CALLCHAIN, recordlen);
	PMCLOG_EMIT32(ps->ps_pid);
	PMCLOG_EMIT32(pm->pm_id);
	PMCLOG_EMIT32(flags);
	for (n = 0; n < ps->ps_nsamples; n++)
	PMCLOG_EMITADDR(ps->ps_pc[n]);
	PMCLOG_DESPATCH(po);
	}

	void
	pmclog_process_closelog(struct pmc_owner *po)
	{
	PMCLOG_RESERVE(po,CLOSELOG,sizeof(struct pmclog_closelog));
	PMCLOG_DESPATCH(po);
	}

	void
	pmclog_process_dropnotify(struct pmc_owner *po)
	{
	PMCLOG_RESERVE(po,DROPNOTIFY,sizeof(struct pmclog_dropnotify));
	PMCLOG_DESPATCH(po);
	}

	void
	pmclog_process_map_in(struct pmc_owner *po, pid_t pid, uintfptr_t start,
	const char *path)
	{
	int pathlen, recordlen;

	KASSERT(path != NULL, ("[pmclog,%d] map-in, null path", __LINE__));

	pathlen = strlen(path) + 1; /* #bytes for path name */
	recordlen = offsetof(struct pmclog_map_in, pl_pathname) +
	pathlen;

	PMCLOG_RESERVE(po, MAP_IN, recordlen);
	PMCLOG_EMIT32(pid);
	PMCLOG_EMITADDR(start);
	PMCLOG_EMITSTRING(path,pathlen);
	PMCLOG_DESPATCH(po);
	}

	void
	pmclog_process_map_out(struct pmc_owner *po, pid_t pid, uintfptr_t start,
	uintfptr_t end)
	{
	KASSERT(start <= end, ("[pmclog,%d] start > end", __LINE__));

	PMCLOG_RESERVE(po, MAP_OUT, sizeof(struct pmclog_map_out));
	PMCLOG_EMIT32(pid);
	PMCLOG_EMITADDR(start);
	PMCLOG_EMITADDR(end);
	PMCLOG_DESPATCH(po);
	}

	void
	pmclog_process_pmcallocate(struct pmc *pm)
	{
	struct pmc_owner *po;

	po = pm->pm_owner;

	PMCDBG(LOG,ALL,1, "pm=%p", pm);

	PMCLOG_RESERVE(po, PMCALLOCATE, sizeof(struct pmclog_pmcallocate));
	PMCLOG_EMIT32(pm->pm_id);
	PMCLOG_EMIT32(pm->pm_event);
	PMCLOG_EMIT32(pm->pm_flags);
	PMCLOG_DESPATCH(po);
	}

	void
	pmclog_process_pmcattach(struct pmc pm, pid_t pid, char path)
	{
	int pathlen, recordlen;
	struct pmc_owner *po;

	PMCDBG(LOG,ATT,1,"pm=%p pid=%d", pm, pid);

	po = pm->pm_owner;

	pathlen = strlen(path) + 1; /* #bytes for the string */
	recordlen = offsetof(struct pmclog_pmcattach, pl_pathname) + pathlen;

	PMCLOG_RESERVE(po, PMCATTACH, recordlen);
	PMCLOG_EMIT32(pm->pm_id);
	PMCLOG_EMIT32(pid);
	PMCLOG_EMITSTRING(path, pathlen);
	PMCLOG_DESPATCH(po);
	}

	void
	pmclog_process_pmcdetach(struct pmc *pm, pid_t pid)
	{
	struct pmc_owner *po;

	PMCDBG(LOG,ATT,1,"!pm=%p pid=%d", pm, pid);

	po = pm->pm_owner;

	PMCLOG_RESERVE(po, PMCDETACH, sizeof(struct pmclog_pmcdetach));
	PMCLOG_EMIT32(pm->pm_id);
	PMCLOG_EMIT32(pid);
	PMCLOG_DESPATCH(po);
	}

	/*
	* Log a context switch event to the log file.
	*/

	void
	pmclog_process_proccsw(struct pmc pm, struct pmc_process pp, pmc_value_t v)
	{
	struct pmc_owner *po;

	KASSERT(pm->pm_flags & PMC_F_LOG_PROCCSW,
	("[pmclog,%d] log-process-csw called gratuitously", __LINE__));

	PMCDBG(LOG,SWO,1,"pm=%p pid=%d v=%jx", pm, pp->pp_proc->p_pid,
	v);

	po = pm->pm_owner;

	PMCLOG_RESERVE(po, PROCCSW, sizeof(struct pmclog_proccsw));
	PMCLOG_EMIT32(pm->pm_id);
	PMCLOG_EMIT64(v);
	PMCLOG_EMIT32(pp->pp_proc->p_pid);
	PMCLOG_DESPATCH(po);
	}

	void
	pmclog_process_procexec(struct pmc_owner *po, pmc_id_t pmid, pid_t pid,
	uintfptr_t startaddr, char *path)
	{
	int pathlen, recordlen;

	PMCDBG(LOG,EXC,1,"po=%p pid=%d path=\"%s\"", po, pid, path);

	pathlen = strlen(path) + 1; /* #bytes for the path */
	recordlen = offsetof(struct pmclog_procexec, pl_pathname) + pathlen;

	PMCLOG_RESERVE(po, PROCEXEC, recordlen);
	PMCLOG_EMIT32(pid);
	PMCLOG_EMITADDR(startaddr);
	PMCLOG_EMIT32(pmid);
	PMCLOG_EMITSTRING(path,pathlen);
	PMCLOG_DESPATCH(po);
	}

	/*
	* Log a process exit event (and accumulated pmc value) to the log file.
	*/

	void
	pmclog_process_procexit(struct pmc pm, struct pmc_process pp)
	{
	int ri;
	struct pmc_owner *po;

	ri = PMC_TO_ROWINDEX(pm);
	PMCDBG(LOG,EXT,1,"pm=%p pid=%d v=%jx", pm, pp->pp_proc->p_pid,
	pp->pp_pmcs[ri].pp_pmcval);

	po = pm->pm_owner;

	PMCLOG_RESERVE(po, PROCEXIT, sizeof(struct pmclog_procexit));
	PMCLOG_EMIT32(pm->pm_id);
	PMCLOG_EMIT64(pp->pp_pmcs[ri].pp_pmcval);
	PMCLOG_EMIT32(pp->pp_proc->p_pid);
	PMCLOG_DESPATCH(po);
	}

	/*
	* Log a fork event.
	*/

	void
	pmclog_process_procfork(struct pmc_owner *po, pid_t oldpid, pid_t newpid)
	{
	PMCLOG_RESERVE(po, PROCFORK, sizeof(struct pmclog_procfork));
	PMCLOG_EMIT32(oldpid);
	PMCLOG_EMIT32(newpid);
	PMCLOG_DESPATCH(po);
	}

	/*
	* Log a process exit event of the form suitable for system-wide PMCs.
	*/

	void
	pmclog_process_sysexit(struct pmc_owner *po, pid_t pid)
	{
	PMCLOG_RESERVE(po, SYSEXIT, sizeof(struct pmclog_sysexit));
	PMCLOG_EMIT32(pid);
	PMCLOG_DESPATCH(po);
	}

	/*
	* Write a user log entry.
	*/

	int
	pmclog_process_userlog(struct pmc_owner po, struct pmc_op_writelog wl)
	{
	int error;

	PMCDBG(LOG,WRI,1, "writelog po=%p ud=0x%x", po, wl->pm_userdata);

	error = 0;

	PMCLOG_RESERVE_WITH_ERROR(po, USERDATA,
	sizeof(struct pmclog_userdata));
	PMCLOG_EMIT32(wl->pm_userdata);
	PMCLOG_DESPATCH(po);

	error:
	return (error);
	}

	/*
	* Initialization.
	*
	* Create a pool of log buffers and initialize mutexes.
	*/

	void
	pmclog_initialize()
	{
	int n;
	struct pmclog_buffer *plb;

	if (pmclog_buffer_size <= 0) {
	(void) printf("hwpmc: tunable logbuffersize=%d must be "
	"greater than zero.\n", pmclog_buffer_size);
	pmclog_buffer_size = PMC_LOG_BUFFER_SIZE;
	}

	if (pmc_nlogbuffers <= 0) {
	(void) printf("hwpmc: tunable nlogbuffers=%d must be greater "
	"than zero.\n", pmc_nlogbuffers);
	pmc_nlogbuffers = PMC_NLOGBUFFERS;
	}

	/* create global pool of log buffers */
	for (n = 0; n < pmc_nlogbuffers; n++) {
	plb = malloc(1024 * pmclog_buffer_size, M_PMC,
	M_WAITOK\|M_ZERO);
	PMCLOG_INIT_BUFFER_DESCRIPTOR(plb);
	TAILQ_INSERT_HEAD(&pmc_bufferlist, plb, plb_next);
	}
	mtx_init(&pmc_bufferlist_mtx, "pmc-buffer-list", "pmc-leaf",
	MTX_SPIN);
	mtx_init(&pmc_kthread_mtx, "pmc-kthread", "pmc-sleep", MTX_DEF);
	}

	/*
	* Shutdown logging.
	*
	* Destroy mutexes and release memory back the to free pool.
	*/

	void
	pmclog_shutdown()
	{
	struct pmclog_buffer *plb;

	mtx_destroy(&pmc_kthread_mtx);
	mtx_destroy(&pmc_bufferlist_mtx);

	while ((plb = TAILQ_FIRST(&pmc_bufferlist)) != NULL) {
	TAILQ_REMOVE(&pmc_bufferlist, plb, plb_next);
	free(plb, M_PMC);
	}
	}
	Index: head/sys/dev/hwpmc/hwpmc_mod.c
	===================================================================
	--- head/sys/dev/hwpmc/hwpmc_mod.c (revision 225616)
	+++ head/sys/dev/hwpmc/hwpmc_mod.c (revision 225617)
	@@ -1,4949 +1,4949 @@
	/*-
	* Copyright (c) 2003-2008 Joseph Koshy
	* Copyright (c) 2007 The FreeBSD Foundation
	* All rights reserved.
	*
	* Portions of this software were developed by A. Joseph Koshy under
	* sponsorship from the FreeBSD Foundation and Google, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/eventhandler.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/pmc.h>
	#include <sys/pmckern.h>
	#include <sys/pmclog.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/queue.h>
	#include <sys/resourcevar.h>
	#include <sys/sched.h>
	#include <sys/signalvar.h>
	#include <sys/smp.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/systm.h>
	#include <sys/vnode.h>

	#include <sys/linker.h> /* needs to be after <sys/malloc.h> */

	#include <machine/atomic.h>
	#include <machine/md_var.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>

	/*
	* Types
	*/

	enum pmc_flags {
	PMC_FLAG_NONE = 0x00, /* do nothing */
	PMC_FLAG_REMOVE = 0x01, /* atomically remove entry from hash */
	PMC_FLAG_ALLOCATE = 0x02, /* add entry to hash if not found */
	};

	/*
	* The offset in sysent where the syscall is allocated.
	*/

	static int pmc_syscall_num = NO_SYSCALL;
	struct pmc_cpu *pmc_pcpu; / per-cpu state */
	pmc_value_t pmc_pcpu_saved; / saved PMC values: CSW handling */

	#define PMC_PCPU_SAVED(C,R) pmc_pcpu_saved[(R) + md->pmd_npmc*(C)]

	struct mtx_pool *pmc_mtxpool;
	static int pmc_pmcdisp; / PMC row dispositions */

	#define PMC_ROW_DISP_IS_FREE(R) (pmc_pmcdisp[(R)] == 0)
	#define PMC_ROW_DISP_IS_THREAD(R) (pmc_pmcdisp[(R)] > 0)
	#define PMC_ROW_DISP_IS_STANDALONE(R) (pmc_pmcdisp[(R)] < 0)

	#define PMC_MARK_ROW_FREE(R) do { \
	pmc_pmcdisp[(R)] = 0; \
	} while (0)

	#define PMC_MARK_ROW_STANDALONE(R) do { \
	KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \
	__LINE__)); \
	atomic_add_int(&pmc_pmcdisp[(R)], -1); \
	KASSERT(pmc_pmcdisp[(R)] >= (-pmc_cpu_max_active()), \
	("[pmc,%d] row disposition error", __LINE__)); \
	} while (0)

	#define PMC_UNMARK_ROW_STANDALONE(R) do { \
	atomic_add_int(&pmc_pmcdisp[(R)], 1); \
	KASSERT(pmc_pmcdisp[(R)] <= 0, ("[pmc,%d] row disposition error", \
	__LINE__)); \
	} while (0)

	#define PMC_MARK_ROW_THREAD(R) do { \
	KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \
	__LINE__)); \
	atomic_add_int(&pmc_pmcdisp[(R)], 1); \
	} while (0)

	#define PMC_UNMARK_ROW_THREAD(R) do { \
	atomic_add_int(&pmc_pmcdisp[(R)], -1); \
	KASSERT(pmc_pmcdisp[(R)] >= 0, ("[pmc,%d] row disposition error", \
	__LINE__)); \
	} while (0)


	/* various event handlers */
	static eventhandler_tag pmc_exit_tag, pmc_fork_tag;

	/* Module statistics */
	struct pmc_op_getdriverstats pmc_stats;

	/* Machine/processor dependent operations */
	static struct pmc_mdep *md;

	/*
	* Hash tables mapping owner processes and target threads to PMCs.
	*/

	struct mtx pmc_processhash_mtx; /* spin mutex */
	static u_long pmc_processhashmask;
	static LIST_HEAD(pmc_processhash, pmc_process) *pmc_processhash;

	/*
	* Hash table of PMC owner descriptors. This table is protected by
	* the shared PMC "sx" lock.
	*/

	static u_long pmc_ownerhashmask;
	static LIST_HEAD(pmc_ownerhash, pmc_owner) *pmc_ownerhash;

	/*
	* List of PMC owners with system-wide sampling PMCs.
	*/

	static LIST_HEAD(, pmc_owner) pmc_ss_owners;


	/*
	* A map of row indices to classdep structures.
	*/
	static struct pmc_classdep **pmc_rowindex_to_classdep;

	/*
	* Prototypes
	*/

	#ifdef DEBUG
	static int pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS);
	static int pmc_debugflags_parse(char newstr, char fence);
	#endif

	static int load(struct module module, int cmd, void arg);
	static int pmc_attach_process(struct proc p, struct pmc pm);
	static struct pmc *pmc_allocate_pmc_descriptor(void);
	static struct pmc_owner pmc_allocate_owner_descriptor(struct proc p);
	static int pmc_attach_one_process(struct proc p, struct pmc pm);
	static int pmc_can_allocate_rowindex(struct proc *p, unsigned int ri,
	int cpu);
	static int pmc_can_attach(struct pmc pm, struct proc p);
	static void pmc_capture_user_callchain(int cpu, struct trapframe *tf);
	static void pmc_cleanup(void);
	static int pmc_detach_process(struct proc p, struct pmc pm);
	static int pmc_detach_one_process(struct proc p, struct pmc pm,
	int flags);
	static void pmc_destroy_owner_descriptor(struct pmc_owner *po);
	static struct pmc_owner pmc_find_owner_descriptor(struct proc p);
	static int pmc_find_pmc(pmc_id_t pmcid, struct pmc **pm);
	static struct pmc pmc_find_pmc_descriptor_in_process(struct pmc_owner po,
	pmc_id_t pmc);
	static struct pmc_process pmc_find_process_descriptor(struct proc p,
	uint32_t mode);
	static void pmc_force_context_switch(void);
	static void pmc_link_target_process(struct pmc *pm,
	struct pmc_process *pp);
	static void pmc_log_all_process_mappings(struct pmc_owner *po);
	static void pmc_log_kernel_mappings(struct pmc *pm);
	static void pmc_log_process_mappings(struct pmc_owner po, struct proc p);
	static void pmc_maybe_remove_owner(struct pmc_owner *po);
	static void pmc_process_csw_in(struct thread *td);
	static void pmc_process_csw_out(struct thread *td);
	static void pmc_process_exit(void arg, struct proc p);
	static void pmc_process_fork(void arg, struct proc p1,
	struct proc *p2, int n);
	static void pmc_process_samples(int cpu);
	static void pmc_release_pmc_descriptor(struct pmc *pmc);
	static void pmc_remove_owner(struct pmc_owner *po);
	static void pmc_remove_process_descriptor(struct pmc_process *pp);
	static void pmc_restore_cpu_binding(struct pmc_binding *pb);
	static void pmc_save_cpu_binding(struct pmc_binding *pb);
	static void pmc_select_cpu(int cpu);
	static int pmc_start(struct pmc *pm);
	static int pmc_stop(struct pmc *pm);
	static int pmc_syscall_handler(struct thread td, void syscall_args);
	static void pmc_unlink_target_process(struct pmc *pmc,
	struct pmc_process *pp);

	/*
	* Kernel tunables and sysctl(8) interface.
	*/

	SYSCTL_NODE(_kern, OID_AUTO, hwpmc, CTLFLAG_RW, 0, "HWPMC parameters");

	static int pmc_callchaindepth = PMC_CALLCHAIN_DEPTH;
	TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "callchaindepth", &pmc_callchaindepth);
	SYSCTL_INT(_kern_hwpmc, OID_AUTO, callchaindepth, CTLFLAG_TUN\|CTLFLAG_RD,
	&pmc_callchaindepth, 0, "depth of call chain records");

	#ifdef DEBUG
	struct pmc_debugflags pmc_debugflags = PMC_DEBUG_DEFAULT_FLAGS;
	char pmc_debugstr[PMC_DEBUG_STRSIZE];
	TUNABLE_STR(PMC_SYSCTL_NAME_PREFIX "debugflags", pmc_debugstr,
	sizeof(pmc_debugstr));
	SYSCTL_PROC(_kern_hwpmc, OID_AUTO, debugflags,
	CTLTYPE_STRING\|CTLFLAG_RW\|CTLFLAG_TUN,
	0, 0, pmc_debugflags_sysctl_handler, "A", "debug flags");
	#endif

	/*
	* kern.hwpmc.hashrows -- determines the number of rows in the
	* of the hash table used to look up threads
	*/

	static int pmc_hashsize = PMC_HASH_SIZE;
	TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "hashsize", &pmc_hashsize);
	SYSCTL_INT(_kern_hwpmc, OID_AUTO, hashsize, CTLFLAG_TUN\|CTLFLAG_RD,
	&pmc_hashsize, 0, "rows in hash tables");

	/*
	* kern.hwpmc.nsamples --- number of PC samples/callchain stacks per CPU
	*/

	static int pmc_nsamples = PMC_NSAMPLES;
	TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "nsamples", &pmc_nsamples);
	SYSCTL_INT(_kern_hwpmc, OID_AUTO, nsamples, CTLFLAG_TUN\|CTLFLAG_RD,
	&pmc_nsamples, 0, "number of PC samples per CPU");


	/*
	* kern.hwpmc.mtxpoolsize -- number of mutexes in the mutex pool.
	*/

	static int pmc_mtxpool_size = PMC_MTXPOOL_SIZE;
	TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "mtxpoolsize", &pmc_mtxpool_size);
	SYSCTL_INT(_kern_hwpmc, OID_AUTO, mtxpoolsize, CTLFLAG_TUN\|CTLFLAG_RD,
	&pmc_mtxpool_size, 0, "size of spin mutex pool");


	/*
	* security.bsd.unprivileged_syspmcs -- allow non-root processes to
	* allocate system-wide PMCs.
	*
	* Allowing unprivileged processes to allocate system PMCs is convenient
	* if system-wide measurements need to be taken concurrently with other
	* per-process measurements. This feature is turned off by default.
	*/

	static int pmc_unprivileged_syspmcs = 0;
	TUNABLE_INT("security.bsd.unprivileged_syspmcs", &pmc_unprivileged_syspmcs);
	SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_syspmcs, CTLFLAG_RW,
	&pmc_unprivileged_syspmcs, 0,
	"allow unprivileged process to allocate system PMCs");

	/*
	* Hash function. Discard the lower 2 bits of the pointer since
	* these are always zero for our uses. The hash multiplier is
	* round((2^LONG_BIT) * ((sqrt(5)-1)/2)).
	*/

	#if LONG_BIT == 64
	#define _PMC_HM 11400714819323198486u
	#elif LONG_BIT == 32
	#define _PMC_HM 2654435769u
	#else
	#error Must know the size of 'long' to compile
	#endif

	#define PMC_HASH_PTR(P,M) ((((unsigned long) (P) >> 2) * _PMC_HM) & (M))

	/*
	* Syscall structures
	*/

	/* The `sysent' for the new syscall */
	static struct sysent pmc_sysent = {
	2, /* sy_narg */
	pmc_syscall_handler /* sy_call */
	};

	static struct syscall_module_data pmc_syscall_mod = {
	load,
	NULL,
	&pmc_syscall_num,
	&pmc_sysent,
	{ 0, NULL }
	};

	static moduledata_t pmc_mod = {
	PMC_MODULE_NAME,
	syscall_module_handler,
	&pmc_syscall_mod
	};

	DECLARE_MODULE(pmc, pmc_mod, SI_SUB_SMP, SI_ORDER_ANY);
	MODULE_VERSION(pmc, PMC_VERSION);

	#ifdef DEBUG
	enum pmc_dbgparse_state {
	PMCDS_WS, /* in whitespace */
	PMCDS_MAJOR, /* seen a major keyword */
	PMCDS_MINOR
	};

	static int
	pmc_debugflags_parse(char newstr, char fence)
	{
	char c, p, q;
	struct pmc_debugflags *tmpflags;
	int error, found, *newbits, tmp;
	size_t kwlen;

	tmpflags = malloc(sizeof(*tmpflags), M_PMC, M_WAITOK\|M_ZERO);

	p = newstr;
	error = 0;

	for (; p < fence && (c = *p); p++) {

	/* skip white space */
	if (c == ' ' \|\| c == '\t')
	continue;

	/* look for a keyword followed by "=" */
	for (q = p; p < fence && (c = *p) && c != '='; p++)
	;
	if (c != '=') {
	error = EINVAL;
	goto done;
	}

	kwlen = p - q;
	newbits = NULL;

	/* lookup flag group name */
	#define DBG_SET_FLAG_MAJ(S,F) \
	if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0) \
	newbits = &tmpflags->pdb_ ## F;

	DBG_SET_FLAG_MAJ("cpu", CPU);
	DBG_SET_FLAG_MAJ("csw", CSW);
	DBG_SET_FLAG_MAJ("logging", LOG);
	DBG_SET_FLAG_MAJ("module", MOD);
	DBG_SET_FLAG_MAJ("md", MDP);
	DBG_SET_FLAG_MAJ("owner", OWN);
	DBG_SET_FLAG_MAJ("pmc", PMC);
	DBG_SET_FLAG_MAJ("process", PRC);
	DBG_SET_FLAG_MAJ("sampling", SAM);

	if (newbits == NULL) {
	error = EINVAL;
	goto done;
	}

	p++; /* skip the '=' */

	/* Now parse the individual flags */
	tmp = 0;
	newflag:
	for (q = p; p < fence && (c = *p); p++)
	if (c == ' ' \|\| c == '\t' \|\| c == ',')
	break;

	/* p == fence or c == ws or c == "," or c == 0 */

	if ((kwlen = p - q) == 0) {
	*newbits = tmp;
	continue;
	}

	found = 0;
	#define DBG_SET_FLAG_MIN(S,F) \
	if (kwlen == sizeof(S)-1 && strncmp(q, S, kwlen) == 0) \
	tmp \|= found = (1 << PMC_DEBUG_MIN_ ## F)

	/* a '' denotes all possible flags in the group /
	if (kwlen == 1 && q == '')
	tmp = found = ~0;
	/* look for individual flag names */
	DBG_SET_FLAG_MIN("allocaterow", ALR);
	DBG_SET_FLAG_MIN("allocate", ALL);
	DBG_SET_FLAG_MIN("attach", ATT);
	DBG_SET_FLAG_MIN("bind", BND);
	DBG_SET_FLAG_MIN("config", CFG);
	DBG_SET_FLAG_MIN("exec", EXC);
	DBG_SET_FLAG_MIN("exit", EXT);
	DBG_SET_FLAG_MIN("find", FND);
	DBG_SET_FLAG_MIN("flush", FLS);
	DBG_SET_FLAG_MIN("fork", FRK);
	DBG_SET_FLAG_MIN("getbuf", GTB);
	DBG_SET_FLAG_MIN("hook", PMH);
	DBG_SET_FLAG_MIN("init", INI);
	DBG_SET_FLAG_MIN("intr", INT);
	DBG_SET_FLAG_MIN("linktarget", TLK);
	DBG_SET_FLAG_MIN("mayberemove", OMR);
	DBG_SET_FLAG_MIN("ops", OPS);
	DBG_SET_FLAG_MIN("read", REA);
	DBG_SET_FLAG_MIN("register", REG);
	DBG_SET_FLAG_MIN("release", REL);
	DBG_SET_FLAG_MIN("remove", ORM);
	DBG_SET_FLAG_MIN("sample", SAM);
	DBG_SET_FLAG_MIN("scheduleio", SIO);
	DBG_SET_FLAG_MIN("select", SEL);
	DBG_SET_FLAG_MIN("signal", SIG);
	DBG_SET_FLAG_MIN("swi", SWI);
	DBG_SET_FLAG_MIN("swo", SWO);
	DBG_SET_FLAG_MIN("start", STA);
	DBG_SET_FLAG_MIN("stop", STO);
	DBG_SET_FLAG_MIN("syscall", PMS);
	DBG_SET_FLAG_MIN("unlinktarget", TUL);
	DBG_SET_FLAG_MIN("write", WRI);
	if (found == 0) {
	/* unrecognized flag name */
	error = EINVAL;
	goto done;
	}

	if (c == 0 \|\| c == ' ' \|\| c == '\t') { /* end of flag group */
	*newbits = tmp;
	continue;
	}

	p++;
	goto newflag;
	}

	/* save the new flag set */
	bcopy(tmpflags, &pmc_debugflags, sizeof(pmc_debugflags));

	done:
	free(tmpflags, M_PMC);
	return error;
	}

	static int
	pmc_debugflags_sysctl_handler(SYSCTL_HANDLER_ARGS)
	{
	char fence, newstr;
	int error;
	unsigned int n;

	(void) arg1; (void) arg2; /* unused parameters */

	n = sizeof(pmc_debugstr);
	newstr = malloc(n, M_PMC, M_WAITOK\|M_ZERO);
	(void) strlcpy(newstr, pmc_debugstr, n);

	error = sysctl_handle_string(oidp, newstr, n, req);

	/* if there is a new string, parse and copy it */
	if (error == 0 && req->newptr != NULL) {
	fence = newstr + (n < req->newlen ? n : req->newlen + 1);
	if ((error = pmc_debugflags_parse(newstr, fence)) == 0)
	(void) strlcpy(pmc_debugstr, newstr,
	sizeof(pmc_debugstr));
	}

	free(newstr, M_PMC);

	return error;
	}
	#endif

	/*
	* Map a row index to a classdep structure and return the adjusted row
	* index for the PMC class index.
	*/
	static struct pmc_classdep *
	pmc_ri_to_classdep(struct pmc_mdep md, int ri, int adjri)
	{
	struct pmc_classdep *pcd;

	(void) md;

	KASSERT(ri >= 0 && ri < md->pmd_npmc,
	("[pmc,%d] illegal row-index %d", __LINE__, ri));

	pcd = pmc_rowindex_to_classdep[ri];

	KASSERT(pcd != NULL,
	("[pmc,%d] ri %d null pcd", __LINE__, ri));

	*adjri = ri - pcd->pcd_ri;

	KASSERT(adjri >= 0 && adjri < pcd->pcd_num,
	("[pmc,%d] adjusted row-index %d", __LINE__, *adjri));

	return (pcd);
	}

	/*
	* Concurrency Control
	*
	* The driver manages the following data structures:
	*
	* - target process descriptors, one per target process
	* - owner process descriptors (and attached lists), one per owner process
	* - lookup hash tables for owner and target processes
	* - PMC descriptors (and attached lists)
	* - per-cpu hardware state
	* - the 'hook' variable through which the kernel calls into
	* this module
	* - the machine hardware state (managed by the MD layer)
	*
	* These data structures are accessed from:
	*
	* - thread context-switch code
	* - interrupt handlers (possibly on multiple cpus)
	* - kernel threads on multiple cpus running on behalf of user
	* processes doing system calls
	* - this driver's private kernel threads
	*
	* = Locks and Locking strategy =
	*
	* The driver uses four locking strategies for its operation:
	*
	* - The global SX lock "pmc_sx" is used to protect internal
	* data structures.
	*
	* Calls into the module by syscall() start with this lock being
	* held in exclusive mode. Depending on the requested operation,
	* the lock may be downgraded to 'shared' mode to allow more
	* concurrent readers into the module. Calls into the module from
	* other parts of the kernel acquire the lock in shared mode.
	*
	* This SX lock is held in exclusive mode for any operations that
	* modify the linkages between the driver's internal data structures.
	*
	* The 'pmc_hook' function pointer is also protected by this lock.
	* It is only examined with the sx lock held in exclusive mode. The
	* kernel module is allowed to be unloaded only with the sx lock held
	* in exclusive mode. In normal syscall handling, after acquiring the
	* pmc_sx lock we first check that 'pmc_hook' is non-null before
	* proceeding. This prevents races between the thread unloading the module
	* and other threads seeking to use the module.
	*
	* - Lookups of target process structures and owner process structures
	* cannot use the global "pmc_sx" SX lock because these lookups need
	* to happen during context switches and in other critical sections
	* where sleeping is not allowed. We protect these lookup tables
	* with their own private spin-mutexes, "pmc_processhash_mtx" and
	* "pmc_ownerhash_mtx".
	*
	* - Interrupt handlers work in a lock free manner. At interrupt
	* time, handlers look at the PMC pointer (phw->phw_pmc) configured
	* when the PMC was started. If this pointer is NULL, the interrupt
	* is ignored after updating driver statistics. We ensure that this
	* pointer is set (using an atomic operation if necessary) before the
	* PMC hardware is started. Conversely, this pointer is unset atomically
	* only after the PMC hardware is stopped.
	*
	* We ensure that everything needed for the operation of an
	* interrupt handler is available without it needing to acquire any
	* locks. We also ensure that a PMC's software state is destroyed only
	* after the PMC is taken off hardware (on all CPUs).
	*
	* - Context-switch handling with process-private PMCs needs more
	* care.
	*
	* A given process may be the target of multiple PMCs. For example,
	* PMCATTACH and PMCDETACH may be requested by a process on one CPU
	* while the target process is running on another. A PMC could also
	* be getting released because its owner is exiting. We tackle
	* these situations in the following manner:
	*
	* - each target process structure 'pmc_process' has an array
	* of 'struct pmc *' pointers, one for each hardware PMC.
	*
	* - At context switch IN time, each "target" PMC in RUNNING state
	* gets started on hardware and a pointer to each PMC is copied into
	* the per-cpu phw array. The 'runcount' for the PMC is
	* incremented.
	*
	* - At context switch OUT time, all process-virtual PMCs are stopped
	* on hardware. The saved value is added to the PMCs value field
	* only if the PMC is in a non-deleted state (the PMCs state could
	* have changed during the current time slice).
	*
	* Note that since in-between a switch IN on a processor and a switch
	* OUT, the PMC could have been released on another CPU. Therefore
	* context switch OUT always looks at the hardware state to turn
	* OFF PMCs and will update a PMC's saved value only if reachable
	* from the target process record.
	*
	* - OP PMCRELEASE could be called on a PMC at any time (the PMC could
	* be attached to many processes at the time of the call and could
	* be active on multiple CPUs).
	*
	* We prevent further scheduling of the PMC by marking it as in
	* state 'DELETED'. If the runcount of the PMC is non-zero then
	* this PMC is currently running on a CPU somewhere. The thread
	* doing the PMCRELEASE operation waits by repeatedly doing a
	* pause() till the runcount comes to zero.
	*
	* The contents of a PMC descriptor (struct pmc) are protected using
	* a spin-mutex. In order to save space, we use a mutex pool.
	*
	* In terms of lock types used by witness(4), we use:
	* - Type "pmc-sx", used by the global SX lock.
	* - Type "pmc-sleep", for sleep mutexes used by logger threads.
	* - Type "pmc-per-proc", for protecting PMC owner descriptors.
	* - Type "pmc-leaf", used for all other spin mutexes.
	*/

	/*
	* save the cpu binding of the current kthread
	*/

	static void
	pmc_save_cpu_binding(struct pmc_binding *pb)
	{
	PMCDBG(CPU,BND,2, "%s", "save-cpu");
	thread_lock(curthread);
	pb->pb_bound = sched_is_bound(curthread);
	pb->pb_cpu = curthread->td_oncpu;
	thread_unlock(curthread);
	PMCDBG(CPU,BND,2, "save-cpu cpu=%d", pb->pb_cpu);
	}

	/*
	* restore the cpu binding of the current thread
	*/

	static void
	pmc_restore_cpu_binding(struct pmc_binding *pb)
	{
	PMCDBG(CPU,BND,2, "restore-cpu curcpu=%d restore=%d",
	curthread->td_oncpu, pb->pb_cpu);
	thread_lock(curthread);
	if (pb->pb_bound)
	sched_bind(curthread, pb->pb_cpu);
	else
	sched_unbind(curthread);
	thread_unlock(curthread);
	PMCDBG(CPU,BND,2, "%s", "restore-cpu done");
	}

	/*
	* move execution over the specified cpu and bind it there.
	*/

	static void
	pmc_select_cpu(int cpu)
	{
	KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
	("[pmc,%d] bad cpu number %d", __LINE__, cpu));

	/* Never move to an inactive CPU. */
	KASSERT(pmc_cpu_is_active(cpu), ("[pmc,%d] selecting inactive "
	"CPU %d", __LINE__, cpu));

	PMCDBG(CPU,SEL,2, "select-cpu cpu=%d", cpu);
	thread_lock(curthread);
	sched_bind(curthread, cpu);
	thread_unlock(curthread);

	KASSERT(curthread->td_oncpu == cpu,
	("[pmc,%d] CPU not bound [cpu=%d, curr=%d]", __LINE__,
	cpu, curthread->td_oncpu));

	PMCDBG(CPU,SEL,2, "select-cpu cpu=%d ok", cpu);
	}

	/*
	* Force a context switch.
	*
	* We do this by pause'ing for 1 tick -- invoking mi_switch() is not
	* guaranteed to force a context switch.
	*/

	static void
	pmc_force_context_switch(void)
	{

	pause("pmcctx", 1);
	}

	/*
	* Get the file name for an executable. This is a simple wrapper
	* around vn_fullpath(9).
	*/

	static void
	pmc_getfilename(struct vnode v, char fullpath, char *freepath)
	{

	*fullpath = "unknown";
	*freepath = NULL;
	vn_fullpath(curthread, v, fullpath, freepath);
	}

	/*
	* remove an process owning PMCs
	*/

	void
	pmc_remove_owner(struct pmc_owner *po)
	{
	struct pmc pm, tmp;

	sx_assert(&pmc_sx, SX_XLOCKED);

	PMCDBG(OWN,ORM,1, "remove-owner po=%p", po);

	/* Remove descriptor from the owner hash table */
	LIST_REMOVE(po, po_next);

	/* release all owned PMC descriptors */
	LIST_FOREACH_SAFE(pm, &po->po_pmcs, pm_next, tmp) {
	PMCDBG(OWN,ORM,2, "pmc=%p", pm);
	KASSERT(pm->pm_owner == po,
	("[pmc,%d] owner %p != po %p", __LINE__, pm->pm_owner, po));

	pmc_release_pmc_descriptor(pm); /* will unlink from the list */
	}

	KASSERT(po->po_sscount == 0,
	("[pmc,%d] SS count not zero", __LINE__));
	KASSERT(LIST_EMPTY(&po->po_pmcs),
	("[pmc,%d] PMC list not empty", __LINE__));

	/* de-configure the log file if present */
	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
	pmclog_deconfigure_log(po);
	}

	/*
	* remove an owner process record if all conditions are met.
	*/

	static void
	pmc_maybe_remove_owner(struct pmc_owner *po)
	{

	PMCDBG(OWN,OMR,1, "maybe-remove-owner po=%p", po);

	/*
	* Remove owner record if
	* - this process does not own any PMCs
	* - this process has not allocated a system-wide sampling buffer
	*/

	if (LIST_EMPTY(&po->po_pmcs) &&
	((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)) {
	pmc_remove_owner(po);
	pmc_destroy_owner_descriptor(po);
	}
	}

	/*
	* Add an association between a target process and a PMC.
	*/

	static void
	pmc_link_target_process(struct pmc pm, struct pmc_process pp)
	{
	int ri;
	struct pmc_target *pt;

	sx_assert(&pmc_sx, SX_XLOCKED);

	KASSERT(pm != NULL && pp != NULL,
	("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp));
	KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)),
	("[pmc,%d] Attaching a non-process-virtual pmc=%p to pid=%d",
	__LINE__, pm, pp->pp_proc->p_pid));
	KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt <= ((int) md->pmd_npmc - 1),
	("[pmc,%d] Illegal reference count %d for process record %p",
	__LINE__, pp->pp_refcnt, (void *) pp));

	ri = PMC_TO_ROWINDEX(pm);

	PMCDBG(PRC,TLK,1, "link-target pmc=%p ri=%d pmc-process=%p",
	pm, ri, pp);

	#ifdef DEBUG
	LIST_FOREACH(pt, &pm->pm_targets, pt_next)
	if (pt->pt_process == pp)
	KASSERT(0, ("[pmc,%d] pp %p already in pmc %p targets",
	__LINE__, pp, pm));
	#endif

	pt = malloc(sizeof(struct pmc_target), M_PMC, M_WAITOK\|M_ZERO);
	pt->pt_process = pp;

	LIST_INSERT_HEAD(&pm->pm_targets, pt, pt_next);

	atomic_store_rel_ptr((uintptr_t *)&pp->pp_pmcs[ri].pp_pmc,
	(uintptr_t)pm);

	if (pm->pm_owner->po_owner == pp->pp_proc)
	pm->pm_flags \|= PMC_F_ATTACHED_TO_OWNER;

	/*
	* Initialize the per-process values at this row index.
	*/
	pp->pp_pmcs[ri].pp_pmcval = PMC_TO_MODE(pm) == PMC_MODE_TS ?
	pm->pm_sc.pm_reloadcount : 0;

	pp->pp_refcnt++;

	}

	/*
	* Removes the association between a target process and a PMC.
	*/

	static void
	pmc_unlink_target_process(struct pmc pm, struct pmc_process pp)
	{
	int ri;
	struct proc *p;
	struct pmc_target *ptgt;

	sx_assert(&pmc_sx, SX_XLOCKED);

	KASSERT(pm != NULL && pp != NULL,
	("[pmc,%d] Null pm %p or pp %p", __LINE__, pm, pp));

	KASSERT(pp->pp_refcnt >= 1 && pp->pp_refcnt <= (int) md->pmd_npmc,
	("[pmc,%d] Illegal ref count %d on process record %p",
	__LINE__, pp->pp_refcnt, (void *) pp));

	ri = PMC_TO_ROWINDEX(pm);

	PMCDBG(PRC,TUL,1, "unlink-target pmc=%p ri=%d pmc-process=%p",
	pm, ri, pp);

	KASSERT(pp->pp_pmcs[ri].pp_pmc == pm,
	("[pmc,%d] PMC ri %d mismatch pmc %p pp->[ri] %p", __LINE__,
	ri, pm, pp->pp_pmcs[ri].pp_pmc));

	pp->pp_pmcs[ri].pp_pmc = NULL;
	pp->pp_pmcs[ri].pp_pmcval = (pmc_value_t) 0;

	/* Remove owner-specific flags */
	if (pm->pm_owner->po_owner == pp->pp_proc) {
	pp->pp_flags &= ~PMC_PP_ENABLE_MSR_ACCESS;
	pm->pm_flags &= ~PMC_F_ATTACHED_TO_OWNER;
	}

	pp->pp_refcnt--;

	/* Remove the target process from the PMC structure */
	LIST_FOREACH(ptgt, &pm->pm_targets, pt_next)
	if (ptgt->pt_process == pp)
	break;

	KASSERT(ptgt != NULL, ("[pmc,%d] process %p (pp: %p) not found "
	"in pmc %p", __LINE__, pp->pp_proc, pp, pm));

	LIST_REMOVE(ptgt, pt_next);
	free(ptgt, M_PMC);

	/* if the PMC now lacks targets, send the owner a SIGIO */
	if (LIST_EMPTY(&pm->pm_targets)) {
	p = pm->pm_owner->po_owner;
	PROC_LOCK(p);
	- psignal(p, SIGIO);
	+ kern_psignal(p, SIGIO);
	PROC_UNLOCK(p);

	PMCDBG(PRC,SIG,2, "signalling proc=%p signal=%d", p,
	SIGIO);
	}
	}

	/*
	* Check if PMC 'pm' may be attached to target process 't'.
	*/

	static int
	pmc_can_attach(struct pmc pm, struct proc t)
	{
	struct proc o; / pmc owner */
	struct ucred oc, tc; /* owner, target credentials */
	int decline_attach, i;

	/*
	* A PMC's owner can always attach that PMC to itself.
	*/

	if ((o = pm->pm_owner->po_owner) == t)
	return 0;

	PROC_LOCK(o);
	oc = o->p_ucred;
	crhold(oc);
	PROC_UNLOCK(o);

	PROC_LOCK(t);
	tc = t->p_ucred;
	crhold(tc);
	PROC_UNLOCK(t);

	/*
	* The effective uid of the PMC owner should match at least one
	* of the {effective,real,saved} uids of the target process.
	*/

	decline_attach = oc->cr_uid != tc->cr_uid &&
	oc->cr_uid != tc->cr_svuid &&
	oc->cr_uid != tc->cr_ruid;

	/*
	* Every one of the target's group ids, must be in the owner's
	* group list.
	*/
	for (i = 0; !decline_attach && i < tc->cr_ngroups; i++)
	decline_attach = !groupmember(tc->cr_groups[i], oc);

	/* check the read and saved gids too */
	if (decline_attach == 0)
	decline_attach = !groupmember(tc->cr_rgid, oc) \|\|
	!groupmember(tc->cr_svgid, oc);

	crfree(tc);
	crfree(oc);

	return !decline_attach;
	}

	/*
	* Attach a process to a PMC.
	*/

	static int
	pmc_attach_one_process(struct proc p, struct pmc pm)
	{
	int ri;
	char fullpath, freepath;
	struct pmc_process *pp;

	sx_assert(&pmc_sx, SX_XLOCKED);

	PMCDBG(PRC,ATT,2, "attach-one pm=%p ri=%d proc=%p (%d, %s)", pm,
	PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);

	/*
	* Locate the process descriptor corresponding to process 'p',
	* allocating space as needed.
	*
	* Verify that rowindex 'pm_rowindex' is free in the process
	* descriptor.
	*
	* If not, allocate space for a descriptor and link the
	* process descriptor and PMC.
	*/
	ri = PMC_TO_ROWINDEX(pm);

	if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_ALLOCATE)) == NULL)
	return ENOMEM;

	if (pp->pp_pmcs[ri].pp_pmc == pm) /* already present at slot [ri] */
	return EEXIST;

	if (pp->pp_pmcs[ri].pp_pmc != NULL)
	return EBUSY;

	pmc_link_target_process(pm, pp);

	if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) &&
	(pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) == 0)
	pm->pm_flags \|= PMC_F_NEEDS_LOGFILE;

	pm->pm_flags \|= PMC_F_ATTACH_DONE; /* mark as attached */

	/* issue an attach event to a configured log file */
	if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE) {
	pmc_getfilename(p->p_textvp, &fullpath, &freepath);
	if (p->p_flag & P_KTHREAD) {
	fullpath = kernelname;
	freepath = NULL;
	} else
	pmclog_process_pmcattach(pm, p->p_pid, fullpath);
	if (freepath)
	free(freepath, M_TEMP);
	if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
	pmc_log_process_mappings(pm->pm_owner, p);
	}
	/* mark process as using HWPMCs */
	PROC_LOCK(p);
	p->p_flag \|= P_HWPMC;
	PROC_UNLOCK(p);

	return 0;
	}

	/*
	* Attach a process and optionally its children
	*/

	static int
	pmc_attach_process(struct proc p, struct pmc pm)
	{
	int error;
	struct proc *top;

	sx_assert(&pmc_sx, SX_XLOCKED);

	PMCDBG(PRC,ATT,1, "attach pm=%p ri=%d proc=%p (%d, %s)", pm,
	PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);


	/*
	* If this PMC successfully allowed a GETMSR operation
	* in the past, disallow further ATTACHes.
	*/

	if ((pm->pm_flags & PMC_PP_ENABLE_MSR_ACCESS) != 0)
	return EPERM;

	if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0)
	return pmc_attach_one_process(p, pm);

	/*
	* Traverse all child processes, attaching them to
	* this PMC.
	*/

	sx_slock(&proctree_lock);

	top = p;

	for (;;) {
	if ((error = pmc_attach_one_process(p, pm)) != 0)
	break;
	if (!LIST_EMPTY(&p->p_children))
	p = LIST_FIRST(&p->p_children);
	else for (;;) {
	if (p == top)
	goto done;
	if (LIST_NEXT(p, p_sibling)) {
	p = LIST_NEXT(p, p_sibling);
	break;
	}
	p = p->p_pptr;
	}
	}

	if (error)
	(void) pmc_detach_process(top, pm);

	done:
	sx_sunlock(&proctree_lock);
	return error;
	}

	/*
	* Detach a process from a PMC. If there are no other PMCs tracking
	* this process, remove the process structure from its hash table. If
	* 'flags' contains PMC_FLAG_REMOVE, then free the process structure.
	*/

	static int
	pmc_detach_one_process(struct proc p, struct pmc pm, int flags)
	{
	int ri;
	struct pmc_process *pp;

	sx_assert(&pmc_sx, SX_XLOCKED);

	KASSERT(pm != NULL,
	("[pmc,%d] null pm pointer", __LINE__));

	ri = PMC_TO_ROWINDEX(pm);

	PMCDBG(PRC,ATT,2, "detach-one pm=%p ri=%d proc=%p (%d, %s) flags=0x%x",
	pm, ri, p, p->p_pid, p->p_comm, flags);

	if ((pp = pmc_find_process_descriptor(p, 0)) == NULL)
	return ESRCH;

	if (pp->pp_pmcs[ri].pp_pmc != pm)
	return EINVAL;

	pmc_unlink_target_process(pm, pp);

	/* Issue a detach entry if a log file is configured */
	if (pm->pm_owner->po_flags & PMC_PO_OWNS_LOGFILE)
	pmclog_process_pmcdetach(pm, p->p_pid);

	/*
	* If there are no PMCs targetting this process, we remove its
	* descriptor from the target hash table and unset the P_HWPMC
	* flag in the struct proc.
	*/
	KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt <= (int) md->pmd_npmc,
	("[pmc,%d] Illegal refcnt %d for process struct %p",
	__LINE__, pp->pp_refcnt, pp));

	if (pp->pp_refcnt != 0) /* still a target of some PMC */
	return 0;

	pmc_remove_process_descriptor(pp);

	if (flags & PMC_FLAG_REMOVE)
	free(pp, M_PMC);

	PROC_LOCK(p);
	p->p_flag &= ~P_HWPMC;
	PROC_UNLOCK(p);

	return 0;
	}

	/*
	* Detach a process and optionally its descendants from a PMC.
	*/

	static int
	pmc_detach_process(struct proc p, struct pmc pm)
	{
	struct proc *top;

	sx_assert(&pmc_sx, SX_XLOCKED);

	PMCDBG(PRC,ATT,1, "detach pm=%p ri=%d proc=%p (%d, %s)", pm,
	PMC_TO_ROWINDEX(pm), p, p->p_pid, p->p_comm);

	if ((pm->pm_flags & PMC_F_DESCENDANTS) == 0)
	return pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE);

	/*
	* Traverse all children, detaching them from this PMC. We
	* ignore errors since we could be detaching a PMC from a
	* partially attached proc tree.
	*/

	sx_slock(&proctree_lock);

	top = p;

	for (;;) {
	(void) pmc_detach_one_process(p, pm, PMC_FLAG_REMOVE);

	if (!LIST_EMPTY(&p->p_children))
	p = LIST_FIRST(&p->p_children);
	else for (;;) {
	if (p == top)
	goto done;
	if (LIST_NEXT(p, p_sibling)) {
	p = LIST_NEXT(p, p_sibling);
	break;
	}
	p = p->p_pptr;
	}
	}

	done:
	sx_sunlock(&proctree_lock);

	if (LIST_EMPTY(&pm->pm_targets))
	pm->pm_flags &= ~PMC_F_ATTACH_DONE;

	return 0;
	}


	/*
	* Thread context switch IN
	*/

	static void
	pmc_process_csw_in(struct thread *td)
	{
	int cpu;
	unsigned int adjri, ri;
	struct pmc *pm;
	struct proc *p;
	struct pmc_cpu *pc;
	struct pmc_hw *phw;
	pmc_value_t newvalue;
	struct pmc_process *pp;
	struct pmc_classdep *pcd;

	p = td->td_proc;

	if ((pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE)) == NULL)
	return;

	KASSERT(pp->pp_proc == td->td_proc,
	("[pmc,%d] not my thread state", __LINE__));

	critical_enter(); /* no preemption from this point */

	cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */

	PMCDBG(CSW,SWI,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p,
	p->p_pid, p->p_comm, pp);

	KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
	("[pmc,%d] wierd CPU id %d", __LINE__, cpu));

	pc = pmc_pcpu[cpu];

	for (ri = 0; ri < md->pmd_npmc; ri++) {

	if ((pm = pp->pp_pmcs[ri].pp_pmc) == NULL)
	continue;

	KASSERT(PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)),
	("[pmc,%d] Target PMC in non-virtual mode (%d)",
	__LINE__, PMC_TO_MODE(pm)));

	KASSERT(PMC_TO_ROWINDEX(pm) == ri,
	("[pmc,%d] Row index mismatch pmc %d != ri %d",
	__LINE__, PMC_TO_ROWINDEX(pm), ri));

	/*
	* Only PMCs that are marked as 'RUNNING' need
	* be placed on hardware.
	*/

	if (pm->pm_state != PMC_STATE_RUNNING)
	continue;

	/* increment PMC runcount */
	atomic_add_rel_int(&pm->pm_runcount, 1);

	/* configure the HWPMC we are going to use. */
	pcd = pmc_ri_to_classdep(md, ri, &adjri);
	pcd->pcd_config_pmc(cpu, adjri, pm);

	phw = pc->pc_hwpmcs[ri];

	KASSERT(phw != NULL,
	("[pmc,%d] null hw pointer", __LINE__));

	KASSERT(phw->phw_pmc == pm,
	("[pmc,%d] hw->pmc %p != pmc %p", __LINE__,
	phw->phw_pmc, pm));

	/*
	* Write out saved value and start the PMC.
	*
	* Sampling PMCs use a per-process value, while
	* counting mode PMCs use a per-pmc value that is
	* inherited across descendants.
	*/
	if (PMC_TO_MODE(pm) == PMC_MODE_TS) {
	mtx_pool_lock_spin(pmc_mtxpool, pm);
	newvalue = PMC_PCPU_SAVED(cpu,ri) =
	pp->pp_pmcs[ri].pp_pmcval;
	mtx_pool_unlock_spin(pmc_mtxpool, pm);
	} else {
	KASSERT(PMC_TO_MODE(pm) == PMC_MODE_TC,
	("[pmc,%d] illegal mode=%d", __LINE__,
	PMC_TO_MODE(pm)));
	mtx_pool_lock_spin(pmc_mtxpool, pm);
	newvalue = PMC_PCPU_SAVED(cpu, ri) =
	pm->pm_gv.pm_savedvalue;
	mtx_pool_unlock_spin(pmc_mtxpool, pm);
	}

	PMCDBG(CSW,SWI,1,"cpu=%d ri=%d new=%jd", cpu, ri, newvalue);

	pcd->pcd_write_pmc(cpu, adjri, newvalue);
	pcd->pcd_start_pmc(cpu, adjri);
	}

	/*
	* perform any other architecture/cpu dependent thread
	* switch-in actions.
	*/

	(void) (*md->pmd_switch_in)(pc, pp);

	critical_exit();

	}

	/*
	* Thread context switch OUT.
	*/

	static void
	pmc_process_csw_out(struct thread *td)
	{
	int cpu;
	int64_t tmp;
	struct pmc *pm;
	struct proc *p;
	enum pmc_mode mode;
	struct pmc_cpu *pc;
	pmc_value_t newvalue;
	unsigned int adjri, ri;
	struct pmc_process *pp;
	struct pmc_classdep *pcd;


	/*
	* Locate our process descriptor; this may be NULL if
	* this process is exiting and we have already removed
	* the process from the target process table.
	*
	* Note that due to kernel preemption, multiple
	* context switches may happen while the process is
	* exiting.
	*
	* Note also that if the target process cannot be
	* found we still need to deconfigure any PMCs that
	* are currently running on hardware.
	*/

	p = td->td_proc;
	pp = pmc_find_process_descriptor(p, PMC_FLAG_NONE);

	/*
	* save PMCs
	*/

	critical_enter();

	cpu = PCPU_GET(cpuid); /* td->td_oncpu is invalid */

	PMCDBG(CSW,SWO,1, "cpu=%d proc=%p (%d, %s) pp=%p", cpu, p,
	p->p_pid, p->p_comm, pp);

	KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
	("[pmc,%d wierd CPU id %d", __LINE__, cpu));

	pc = pmc_pcpu[cpu];

	/*
	* When a PMC gets unlinked from a target PMC, it will
	* be removed from the target's pp_pmc[] array.
	*
	* However, on a MP system, the target could have been
	* executing on another CPU at the time of the unlink.
	* So, at context switch OUT time, we need to look at
	* the hardware to determine if a PMC is scheduled on
	* it.
	*/

	for (ri = 0; ri < md->pmd_npmc; ri++) {

	pcd = pmc_ri_to_classdep(md, ri, &adjri);
	pm = NULL;
	(void) (*pcd->pcd_get_config)(cpu, adjri, &pm);

	if (pm == NULL) /* nothing at this row index */
	continue;

	mode = PMC_TO_MODE(pm);
	if (!PMC_IS_VIRTUAL_MODE(mode))
	continue; /* not a process virtual PMC */

	KASSERT(PMC_TO_ROWINDEX(pm) == ri,
	("[pmc,%d] ri mismatch pmc(%d) ri(%d)",
	__LINE__, PMC_TO_ROWINDEX(pm), ri));

	/* Stop hardware if not already stopped */
	if (pm->pm_stalled == 0)
	pcd->pcd_stop_pmc(cpu, adjri);

	/* reduce this PMC's runcount */
	atomic_subtract_rel_int(&pm->pm_runcount, 1);

	/*
	* If this PMC is associated with this process,
	* save the reading.
	*/

	if (pp != NULL && pp->pp_pmcs[ri].pp_pmc != NULL) {

	KASSERT(pm == pp->pp_pmcs[ri].pp_pmc,
	("[pmc,%d] pm %p != pp_pmcs[%d] %p", __LINE__,
	pm, ri, pp->pp_pmcs[ri].pp_pmc));

	KASSERT(pp->pp_refcnt > 0,
	("[pmc,%d] pp refcnt = %d", __LINE__,
	pp->pp_refcnt));

	pcd->pcd_read_pmc(cpu, adjri, &newvalue);

	tmp = newvalue - PMC_PCPU_SAVED(cpu,ri);

	PMCDBG(CSW,SWO,1,"cpu=%d ri=%d tmp=%jd", cpu, ri,
	tmp);

	if (mode == PMC_MODE_TS) {

	/*
	* For sampling process-virtual PMCs,
	* we expect the count to be
	* decreasing as the 'value'
	* programmed into the PMC is the
	* number of events to be seen till
	* the next sampling interrupt.
	*/
	if (tmp < 0)
	tmp += pm->pm_sc.pm_reloadcount;
	mtx_pool_lock_spin(pmc_mtxpool, pm);
	pp->pp_pmcs[ri].pp_pmcval -= tmp;
	if ((int64_t) pp->pp_pmcs[ri].pp_pmcval < 0)
	pp->pp_pmcs[ri].pp_pmcval +=
	pm->pm_sc.pm_reloadcount;
	mtx_pool_unlock_spin(pmc_mtxpool, pm);

	} else {

	/*
	* For counting process-virtual PMCs,
	* we expect the count to be
	* increasing monotonically, modulo a 64
	* bit wraparound.
	*/
	KASSERT((int64_t) tmp >= 0,
	("[pmc,%d] negative increment cpu=%d "
	"ri=%d newvalue=%jx saved=%jx "
	"incr=%jx", __LINE__, cpu, ri,
	newvalue, PMC_PCPU_SAVED(cpu,ri), tmp));

	mtx_pool_lock_spin(pmc_mtxpool, pm);
	pm->pm_gv.pm_savedvalue += tmp;
	pp->pp_pmcs[ri].pp_pmcval += tmp;
	mtx_pool_unlock_spin(pmc_mtxpool, pm);

	if (pm->pm_flags & PMC_F_LOG_PROCCSW)
	pmclog_process_proccsw(pm, pp, tmp);
	}
	}

	/* mark hardware as free */
	pcd->pcd_config_pmc(cpu, adjri, NULL);
	}

	/*
	* perform any other architecture/cpu dependent thread
	* switch out functions.
	*/

	(void) (*md->pmd_switch_out)(pc, pp);

	critical_exit();
	}

	/*
	* Log a KLD operation.
	*/

	static void
	pmc_process_kld_load(struct pmckern_map_in *pkm)
	{
	struct pmc_owner *po;

	sx_assert(&pmc_sx, SX_LOCKED);

	/*
	* Notify owners of system sampling PMCs about KLD operations.
	*/

	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
	pmclog_process_map_in(po, (pid_t) -1, pkm->pm_address,
	(char *) pkm->pm_file);

	/*
	* TODO: Notify owners of (all) process-sampling PMCs too.
	*/

	return;
	}

	static void
	pmc_process_kld_unload(struct pmckern_map_out *pkm)
	{
	struct pmc_owner *po;

	sx_assert(&pmc_sx, SX_LOCKED);

	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
	pmclog_process_map_out(po, (pid_t) -1,
	pkm->pm_address, pkm->pm_address + pkm->pm_size);

	/*
	* TODO: Notify owners of process-sampling PMCs.
	*/
	}

	/*
	* A mapping change for a process.
	*/

	static void
	pmc_process_mmap(struct thread td, struct pmckern_map_in pkm)
	{
	int ri;
	pid_t pid;
	char fullpath, freepath;
	const struct pmc *pm;
	struct pmc_owner *po;
	const struct pmc_process *pp;

	freepath = fullpath = NULL;
	pmc_getfilename((struct vnode *) pkm->pm_file, &fullpath, &freepath);

	pid = td->td_proc->p_pid;

	/* Inform owners of all system-wide sampling PMCs. */
	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
	pmclog_process_map_in(po, pid, pkm->pm_address, fullpath);

	if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL)
	goto done;

	/*
	* Inform sampling PMC owners tracking this process.
	*/
	for (ri = 0; ri < md->pmd_npmc; ri++)
	if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL &&
	PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
	pmclog_process_map_in(pm->pm_owner,
	pid, pkm->pm_address, fullpath);

	done:
	if (freepath)
	free(freepath, M_TEMP);
	}


	/*
	* Log an munmap request.
	*/

	static void
	pmc_process_munmap(struct thread td, struct pmckern_map_out pkm)
	{
	int ri;
	pid_t pid;
	struct pmc_owner *po;
	const struct pmc *pm;
	const struct pmc_process *pp;

	pid = td->td_proc->p_pid;

	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
	pmclog_process_map_out(po, pid, pkm->pm_address,
	pkm->pm_address + pkm->pm_size);

	if ((pp = pmc_find_process_descriptor(td->td_proc, 0)) == NULL)
	return;

	for (ri = 0; ri < md->pmd_npmc; ri++)
	if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL &&
	PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
	pmclog_process_map_out(pm->pm_owner, pid,
	pkm->pm_address, pkm->pm_address + pkm->pm_size);
	}

	/*
	* Log mapping information about the kernel.
	*/

	static void
	pmc_log_kernel_mappings(struct pmc *pm)
	{
	struct pmc_owner *po;
	struct pmckern_map_in km, kmbase;

	sx_assert(&pmc_sx, SX_LOCKED);
	KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)),
	("[pmc,%d] non-sampling PMC (%p) desires mapping information",
	__LINE__, (void *) pm));

	po = pm->pm_owner;

	if (po->po_flags & PMC_PO_INITIAL_MAPPINGS_DONE)
	return;

	/*
	* Log the current set of kernel modules.
	*/
	kmbase = linker_hwpmc_list_objects();
	for (km = kmbase; km->pm_file != NULL; km++) {
	PMCDBG(LOG,REG,1,"%s %p", (char *) km->pm_file,
	(void *) km->pm_address);
	pmclog_process_map_in(po, (pid_t) -1, km->pm_address,
	km->pm_file);
	}
	free(kmbase, M_LINKER);

	po->po_flags \|= PMC_PO_INITIAL_MAPPINGS_DONE;
	}

	/*
	* Log the mappings for a single process.
	*/

	static void
	pmc_log_process_mappings(struct pmc_owner po, struct proc p)
	{
	int locked;
	vm_map_t map;
	struct vnode *vp;
	struct vmspace *vm;
	vm_map_entry_t entry;
	vm_offset_t last_end;
	u_int last_timestamp;
	struct vnode *last_vp;
	vm_offset_t start_addr;
	vm_object_t obj, lobj, tobj;
	char fullpath, freepath;

	last_vp = NULL;
	last_end = (vm_offset_t) 0;
	fullpath = freepath = NULL;

	if ((vm = vmspace_acquire_ref(p)) == NULL)
	return;

	map = &vm->vm_map;
	vm_map_lock_read(map);

	for (entry = map->header.next; entry != &map->header; entry = entry->next) {

	if (entry == NULL) {
	PMCDBG(LOG,OPS,2, "hwpmc: vm_map entry unexpectedly "
	"NULL! pid=%d vm_map=%p\n", p->p_pid, map);
	break;
	}

	/*
	* We only care about executable map entries.
	*/
	if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) \|\|
	!(entry->protection & VM_PROT_EXECUTE) \|\|
	(entry->object.vm_object == NULL)) {
	continue;
	}

	obj = entry->object.vm_object;
	VM_OBJECT_LOCK(obj);

	/*
	* Walk the backing_object list to find the base
	* (non-shadowed) vm_object.
	*/
	for (lobj = tobj = obj; tobj != NULL; tobj = tobj->backing_object) {
	if (tobj != obj)
	VM_OBJECT_LOCK(tobj);
	if (lobj != obj)
	VM_OBJECT_UNLOCK(lobj);
	lobj = tobj;
	}

	/*
	* At this point lobj is the base vm_object and it is locked.
	*/
	if (lobj == NULL) {
	PMCDBG(LOG,OPS,2, "hwpmc: lobj unexpectedly NULL! pid=%d "
	"vm_map=%p vm_obj=%p\n", p->p_pid, map, obj);
	VM_OBJECT_UNLOCK(obj);
	continue;
	}

	if (lobj->type != OBJT_VNODE \|\| lobj->handle == NULL) {
	if (lobj != obj)
	VM_OBJECT_UNLOCK(lobj);
	VM_OBJECT_UNLOCK(obj);
	continue;
	}

	/*
	* Skip contiguous regions that point to the same
	* vnode, so we don't emit redundant MAP-IN
	* directives.
	*/
	if (entry->start == last_end && lobj->handle == last_vp) {
	last_end = entry->end;
	if (lobj != obj)
	VM_OBJECT_UNLOCK(lobj);
	VM_OBJECT_UNLOCK(obj);
	continue;
	}

	/*
	* We don't want to keep the proc's vm_map or this
	* vm_object locked while we walk the pathname, since
	* vn_fullpath() can sleep. However, if we drop the
	* lock, it's possible for concurrent activity to
	* modify the vm_map list. To protect against this,
	* we save the vm_map timestamp before we release the
	* lock, and check it after we reacquire the lock
	* below.
	*/
	start_addr = entry->start;
	last_end = entry->end;
	last_timestamp = map->timestamp;
	vm_map_unlock_read(map);

	vp = lobj->handle;
	vref(vp);
	if (lobj != obj)
	VM_OBJECT_UNLOCK(lobj);

	VM_OBJECT_UNLOCK(obj);

	freepath = NULL;
	pmc_getfilename(vp, &fullpath, &freepath);
	last_vp = vp;

	locked = VFS_LOCK_GIANT(vp->v_mount);
	vrele(vp);
	VFS_UNLOCK_GIANT(locked);

	vp = NULL;
	pmclog_process_map_in(po, p->p_pid, start_addr, fullpath);
	if (freepath)
	free(freepath, M_TEMP);

	vm_map_lock_read(map);

	/*
	* If our saved timestamp doesn't match, this means
	* that the vm_map was modified out from under us and
	* we can't trust our current "entry" pointer. Do a
	* new lookup for this entry. If there is no entry
	* for this address range, vm_map_lookup_entry() will
	* return the previous one, so we always want to go to
	* entry->next on the next loop iteration.
	*
	* There is an edge condition here that can occur if
	* there is no entry at or before this address. In
	* this situation, vm_map_lookup_entry returns
	* &map->header, which would cause our loop to abort
	* without processing the rest of the map. However,
	* in practice this will never happen for process
	* vm_map. This is because the executable's text
	* segment is the first mapping in the proc's address
	* space, and this mapping is never removed until the
	* process exits, so there will always be a non-header
	* entry at or before the requested address for
	* vm_map_lookup_entry to return.
	*/
	if (map->timestamp != last_timestamp)
	vm_map_lookup_entry(map, last_end - 1, &entry);
	}

	vm_map_unlock_read(map);
	vmspace_free(vm);
	return;
	}

	/*
	* Log mappings for all processes in the system.
	*/

	static void
	pmc_log_all_process_mappings(struct pmc_owner *po)
	{
	struct proc p, top;

	sx_assert(&pmc_sx, SX_XLOCKED);

	if ((p = pfind(1)) == NULL)
	panic("[pmc,%d] Cannot find init", __LINE__);

	PROC_UNLOCK(p);

	sx_slock(&proctree_lock);

	top = p;

	for (;;) {
	pmc_log_process_mappings(po, p);
	if (!LIST_EMPTY(&p->p_children))
	p = LIST_FIRST(&p->p_children);
	else for (;;) {
	if (p == top)
	goto done;
	if (LIST_NEXT(p, p_sibling)) {
	p = LIST_NEXT(p, p_sibling);
	break;
	}
	p = p->p_pptr;
	}
	}
	done:
	sx_sunlock(&proctree_lock);
	}

	/*
	* The 'hook' invoked from the kernel proper
	*/


	#ifdef DEBUG
	const char *pmc_hooknames[] = {
	/* these strings correspond to PMC_FN_* in <sys/pmckern.h> */
	"",
	"EXEC",
	"CSW-IN",
	"CSW-OUT",
	"SAMPLE",
	"KLDLOAD",
	"KLDUNLOAD",
	"MMAP",
	"MUNMAP",
	"CALLCHAIN"
	};
	#endif

	static int
	pmc_hook_handler(struct thread td, int function, void arg)
	{

	PMCDBG(MOD,PMH,1, "hook td=%p func=%d \"%s\" arg=%p", td, function,
	pmc_hooknames[function], arg);

	switch (function)
	{

	/*
	* Process exec()
	*/

	case PMC_FN_PROCESS_EXEC:
	{
	char fullpath, freepath;
	unsigned int ri;
	int is_using_hwpmcs;
	struct pmc *pm;
	struct proc *p;
	struct pmc_owner *po;
	struct pmc_process *pp;
	struct pmckern_procexec *pk;

	sx_assert(&pmc_sx, SX_XLOCKED);

	p = td->td_proc;
	pmc_getfilename(p->p_textvp, &fullpath, &freepath);

	pk = (struct pmckern_procexec *) arg;

	/* Inform owners of SS mode PMCs of the exec event. */
	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
	pmclog_process_procexec(po, PMC_ID_INVALID,
	p->p_pid, pk->pm_entryaddr, fullpath);

	PROC_LOCK(p);
	is_using_hwpmcs = p->p_flag & P_HWPMC;
	PROC_UNLOCK(p);

	if (!is_using_hwpmcs) {
	if (freepath)
	free(freepath, M_TEMP);
	break;
	}

	/*
	* PMCs are not inherited across an exec(): remove any
	* PMCs that this process is the owner of.
	*/

	if ((po = pmc_find_owner_descriptor(p)) != NULL) {
	pmc_remove_owner(po);
	pmc_destroy_owner_descriptor(po);
	}

	/*
	* If the process being exec'ed is not the target of any
	* PMC, we are done.
	*/
	if ((pp = pmc_find_process_descriptor(p, 0)) == NULL) {
	if (freepath)
	free(freepath, M_TEMP);
	break;
	}

	/*
	* Log the exec event to all monitoring owners. Skip
	* owners who have already recieved the event because
	* they had system sampling PMCs active.
	*/
	for (ri = 0; ri < md->pmd_npmc; ri++)
	if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) {
	po = pm->pm_owner;
	if (po->po_sscount == 0 &&
	po->po_flags & PMC_PO_OWNS_LOGFILE)
	pmclog_process_procexec(po, pm->pm_id,
	p->p_pid, pk->pm_entryaddr,
	fullpath);
	}

	if (freepath)
	free(freepath, M_TEMP);


	PMCDBG(PRC,EXC,1, "exec proc=%p (%d, %s) cred-changed=%d",
	p, p->p_pid, p->p_comm, pk->pm_credentialschanged);

	if (pk->pm_credentialschanged == 0) /* no change */
	break;

	/*
	* If the newly exec()'ed process has a different credential
	* than before, allow it to be the target of a PMC only if
	* the PMC's owner has sufficient priviledge.
	*/

	for (ri = 0; ri < md->pmd_npmc; ri++)
	if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL)
	if (pmc_can_attach(pm, td->td_proc) != 0)
	pmc_detach_one_process(td->td_proc,
	pm, PMC_FLAG_NONE);

	KASSERT(pp->pp_refcnt >= 0 && pp->pp_refcnt <= (int) md->pmd_npmc,
	("[pmc,%d] Illegal ref count %d on pp %p", __LINE__,
	pp->pp_refcnt, pp));

	/*
	* If this process is no longer the target of any
	* PMCs, we can remove the process entry and free
	* up space.
	*/

	if (pp->pp_refcnt == 0) {
	pmc_remove_process_descriptor(pp);
	free(pp, M_PMC);
	break;
	}

	}
	break;

	case PMC_FN_CSW_IN:
	pmc_process_csw_in(td);
	break;

	case PMC_FN_CSW_OUT:
	pmc_process_csw_out(td);
	break;

	/*
	* Process accumulated PC samples.
	*
	* This function is expected to be called by hardclock() for
	* each CPU that has accumulated PC samples.
	*
	* This function is to be executed on the CPU whose samples
	* are being processed.
	*/
	case PMC_FN_DO_SAMPLES:

	/*
	* Clear the cpu specific bit in the CPU mask before
	* do the rest of the processing. If the NMI handler
	* gets invoked after the "atomic_clear_int()" call
	* below but before "pmc_process_samples()" gets
	* around to processing the interrupt, then we will
	* come back here at the next hardclock() tick (and
	* may find nothing to do if "pmc_process_samples()"
	* had already processed the interrupt). We don't
	* lose the interrupt sample.
	*/
	CPU_CLR_ATOMIC(PCPU_GET(cpuid), &pmc_cpumask);
	pmc_process_samples(PCPU_GET(cpuid));
	break;


	case PMC_FN_KLD_LOAD:
	sx_assert(&pmc_sx, SX_LOCKED);
	pmc_process_kld_load((struct pmckern_map_in *) arg);
	break;

	case PMC_FN_KLD_UNLOAD:
	sx_assert(&pmc_sx, SX_LOCKED);
	pmc_process_kld_unload((struct pmckern_map_out *) arg);
	break;

	case PMC_FN_MMAP:
	sx_assert(&pmc_sx, SX_LOCKED);
	pmc_process_mmap(td, (struct pmckern_map_in *) arg);
	break;

	case PMC_FN_MUNMAP:
	sx_assert(&pmc_sx, SX_LOCKED);
	pmc_process_munmap(td, (struct pmckern_map_out *) arg);
	break;

	case PMC_FN_USER_CALLCHAIN:
	/*
	* Record a call chain.
	*/
	KASSERT(td == curthread, ("[pmc,%d] td != curthread",
	__LINE__));
	pmc_capture_user_callchain(PCPU_GET(cpuid),
	(struct trapframe *) arg);
	td->td_pflags &= ~TDP_CALLCHAIN;
	break;

	default:
	#ifdef DEBUG
	KASSERT(0, ("[pmc,%d] unknown hook %d\n", __LINE__, function));
	#endif
	break;

	}

	return 0;
	}

	/*
	* allocate a 'struct pmc_owner' descriptor in the owner hash table.
	*/

	static struct pmc_owner *
	pmc_allocate_owner_descriptor(struct proc *p)
	{
	uint32_t hindex;
	struct pmc_owner *po;
	struct pmc_ownerhash *poh;

	hindex = PMC_HASH_PTR(p, pmc_ownerhashmask);
	poh = &pmc_ownerhash[hindex];

	/* allocate space for N pointers and one descriptor struct */
	po = malloc(sizeof(struct pmc_owner), M_PMC, M_WAITOK\|M_ZERO);
	po->po_sscount = po->po_error = po->po_flags = po->po_logprocmaps = 0;
	po->po_file = NULL;
	po->po_owner = p;
	po->po_kthread = NULL;
	LIST_INIT(&po->po_pmcs);
	LIST_INSERT_HEAD(poh, po, po_next); /* insert into hash table */

	TAILQ_INIT(&po->po_logbuffers);
	mtx_init(&po->po_mtx, "pmc-owner-mtx", "pmc-per-proc", MTX_SPIN);

	PMCDBG(OWN,ALL,1, "allocate-owner proc=%p (%d, %s) pmc-owner=%p",
	p, p->p_pid, p->p_comm, po);

	return po;
	}

	static void
	pmc_destroy_owner_descriptor(struct pmc_owner *po)
	{

	PMCDBG(OWN,REL,1, "destroy-owner po=%p proc=%p (%d, %s)",
	po, po->po_owner, po->po_owner->p_pid, po->po_owner->p_comm);

	mtx_destroy(&po->po_mtx);
	free(po, M_PMC);
	}

	/*
	* find the descriptor corresponding to process 'p', adding or removing it
	* as specified by 'mode'.
	*/

	static struct pmc_process *
	pmc_find_process_descriptor(struct proc *p, uint32_t mode)
	{
	uint32_t hindex;
	struct pmc_process pp, ppnew;
	struct pmc_processhash *pph;

	hindex = PMC_HASH_PTR(p, pmc_processhashmask);
	pph = &pmc_processhash[hindex];

	ppnew = NULL;

	/*
	* Pre-allocate memory in the FIND_ALLOCATE case since we
	* cannot call malloc(9) once we hold a spin lock.
	*/
	if (mode & PMC_FLAG_ALLOCATE)
	ppnew = malloc(sizeof(struct pmc_process) + md->pmd_npmc *
	sizeof(struct pmc_targetstate), M_PMC, M_WAITOK\|M_ZERO);

	mtx_lock_spin(&pmc_processhash_mtx);
	LIST_FOREACH(pp, pph, pp_next)
	if (pp->pp_proc == p)
	break;

	if ((mode & PMC_FLAG_REMOVE) && pp != NULL)
	LIST_REMOVE(pp, pp_next);

	if ((mode & PMC_FLAG_ALLOCATE) && pp == NULL &&
	ppnew != NULL) {
	ppnew->pp_proc = p;
	LIST_INSERT_HEAD(pph, ppnew, pp_next);
	pp = ppnew;
	ppnew = NULL;
	}
	mtx_unlock_spin(&pmc_processhash_mtx);

	if (pp != NULL && ppnew != NULL)
	free(ppnew, M_PMC);

	return pp;
	}

	/*
	* remove a process descriptor from the process hash table.
	*/

	static void
	pmc_remove_process_descriptor(struct pmc_process *pp)
	{
	KASSERT(pp->pp_refcnt == 0,
	("[pmc,%d] Removing process descriptor %p with count %d",
	__LINE__, pp, pp->pp_refcnt));

	mtx_lock_spin(&pmc_processhash_mtx);
	LIST_REMOVE(pp, pp_next);
	mtx_unlock_spin(&pmc_processhash_mtx);
	}


	/*
	* find an owner descriptor corresponding to proc 'p'
	*/

	static struct pmc_owner *
	pmc_find_owner_descriptor(struct proc *p)
	{
	uint32_t hindex;
	struct pmc_owner *po;
	struct pmc_ownerhash *poh;

	hindex = PMC_HASH_PTR(p, pmc_ownerhashmask);
	poh = &pmc_ownerhash[hindex];

	po = NULL;
	LIST_FOREACH(po, poh, po_next)
	if (po->po_owner == p)
	break;

	PMCDBG(OWN,FND,1, "find-owner proc=%p (%d, %s) hindex=0x%x -> "
	"pmc-owner=%p", p, p->p_pid, p->p_comm, hindex, po);

	return po;
	}

	/*
	* pmc_allocate_pmc_descriptor
	*
	* Allocate a pmc descriptor and initialize its
	* fields.
	*/

	static struct pmc *
	pmc_allocate_pmc_descriptor(void)
	{
	struct pmc *pmc;

	pmc = malloc(sizeof(struct pmc), M_PMC, M_WAITOK\|M_ZERO);

	if (pmc != NULL) {
	pmc->pm_owner = NULL;
	LIST_INIT(&pmc->pm_targets);
	}

	PMCDBG(PMC,ALL,1, "allocate-pmc -> pmc=%p", pmc);

	return pmc;
	}

	/*
	* Destroy a pmc descriptor.
	*/

	static void
	pmc_destroy_pmc_descriptor(struct pmc *pm)
	{
	(void) pm;

	#ifdef DEBUG
	KASSERT(pm->pm_state == PMC_STATE_DELETED \|\|
	pm->pm_state == PMC_STATE_FREE,
	("[pmc,%d] destroying non-deleted PMC", __LINE__));
	KASSERT(LIST_EMPTY(&pm->pm_targets),
	("[pmc,%d] destroying pmc with targets", __LINE__));
	KASSERT(pm->pm_owner == NULL,
	("[pmc,%d] destroying pmc attached to an owner", __LINE__));
	KASSERT(pm->pm_runcount == 0,
	("[pmc,%d] pmc has non-zero run count %d", __LINE__,
	pm->pm_runcount));
	#endif
	}

	static void
	pmc_wait_for_pmc_idle(struct pmc *pm)
	{
	#ifdef DEBUG
	volatile int maxloop;

	maxloop = 100 * pmc_cpu_max();
	#endif

	/*
	* Loop (with a forced context switch) till the PMC's runcount
	* comes down to zero.
	*/
	while (atomic_load_acq_32(&pm->pm_runcount) > 0) {
	#ifdef DEBUG
	maxloop--;
	KASSERT(maxloop > 0,
	("[pmc,%d] (ri%d, rc%d) waiting too long for "
	"pmc to be free", __LINE__,
	PMC_TO_ROWINDEX(pm), pm->pm_runcount));
	#endif
	pmc_force_context_switch();
	}
	}

	/*
	* This function does the following things:
	*
	* - detaches the PMC from hardware
	* - unlinks all target threads that were attached to it
	* - removes the PMC from its owner's list
	* - destroy's the PMC private mutex
	*
	* Once this function completes, the given pmc pointer can be safely
	* FREE'd by the caller.
	*/

	static void
	pmc_release_pmc_descriptor(struct pmc *pm)
	{
	enum pmc_mode mode;
	struct pmc_hw *phw;
	u_int adjri, ri, cpu;
	struct pmc_owner *po;
	struct pmc_binding pb;
	struct pmc_process *pp;
	struct pmc_classdep *pcd;
	struct pmc_target ptgt, tmp;

	sx_assert(&pmc_sx, SX_XLOCKED);

	KASSERT(pm, ("[pmc,%d] null pmc", __LINE__));

	ri = PMC_TO_ROWINDEX(pm);
	pcd = pmc_ri_to_classdep(md, ri, &adjri);
	mode = PMC_TO_MODE(pm);

	PMCDBG(PMC,REL,1, "release-pmc pmc=%p ri=%d mode=%d", pm, ri,
	mode);

	/*
	* First, we take the PMC off hardware.
	*/
	cpu = 0;
	if (PMC_IS_SYSTEM_MODE(mode)) {

	/*
	* A system mode PMC runs on a specific CPU. Switch
	* to this CPU and turn hardware off.
	*/
	pmc_save_cpu_binding(&pb);

	cpu = PMC_TO_CPU(pm);

	pmc_select_cpu(cpu);

	/* switch off non-stalled CPUs */
	if (pm->pm_state == PMC_STATE_RUNNING &&
	pm->pm_stalled == 0) {

	phw = pmc_pcpu[cpu]->pc_hwpmcs[ri];

	KASSERT(phw->phw_pmc == pm,
	("[pmc, %d] pmc ptr ri(%d) hw(%p) pm(%p)",
	__LINE__, ri, phw->phw_pmc, pm));
	PMCDBG(PMC,REL,2, "stopping cpu=%d ri=%d", cpu, ri);

	critical_enter();
	pcd->pcd_stop_pmc(cpu, adjri);
	critical_exit();
	}

	PMCDBG(PMC,REL,2, "decfg cpu=%d ri=%d", cpu, ri);

	critical_enter();
	pcd->pcd_config_pmc(cpu, adjri, NULL);
	critical_exit();

	/* adjust the global and process count of SS mode PMCs */
	if (mode == PMC_MODE_SS && pm->pm_state == PMC_STATE_RUNNING) {
	po = pm->pm_owner;
	po->po_sscount--;
	if (po->po_sscount == 0) {
	atomic_subtract_rel_int(&pmc_ss_count, 1);
	LIST_REMOVE(po, po_ssnext);
	}
	}

	pm->pm_state = PMC_STATE_DELETED;

	pmc_restore_cpu_binding(&pb);

	/*
	* We could have references to this PMC structure in
	* the per-cpu sample queues. Wait for the queue to
	* drain.
	*/
	pmc_wait_for_pmc_idle(pm);

	} else if (PMC_IS_VIRTUAL_MODE(mode)) {

	/*
	* A virtual PMC could be running on multiple CPUs at
	* a given instant.
	*
	* By marking its state as DELETED, we ensure that
	* this PMC is never further scheduled on hardware.
	*
	* Then we wait till all CPUs are done with this PMC.
	*/
	pm->pm_state = PMC_STATE_DELETED;


	/* Wait for the PMCs runcount to come to zero. */
	pmc_wait_for_pmc_idle(pm);

	/*
	* At this point the PMC is off all CPUs and cannot be
	* freshly scheduled onto a CPU. It is now safe to
	* unlink all targets from this PMC. If a
	* process-record's refcount falls to zero, we remove
	* it from the hash table. The module-wide SX lock
	* protects us from races.
	*/
	LIST_FOREACH_SAFE(ptgt, &pm->pm_targets, pt_next, tmp) {
	pp = ptgt->pt_process;
	pmc_unlink_target_process(pm, pp); /* frees 'ptgt' */

	PMCDBG(PMC,REL,3, "pp->refcnt=%d", pp->pp_refcnt);

	/*
	* If the target process record shows that no
	* PMCs are attached to it, reclaim its space.
	*/

	if (pp->pp_refcnt == 0) {
	pmc_remove_process_descriptor(pp);
	free(pp, M_PMC);
	}
	}

	cpu = curthread->td_oncpu; /* setup cpu for pmd_release() */

	}

	/*
	* Release any MD resources
	*/
	(void) pcd->pcd_release_pmc(cpu, adjri, pm);

	/*
	* Update row disposition
	*/

	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm)))
	PMC_UNMARK_ROW_STANDALONE(ri);
	else
	PMC_UNMARK_ROW_THREAD(ri);

	/* unlink from the owner's list */
	if (pm->pm_owner) {
	LIST_REMOVE(pm, pm_next);
	pm->pm_owner = NULL;
	}

	pmc_destroy_pmc_descriptor(pm);
	}

	/*
	* Register an owner and a pmc.
	*/

	static int
	pmc_register_owner(struct proc p, struct pmc pmc)
	{
	struct pmc_owner *po;

	sx_assert(&pmc_sx, SX_XLOCKED);

	if ((po = pmc_find_owner_descriptor(p)) == NULL)
	if ((po = pmc_allocate_owner_descriptor(p)) == NULL)
	return ENOMEM;

	KASSERT(pmc->pm_owner == NULL,
	("[pmc,%d] attempting to own an initialized PMC", __LINE__));
	pmc->pm_owner = po;

	LIST_INSERT_HEAD(&po->po_pmcs, pmc, pm_next);

	PROC_LOCK(p);
	p->p_flag \|= P_HWPMC;
	PROC_UNLOCK(p);

	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
	pmclog_process_pmcallocate(pmc);

	PMCDBG(PMC,REG,1, "register-owner pmc-owner=%p pmc=%p",
	po, pmc);

	return 0;
	}

	/*
	* Return the current row disposition:
	* == 0 => FREE
	* > 0 => PROCESS MODE
	* < 0 => SYSTEM MODE
	*/

	int
	pmc_getrowdisp(int ri)
	{
	return pmc_pmcdisp[ri];
	}

	/*
	* Check if a PMC at row index 'ri' can be allocated to the current
	* process.
	*
	* Allocation can fail if:
	* - the current process is already being profiled by a PMC at index 'ri',
	* attached to it via OP_PMCATTACH.
	* - the current process has already allocated a PMC at index 'ri'
	* via OP_ALLOCATE.
	*/

	static int
	pmc_can_allocate_rowindex(struct proc *p, unsigned int ri, int cpu)
	{
	enum pmc_mode mode;
	struct pmc *pm;
	struct pmc_owner *po;
	struct pmc_process *pp;

	PMCDBG(PMC,ALR,1, "can-allocate-rowindex proc=%p (%d, %s) ri=%d "
	"cpu=%d", p, p->p_pid, p->p_comm, ri, cpu);

	/*
	* We shouldn't have already allocated a process-mode PMC at
	* row index 'ri'.
	*
	* We shouldn't have allocated a system-wide PMC on the same
	* CPU and same RI.
	*/
	if ((po = pmc_find_owner_descriptor(p)) != NULL)
	LIST_FOREACH(pm, &po->po_pmcs, pm_next) {
	if (PMC_TO_ROWINDEX(pm) == ri) {
	mode = PMC_TO_MODE(pm);
	if (PMC_IS_VIRTUAL_MODE(mode))
	return EEXIST;
	if (PMC_IS_SYSTEM_MODE(mode) &&
	(int) PMC_TO_CPU(pm) == cpu)
	return EEXIST;
	}
	}

	/*
	* We also shouldn't be the target of any PMC at this index
	* since otherwise a PMC_ATTACH to ourselves will fail.
	*/
	if ((pp = pmc_find_process_descriptor(p, 0)) != NULL)
	if (pp->pp_pmcs[ri].pp_pmc)
	return EEXIST;

	PMCDBG(PMC,ALR,2, "can-allocate-rowindex proc=%p (%d, %s) ri=%d ok",
	p, p->p_pid, p->p_comm, ri);

	return 0;
	}

	/*
	* Check if a given PMC at row index 'ri' can be currently used in
	* mode 'mode'.
	*/

	static int
	pmc_can_allocate_row(int ri, enum pmc_mode mode)
	{
	enum pmc_disp disp;

	sx_assert(&pmc_sx, SX_XLOCKED);

	PMCDBG(PMC,ALR,1, "can-allocate-row ri=%d mode=%d", ri, mode);

	if (PMC_IS_SYSTEM_MODE(mode))
	disp = PMC_DISP_STANDALONE;
	else
	disp = PMC_DISP_THREAD;

	/*
	* check disposition for PMC row 'ri':
	*
	* Expected disposition Row-disposition Result
	*
	* STANDALONE STANDALONE or FREE proceed
	* STANDALONE THREAD fail
	* THREAD THREAD or FREE proceed
	* THREAD STANDALONE fail
	*/

	if (!PMC_ROW_DISP_IS_FREE(ri) &&
	!(disp == PMC_DISP_THREAD && PMC_ROW_DISP_IS_THREAD(ri)) &&
	!(disp == PMC_DISP_STANDALONE && PMC_ROW_DISP_IS_STANDALONE(ri)))
	return EBUSY;

	/*
	* All OK
	*/

	PMCDBG(PMC,ALR,2, "can-allocate-row ri=%d mode=%d ok", ri, mode);

	return 0;

	}

	/*
	* Find a PMC descriptor with user handle 'pmcid' for thread 'td'.
	*/

	static struct pmc *
	pmc_find_pmc_descriptor_in_process(struct pmc_owner *po, pmc_id_t pmcid)
	{
	struct pmc *pm;

	KASSERT(PMC_ID_TO_ROWINDEX(pmcid) < md->pmd_npmc,
	("[pmc,%d] Illegal pmc index %d (max %d)", __LINE__,
	PMC_ID_TO_ROWINDEX(pmcid), md->pmd_npmc));

	LIST_FOREACH(pm, &po->po_pmcs, pm_next)
	if (pm->pm_id == pmcid)
	return pm;

	return NULL;
	}

	static int
	pmc_find_pmc(pmc_id_t pmcid, struct pmc **pmc)
	{

	struct pmc *pm;
	struct pmc_owner *po;

	PMCDBG(PMC,FND,1, "find-pmc id=%d", pmcid);

	if ((po = pmc_find_owner_descriptor(curthread->td_proc)) == NULL)
	return ESRCH;

	if ((pm = pmc_find_pmc_descriptor_in_process(po, pmcid)) == NULL)
	return EINVAL;

	PMCDBG(PMC,FND,2, "find-pmc id=%d -> pmc=%p", pmcid, pm);

	*pmc = pm;
	return 0;
	}

	/*
	* Start a PMC.
	*/

	static int
	pmc_start(struct pmc *pm)
	{
	enum pmc_mode mode;
	struct pmc_owner *po;
	struct pmc_binding pb;
	struct pmc_classdep *pcd;
	int adjri, error, cpu, ri;

	KASSERT(pm != NULL,
	("[pmc,%d] null pm", __LINE__));

	mode = PMC_TO_MODE(pm);
	ri = PMC_TO_ROWINDEX(pm);
	pcd = pmc_ri_to_classdep(md, ri, &adjri);

	error = 0;

	PMCDBG(PMC,OPS,1, "start pmc=%p mode=%d ri=%d", pm, mode, ri);

	po = pm->pm_owner;

	/*
	* Disallow PMCSTART if a logfile is required but has not been
	* configured yet.
	*/
	if ((pm->pm_flags & PMC_F_NEEDS_LOGFILE) &&
	(po->po_flags & PMC_PO_OWNS_LOGFILE) == 0)
	return (EDOOFUS); /* programming error */

	/*
	* If this is a sampling mode PMC, log mapping information for
	* the kernel modules that are currently loaded.
	*/
	if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
	pmc_log_kernel_mappings(pm);

	if (PMC_IS_VIRTUAL_MODE(mode)) {

	/*
	* If a PMCATTACH has never been done on this PMC,
	* attach it to its owner process.
	*/

	if (LIST_EMPTY(&pm->pm_targets))
	error = (pm->pm_flags & PMC_F_ATTACH_DONE) ? ESRCH :
	pmc_attach_process(po->po_owner, pm);

	/*
	* If the PMC is attached to its owner, then force a context
	* switch to ensure that the MD state gets set correctly.
	*/

	if (error == 0) {
	pm->pm_state = PMC_STATE_RUNNING;
	if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER)
	pmc_force_context_switch();
	}

	return (error);
	}


	/*
	* A system-wide PMC.
	*
	* Add the owner to the global list if this is a system-wide
	* sampling PMC.
	*/

	if (mode == PMC_MODE_SS) {
	if (po->po_sscount == 0) {
	LIST_INSERT_HEAD(&pmc_ss_owners, po, po_ssnext);
	atomic_add_rel_int(&pmc_ss_count, 1);
	PMCDBG(PMC,OPS,1, "po=%p in global list", po);
	}
	po->po_sscount++;

	/*
	* Log mapping information for all existing processes in the
	* system. Subsequent mappings are logged as they happen;
	* see pmc_process_mmap().
	*/
	if (po->po_logprocmaps == 0) {
	pmc_log_all_process_mappings(po);
	po->po_logprocmaps = 1;
	}
	}

	/*
	* Move to the CPU associated with this
	* PMC, and start the hardware.
	*/

	pmc_save_cpu_binding(&pb);

	cpu = PMC_TO_CPU(pm);

	if (!pmc_cpu_is_active(cpu))
	return (ENXIO);

	pmc_select_cpu(cpu);

	/*
	* global PMCs are configured at allocation time
	* so write out the initial value and start the PMC.
	*/

	pm->pm_state = PMC_STATE_RUNNING;

	critical_enter();
	if ((error = pcd->pcd_write_pmc(cpu, adjri,
	PMC_IS_SAMPLING_MODE(mode) ?
	pm->pm_sc.pm_reloadcount :
	pm->pm_sc.pm_initial)) == 0)
	error = pcd->pcd_start_pmc(cpu, adjri);
	critical_exit();

	pmc_restore_cpu_binding(&pb);

	return (error);
	}

	/*
	* Stop a PMC.
	*/

	static int
	pmc_stop(struct pmc *pm)
	{
	struct pmc_owner *po;
	struct pmc_binding pb;
	struct pmc_classdep *pcd;
	int adjri, cpu, error, ri;

	KASSERT(pm != NULL, ("[pmc,%d] null pmc", __LINE__));

	PMCDBG(PMC,OPS,1, "stop pmc=%p mode=%d ri=%d", pm,
	PMC_TO_MODE(pm), PMC_TO_ROWINDEX(pm));

	pm->pm_state = PMC_STATE_STOPPED;

	/*
	* If the PMC is a virtual mode one, changing the state to
	* non-RUNNING is enough to ensure that the PMC never gets
	* scheduled.
	*
	* If this PMC is current running on a CPU, then it will
	* handled correctly at the time its target process is context
	* switched out.
	*/

	if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)))
	return 0;

	/*
	* A system-mode PMC. Move to the CPU associated with
	* this PMC, and stop the hardware. We update the
	* 'initial count' so that a subsequent PMCSTART will
	* resume counting from the current hardware count.
	*/

	pmc_save_cpu_binding(&pb);

	cpu = PMC_TO_CPU(pm);

	KASSERT(cpu >= 0 && cpu < pmc_cpu_max(),
	("[pmc,%d] illegal cpu=%d", __LINE__, cpu));

	if (!pmc_cpu_is_active(cpu))
	return ENXIO;

	pmc_select_cpu(cpu);

	ri = PMC_TO_ROWINDEX(pm);
	pcd = pmc_ri_to_classdep(md, ri, &adjri);

	critical_enter();
	if ((error = pcd->pcd_stop_pmc(cpu, adjri)) == 0)
	error = pcd->pcd_read_pmc(cpu, adjri, &pm->pm_sc.pm_initial);
	critical_exit();

	pmc_restore_cpu_binding(&pb);

	po = pm->pm_owner;

	/* remove this owner from the global list of SS PMC owners */
	if (PMC_TO_MODE(pm) == PMC_MODE_SS) {
	po->po_sscount--;
	if (po->po_sscount == 0) {
	atomic_subtract_rel_int(&pmc_ss_count, 1);
	LIST_REMOVE(po, po_ssnext);
	PMCDBG(PMC,OPS,2,"po=%p removed from global list", po);
	}
	}

	return (error);
	}


	#ifdef DEBUG
	static const char *pmc_op_to_name[] = {
	#undef __PMC_OP
	#define __PMC_OP(N, D) #N ,
	__PMC_OPS()
	NULL
	};
	#endif

	/*
	* The syscall interface
	*/

	#define PMC_GET_SX_XLOCK(...) do { \
	sx_xlock(&pmc_sx); \
	if (pmc_hook == NULL) { \
	sx_xunlock(&pmc_sx); \
	return __VA_ARGS__; \
	} \
	} while (0)

	#define PMC_DOWNGRADE_SX() do { \
	sx_downgrade(&pmc_sx); \
	is_sx_downgraded = 1; \
	} while (0)

	static int
	pmc_syscall_handler(struct thread td, void syscall_args)
	{
	int error, is_sx_downgraded, is_sx_locked, op;
	struct pmc_syscall_args *c;
	void *arg;

	PMC_GET_SX_XLOCK(ENOSYS);

	DROP_GIANT();

	is_sx_downgraded = 0;
	is_sx_locked = 1;

	c = (struct pmc_syscall_args *) syscall_args;

	op = c->pmop_code;
	arg = c->pmop_data;

	PMCDBG(MOD,PMS,1, "syscall op=%d \"%s\" arg=%p", op,
	pmc_op_to_name[op], arg);

	error = 0;
	atomic_add_int(&pmc_stats.pm_syscalls, 1);

	switch(op)
	{


	/*
	* Configure a log file.
	*
	* XXX This OP will be reworked.
	*/

	case PMC_OP_CONFIGURELOG:
	{
	struct proc *p;
	struct pmc *pm;
	struct pmc_owner *po;
	struct pmc_op_configurelog cl;

	sx_assert(&pmc_sx, SX_XLOCKED);

	if ((error = copyin(arg, &cl, sizeof(cl))) != 0)
	break;

	/* mark this process as owning a log file */
	p = td->td_proc;
	if ((po = pmc_find_owner_descriptor(p)) == NULL)
	if ((po = pmc_allocate_owner_descriptor(p)) == NULL) {
	error = ENOMEM;
	break;
	}

	/*
	* If a valid fd was passed in, try to configure that,
	* otherwise if 'fd' was less than zero and there was
	* a log file configured, flush its buffers and
	* de-configure it.
	*/
	if (cl.pm_logfd >= 0) {
	sx_xunlock(&pmc_sx);
	is_sx_locked = 0;
	error = pmclog_configure_log(md, po, cl.pm_logfd);
	} else if (po->po_flags & PMC_PO_OWNS_LOGFILE) {
	pmclog_process_closelog(po);
	error = pmclog_flush(po);
	if (error == 0) {
	LIST_FOREACH(pm, &po->po_pmcs, pm_next)
	if (pm->pm_flags & PMC_F_NEEDS_LOGFILE &&
	pm->pm_state == PMC_STATE_RUNNING)
	pmc_stop(pm);
	error = pmclog_deconfigure_log(po);
	}
	} else
	error = EINVAL;

	if (error)
	break;
	}
	break;


	/*
	* Flush a log file.
	*/

	case PMC_OP_FLUSHLOG:
	{
	struct pmc_owner *po;

	sx_assert(&pmc_sx, SX_XLOCKED);

	if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) {
	error = EINVAL;
	break;
	}

	error = pmclog_flush(po);
	}
	break;

	/*
	* Retrieve hardware configuration.
	*/

	case PMC_OP_GETCPUINFO: /* CPU information */
	{
	struct pmc_op_getcpuinfo gci;
	struct pmc_classinfo *pci;
	struct pmc_classdep *pcd;
	int cl;

	gci.pm_cputype = md->pmd_cputype;
	gci.pm_ncpu = pmc_cpu_max();
	gci.pm_npmc = md->pmd_npmc;
	gci.pm_nclass = md->pmd_nclass;
	pci = gci.pm_classes;
	pcd = md->pmd_classdep;
	for (cl = 0; cl < md->pmd_nclass; cl++, pci++, pcd++) {
	pci->pm_caps = pcd->pcd_caps;
	pci->pm_class = pcd->pcd_class;
	pci->pm_width = pcd->pcd_width;
	pci->pm_num = pcd->pcd_num;
	}
	error = copyout(&gci, arg, sizeof(gci));
	}
	break;


	/*
	* Get module statistics
	*/

	case PMC_OP_GETDRIVERSTATS:
	{
	struct pmc_op_getdriverstats gms;

	bcopy(&pmc_stats, &gms, sizeof(gms));
	error = copyout(&gms, arg, sizeof(gms));
	}
	break;


	/*
	* Retrieve module version number
	*/

	case PMC_OP_GETMODULEVERSION:
	{
	uint32_t cv, modv;

	/* retrieve the client's idea of the ABI version */
	if ((error = copyin(arg, &cv, sizeof(uint32_t))) != 0)
	break;
	/* don't service clients newer than our driver */
	modv = PMC_VERSION;
	if ((cv & 0xFFFF0000) > (modv & 0xFFFF0000)) {
	error = EPROGMISMATCH;
	break;
	}
	error = copyout(&modv, arg, sizeof(int));
	}
	break;


	/*
	* Retrieve the state of all the PMCs on a given
	* CPU.
	*/

	case PMC_OP_GETPMCINFO:
	{
	int ari;
	struct pmc *pm;
	size_t pmcinfo_size;
	uint32_t cpu, n, npmc;
	struct pmc_owner *po;
	struct pmc_binding pb;
	struct pmc_classdep *pcd;
	struct pmc_info p, pmcinfo;
	struct pmc_op_getpmcinfo *gpi;

	PMC_DOWNGRADE_SX();

	gpi = (struct pmc_op_getpmcinfo *) arg;

	if ((error = copyin(&gpi->pm_cpu, &cpu, sizeof(cpu))) != 0)
	break;

	if (cpu >= pmc_cpu_max()) {
	error = EINVAL;
	break;
	}

	if (!pmc_cpu_is_active(cpu)) {
	error = ENXIO;
	break;
	}

	/* switch to CPU 'cpu' */
	pmc_save_cpu_binding(&pb);
	pmc_select_cpu(cpu);

	npmc = md->pmd_npmc;

	pmcinfo_size = npmc * sizeof(struct pmc_info);
	pmcinfo = malloc(pmcinfo_size, M_PMC, M_WAITOK);

	p = pmcinfo;

	for (n = 0; n < md->pmd_npmc; n++, p++) {

	pcd = pmc_ri_to_classdep(md, n, &ari);

	KASSERT(pcd != NULL,
	("[pmc,%d] null pcd ri=%d", __LINE__, n));

	if ((error = pcd->pcd_describe(cpu, ari, p, &pm)) != 0)
	break;

	if (PMC_ROW_DISP_IS_STANDALONE(n))
	p->pm_rowdisp = PMC_DISP_STANDALONE;
	else if (PMC_ROW_DISP_IS_THREAD(n))
	p->pm_rowdisp = PMC_DISP_THREAD;
	else
	p->pm_rowdisp = PMC_DISP_FREE;

	p->pm_ownerpid = -1;

	if (pm == NULL) /* no PMC associated */
	continue;

	po = pm->pm_owner;

	KASSERT(po->po_owner != NULL,
	("[pmc,%d] pmc_owner had a null proc pointer",
	__LINE__));

	p->pm_ownerpid = po->po_owner->p_pid;
	p->pm_mode = PMC_TO_MODE(pm);
	p->pm_event = pm->pm_event;
	p->pm_flags = pm->pm_flags;

	if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
	p->pm_reloadcount =
	pm->pm_sc.pm_reloadcount;
	}

	pmc_restore_cpu_binding(&pb);

	/* now copy out the PMC info collected */
	if (error == 0)
	error = copyout(pmcinfo, &gpi->pm_pmcs, pmcinfo_size);

	free(pmcinfo, M_PMC);
	}
	break;


	/*
	* Set the administrative state of a PMC. I.e. whether
	* the PMC is to be used or not.
	*/

	case PMC_OP_PMCADMIN:
	{
	int cpu, ri;
	enum pmc_state request;
	struct pmc_cpu *pc;
	struct pmc_hw *phw;
	struct pmc_op_pmcadmin pma;
	struct pmc_binding pb;

	sx_assert(&pmc_sx, SX_XLOCKED);

	KASSERT(td == curthread,
	("[pmc,%d] td != curthread", __LINE__));

	error = priv_check(td, PRIV_PMC_MANAGE);
	if (error)
	break;

	if ((error = copyin(arg, &pma, sizeof(pma))) != 0)
	break;

	cpu = pma.pm_cpu;

	if (cpu < 0 \|\| cpu >= (int) pmc_cpu_max()) {
	error = EINVAL;
	break;
	}

	if (!pmc_cpu_is_active(cpu)) {
	error = ENXIO;
	break;
	}

	request = pma.pm_state;

	if (request != PMC_STATE_DISABLED &&
	request != PMC_STATE_FREE) {
	error = EINVAL;
	break;
	}

	ri = pma.pm_pmc; /* pmc id == row index */
	if (ri < 0 \|\| ri >= (int) md->pmd_npmc) {
	error = EINVAL;
	break;
	}

	/*
	* We can't disable a PMC with a row-index allocated
	* for process virtual PMCs.
	*/

	if (PMC_ROW_DISP_IS_THREAD(ri) &&
	request == PMC_STATE_DISABLED) {
	error = EBUSY;
	break;
	}

	/*
	* otherwise, this PMC on this CPU is either free or
	* in system-wide mode.
	*/

	pmc_save_cpu_binding(&pb);
	pmc_select_cpu(cpu);

	pc = pmc_pcpu[cpu];
	phw = pc->pc_hwpmcs[ri];

	/*
	* XXX do we need some kind of 'forced' disable?
	*/

	if (phw->phw_pmc == NULL) {
	if (request == PMC_STATE_DISABLED &&
	(phw->phw_state & PMC_PHW_FLAG_IS_ENABLED)) {
	phw->phw_state &= ~PMC_PHW_FLAG_IS_ENABLED;
	PMC_MARK_ROW_STANDALONE(ri);
	} else if (request == PMC_STATE_FREE &&
	(phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0) {
	phw->phw_state \|= PMC_PHW_FLAG_IS_ENABLED;
	PMC_UNMARK_ROW_STANDALONE(ri);
	}
	/* other cases are a no-op */
	} else
	error = EBUSY;

	pmc_restore_cpu_binding(&pb);
	}
	break;


	/*
	* Allocate a PMC.
	*/

	case PMC_OP_PMCALLOCATE:
	{
	int adjri, n;
	u_int cpu;
	uint32_t caps;
	struct pmc *pmc;
	enum pmc_mode mode;
	struct pmc_hw *phw;
	struct pmc_binding pb;
	struct pmc_classdep *pcd;
	struct pmc_op_pmcallocate pa;

	if ((error = copyin(arg, &pa, sizeof(pa))) != 0)
	break;

	caps = pa.pm_caps;
	mode = pa.pm_mode;
	cpu = pa.pm_cpu;

	if ((mode != PMC_MODE_SS && mode != PMC_MODE_SC &&
	mode != PMC_MODE_TS && mode != PMC_MODE_TC) \|\|
	(cpu != (u_int) PMC_CPU_ANY && cpu >= pmc_cpu_max())) {
	error = EINVAL;
	break;
	}

	/*
	* Virtual PMCs should only ask for a default CPU.
	* System mode PMCs need to specify a non-default CPU.
	*/

	if ((PMC_IS_VIRTUAL_MODE(mode) && cpu != (u_int) PMC_CPU_ANY) \|\|
	(PMC_IS_SYSTEM_MODE(mode) && cpu == (u_int) PMC_CPU_ANY)) {
	error = EINVAL;
	break;
	}

	/*
	* Check that an inactive CPU is not being asked for.
	*/

	if (PMC_IS_SYSTEM_MODE(mode) && !pmc_cpu_is_active(cpu)) {
	error = ENXIO;
	break;
	}

	/*
	* Refuse an allocation for a system-wide PMC if this
	* process has been jailed, or if this process lacks
	* super-user credentials and the sysctl tunable
	* 'security.bsd.unprivileged_syspmcs' is zero.
	*/

	if (PMC_IS_SYSTEM_MODE(mode)) {
	if (jailed(curthread->td_ucred)) {
	error = EPERM;
	break;
	}
	if (!pmc_unprivileged_syspmcs) {
	error = priv_check(curthread,
	PRIV_PMC_SYSTEM);
	if (error)
	break;
	}
	}

	/*
	* Look for valid values for 'pm_flags'
	*/

	if ((pa.pm_flags & ~(PMC_F_DESCENDANTS \| PMC_F_LOG_PROCCSW \|
	PMC_F_LOG_PROCEXIT \| PMC_F_CALLCHAIN)) != 0) {
	error = EINVAL;
	break;
	}

	/* process logging options are not allowed for system PMCs */
	if (PMC_IS_SYSTEM_MODE(mode) && (pa.pm_flags &
	(PMC_F_LOG_PROCCSW \| PMC_F_LOG_PROCEXIT))) {
	error = EINVAL;
	break;
	}

	/*
	* All sampling mode PMCs need to be able to interrupt the
	* CPU.
	*/
	if (PMC_IS_SAMPLING_MODE(mode))
	caps \|= PMC_CAP_INTERRUPT;

	/* A valid class specifier should have been passed in. */
	for (n = 0; n < md->pmd_nclass; n++)
	if (md->pmd_classdep[n].pcd_class == pa.pm_class)
	break;
	if (n == md->pmd_nclass) {
	error = EINVAL;
	break;
	}

	/* The requested PMC capabilities should be feasible. */
	if ((md->pmd_classdep[n].pcd_caps & caps) != caps) {
	error = EOPNOTSUPP;
	break;
	}

	PMCDBG(PMC,ALL,2, "event=%d caps=0x%x mode=%d cpu=%d",
	pa.pm_ev, caps, mode, cpu);

	pmc = pmc_allocate_pmc_descriptor();
	pmc->pm_id = PMC_ID_MAKE_ID(cpu,pa.pm_mode,pa.pm_class,
	PMC_ID_INVALID);
	pmc->pm_event = pa.pm_ev;
	pmc->pm_state = PMC_STATE_FREE;
	pmc->pm_caps = caps;
	pmc->pm_flags = pa.pm_flags;

	/* switch thread to CPU 'cpu' */
	pmc_save_cpu_binding(&pb);

	#define PMC_IS_SHAREABLE_PMC(cpu, n) \
	(pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_state & \
	PMC_PHW_FLAG_IS_SHAREABLE)
	#define PMC_IS_UNALLOCATED(cpu, n) \
	(pmc_pcpu[(cpu)]->pc_hwpmcs[(n)]->phw_pmc == NULL)

	if (PMC_IS_SYSTEM_MODE(mode)) {
	pmc_select_cpu(cpu);
	for (n = 0; n < (int) md->pmd_npmc; n++) {
	pcd = pmc_ri_to_classdep(md, n, &adjri);
	if (pmc_can_allocate_row(n, mode) == 0 &&
	pmc_can_allocate_rowindex(
	curthread->td_proc, n, cpu) == 0 &&
	(PMC_IS_UNALLOCATED(cpu, n) \|\|
	PMC_IS_SHAREABLE_PMC(cpu, n)) &&
	pcd->pcd_allocate_pmc(cpu, adjri, pmc,
	&pa) == 0)
	break;
	}
	} else {
	/* Process virtual mode */
	for (n = 0; n < (int) md->pmd_npmc; n++) {
	pcd = pmc_ri_to_classdep(md, n, &adjri);
	if (pmc_can_allocate_row(n, mode) == 0 &&
	pmc_can_allocate_rowindex(
	curthread->td_proc, n,
	PMC_CPU_ANY) == 0 &&
	pcd->pcd_allocate_pmc(curthread->td_oncpu,
	adjri, pmc, &pa) == 0)
	break;
	}
	}

	#undef PMC_IS_UNALLOCATED
	#undef PMC_IS_SHAREABLE_PMC

	pmc_restore_cpu_binding(&pb);

	if (n == (int) md->pmd_npmc) {
	pmc_destroy_pmc_descriptor(pmc);
	free(pmc, M_PMC);
	pmc = NULL;
	error = EINVAL;
	break;
	}

	/* Fill in the correct value in the ID field */
	pmc->pm_id = PMC_ID_MAKE_ID(cpu,mode,pa.pm_class,n);

	PMCDBG(PMC,ALL,2, "ev=%d class=%d mode=%d n=%d -> pmcid=%x",
	pmc->pm_event, pa.pm_class, mode, n, pmc->pm_id);

	/* Process mode PMCs with logging enabled need log files */
	if (pmc->pm_flags & (PMC_F_LOG_PROCEXIT \| PMC_F_LOG_PROCCSW))
	pmc->pm_flags \|= PMC_F_NEEDS_LOGFILE;

	/* All system mode sampling PMCs require a log file */
	if (PMC_IS_SAMPLING_MODE(mode) && PMC_IS_SYSTEM_MODE(mode))
	pmc->pm_flags \|= PMC_F_NEEDS_LOGFILE;

	/*
	* Configure global pmc's immediately
	*/

	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pmc))) {

	pmc_save_cpu_binding(&pb);
	pmc_select_cpu(cpu);

	phw = pmc_pcpu[cpu]->pc_hwpmcs[n];
	pcd = pmc_ri_to_classdep(md, n, &adjri);

	if ((phw->phw_state & PMC_PHW_FLAG_IS_ENABLED) == 0 \|\|
	(error = pcd->pcd_config_pmc(cpu, adjri, pmc)) != 0) {
	(void) pcd->pcd_release_pmc(cpu, adjri, pmc);
	pmc_destroy_pmc_descriptor(pmc);
	free(pmc, M_PMC);
	pmc = NULL;
	pmc_restore_cpu_binding(&pb);
	error = EPERM;
	break;
	}

	pmc_restore_cpu_binding(&pb);
	}

	pmc->pm_state = PMC_STATE_ALLOCATED;

	/*
	* mark row disposition
	*/

	if (PMC_IS_SYSTEM_MODE(mode))
	PMC_MARK_ROW_STANDALONE(n);
	else
	PMC_MARK_ROW_THREAD(n);

	/*
	* Register this PMC with the current thread as its owner.
	*/

	if ((error =
	pmc_register_owner(curthread->td_proc, pmc)) != 0) {
	pmc_release_pmc_descriptor(pmc);
	free(pmc, M_PMC);
	pmc = NULL;
	break;
	}

	/*
	* Return the allocated index.
	*/

	pa.pm_pmcid = pmc->pm_id;

	error = copyout(&pa, arg, sizeof(pa));
	}
	break;


	/*
	* Attach a PMC to a process.
	*/

	case PMC_OP_PMCATTACH:
	{
	struct pmc *pm;
	struct proc *p;
	struct pmc_op_pmcattach a;

	sx_assert(&pmc_sx, SX_XLOCKED);

	if ((error = copyin(arg, &a, sizeof(a))) != 0)
	break;

	if (a.pm_pid < 0) {
	error = EINVAL;
	break;
	} else if (a.pm_pid == 0)
	a.pm_pid = td->td_proc->p_pid;

	if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0)
	break;

	if (PMC_IS_SYSTEM_MODE(PMC_TO_MODE(pm))) {
	error = EINVAL;
	break;
	}

	/* PMCs may be (re)attached only when allocated or stopped */
	if (pm->pm_state == PMC_STATE_RUNNING) {
	error = EBUSY;
	break;
	} else if (pm->pm_state != PMC_STATE_ALLOCATED &&
	pm->pm_state != PMC_STATE_STOPPED) {
	error = EINVAL;
	break;
	}

	/* lookup pid */
	if ((p = pfind(a.pm_pid)) == NULL) {
	error = ESRCH;
	break;
	}

	/*
	* Ignore processes that are working on exiting.
	*/
	if (p->p_flag & P_WEXIT) {
	error = ESRCH;
	PROC_UNLOCK(p); /* pfind() returns a locked process */
	break;
	}

	/*
	* we are allowed to attach a PMC to a process if
	* we can debug it.
	*/
	error = p_candebug(curthread, p);

	PROC_UNLOCK(p);

	if (error == 0)
	error = pmc_attach_process(p, pm);
	}
	break;


	/*
	* Detach an attached PMC from a process.
	*/

	case PMC_OP_PMCDETACH:
	{
	struct pmc *pm;
	struct proc *p;
	struct pmc_op_pmcattach a;

	if ((error = copyin(arg, &a, sizeof(a))) != 0)
	break;

	if (a.pm_pid < 0) {
	error = EINVAL;
	break;
	} else if (a.pm_pid == 0)
	a.pm_pid = td->td_proc->p_pid;

	if ((error = pmc_find_pmc(a.pm_pmc, &pm)) != 0)
	break;

	if ((p = pfind(a.pm_pid)) == NULL) {
	error = ESRCH;
	break;
	}

	/*
	* Treat processes that are in the process of exiting
	* as if they were not present.
	*/

	if (p->p_flag & P_WEXIT)
	error = ESRCH;

	PROC_UNLOCK(p); /* pfind() returns a locked process */

	if (error == 0)
	error = pmc_detach_process(p, pm);
	}
	break;


	/*
	* Retrieve the MSR number associated with the counter
	* 'pmc_id'. This allows processes to directly use RDPMC
	* instructions to read their PMCs, without the overhead of a
	* system call.
	*/

	case PMC_OP_PMCGETMSR:
	{
	int adjri, ri;
	struct pmc *pm;
	struct pmc_target *pt;
	struct pmc_op_getmsr gm;
	struct pmc_classdep *pcd;

	PMC_DOWNGRADE_SX();

	if ((error = copyin(arg, &gm, sizeof(gm))) != 0)
	break;

	if ((error = pmc_find_pmc(gm.pm_pmcid, &pm)) != 0)
	break;

	/*
	* The allocated PMC has to be a process virtual PMC,
	* i.e., of type MODE_T[CS]. Global PMCs can only be
	* read using the PMCREAD operation since they may be
	* allocated on a different CPU than the one we could
	* be running on at the time of the RDPMC instruction.
	*
	* The GETMSR operation is not allowed for PMCs that
	* are inherited across processes.
	*/

	if (!PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)) \|\|
	(pm->pm_flags & PMC_F_DESCENDANTS)) {
	error = EINVAL;
	break;
	}

	/*
	* It only makes sense to use a RDPMC (or its
	* equivalent instruction on non-x86 architectures) on
	* a process that has allocated and attached a PMC to
	* itself. Conversely the PMC is only allowed to have
	* one process attached to it -- its owner.
	*/

	if ((pt = LIST_FIRST(&pm->pm_targets)) == NULL \|\|
	LIST_NEXT(pt, pt_next) != NULL \|\|
	pt->pt_process->pp_proc != pm->pm_owner->po_owner) {
	error = EINVAL;
	break;
	}

	ri = PMC_TO_ROWINDEX(pm);
	pcd = pmc_ri_to_classdep(md, ri, &adjri);

	/* PMC class has no 'GETMSR' support */
	if (pcd->pcd_get_msr == NULL) {
	error = ENOSYS;
	break;
	}

	if ((error = (*pcd->pcd_get_msr)(adjri, &gm.pm_msr)) < 0)
	break;

	if ((error = copyout(&gm, arg, sizeof(gm))) < 0)
	break;

	/*
	* Mark our process as using MSRs. Update machine
	* state using a forced context switch.
	*/

	pt->pt_process->pp_flags \|= PMC_PP_ENABLE_MSR_ACCESS;
	pmc_force_context_switch();

	}
	break;

	/*
	* Release an allocated PMC
	*/

	case PMC_OP_PMCRELEASE:
	{
	pmc_id_t pmcid;
	struct pmc *pm;
	struct pmc_owner *po;
	struct pmc_op_simple sp;

	/*
	* Find PMC pointer for the named PMC.
	*
	* Use pmc_release_pmc_descriptor() to switch off the
	* PMC, remove all its target threads, and remove the
	* PMC from its owner's list.
	*
	* Remove the owner record if this is the last PMC
	* owned.
	*
	* Free up space.
	*/

	if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
	break;

	pmcid = sp.pm_pmcid;

	if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
	break;

	po = pm->pm_owner;
	pmc_release_pmc_descriptor(pm);
	pmc_maybe_remove_owner(po);

	free(pm, M_PMC);
	}
	break;


	/*
	* Read and/or write a PMC.
	*/

	case PMC_OP_PMCRW:
	{
	int adjri;
	struct pmc *pm;
	uint32_t cpu, ri;
	pmc_value_t oldvalue;
	struct pmc_binding pb;
	struct pmc_op_pmcrw prw;
	struct pmc_classdep *pcd;
	struct pmc_op_pmcrw *pprw;

	PMC_DOWNGRADE_SX();

	if ((error = copyin(arg, &prw, sizeof(prw))) != 0)
	break;

	ri = 0;
	PMCDBG(PMC,OPS,1, "rw id=%d flags=0x%x", prw.pm_pmcid,
	prw.pm_flags);

	/* must have at least one flag set */
	if ((prw.pm_flags & (PMC_F_OLDVALUE\|PMC_F_NEWVALUE)) == 0) {
	error = EINVAL;
	break;
	}

	/* locate pmc descriptor */
	if ((error = pmc_find_pmc(prw.pm_pmcid, &pm)) != 0)
	break;

	/* Can't read a PMC that hasn't been started. */
	if (pm->pm_state != PMC_STATE_ALLOCATED &&
	pm->pm_state != PMC_STATE_STOPPED &&
	pm->pm_state != PMC_STATE_RUNNING) {
	error = EINVAL;
	break;
	}

	/* writing a new value is allowed only for 'STOPPED' pmcs */
	if (pm->pm_state == PMC_STATE_RUNNING &&
	(prw.pm_flags & PMC_F_NEWVALUE)) {
	error = EBUSY;
	break;
	}

	if (PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm))) {

	/*
	* If this PMC is attached to its owner (i.e.,
	* the process requesting this operation) and
	* is running, then attempt to get an
	* upto-date reading from hardware for a READ.
	* Writes are only allowed when the PMC is
	* stopped, so only update the saved value
	* field.
	*
	* If the PMC is not running, or is not
	* attached to its owner, read/write to the
	* savedvalue field.
	*/

	ri = PMC_TO_ROWINDEX(pm);
	pcd = pmc_ri_to_classdep(md, ri, &adjri);

	mtx_pool_lock_spin(pmc_mtxpool, pm);
	cpu = curthread->td_oncpu;

	if (prw.pm_flags & PMC_F_OLDVALUE) {
	if ((pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) &&
	(pm->pm_state == PMC_STATE_RUNNING))
	error = (*pcd->pcd_read_pmc)(cpu, adjri,
	&oldvalue);
	else
	oldvalue = pm->pm_gv.pm_savedvalue;
	}
	if (prw.pm_flags & PMC_F_NEWVALUE)
	pm->pm_gv.pm_savedvalue = prw.pm_value;

	mtx_pool_unlock_spin(pmc_mtxpool, pm);

	} else { /* System mode PMCs */
	cpu = PMC_TO_CPU(pm);
	ri = PMC_TO_ROWINDEX(pm);
	pcd = pmc_ri_to_classdep(md, ri, &adjri);

	if (!pmc_cpu_is_active(cpu)) {
	error = ENXIO;
	break;
	}

	/* move this thread to CPU 'cpu' */
	pmc_save_cpu_binding(&pb);
	pmc_select_cpu(cpu);

	critical_enter();
	/* save old value */
	if (prw.pm_flags & PMC_F_OLDVALUE)
	if ((error = (*pcd->pcd_read_pmc)(cpu, adjri,
	&oldvalue)))
	goto error;
	/* write out new value */
	if (prw.pm_flags & PMC_F_NEWVALUE)
	error = (*pcd->pcd_write_pmc)(cpu, adjri,
	prw.pm_value);
	error:
	critical_exit();
	pmc_restore_cpu_binding(&pb);
	if (error)
	break;
	}

	pprw = (struct pmc_op_pmcrw *) arg;

	#ifdef DEBUG
	if (prw.pm_flags & PMC_F_NEWVALUE)
	PMCDBG(PMC,OPS,2, "rw id=%d new %jx -> old %jx",
	ri, prw.pm_value, oldvalue);
	else if (prw.pm_flags & PMC_F_OLDVALUE)
	PMCDBG(PMC,OPS,2, "rw id=%d -> old %jx", ri, oldvalue);
	#endif

	/* return old value if requested */
	if (prw.pm_flags & PMC_F_OLDVALUE)
	if ((error = copyout(&oldvalue, &pprw->pm_value,
	sizeof(prw.pm_value))))
	break;

	}
	break;


	/*
	* Set the sampling rate for a sampling mode PMC and the
	* initial count for a counting mode PMC.
	*/

	case PMC_OP_PMCSETCOUNT:
	{
	struct pmc *pm;
	struct pmc_op_pmcsetcount sc;

	PMC_DOWNGRADE_SX();

	if ((error = copyin(arg, &sc, sizeof(sc))) != 0)
	break;

	if ((error = pmc_find_pmc(sc.pm_pmcid, &pm)) != 0)
	break;

	if (pm->pm_state == PMC_STATE_RUNNING) {
	error = EBUSY;
	break;
	}

	if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)))
	pm->pm_sc.pm_reloadcount = sc.pm_count;
	else
	pm->pm_sc.pm_initial = sc.pm_count;
	}
	break;


	/*
	* Start a PMC.
	*/

	case PMC_OP_PMCSTART:
	{
	pmc_id_t pmcid;
	struct pmc *pm;
	struct pmc_op_simple sp;

	sx_assert(&pmc_sx, SX_XLOCKED);

	if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
	break;

	pmcid = sp.pm_pmcid;

	if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
	break;

	KASSERT(pmcid == pm->pm_id,
	("[pmc,%d] pmcid %x != id %x", __LINE__,
	pm->pm_id, pmcid));

	if (pm->pm_state == PMC_STATE_RUNNING) /* already running */
	break;
	else if (pm->pm_state != PMC_STATE_STOPPED &&
	pm->pm_state != PMC_STATE_ALLOCATED) {
	error = EINVAL;
	break;
	}

	error = pmc_start(pm);
	}
	break;


	/*
	* Stop a PMC.
	*/

	case PMC_OP_PMCSTOP:
	{
	pmc_id_t pmcid;
	struct pmc *pm;
	struct pmc_op_simple sp;

	PMC_DOWNGRADE_SX();

	if ((error = copyin(arg, &sp, sizeof(sp))) != 0)
	break;

	pmcid = sp.pm_pmcid;

	/*
	* Mark the PMC as inactive and invoke the MD stop
	* routines if needed.
	*/

	if ((error = pmc_find_pmc(pmcid, &pm)) != 0)
	break;

	KASSERT(pmcid == pm->pm_id,
	("[pmc,%d] pmc id %x != pmcid %x", __LINE__,
	pm->pm_id, pmcid));

	if (pm->pm_state == PMC_STATE_STOPPED) /* already stopped */
	break;
	else if (pm->pm_state != PMC_STATE_RUNNING) {
	error = EINVAL;
	break;
	}

	error = pmc_stop(pm);
	}
	break;


	/*
	* Write a user supplied value to the log file.
	*/

	case PMC_OP_WRITELOG:
	{
	struct pmc_op_writelog wl;
	struct pmc_owner *po;

	PMC_DOWNGRADE_SX();

	if ((error = copyin(arg, &wl, sizeof(wl))) != 0)
	break;

	if ((po = pmc_find_owner_descriptor(td->td_proc)) == NULL) {
	error = EINVAL;
	break;
	}

	if ((po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) {
	error = EINVAL;
	break;
	}

	error = pmclog_process_userlog(po, &wl);
	}
	break;


	default:
	error = EINVAL;
	break;
	}

	if (is_sx_locked != 0) {
	if (is_sx_downgraded)
	sx_sunlock(&pmc_sx);
	else
	sx_xunlock(&pmc_sx);
	}

	if (error)
	atomic_add_int(&pmc_stats.pm_syscall_errors, 1);

	PICKUP_GIANT();

	return error;
	}

	/*
	* Helper functions
	*/


	/*
	* Mark the thread as needing callchain capture and post an AST. The
	* actual callchain capture will be done in a context where it is safe
	* to take page faults.
	*/

	static void
	pmc_post_callchain_callback(void)
	{
	struct thread *td;

	td = curthread;

	/*
	* If there is multiple PMCs for the same interrupt ignore new post
	*/
	if (td->td_pflags & TDP_CALLCHAIN)
	return;

	/*
	* Mark this thread as needing callchain capture.
	* `td->td_pflags' will be safe to touch because this thread
	* was in user space when it was interrupted.
	*/
	td->td_pflags \|= TDP_CALLCHAIN;

	/*
	* Don't let this thread migrate between CPUs until callchain
	* capture completes.
	*/
	sched_pin();

	return;
	}

	/*
	* Interrupt processing.
	*
	* Find a free slot in the per-cpu array of samples and capture the
	* current callchain there. If a sample was successfully added, a bit
	* is set in mask 'pmc_cpumask' denoting that the DO_SAMPLES hook
	* needs to be invoked from the clock handler.
	*
	* This function is meant to be called from an NMI handler. It cannot
	* use any of the locking primitives supplied by the OS.
	*/

	int
	pmc_process_interrupt(int cpu, struct pmc pm, struct trapframe tf,
	int inuserspace)
	{
	int error, callchaindepth;
	struct thread *td;
	struct pmc_sample *ps;
	struct pmc_samplebuffer *psb;

	error = 0;

	/*
	* Allocate space for a sample buffer.
	*/
	psb = pmc_pcpu[cpu]->pc_sb;

	ps = psb->ps_write;
	if (ps->ps_nsamples) { /* in use, reader hasn't caught up */
	pm->pm_stalled = 1;
	atomic_add_int(&pmc_stats.pm_intr_bufferfull, 1);
	PMCDBG(SAM,INT,1,"(spc) cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d",
	cpu, pm, (void *) tf, inuserspace,
	(int) (psb->ps_write - psb->ps_samples),
	(int) (psb->ps_read - psb->ps_samples));
	error = ENOMEM;
	goto done;
	}


	/* Fill in entry. */
	PMCDBG(SAM,INT,1,"cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm,
	(void *) tf, inuserspace,
	(int) (psb->ps_write - psb->ps_samples),
	(int) (psb->ps_read - psb->ps_samples));

	KASSERT(pm->pm_runcount >= 0,
	("[pmc,%d] pm=%p runcount %d", __LINE__, (void *) pm,
	pm->pm_runcount));

	atomic_add_rel_int(&pm->pm_runcount, 1); /* hold onto PMC */
	ps->ps_pmc = pm;
	if ((td = curthread) && td->td_proc)
	ps->ps_pid = td->td_proc->p_pid;
	else
	ps->ps_pid = -1;
	ps->ps_cpu = cpu;
	ps->ps_td = td;
	ps->ps_flags = inuserspace ? PMC_CC_F_USERSPACE : 0;

	callchaindepth = (pm->pm_flags & PMC_F_CALLCHAIN) ?
	pmc_callchaindepth : 1;

	if (callchaindepth == 1)
	ps->ps_pc[0] = PMC_TRAPFRAME_TO_PC(tf);
	else {
	/*
	* Kernel stack traversals can be done immediately,
	* while we defer to an AST for user space traversals.
	*/
	if (!inuserspace)
	callchaindepth =
	pmc_save_kernel_callchain(ps->ps_pc,
	callchaindepth, tf);
	else {
	pmc_post_callchain_callback();
	callchaindepth = PMC_SAMPLE_INUSE;
	}
	}

	ps->ps_nsamples = callchaindepth; /* mark entry as in use */

	/* increment write pointer, modulo ring buffer size */
	ps++;
	if (ps == psb->ps_fence)
	psb->ps_write = psb->ps_samples;
	else
	psb->ps_write = ps;

	done:
	/* mark CPU as needing processing */
	CPU_SET_ATOMIC(cpu, &pmc_cpumask);

	return (error);
	}

	/*
	* Capture a user call chain. This function will be called from ast()
	* before control returns to userland and before the process gets
	* rescheduled.
	*/

	static void
	pmc_capture_user_callchain(int cpu, struct trapframe *tf)
	{
	int i;
	struct pmc *pm;
	struct thread *td;
	struct pmc_sample *ps;
	struct pmc_samplebuffer *psb;
	#ifdef INVARIANTS
	int ncallchains;
	#endif

	sched_unpin(); /* Can migrate safely now. */

	psb = pmc_pcpu[cpu]->pc_sb;
	td = curthread;

	KASSERT(td->td_pflags & TDP_CALLCHAIN,
	("[pmc,%d] Retrieving callchain for thread that doesn't want it",
	__LINE__));

	#ifdef INVARIANTS
	ncallchains = 0;
	#endif

	/*
	* Iterate through all deferred callchain requests.
	*/

	ps = psb->ps_samples;
	for (i = 0; i < pmc_nsamples; i++, ps++) {

	if (ps->ps_nsamples != PMC_SAMPLE_INUSE)
	continue;
	if (ps->ps_td != td)
	continue;

	KASSERT(ps->ps_cpu == cpu,
	("[pmc,%d] cpu mismatch ps_cpu=%d pcpu=%d", __LINE__,
	ps->ps_cpu, PCPU_GET(cpuid)));

	pm = ps->ps_pmc;

	KASSERT(pm->pm_flags & PMC_F_CALLCHAIN,
	("[pmc,%d] Retrieving callchain for PMC that doesn't "
	"want it", __LINE__));

	KASSERT(pm->pm_runcount > 0,
	("[pmc,%d] runcount %d", __LINE__, pm->pm_runcount));

	/*
	* Retrieve the callchain and mark the sample buffer
	* as 'processable' by the timer tick sweep code.
	*/
	ps->ps_nsamples = pmc_save_user_callchain(ps->ps_pc,
	pmc_callchaindepth, tf);

	#ifdef INVARIANTS
	ncallchains++;
	#endif

	}

	KASSERT(ncallchains > 0,
	("[pmc,%d] cpu %d didn't find a sample to collect", __LINE__,
	cpu));

	return;
	}


	/*
	* Process saved PC samples.
	*/

	static void
	pmc_process_samples(int cpu)
	{
	struct pmc *pm;
	int adjri, n;
	struct thread *td;
	struct pmc_owner *po;
	struct pmc_sample *ps;
	struct pmc_classdep *pcd;
	struct pmc_samplebuffer *psb;

	KASSERT(PCPU_GET(cpuid) == cpu,
	("[pmc,%d] not on the correct CPU pcpu=%d cpu=%d", __LINE__,
	PCPU_GET(cpuid), cpu));

	psb = pmc_pcpu[cpu]->pc_sb;

	for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */

	ps = psb->ps_read;
	if (ps->ps_nsamples == PMC_SAMPLE_FREE)
	break;
	if (ps->ps_nsamples == PMC_SAMPLE_INUSE) {
	/* Need a rescan at a later time. */
	CPU_SET_ATOMIC(cpu, &pmc_cpumask);
	break;
	}

	pm = ps->ps_pmc;

	KASSERT(pm->pm_runcount > 0,
	("[pmc,%d] pm=%p runcount %d", __LINE__, (void *) pm,
	pm->pm_runcount));

	po = pm->pm_owner;

	KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)),
	("[pmc,%d] pmc=%p non-sampling mode=%d", __LINE__,
	pm, PMC_TO_MODE(pm)));

	/* Ignore PMCs that have been switched off */
	if (pm->pm_state != PMC_STATE_RUNNING)
	goto entrydone;

	PMCDBG(SAM,OPS,1,"cpu=%d pm=%p n=%d fl=%x wr=%d rd=%d", cpu,
	pm, ps->ps_nsamples, ps->ps_flags,
	(int) (psb->ps_write - psb->ps_samples),
	(int) (psb->ps_read - psb->ps_samples));

	/*
	* If this is a process-mode PMC that is attached to
	* its owner, and if the PC is in user mode, update
	* profiling statistics like timer-based profiling
	* would have done.
	*/
	if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) {
	if (ps->ps_flags & PMC_CC_F_USERSPACE) {
	td = FIRST_THREAD_IN_PROC(po->po_owner);
	addupc_intr(td, ps->ps_pc[0], 1);
	}
	goto entrydone;
	}

	/*
	* Otherwise, this is either a sampling mode PMC that
	* is attached to a different process than its owner,
	* or a system-wide sampling PMC. Dispatch a log
	* entry to the PMC's owner process.
	*/

	pmclog_process_callchain(pm, ps);

	entrydone:
	ps->ps_nsamples = 0; /* mark entry as free */
	atomic_subtract_rel_int(&pm->pm_runcount, 1);

	/* increment read pointer, modulo sample size */
	if (++ps == psb->ps_fence)
	psb->ps_read = psb->ps_samples;
	else
	psb->ps_read = ps;
	}

	atomic_add_int(&pmc_stats.pm_log_sweeps, 1);

	/* Do not re-enable stalled PMCs if we failed to process any samples */
	if (n == 0)
	return;

	/*
	* Restart any stalled sampling PMCs on this CPU.
	*
	* If the NMI handler sets the pm_stalled field of a PMC after
	* the check below, we'll end up processing the stalled PMC at
	* the next hardclock tick.
	*/
	for (n = 0; n < md->pmd_npmc; n++) {
	pcd = pmc_ri_to_classdep(md, n, &adjri);
	KASSERT(pcd != NULL,
	("[pmc,%d] null pcd ri=%d", __LINE__, n));
	(void) (*pcd->pcd_get_config)(cpu,adjri,&pm);

	if (pm == NULL \|\| /* !cfg'ed */
	pm->pm_state != PMC_STATE_RUNNING \|\| /* !active */
	!PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)) \|\| /* !sampling */
	pm->pm_stalled == 0) /* !stalled */
	continue;

	pm->pm_stalled = 0;
	(*pcd->pcd_start_pmc)(cpu, adjri);
	}
	}

	/*
	* Event handlers.
	*/

	/*
	* Handle a process exit.
	*
	* Remove this process from all hash tables. If this process
	* owned any PMCs, turn off those PMCs and deallocate them,
	* removing any associations with target processes.
	*
	* This function will be called by the last 'thread' of a
	* process.
	*
	* XXX This eventhandler gets called early in the exit process.
	* Consider using a 'hook' invocation from thread_exit() or equivalent
	* spot. Another negative is that kse_exit doesn't seem to call
	* exit1() [??].
	*
	*/

	static void
	pmc_process_exit(void arg __unused, struct proc p)
	{
	struct pmc *pm;
	int adjri, cpu;
	unsigned int ri;
	int is_using_hwpmcs;
	struct pmc_owner *po;
	struct pmc_process *pp;
	struct pmc_classdep *pcd;
	pmc_value_t newvalue, tmp;

	PROC_LOCK(p);
	is_using_hwpmcs = p->p_flag & P_HWPMC;
	PROC_UNLOCK(p);

	/*
	* Log a sysexit event to all SS PMC owners.
	*/
	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
	pmclog_process_sysexit(po, p->p_pid);

	if (!is_using_hwpmcs)
	return;

	PMC_GET_SX_XLOCK();
	PMCDBG(PRC,EXT,1,"process-exit proc=%p (%d, %s)", p, p->p_pid,
	p->p_comm);

	/*
	* Since this code is invoked by the last thread in an exiting
	* process, we would have context switched IN at some prior
	* point. However, with PREEMPTION, kernel mode context
	* switches may happen any time, so we want to disable a
	* context switch OUT till we get any PMCs targetting this
	* process off the hardware.
	*
	* We also need to atomically remove this process'
	* entry from our target process hash table, using
	* PMC_FLAG_REMOVE.
	*/
	PMCDBG(PRC,EXT,1, "process-exit proc=%p (%d, %s)", p, p->p_pid,
	p->p_comm);

	critical_enter(); /* no preemption */

	cpu = curthread->td_oncpu;

	if ((pp = pmc_find_process_descriptor(p,
	PMC_FLAG_REMOVE)) != NULL) {

	PMCDBG(PRC,EXT,2,
	"process-exit proc=%p pmc-process=%p", p, pp);

	/*
	* The exiting process could the target of
	* some PMCs which will be running on
	* currently executing CPU.
	*
	* We need to turn these PMCs off like we
	* would do at context switch OUT time.
	*/
	for (ri = 0; ri < md->pmd_npmc; ri++) {

	/*
	* Pick up the pmc pointer from hardware
	* state similar to the CSW_OUT code.
	*/
	pm = NULL;

	pcd = pmc_ri_to_classdep(md, ri, &adjri);

	(void) (*pcd->pcd_get_config)(cpu, adjri, &pm);

	PMCDBG(PRC,EXT,2, "ri=%d pm=%p", ri, pm);

	if (pm == NULL \|\|
	!PMC_IS_VIRTUAL_MODE(PMC_TO_MODE(pm)))
	continue;

	PMCDBG(PRC,EXT,2, "ppmcs[%d]=%p pm=%p "
	"state=%d", ri, pp->pp_pmcs[ri].pp_pmc,
	pm, pm->pm_state);

	KASSERT(PMC_TO_ROWINDEX(pm) == ri,
	("[pmc,%d] ri mismatch pmc(%d) ri(%d)",
	__LINE__, PMC_TO_ROWINDEX(pm), ri));

	KASSERT(pm == pp->pp_pmcs[ri].pp_pmc,
	("[pmc,%d] pm %p != pp_pmcs[%d] %p",
	__LINE__, pm, ri, pp->pp_pmcs[ri].pp_pmc));

	(void) pcd->pcd_stop_pmc(cpu, adjri);

	KASSERT(pm->pm_runcount > 0,
	("[pmc,%d] bad runcount ri %d rc %d",
	__LINE__, ri, pm->pm_runcount));

	/* Stop hardware only if it is actually running */
	if (pm->pm_state == PMC_STATE_RUNNING &&
	pm->pm_stalled == 0) {
	pcd->pcd_read_pmc(cpu, adjri, &newvalue);
	tmp = newvalue -
	PMC_PCPU_SAVED(cpu,ri);

	mtx_pool_lock_spin(pmc_mtxpool, pm);
	pm->pm_gv.pm_savedvalue += tmp;
	pp->pp_pmcs[ri].pp_pmcval += tmp;
	mtx_pool_unlock_spin(pmc_mtxpool, pm);
	}

	atomic_subtract_rel_int(&pm->pm_runcount,1);

	KASSERT((int) pm->pm_runcount >= 0,
	("[pmc,%d] runcount is %d", __LINE__, ri));

	(void) pcd->pcd_config_pmc(cpu, adjri, NULL);
	}

	/*
	* Inform the MD layer of this pseudo "context switch
	* out"
	*/
	(void) md->pmd_switch_out(pmc_pcpu[cpu], pp);

	critical_exit(); /* ok to be pre-empted now */

	/*
	* Unlink this process from the PMCs that are
	* targetting it. This will send a signal to
	* all PMC owner's whose PMCs are orphaned.
	*
	* Log PMC value at exit time if requested.
	*/
	for (ri = 0; ri < md->pmd_npmc; ri++)
	if ((pm = pp->pp_pmcs[ri].pp_pmc) != NULL) {
	if (pm->pm_flags & PMC_F_NEEDS_LOGFILE &&
	PMC_IS_COUNTING_MODE(PMC_TO_MODE(pm)))
	pmclog_process_procexit(pm, pp);
	pmc_unlink_target_process(pm, pp);
	}
	free(pp, M_PMC);

	} else
	critical_exit(); /* pp == NULL */


	/*
	* If the process owned PMCs, free them up and free up
	* memory.
	*/
	if ((po = pmc_find_owner_descriptor(p)) != NULL) {
	pmc_remove_owner(po);
	pmc_destroy_owner_descriptor(po);
	}

	sx_xunlock(&pmc_sx);
	}

	/*
	* Handle a process fork.
	*
	* If the parent process 'p1' is under HWPMC monitoring, then copy
	* over any attached PMCs that have 'do_descendants' semantics.
	*/

	static void
	pmc_process_fork(void arg __unused, struct proc p1, struct proc *newproc,
	int flags)
	{
	int is_using_hwpmcs;
	unsigned int ri;
	uint32_t do_descendants;
	struct pmc *pm;
	struct pmc_owner *po;
	struct pmc_process ppnew, ppold;

	(void) flags; /* unused parameter */

	PROC_LOCK(p1);
	is_using_hwpmcs = p1->p_flag & P_HWPMC;
	PROC_UNLOCK(p1);

	/*
	* If there are system-wide sampling PMCs active, we need to
	* log all fork events to their owner's logs.
	*/

	LIST_FOREACH(po, &pmc_ss_owners, po_ssnext)
	if (po->po_flags & PMC_PO_OWNS_LOGFILE)
	pmclog_process_procfork(po, p1->p_pid, newproc->p_pid);

	if (!is_using_hwpmcs)
	return;

	PMC_GET_SX_XLOCK();
	PMCDBG(PMC,FRK,1, "process-fork proc=%p (%d, %s) -> %p", p1,
	p1->p_pid, p1->p_comm, newproc);

	/*
	* If the parent process (curthread->td_proc) is a
	* target of any PMCs, look for PMCs that are to be
	* inherited, and link these into the new process
	* descriptor.
	*/
	if ((ppold = pmc_find_process_descriptor(curthread->td_proc,
	PMC_FLAG_NONE)) == NULL)
	goto done; /* nothing to do */

	do_descendants = 0;
	for (ri = 0; ri < md->pmd_npmc; ri++)
	if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL)
	do_descendants \|= pm->pm_flags & PMC_F_DESCENDANTS;
	if (do_descendants == 0) /* nothing to do */
	goto done;

	/* allocate a descriptor for the new process */
	if ((ppnew = pmc_find_process_descriptor(newproc,
	PMC_FLAG_ALLOCATE)) == NULL)
	goto done;

	/*
	* Run through all PMCs that were targeting the old process
	* and which specified F_DESCENDANTS and attach them to the
	* new process.
	*
	* Log the fork event to all owners of PMCs attached to this
	* process, if not already logged.
	*/
	for (ri = 0; ri < md->pmd_npmc; ri++)
	if ((pm = ppold->pp_pmcs[ri].pp_pmc) != NULL &&
	(pm->pm_flags & PMC_F_DESCENDANTS)) {
	pmc_link_target_process(pm, ppnew);
	po = pm->pm_owner;
	if (po->po_sscount == 0 &&
	po->po_flags & PMC_PO_OWNS_LOGFILE)
	pmclog_process_procfork(po, p1->p_pid,
	newproc->p_pid);
	}

	/*
	* Now mark the new process as being tracked by this driver.
	*/
	PROC_LOCK(newproc);
	newproc->p_flag \|= P_HWPMC;
	PROC_UNLOCK(newproc);

	done:
	sx_xunlock(&pmc_sx);
	}


	/*
	* initialization
	*/

	static const char *pmc_name_of_pmcclass[] = {
	#undef __PMC_CLASS
	#define __PMC_CLASS(N) #N ,
	__PMC_CLASSES()
	};

	static int
	pmc_initialize(void)
	{
	int c, cpu, error, n, ri;
	unsigned int maxcpu;
	struct pmc_binding pb;
	struct pmc_sample *ps;
	struct pmc_classdep *pcd;
	struct pmc_samplebuffer *sb;

	md = NULL;
	error = 0;

	#ifdef DEBUG
	/* parse debug flags first */
	if (TUNABLE_STR_FETCH(PMC_SYSCTL_NAME_PREFIX "debugflags",
	pmc_debugstr, sizeof(pmc_debugstr)))
	pmc_debugflags_parse(pmc_debugstr,
	pmc_debugstr+strlen(pmc_debugstr));
	#endif

	PMCDBG(MOD,INI,0, "PMC Initialize (version %x)", PMC_VERSION);

	/* check kernel version */
	if (pmc_kernel_version != PMC_VERSION) {
	if (pmc_kernel_version == 0)
	printf("hwpmc: this kernel has not been compiled with "
	"'options HWPMC_HOOKS'.\n");
	else
	printf("hwpmc: kernel version (0x%x) does not match "
	"module version (0x%x).\n", pmc_kernel_version,
	PMC_VERSION);
	return EPROGMISMATCH;
	}

	/*
	* check sysctl parameters
	*/

	if (pmc_hashsize <= 0) {
	(void) printf("hwpmc: tunable \"hashsize\"=%d must be "
	"greater than zero.\n", pmc_hashsize);
	pmc_hashsize = PMC_HASH_SIZE;
	}

	if (pmc_nsamples <= 0 \|\| pmc_nsamples > 65535) {
	(void) printf("hwpmc: tunable \"nsamples\"=%d out of "
	"range.\n", pmc_nsamples);
	pmc_nsamples = PMC_NSAMPLES;
	}

	if (pmc_callchaindepth <= 0 \|\|
	pmc_callchaindepth > PMC_CALLCHAIN_DEPTH_MAX) {
	(void) printf("hwpmc: tunable \"callchaindepth\"=%d out of "
	"range.\n", pmc_callchaindepth);
	pmc_callchaindepth = PMC_CALLCHAIN_DEPTH;
	}

	md = pmc_md_initialize();

	if (md == NULL)
	return (ENOSYS);

	KASSERT(md->pmd_nclass >= 1 && md->pmd_npmc >= 1,
	("[pmc,%d] no classes or pmcs", __LINE__));

	/* Compute the map from row-indices to classdep pointers. */
	pmc_rowindex_to_classdep = malloc(sizeof(struct pmc_classdep )
	md->pmd_npmc, M_PMC, M_WAITOK\|M_ZERO);

	for (n = 0; n < md->pmd_npmc; n++)
	pmc_rowindex_to_classdep[n] = NULL;
	for (ri = c = 0; c < md->pmd_nclass; c++) {
	pcd = &md->pmd_classdep[c];
	for (n = 0; n < pcd->pcd_num; n++, ri++)
	pmc_rowindex_to_classdep[ri] = pcd;
	}

	KASSERT(ri == md->pmd_npmc,
	("[pmc,%d] npmc miscomputed: ri=%d, md->npmc=%d", __LINE__,
	ri, md->pmd_npmc));

	maxcpu = pmc_cpu_max();

	/* allocate space for the per-cpu array */
	pmc_pcpu = malloc(maxcpu * sizeof(struct pmc_cpu *), M_PMC,
	M_WAITOK\|M_ZERO);

	/* per-cpu 'saved values' for managing process-mode PMCs */
	pmc_pcpu_saved = malloc(sizeof(pmc_value_t) * maxcpu * md->pmd_npmc,
	M_PMC, M_WAITOK);

	/* Perform CPU-dependent initialization. */
	pmc_save_cpu_binding(&pb);
	error = 0;
	for (cpu = 0; error == 0 && cpu < maxcpu; cpu++) {
	if (!pmc_cpu_is_active(cpu))
	continue;
	pmc_select_cpu(cpu);
	pmc_pcpu[cpu] = malloc(sizeof(struct pmc_cpu) +
	md->pmd_npmc * sizeof(struct pmc_hw *), M_PMC,
	M_WAITOK\|M_ZERO);
	if (md->pmd_pcpu_init)
	error = md->pmd_pcpu_init(md, cpu);
	for (n = 0; error == 0 && n < md->pmd_nclass; n++)
	error = md->pmd_classdep[n].pcd_pcpu_init(md, cpu);
	}
	pmc_restore_cpu_binding(&pb);

	if (error)
	return (error);

	/* allocate space for the sample array */
	for (cpu = 0; cpu < maxcpu; cpu++) {
	if (!pmc_cpu_is_active(cpu))
	continue;

	sb = malloc(sizeof(struct pmc_samplebuffer) +
	pmc_nsamples * sizeof(struct pmc_sample), M_PMC,
	M_WAITOK\|M_ZERO);
	sb->ps_read = sb->ps_write = sb->ps_samples;
	sb->ps_fence = sb->ps_samples + pmc_nsamples;

	KASSERT(pmc_pcpu[cpu] != NULL,
	("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu));

	sb->ps_callchains = malloc(pmc_callchaindepth * pmc_nsamples *
	sizeof(uintptr_t), M_PMC, M_WAITOK\|M_ZERO);

	for (n = 0, ps = sb->ps_samples; n < pmc_nsamples; n++, ps++)
	ps->ps_pc = sb->ps_callchains +
	(n * pmc_callchaindepth);

	pmc_pcpu[cpu]->pc_sb = sb;
	}

	/* allocate space for the row disposition array */
	pmc_pmcdisp = malloc(sizeof(enum pmc_mode) * md->pmd_npmc,
	M_PMC, M_WAITOK\|M_ZERO);

	KASSERT(pmc_pmcdisp != NULL,
	("[pmc,%d] pmcdisp allocation returned NULL", __LINE__));

	/* mark all PMCs as available */
	for (n = 0; n < (int) md->pmd_npmc; n++)
	PMC_MARK_ROW_FREE(n);

	/* allocate thread hash tables */
	pmc_ownerhash = hashinit(pmc_hashsize, M_PMC,
	&pmc_ownerhashmask);

	pmc_processhash = hashinit(pmc_hashsize, M_PMC,
	&pmc_processhashmask);
	mtx_init(&pmc_processhash_mtx, "pmc-process-hash", "pmc-leaf",
	MTX_SPIN);

	LIST_INIT(&pmc_ss_owners);
	pmc_ss_count = 0;

	/* allocate a pool of spin mutexes */
	pmc_mtxpool = mtx_pool_create("pmc-leaf", pmc_mtxpool_size,
	MTX_SPIN);

	PMCDBG(MOD,INI,1, "pmc_ownerhash=%p, mask=0x%lx "
	"targethash=%p mask=0x%lx", pmc_ownerhash, pmc_ownerhashmask,
	pmc_processhash, pmc_processhashmask);

	/* register process {exit,fork,exec} handlers */
	pmc_exit_tag = EVENTHANDLER_REGISTER(process_exit,
	pmc_process_exit, NULL, EVENTHANDLER_PRI_ANY);
	pmc_fork_tag = EVENTHANDLER_REGISTER(process_fork,
	pmc_process_fork, NULL, EVENTHANDLER_PRI_ANY);

	/* initialize logging */
	pmclog_initialize();

	/* set hook functions */
	pmc_intr = md->pmd_intr;
	pmc_hook = pmc_hook_handler;

	if (error == 0) {
	printf(PMC_MODULE_NAME ":");
	for (n = 0; n < (int) md->pmd_nclass; n++) {
	pcd = &md->pmd_classdep[n];
	printf(" %s/%d/%d/0x%b",
	pmc_name_of_pmcclass[pcd->pcd_class],
	pcd->pcd_num,
	pcd->pcd_width,
	pcd->pcd_caps,
	"\20"
	"\1INT\2USR\3SYS\4EDG\5THR"
	"\6REA\7WRI\10INV\11QUA\12PRC"
	"\13TAG\14CSC");
	}
	printf("\n");
	}

	return (error);
	}

	/* prepare to be unloaded */
	static void
	pmc_cleanup(void)
	{
	int c, cpu;
	unsigned int maxcpu;
	struct pmc_ownerhash *ph;
	struct pmc_owner po, tmp;
	struct pmc_binding pb;
	#ifdef DEBUG
	struct pmc_processhash *prh;
	#endif

	PMCDBG(MOD,INI,0, "%s", "cleanup");

	/* switch off sampling */
	CPU_ZERO(&pmc_cpumask);
	pmc_intr = NULL;

	sx_xlock(&pmc_sx);
	if (pmc_hook == NULL) { /* being unloaded already */
	sx_xunlock(&pmc_sx);
	return;
	}

	pmc_hook = NULL; /* prevent new threads from entering module */

	/* deregister event handlers */
	EVENTHANDLER_DEREGISTER(process_fork, pmc_fork_tag);
	EVENTHANDLER_DEREGISTER(process_exit, pmc_exit_tag);

	/* send SIGBUS to all owner threads, free up allocations */
	if (pmc_ownerhash)
	for (ph = pmc_ownerhash;
	ph <= &pmc_ownerhash[pmc_ownerhashmask];
	ph++) {
	LIST_FOREACH_SAFE(po, ph, po_next, tmp) {
	pmc_remove_owner(po);

	/* send SIGBUS to owner processes */
	PMCDBG(MOD,INI,2, "cleanup signal proc=%p "
	"(%d, %s)", po->po_owner,
	po->po_owner->p_pid,
	po->po_owner->p_comm);

	PROC_LOCK(po->po_owner);
	- psignal(po->po_owner, SIGBUS);
	+ kern_psignal(po->po_owner, SIGBUS);
	PROC_UNLOCK(po->po_owner);

	pmc_destroy_owner_descriptor(po);
	}
	}

	/* reclaim allocated data structures */
	if (pmc_mtxpool)
	mtx_pool_destroy(&pmc_mtxpool);

	mtx_destroy(&pmc_processhash_mtx);
	if (pmc_processhash) {
	#ifdef DEBUG
	struct pmc_process *pp;

	PMCDBG(MOD,INI,3, "%s", "destroy process hash");
	for (prh = pmc_processhash;
	prh <= &pmc_processhash[pmc_processhashmask];
	prh++)
	LIST_FOREACH(pp, prh, pp_next)
	PMCDBG(MOD,INI,3, "pid=%d", pp->pp_proc->p_pid);
	#endif

	hashdestroy(pmc_processhash, M_PMC, pmc_processhashmask);
	pmc_processhash = NULL;
	}

	if (pmc_ownerhash) {
	PMCDBG(MOD,INI,3, "%s", "destroy owner hash");
	hashdestroy(pmc_ownerhash, M_PMC, pmc_ownerhashmask);
	pmc_ownerhash = NULL;
	}

	KASSERT(LIST_EMPTY(&pmc_ss_owners),
	("[pmc,%d] Global SS owner list not empty", __LINE__));
	KASSERT(pmc_ss_count == 0,
	("[pmc,%d] Global SS count not empty", __LINE__));

	/* do processor and pmc-class dependent cleanup */
	maxcpu = pmc_cpu_max();

	PMCDBG(MOD,INI,3, "%s", "md cleanup");
	if (md) {
	pmc_save_cpu_binding(&pb);
	for (cpu = 0; cpu < maxcpu; cpu++) {
	PMCDBG(MOD,INI,1,"pmc-cleanup cpu=%d pcs=%p",
	cpu, pmc_pcpu[cpu]);
	if (!pmc_cpu_is_active(cpu) \|\| pmc_pcpu[cpu] == NULL)
	continue;
	pmc_select_cpu(cpu);
	for (c = 0; c < md->pmd_nclass; c++)
	md->pmd_classdep[c].pcd_pcpu_fini(md, cpu);
	if (md->pmd_pcpu_fini)
	md->pmd_pcpu_fini(md, cpu);
	}

	pmc_md_finalize(md);

	free(md, M_PMC);
	md = NULL;
	pmc_restore_cpu_binding(&pb);
	}

	/* Free per-cpu descriptors. */
	for (cpu = 0; cpu < maxcpu; cpu++) {
	if (!pmc_cpu_is_active(cpu))
	continue;
	KASSERT(pmc_pcpu[cpu]->pc_sb != NULL,
	("[pmc,%d] Null cpu sample buffer cpu=%d", __LINE__,
	cpu));
	free(pmc_pcpu[cpu]->pc_sb->ps_callchains, M_PMC);
	free(pmc_pcpu[cpu]->pc_sb, M_PMC);
	free(pmc_pcpu[cpu], M_PMC);
	}

	free(pmc_pcpu, M_PMC);
	pmc_pcpu = NULL;

	free(pmc_pcpu_saved, M_PMC);
	pmc_pcpu_saved = NULL;

	if (pmc_pmcdisp) {
	free(pmc_pmcdisp, M_PMC);
	pmc_pmcdisp = NULL;
	}

	if (pmc_rowindex_to_classdep) {
	free(pmc_rowindex_to_classdep, M_PMC);
	pmc_rowindex_to_classdep = NULL;
	}

	pmclog_shutdown();

	sx_xunlock(&pmc_sx); /* we are done */
	}

	/*
	* The function called at load/unload.
	*/

	static int
	load (struct module module __unused, int cmd, void arg __unused)
	{
	int error;

	error = 0;

	switch (cmd) {
	case MOD_LOAD :
	/* initialize the subsystem */
	error = pmc_initialize();
	if (error != 0)
	break;
	PMCDBG(MOD,INI,1, "syscall=%d maxcpu=%d",
	pmc_syscall_num, pmc_cpu_max());
	break;


	case MOD_UNLOAD :
	case MOD_SHUTDOWN:
	pmc_cleanup();
	PMCDBG(MOD,INI,1, "%s", "unloaded");
	break;

	default :
	error = EINVAL; /* XXX should panic(9) */
	break;
	}

	return error;
	}

	/* memory pool */
	MALLOC_DEFINE(M_PMC, "pmc", "Memory space for the PMC module");
	Index: head/sys/dev/iscsi/initiator/isc_soc.c
	===================================================================
	--- head/sys/dev/iscsi/initiator/isc_soc.c (revision 225616)
	+++ head/sys/dev/iscsi/initiator/isc_soc.c (revision 225617)
	@@ -1,701 +1,701 @@
	/*-
	* Copyright (c) 2005-2010 Daniel Braniss <danny@cs.huji.ac.il>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/
	/*
	\| $Id: isc_soc.c 998 2009-12-20 10:32:45Z danny $
	*/
	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_iscsi_initiator.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/conf.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/ctype.h>
	#include <sys/errno.h>
	#include <sys/sysctl.h>
	#include <sys/file.h>
	#include <sys/uio.h>
	#include <sys/socketvar.h>
	#include <sys/socket.h>
	#include <sys/protosw.h>
	#include <sys/proc.h>
	#include <sys/ioccom.h>
	#include <sys/queue.h>
	#include <sys/kthread.h>
	#include <sys/syslog.h>
	#include <sys/mbuf.h>
	#include <sys/user.h>

	#include <cam/cam.h>
	#include <cam/cam_ccb.h>

	#include <dev/iscsi/initiator/iscsi.h>
	#include <dev/iscsi/initiator/iscsivar.h>

	#ifndef NO_USE_MBUF
	#define USE_MBUF
	#endif

	#ifdef USE_MBUF
	static int ou_refcnt = 0;
	/*
	\| function for freeing external storage for mbuf
	*/
	static void
	ext_free(void a, void b)
	{
	pduq_t *pq = b;

	if(pq->buf != NULL) {
	debug(3, "ou_refcnt=%d a=%p b=%p", ou_refcnt, a, pq->buf);
	free(pq->buf, M_ISCSIBUF);
	pq->buf = NULL;
	}
	}

	int
	isc_sendPDU(isc_session_t sp, pduq_t pq)
	{
	struct mbuf mh, *mp;
	pdu_t *pp = &pq->pdu;
	int len, error;

	debug_called(8);
	/*
	\| mbuf for the iSCSI header
	*/
	MGETHDR(mh, M_TRYWAIT, MT_DATA);
	mh->m_pkthdr.rcvif = NULL;
	mh->m_next = NULL;
	mh->m_len = sizeof(union ipdu_u);

	if(ISOK2DIG(sp->hdrDigest, pp)) {
	pp->hdr_dig = sp->hdrDigest(&pp->ipdu, sizeof(union ipdu_u), 0);
	mh->m_len += sizeof(pp->hdr_dig);
	if(pp->ahs_len) {
	debug(2, "ahs_len=%d", pp->ahs_len);
	pp->hdr_dig = sp->hdrDigest(&pp->ahs_addr, pp->ahs_len, pp->hdr_dig);
	}
	debug(3, "pp->hdr_dig=%04x", htonl(pp->hdr_dig));
	}
	if(pp->ahs_len) {
	/*
	\| Add any AHS to the iSCSI hdr mbuf
	*/
	if((mh->m_len + pp->ahs_len) < MHLEN) {
	MH_ALIGN(mh, mh->m_len + pp->ahs_len);
	bcopy(&pp->ipdu, mh->m_data, mh->m_len);
	bcopy(pp->ahs_addr, mh->m_data + mh->m_len, pp->ahs_len);
	mh->m_len += pp->ahs_len;
	}
	else
	panic("len AHS=%d too big, not impleneted yet", pp->ahs_len);
	}
	else {
	MH_ALIGN(mh, mh->m_len);
	bcopy(&pp->ipdu, mh->m_data, mh->m_len);
	}
	mh->m_pkthdr.len = mh->m_len;
	mp = &mh->m_next;
	if(pp->ds_len && pq->pdu.ds_addr) {
	struct mbuf *md;
	int off = 0;

	len = pp->ds_len;
	while(len > 0) {
	int l;

	MGET(md, M_TRYWAIT, MT_DATA);
	md->m_ext.ref_cnt = &ou_refcnt;
	l = min(MCLBYTES, len);
	debug(4, "setting ext_free(arg=%p len/l=%d/%d)", pq->buf, len, l);
	MEXTADD(md, pp->ds_addr + off, l, ext_free,
	#if __FreeBSD_version >= 800000
	pp->ds_addr + off,
	#endif
	pq, 0, EXT_EXTREF);
	md->m_len = l;
	md->m_next = NULL;
	mh->m_pkthdr.len += l;
	*mp = md;
	mp = &md->m_next;
	len -= l;
	off += l;
	}
	if(((pp->ds_len & 03) != 0) \|\| ISOK2DIG(sp->dataDigest, pp)) {
	MGET(md, M_TRYWAIT, MT_DATA);
	if(pp->ds_len & 03)
	len = 4 - (pp->ds_len & 03);
	else
	len = 0;
	md->m_len = len;
	if(ISOK2DIG(sp->dataDigest, pp))
	md->m_len += sizeof(pp->ds_dig);
	M_ALIGN(md, md->m_len);
	if(ISOK2DIG(sp->dataDigest, pp)) {
	pp->ds_dig = sp->dataDigest(pp->ds_addr, pp->ds_len, 0);
	if(len) {
	bzero(md->m_data, len); // RFC says SHOULD be 0
	pp->ds_dig = sp->dataDigest(md->m_data, len, pp->ds_dig);
	}
	bcopy(&pp->ds_dig, md->m_data+len, sizeof(pp->ds_dig));
	}
	md->m_next = NULL;
	mh->m_pkthdr.len += md->m_len;
	*mp = md;
	}
	}
	if((error = sosend(sp->soc, NULL, NULL, mh, 0, 0, sp->td)) != 0) {
	sdebug(2, "error=%d", error);
	return error;
	}
	sp->stats.nsent++;
	getbintime(&sp->stats.t_sent);
	return 0;
	}
	#else /* NO_USE_MBUF */
	int
	isc_sendPDU(isc_session_t sp, pduq_t pq)
	{
	struct uio *uio = &pq->uio;
	struct iovec *iv;
	pdu_t *pp = &pq->pdu;
	int len, error;

	debug_called(8);

	bzero(uio, sizeof(struct uio));
	uio->uio_rw = UIO_WRITE;
	uio->uio_segflg = UIO_SYSSPACE;
	uio->uio_td = sp->td;
	uio->uio_iov = iv = pq->iov;

	iv->iov_base = &pp->ipdu;
	iv->iov_len = sizeof(union ipdu_u);
	uio->uio_resid = iv->iov_len;
	iv++;
	if(ISOK2DIG(sp->hdrDigest, pp))
	pq->pdu.hdr_dig = sp->hdrDigest(&pp->ipdu, sizeof(union ipdu_u), 0);
	if(pp->ahs_len) {
	iv->iov_base = pp->ahs_addr;
	iv->iov_len = pp->ahs_len;
	uio->uio_resid += iv->iov_len;
	iv++;
	if(ISOK2DIG(sp->hdrDigest, pp))
	pp->hdr_dig = sp->hdrDigest(&pp->ahs_addr, pp->ahs_len, pp->hdr_dig);
	}
	if(ISOK2DIG(sp->hdrDigest, pp)) {
	debug(3, "hdr_dig=%04x", htonl(pp->hdr_dig));
	iv->iov_base = &pp->hdr_dig;
	iv->iov_len = sizeof(int);
	uio->uio_resid += iv->iov_len ;
	iv++;
	}
	if(pq->pdu.ds_addr && pp->ds_len) {
	iv->iov_base = pp->ds_addr;
	iv->iov_len = pp->ds_len;
	while(iv->iov_len & 03) // the specs say it must be int alligned
	iv->iov_len++;
	uio->uio_resid += iv->iov_len ;
	iv++;
	if(ISOK2DIG(sp->dataDigest, pp)) {
	pp->ds_dig = sp->dataDigest(pp->ds, pp->ds_len, 0);
	iv->iov_base = &pp->ds_dig;
	iv->iov_len = sizeof(pp->ds_dig);
	uio->uio_resid += iv->iov_len ;
	iv++;
	}
	}
	uio->uio_iovcnt = iv - pq->iov;
	sdebug(4, "pq->len=%d uio->uio_resid=%d uio->uio_iovcnt=%d", pq->len,
	uio->uio_resid,
	uio->uio_iovcnt);

	sdebug(4, "opcode=%x iovcnt=%d uio_resid=%d itt=%x",
	pp->ipdu.bhs.opcode, uio->uio_iovcnt, uio->uio_resid,
	ntohl(pp->ipdu.bhs.itt));
	sdebug(5, "sp=%p sp->soc=%p uio=%p sp->td=%p",
	sp, sp->soc, uio, sp->td);
	do {
	len = uio->uio_resid;
	error = sosend(sp->soc, NULL, uio, 0, 0, 0, sp->td);
	if(uio->uio_resid == 0 \|\| error \|\| len == uio->uio_resid) {
	if(uio->uio_resid) {
	sdebug(2, "uio->uio_resid=%d uio->uio_iovcnt=%d error=%d len=%d",
	uio->uio_resid, uio->uio_iovcnt, error, len);
	if(error == 0)
	error = EAGAIN; // 35
	}
	break;
	}
	/*
	\| XXX: untested code
	*/
	sdebug(1, "uio->uio_resid=%d uio->uio_iovcnt=%d",
	uio->uio_resid, uio->uio_iovcnt);
	iv = uio->uio_iov;
	len -= uio->uio_resid;
	while(uio->uio_iovcnt > 0) {
	if(iv->iov_len > len) {
	caddr_t bp = (caddr_t)iv->iov_base;

	iv->iov_len -= len;
	iv->iov_base = (void *)&bp[len];
	break;
	}
	len -= iv->iov_len;
	uio->uio_iovcnt--;
	uio->uio_iov++;
	iv++;
	}
	} while(uio->uio_resid);

	if(error == 0) {
	sp->stats.nsent++;
	getbintime(&sp->stats.t_sent);
	}

	return error;
	}
	#endif /* USE_MBUF */

	/*
	\| wait till a PDU header is received
	\| from the socket.
	*/
	/*
	The format of the BHS is:

	Byte/ 0 \| 1 \| 2 \| 3 \|
	/ \| \| \| \|
	\|0 1 2 3 4 5 6 7\|0 1 2 3 4 5 6 7\|0 1 2 3 4 5 6 7\|0 1 2 3 4 5 6 7\|
	+---------------+---------------+---------------+---------------+
	0\|.\|I\| Opcode \|F\| Opcode-specific fields \|
	+---------------+---------------+---------------+---------------+
	4\|TotalAHSLength \| DataSegmentLength \|
	+---------------+---------------+---------------+---------------+
	8\| LUN or Opcode-specific fields \|
	+ +
	12\| \|
	+---------------+---------------+---------------+---------------+
	16\| Initiator Task Tag \|
	+---------------+---------------+---------------+---------------+
	20/ Opcode-specific fields /
	+/ /
	+---------------+---------------+---------------+---------------+
	48
	*/
	static __inline int
	so_getbhs(isc_session_t *sp)
	{
	bhs_t *bhs = &sp->bhs;
	struct uio *uio = &sp->uio;
	struct iovec *iov = &sp->iov;
	int error, flags;

	debug_called(8);

	iov->iov_base = bhs;
	iov->iov_len = sizeof(bhs_t);

	uio->uio_iov = iov;
	uio->uio_iovcnt = 1;
	uio->uio_rw = UIO_READ;
	uio->uio_segflg = UIO_SYSSPACE;
	uio->uio_td = curthread; // why ...
	uio->uio_resid = sizeof(bhs_t);

	flags = MSG_WAITALL;
	error = soreceive(sp->soc, NULL, uio, 0, 0, &flags);

	if(error)
	debug(2,
	#if __FreeBSD_version > 800000
	"error=%d so_error=%d uio->uio_resid=%zd iov.iov_len=%zd",
	#else
	"error=%d so_error=%d uio->uio_resid=%d iov.iov_len=%zd",
	#endif
	error,
	sp->soc->so_error, uio->uio_resid, iov->iov_len);
	if(!error && (uio->uio_resid > 0)) {
	error = EPIPE; // was EAGAIN
	debug(2,
	#if __FreeBSD_version > 800000
	"error=%d so_error=%d uio->uio_resid=%zd iov.iov_len=%zd so_state=%x",
	#else
	"error=%d so_error=%d uio->uio_resid=%d iov.iov_len=%zd so_state=%x",
	#endif
	error,
	sp->soc->so_error, uio->uio_resid, iov->iov_len, sp->soc->so_state);
	}
	return error;
	}

	/*
	\| so_recv gets called when
	\| an iSCSI header has been received.
	\| Note: the designers had no intentions
	\| in making programmer's life easy.
	*/
	static int
	so_recv(isc_session_t sp, pduq_t pq)
	{
	sn_t *sn = &sp->sn;
	struct uio *uio = &pq->uio;
	pdu_t *pp = &pq->pdu;
	bhs_t *bhs = &pp->ipdu.bhs;
	struct iovec *iov = pq->iov;
	int error;
	u_int len;
	u_int max, exp;
	int flags = MSG_WAITALL;

	debug_called(8);
	/*
	\| now calculate how much data should be in the buffer
	*/
	uio->uio_iov = iov;
	uio->uio_iovcnt = 0;
	len = 0;
	if(bhs->AHSLength) {
	debug(2, "bhs->AHSLength=%d", bhs->AHSLength);
	pp->ahs_len = bhs->AHSLength * 4;
	len += pp->ahs_len;
	pp->ahs_addr = malloc(pp->ahs_len, M_TEMP, M_WAITOK); // XXX: could get stuck here
	iov->iov_base = pp->ahs_addr;
	iov->iov_len = pp->ahs_len;
	uio->uio_iovcnt++;
	iov++;
	}
	if(ISOK2DIG(sp->hdrDigest, pp)) {
	len += sizeof(pp->hdr_dig);
	iov->iov_base = &pp->hdr_dig;
	iov->iov_len = sizeof(pp->hdr_dig);
	uio->uio_iovcnt++;
	}
	if(len) {
	uio->uio_rw = UIO_READ;
	uio->uio_segflg = UIO_SYSSPACE;
	uio->uio_resid = len;
	uio->uio_td = sp->td; // why ...
	error = soreceive(sp->soc, NULL, uio, NULL, NULL, &flags);
	//if(error == EAGAIN)
	// XXX: this needs work! it hangs iscontrol
	if(error \|\| uio->uio_resid) {
	debug(2,
	#if __FreeBSD_version > 800000
	"len=%d error=%d uio->uio_resid=%zd",
	#else
	"len=%d error=%d uio->uio_resid=%d",
	#endif
	len, error, uio->uio_resid);
	goto out;
	}
	if(ISOK2DIG(sp->hdrDigest, pp)) {
	bhs_t *bhs;
	u_int digest;

	bhs = (bhs_t *)&pp->ipdu;
	digest = sp->hdrDigest(bhs, sizeof(bhs_t), 0);
	if(pp->ahs_len)
	digest = sp->hdrDigest(pp->ahs_addr, pp->ahs_len, digest);
	if(pp->hdr_dig != digest) {
	debug(2, "bad header digest: received=%x calculated=%x", pp->hdr_dig, digest);
	// XXX: now what?
	error = EIO;
	goto out;
	}
	}
	if(pp->ahs_len) {
	debug(2, "ahs len=%x type=%x spec=%x",
	pp->ahs_addr->len, pp->ahs_addr->type, pp->ahs_addr->spec);
	// XXX: till I figure out what to do with this
	free(pp->ahs_addr, M_TEMP);
	}
	pq->len += len; // XXX: who needs this?
	bzero(uio, sizeof(struct uio));
	len = 0;
	}

	if(bhs->DSLength) {
	len = bhs->DSLength;
	#if BYTE_ORDER == LITTLE_ENDIAN
	len = ((len & 0x00ff0000) >> 16)
	\| (len & 0x0000ff00)
	\| ((len & 0x000000ff) << 16);
	#endif
	pp->ds_len = len;
	if((sp->opt.maxRecvDataSegmentLength > 0) && (len > sp->opt.maxRecvDataSegmentLength)) {
	xdebug("impossible PDU length(%d) opt.maxRecvDataSegmentLength=%d",
	len, sp->opt.maxRecvDataSegmentLength);
	log(LOG_ERR,
	"so_recv: impossible PDU length(%d) from iSCSI %s/%s\n",
	len, sp->opt.targetAddress, sp->opt.targetName);
	/*
	\| XXX: this will really screwup the stream.
	\| should clear up the buffer till a valid header
	\| is found, or just close connection ...
	\| should read the RFC.
	*/
	error = E2BIG;
	goto out;
	}
	while(len & 03)
	len++;
	if(ISOK2DIG(sp->dataDigest, pp))
	len += 4;
	uio->uio_resid = len;
	uio->uio_td = sp->td; // why ...
	pq->len += len; // XXX: do we need this?
	error = soreceive(sp->soc, NULL, uio, &pq->mp, NULL, &flags);
	//if(error == EAGAIN)
	// XXX: this needs work! it hangs iscontrol
	if(error \|\| uio->uio_resid)
	goto out;
	if(ISOK2DIG(sp->dataDigest, pp)) {
	struct mbuf *m;
	u_int digest, ds_len, cnt;

	// get the received digest
	m_copydata(pq->mp,
	len - sizeof(pp->ds_dig),
	sizeof(pp->ds_dig),
	(caddr_t)&pp->ds_dig);
	// calculate all mbufs
	digest = 0;
	ds_len = len - sizeof(pp->ds_dig);
	for(m = pq->mp; m != NULL; m = m->m_next) {
	cnt = MIN(ds_len, m->m_len);
	digest = sp->dataDigest(mtod(m, char *), cnt, digest);
	ds_len -= cnt;
	if(ds_len == 0)
	break;
	}
	if(digest != pp->ds_dig) {
	sdebug(1, "bad data digest: received=%x calculated=%x", pp->ds_dig, digest);
	error = EIO; // XXX: find a better error
	goto out;
	}
	KASSERT(ds_len == 0, ("ds_len not zero"));
	}
	}
	sdebug(6, "len=%d] opcode=0x%x ahs_len=0x%x ds_len=0x%x",
	pq->len, bhs->opcode, pp->ahs_len, pp->ds_len);

	max = ntohl(bhs->MaxCmdSN);
	exp = ntohl(bhs->ExpStSN);
	if(max < exp - 1 &&
	max > exp - _MAXINCR) {
	sdebug(2, "bad cmd window size");
	error = EIO; // XXX: for now;
	goto out; // error
	}
	if(SNA_GT(max, sn->maxCmd))
	sn->maxCmd = max;
	if(SNA_GT(exp, sn->expCmd))
	sn->expCmd = exp;
	/*
	\| remove from the holding queue packets
	\| that have been acked and don't need
	\| further processing.
	*/
	i_acked_hld(sp, NULL);

	sp->cws = sn->maxCmd - sn->expCmd + 1;

	return 0;

	out:
	// XXX: need some work here
	if(pp->ahs_len) {
	// XXX: till I figure out what to do with this
	free(pp->ahs_addr, M_TEMP);
	}
	xdebug("have a problem, error=%d", error);
	pdu_free(sp->isc, pq);
	if(!error && uio->uio_resid > 0)
	error = EPIPE;
	return error;
	}

	/*
	\| wait for something to arrive.
	\| and if the pdu is without errors, process it.
	*/
	static int
	so_input(isc_session_t *sp)
	{
	pduq_t *pq;
	int error;

	debug_called(8);
	/*
	\| first read in the iSCSI header
	*/
	error = so_getbhs(sp);
	if(error == 0) {
	/*
	\| now read the rest.
	*/
	pq = pdu_alloc(sp->isc, M_NOWAIT);
	if(pq == NULL) { // XXX: might cause a deadlock ...
	debug(2, "out of pdus, wait");
	pq = pdu_alloc(sp->isc, M_WAITOK); // OK to WAIT
	}
	pq->pdu.ipdu.bhs = sp->bhs;
	pq->len = sizeof(bhs_t); // so far only the header was read
	error = so_recv(sp, pq);
	if(error != 0) {
	error += 0x800; // XXX: just to see the error.
	// terminal error
	// XXX: close connection and exit
	}
	else {
	sp->stats.nrecv++;
	getbintime(&sp->stats.t_recv);
	ism_recv(sp, pq);
	}
	}
	return error;
	}

	/*
	\| one per active (connected) session.
	\| this thread is responsible for reading
	\| in packets from the target.
	*/
	static void
	isc_in(void *vp)
	{
	isc_session_t sp = (isc_session_t )vp;
	struct socket *so = sp->soc;
	int error;

	debug_called(8);

	sp->flags \|= ISC_CON_RUNNING;
	error = 0;
	while((sp->flags & (ISC_CON_RUN \| ISC_LINK_UP)) == (ISC_CON_RUN \| ISC_LINK_UP)) {
	// XXX: hunting ...
	if(sp->soc == NULL \|\| !(so->so_state & SS_ISCONNECTED)) {
	debug(2, "sp->soc=%p", sp->soc);
	break;
	}
	error = so_input(sp);
	if(error == 0) {
	mtx_lock(&sp->io_mtx);
	if(sp->flags & ISC_OWAITING) {
	wakeup(&sp->flags);
	}
	mtx_unlock(&sp->io_mtx);
	} else if(error == EPIPE) {
	break;
	}
	else if(error == EAGAIN) {
	if(so->so_state & SS_ISCONNECTED)
	// there seems to be a problem in 6.0 ...
	tsleep(sp, PRIBIO, "isc_soc", 2*hz);
	}
	}
	sdebug(2, "terminated, flags=%x so_count=%d so_state=%x error=%d proc=%p",
	sp->flags, so->so_count, so->so_state, error, sp->proc);
	if((sp->proc != NULL) && sp->signal) {
	PROC_LOCK(sp->proc);
	- psignal(sp->proc, sp->signal);
	+ kern_psignal(sp->proc, sp->signal);
	PROC_UNLOCK(sp->proc);
	sp->flags \|= ISC_SIGNALED;
	sdebug(2, "pid=%d signaled(%d)", sp->proc->p_pid, sp->signal);
	}
	else {
	// we have to do something ourselves
	// like closing this session ...
	}
	/*
	\| we've been terminated
	*/
	// do we need this mutex ...?
	mtx_lock(&sp->io_mtx);
	sp->flags &= ~(ISC_CON_RUNNING \| ISC_LINK_UP);
	wakeup(&sp->soc);
	mtx_unlock(&sp->io_mtx);

	sdebug(2, "dropped ISC_CON_RUNNING");
	#if __FreeBSD_version >= 800000
	kproc_exit(0);
	#else
	kthread_exit(0);
	#endif
	}

	void
	isc_stop_receiver(isc_session_t *sp)
	{
	int n;

	debug_called(8);
	sdebug(3, "sp=%p sp->soc=%p", sp, sp? sp->soc: 0);
	mtx_lock(&sp->io_mtx);
	sp->flags &= ~ISC_LINK_UP;
	msleep(&sp->soc, &sp->io_mtx, PRIBIO\|PDROP, "isc_stpc", 5*hz);

	soshutdown(sp->soc, SHUT_RD);

	mtx_lock(&sp->io_mtx);
	sdebug(3, "soshutdown");
	sp->flags &= ~ISC_CON_RUN;
	n = 2;
	while(n-- && (sp->flags & ISC_CON_RUNNING)) {
	sdebug(3, "waiting n=%d... flags=%x", n, sp->flags);
	msleep(&sp->soc, &sp->io_mtx, PRIBIO, "isc_stpc", 5*hz);
	}
	mtx_unlock(&sp->io_mtx);

	if(sp->fp != NULL)
	fdrop(sp->fp, sp->td);
	fputsock(sp->soc);
	sp->soc = NULL;
	sp->fp = NULL;

	sdebug(3, "done");
	}

	void
	isc_start_receiver(isc_session_t *sp)
	{
	debug_called(8);

	sp->flags \|= ISC_CON_RUN \| ISC_LINK_UP;
	#if __FreeBSD_version >= 800000
	kproc_create
	#else
	kthread_create
	#endif
	(isc_in, sp, &sp->soc_proc, 0, 0, "isc_in %d", sp->sid);
	}
	Index: head/sys/dev/mfi/mfi.c
	===================================================================
	--- head/sys/dev/mfi/mfi.c (revision 225616)
	+++ head/sys/dev/mfi/mfi.c (revision 225617)
	@@ -1,2549 +1,2549 @@
	/*-
	* Copyright (c) 2006 IronPort Systems
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/
	/*-
	* Copyright (c) 2007 LSI Corp.
	* Copyright (c) 2007 Rajesh Prabhakaran.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_mfi.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/kernel.h>
	#include <sys/poll.h>
	#include <sys/selinfo.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/eventhandler.h>
	#include <sys/rman.h>
	#include <sys/bus_dma.h>
	#include <sys/bio.h>
	#include <sys/ioccom.h>
	#include <sys/uio.h>
	#include <sys/proc.h>
	#include <sys/signalvar.h>

	#include <machine/bus.h>
	#include <machine/resource.h>

	#include <dev/mfi/mfireg.h>
	#include <dev/mfi/mfi_ioctl.h>
	#include <dev/mfi/mfivar.h>

	static int mfi_alloc_commands(struct mfi_softc *);
	static int mfi_comms_init(struct mfi_softc *);
	static int mfi_wait_command(struct mfi_softc , struct mfi_command );
	static int mfi_get_controller_info(struct mfi_softc *);
	static int mfi_get_log_state(struct mfi_softc *,
	struct mfi_evt_log_state **);
	static int mfi_parse_entries(struct mfi_softc *, int, int);
	static int mfi_dcmd_command(struct mfi_softc , struct mfi_command *,
	uint32_t, void **, size_t);
	static void mfi_data_cb(void , bus_dma_segment_t , int, int);
	static void mfi_startup(void *arg);
	static void mfi_intr(void *arg);
	static void mfi_ldprobe(struct mfi_softc *sc);
	static int mfi_aen_register(struct mfi_softc *sc, int seq, int locale);
	static void mfi_aen_complete(struct mfi_command *);
	static int mfi_aen_setup(struct mfi_softc *, uint32_t);
	static int mfi_add_ld(struct mfi_softc *sc, int);
	static void mfi_add_ld_complete(struct mfi_command *);
	static struct mfi_command * mfi_bio_command(struct mfi_softc *);
	static void mfi_bio_complete(struct mfi_command *);
	static int mfi_mapcmd(struct mfi_softc , struct mfi_command );
	static int mfi_send_frame(struct mfi_softc , struct mfi_command );
	static void mfi_complete(struct mfi_softc , struct mfi_command );
	static int mfi_abort(struct mfi_softc , struct mfi_command );
	static int mfi_linux_ioctl_int(struct cdev , u_long, caddr_t, int, struct thread );
	static void mfi_timeout(void *);
	static int mfi_user_command(struct mfi_softc *,
	struct mfi_ioc_passthru *);
	static void mfi_enable_intr_xscale(struct mfi_softc *sc);
	static void mfi_enable_intr_ppc(struct mfi_softc *sc);
	static int32_t mfi_read_fw_status_xscale(struct mfi_softc *sc);
	static int32_t mfi_read_fw_status_ppc(struct mfi_softc *sc);
	static int mfi_check_clear_intr_xscale(struct mfi_softc *sc);
	static int mfi_check_clear_intr_ppc(struct mfi_softc *sc);
	static void mfi_issue_cmd_xscale(struct mfi_softc *sc,uint32_t bus_add,uint32_t frame_cnt);
	static void mfi_issue_cmd_ppc(struct mfi_softc *sc,uint32_t bus_add,uint32_t frame_cnt);

	SYSCTL_NODE(_hw, OID_AUTO, mfi, CTLFLAG_RD, 0, "MFI driver parameters");
	static int mfi_event_locale = MFI_EVT_LOCALE_ALL;
	TUNABLE_INT("hw.mfi.event_locale", &mfi_event_locale);
	SYSCTL_INT(_hw_mfi, OID_AUTO, event_locale, CTLFLAG_RW, &mfi_event_locale,
	0, "event message locale");

	static int mfi_event_class = MFI_EVT_CLASS_INFO;
	TUNABLE_INT("hw.mfi.event_class", &mfi_event_class);
	SYSCTL_INT(_hw_mfi, OID_AUTO, event_class, CTLFLAG_RW, &mfi_event_class,
	0, "event message class");

	static int mfi_max_cmds = 128;
	TUNABLE_INT("hw.mfi.max_cmds", &mfi_max_cmds);
	SYSCTL_INT(_hw_mfi, OID_AUTO, max_cmds, CTLFLAG_RD, &mfi_max_cmds,
	0, "Max commands");

	/* Management interface */
	static d_open_t mfi_open;
	static d_close_t mfi_close;
	static d_ioctl_t mfi_ioctl;
	static d_poll_t mfi_poll;

	static struct cdevsw mfi_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = 0,
	.d_open = mfi_open,
	.d_close = mfi_close,
	.d_ioctl = mfi_ioctl,
	.d_poll = mfi_poll,
	.d_name = "mfi",
	};

	MALLOC_DEFINE(M_MFIBUF, "mfibuf", "Buffers for the MFI driver");

	#define MFI_INQ_LENGTH SHORT_INQUIRY_LENGTH

	static void
	mfi_enable_intr_xscale(struct mfi_softc *sc)
	{
	MFI_WRITE4(sc, MFI_OMSK, 0x01);
	}

	static void
	mfi_enable_intr_ppc(struct mfi_softc *sc)
	{
	MFI_WRITE4(sc, MFI_ODCR0, 0xFFFFFFFF);
	if (sc->mfi_flags & MFI_FLAGS_1078) {
	MFI_WRITE4(sc, MFI_OMSK, ~MFI_1078_EIM);
	} else if (sc->mfi_flags & MFI_FLAGS_GEN2) {
	MFI_WRITE4(sc, MFI_OMSK, ~MFI_GEN2_EIM);
	}
	}

	static int32_t
	mfi_read_fw_status_xscale(struct mfi_softc *sc)
	{
	return MFI_READ4(sc, MFI_OMSG0);
	}

	static int32_t
	mfi_read_fw_status_ppc(struct mfi_softc *sc)
	{
	return MFI_READ4(sc, MFI_OSP0);
	}

	static int
	mfi_check_clear_intr_xscale(struct mfi_softc *sc)
	{
	int32_t status;

	status = MFI_READ4(sc, MFI_OSTS);
	if ((status & MFI_OSTS_INTR_VALID) == 0)
	return 1;

	MFI_WRITE4(sc, MFI_OSTS, status);
	return 0;
	}

	static int
	mfi_check_clear_intr_ppc(struct mfi_softc *sc)
	{
	int32_t status;

	status = MFI_READ4(sc, MFI_OSTS);
	if (sc->mfi_flags & MFI_FLAGS_1078) {
	if (!(status & MFI_1078_RM)) {
	return 1;
	}
	} else if (sc->mfi_flags & MFI_FLAGS_GEN2) {
	if (!(status & MFI_GEN2_RM)) {
	return 1;
	}
	}

	MFI_WRITE4(sc, MFI_ODCR0, status);
	return 0;
	}

	static void
	mfi_issue_cmd_xscale(struct mfi_softc *sc,uint32_t bus_add,uint32_t frame_cnt)
	{
	MFI_WRITE4(sc, MFI_IQP,(bus_add >>3)\|frame_cnt);
	}

	static void
	mfi_issue_cmd_ppc(struct mfi_softc *sc,uint32_t bus_add,uint32_t frame_cnt)
	{
	MFI_WRITE4(sc, MFI_IQP, (bus_add \|frame_cnt <<1)\|1 );
	}

	static int
	mfi_transition_firmware(struct mfi_softc *sc)
	{
	uint32_t fw_state, cur_state;
	int max_wait, i;

	fw_state = sc->mfi_read_fw_status(sc)& MFI_FWSTATE_MASK;
	while (fw_state != MFI_FWSTATE_READY) {
	if (bootverbose)
	device_printf(sc->mfi_dev, "Waiting for firmware to "
	"become ready\n");
	cur_state = fw_state;
	switch (fw_state) {
	case MFI_FWSTATE_FAULT:
	device_printf(sc->mfi_dev, "Firmware fault\n");
	return (ENXIO);
	case MFI_FWSTATE_WAIT_HANDSHAKE:
	MFI_WRITE4(sc, MFI_IDB, MFI_FWINIT_CLEAR_HANDSHAKE);
	max_wait = 2;
	break;
	case MFI_FWSTATE_OPERATIONAL:
	MFI_WRITE4(sc, MFI_IDB, MFI_FWINIT_READY);
	max_wait = 10;
	break;
	case MFI_FWSTATE_UNDEFINED:
	case MFI_FWSTATE_BB_INIT:
	max_wait = 2;
	break;
	case MFI_FWSTATE_FW_INIT:
	case MFI_FWSTATE_DEVICE_SCAN:
	case MFI_FWSTATE_FLUSH_CACHE:
	max_wait = 20;
	break;
	case MFI_FWSTATE_BOOT_MESSAGE_PENDING:
	MFI_WRITE4(sc, MFI_IDB, MFI_FWINIT_HOTPLUG);
	max_wait = 10;
	break;
	default:
	device_printf(sc->mfi_dev,"Unknown firmware state %#x\n",
	fw_state);
	return (ENXIO);
	}
	for (i = 0; i < (max_wait * 10); i++) {
	fw_state = sc->mfi_read_fw_status(sc) & MFI_FWSTATE_MASK;
	if (fw_state == cur_state)
	DELAY(100000);
	else
	break;
	}
	if (fw_state == cur_state) {
	device_printf(sc->mfi_dev, "Firmware stuck in state "
	"%#x\n", fw_state);
	return (ENXIO);
	}
	}
	return (0);
	}

	static void
	mfi_addr32_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	uint32_t *addr;

	addr = arg;
	*addr = segs[0].ds_addr;
	}

	int
	mfi_attach(struct mfi_softc *sc)
	{
	uint32_t status;
	int error, commsz, framessz, sensesz;
	int frames, unit, max_fw_sge;

	device_printf(sc->mfi_dev, "Megaraid SAS driver Ver 3.00 \n");

	mtx_init(&sc->mfi_io_lock, "MFI I/O lock", NULL, MTX_DEF);
	sx_init(&sc->mfi_config_lock, "MFI config");
	TAILQ_INIT(&sc->mfi_ld_tqh);
	TAILQ_INIT(&sc->mfi_aen_pids);
	TAILQ_INIT(&sc->mfi_cam_ccbq);

	mfi_initq_free(sc);
	mfi_initq_ready(sc);
	mfi_initq_busy(sc);
	mfi_initq_bio(sc);

	if (sc->mfi_flags & MFI_FLAGS_1064R) {
	sc->mfi_enable_intr = mfi_enable_intr_xscale;
	sc->mfi_read_fw_status = mfi_read_fw_status_xscale;
	sc->mfi_check_clear_intr = mfi_check_clear_intr_xscale;
	sc->mfi_issue_cmd = mfi_issue_cmd_xscale;
	}
	else {
	sc->mfi_enable_intr = mfi_enable_intr_ppc;
	sc->mfi_read_fw_status = mfi_read_fw_status_ppc;
	sc->mfi_check_clear_intr = mfi_check_clear_intr_ppc;
	sc->mfi_issue_cmd = mfi_issue_cmd_ppc;
	}


	/* Before we get too far, see if the firmware is working */
	if ((error = mfi_transition_firmware(sc)) != 0) {
	device_printf(sc->mfi_dev, "Firmware not in READY state, "
	"error %d\n", error);
	return (ENXIO);
	}

	/*
	* Get information needed for sizing the contiguous memory for the
	* frame pool. Size down the sgl parameter since we know that
	* we will never need more than what's required for MAXPHYS.
	* It would be nice if these constants were available at runtime
	* instead of compile time.
	*/
	status = sc->mfi_read_fw_status(sc);
	sc->mfi_max_fw_cmds = status & MFI_FWSTATE_MAXCMD_MASK;
	max_fw_sge = (status & MFI_FWSTATE_MAXSGL_MASK) >> 16;
	sc->mfi_max_sge = min(max_fw_sge, ((MFI_MAXPHYS / PAGE_SIZE) + 1));

	/*
	* Create the dma tag for data buffers. Used both for block I/O
	* and for various internal data queries.
	*/
	if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
	1, 0, /* algnmnt, boundary */
	BUS_SPACE_MAXADDR, /* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
	sc->mfi_max_sge, /* nsegments */
	BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
	BUS_DMA_ALLOCNOW, /* flags */
	busdma_lock_mutex, /* lockfunc */
	&sc->mfi_io_lock, /* lockfuncarg */
	&sc->mfi_buffer_dmat)) {
	device_printf(sc->mfi_dev, "Cannot allocate buffer DMA tag\n");
	return (ENOMEM);
	}

	/*
	* Allocate DMA memory for the comms queues. Keep it under 4GB for
	* efficiency. The mfi_hwcomms struct includes space for 1 reply queue
	* entry, so the calculated size here will be will be 1 more than
	* mfi_max_fw_cmds. This is apparently a requirement of the hardware.
	*/
	commsz = (sizeof(uint32_t) * sc->mfi_max_fw_cmds) +
	sizeof(struct mfi_hwcomms);
	if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
	1, 0, /* algnmnt, boundary */
	BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	commsz, /* maxsize */
	1, /* msegments */
	commsz, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->mfi_comms_dmat)) {
	device_printf(sc->mfi_dev, "Cannot allocate comms DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->mfi_comms_dmat, (void **)&sc->mfi_comms,
	BUS_DMA_NOWAIT, &sc->mfi_comms_dmamap)) {
	device_printf(sc->mfi_dev, "Cannot allocate comms memory\n");
	return (ENOMEM);
	}
	bzero(sc->mfi_comms, commsz);
	bus_dmamap_load(sc->mfi_comms_dmat, sc->mfi_comms_dmamap,
	sc->mfi_comms, commsz, mfi_addr32_cb, &sc->mfi_comms_busaddr, 0);

	/*
	* Allocate DMA memory for the command frames. Keep them in the
	* lower 4GB for efficiency. Calculate the size of the commands at
	* the same time; each command is one 64 byte frame plus a set of
	* additional frames for holding sg lists or other data.
	* The assumption here is that the SG list will start at the second
	* frame and not use the unused bytes in the first frame. While this
	* isn't technically correct, it simplifies the calculation and allows
	* for command frames that might be larger than an mfi_io_frame.
	*/
	if (sizeof(bus_addr_t) == 8) {
	sc->mfi_sge_size = sizeof(struct mfi_sg64);
	sc->mfi_flags \|= MFI_FLAGS_SG64;
	} else {
	sc->mfi_sge_size = sizeof(struct mfi_sg32);
	}
	frames = (sc->mfi_sge_size * sc->mfi_max_sge - 1) / MFI_FRAME_SIZE + 2;
	sc->mfi_cmd_size = frames * MFI_FRAME_SIZE;
	framessz = sc->mfi_cmd_size * sc->mfi_max_fw_cmds;
	if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
	64, 0, /* algnmnt, boundary */
	BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	framessz, /* maxsize */
	1, /* nsegments */
	framessz, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->mfi_frames_dmat)) {
	device_printf(sc->mfi_dev, "Cannot allocate frame DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->mfi_frames_dmat, (void **)&sc->mfi_frames,
	BUS_DMA_NOWAIT, &sc->mfi_frames_dmamap)) {
	device_printf(sc->mfi_dev, "Cannot allocate frames memory\n");
	return (ENOMEM);
	}
	bzero(sc->mfi_frames, framessz);
	bus_dmamap_load(sc->mfi_frames_dmat, sc->mfi_frames_dmamap,
	sc->mfi_frames, framessz, mfi_addr32_cb, &sc->mfi_frames_busaddr,0);

	/*
	* Allocate DMA memory for the frame sense data. Keep them in the
	* lower 4GB for efficiency
	*/
	sensesz = sc->mfi_max_fw_cmds * MFI_SENSE_LEN;
	if (bus_dma_tag_create( sc->mfi_parent_dmat, /* parent */
	4, 0, /* algnmnt, boundary */
	BUS_SPACE_MAXADDR_32BIT,/* lowaddr */
	BUS_SPACE_MAXADDR, /* highaddr */
	NULL, NULL, /* filter, filterarg */
	sensesz, /* maxsize */
	1, /* nsegments */
	sensesz, /* maxsegsize */
	0, /* flags */
	NULL, NULL, /* lockfunc, lockarg */
	&sc->mfi_sense_dmat)) {
	device_printf(sc->mfi_dev, "Cannot allocate sense DMA tag\n");
	return (ENOMEM);
	}
	if (bus_dmamem_alloc(sc->mfi_sense_dmat, (void **)&sc->mfi_sense,
	BUS_DMA_NOWAIT, &sc->mfi_sense_dmamap)) {
	device_printf(sc->mfi_dev, "Cannot allocate sense memory\n");
	return (ENOMEM);
	}
	bus_dmamap_load(sc->mfi_sense_dmat, sc->mfi_sense_dmamap,
	sc->mfi_sense, sensesz, mfi_addr32_cb, &sc->mfi_sense_busaddr, 0);

	if ((error = mfi_alloc_commands(sc)) != 0)
	return (error);

	if ((error = mfi_comms_init(sc)) != 0)
	return (error);

	if ((error = mfi_get_controller_info(sc)) != 0)
	return (error);

	mtx_lock(&sc->mfi_io_lock);
	if ((error = mfi_aen_setup(sc, 0), 0) != 0) {
	mtx_unlock(&sc->mfi_io_lock);
	return (error);
	}
	mtx_unlock(&sc->mfi_io_lock);

	/*
	* Set up the interrupt handler. XXX This should happen in
	* mfi_pci.c
	*/
	sc->mfi_irq_rid = 0;
	if ((sc->mfi_irq = bus_alloc_resource_any(sc->mfi_dev, SYS_RES_IRQ,
	&sc->mfi_irq_rid, RF_SHAREABLE \| RF_ACTIVE)) == NULL) {
	device_printf(sc->mfi_dev, "Cannot allocate interrupt\n");
	return (EINVAL);
	}
	if (bus_setup_intr(sc->mfi_dev, sc->mfi_irq, INTR_MPSAFE\|INTR_TYPE_BIO,
	NULL, mfi_intr, sc, &sc->mfi_intr)) {
	device_printf(sc->mfi_dev, "Cannot set up interrupt\n");
	return (EINVAL);
	}

	/* Register a config hook to probe the bus for arrays */
	sc->mfi_ich.ich_func = mfi_startup;
	sc->mfi_ich.ich_arg = sc;
	if (config_intrhook_establish(&sc->mfi_ich) != 0) {
	device_printf(sc->mfi_dev, "Cannot establish configuration "
	"hook\n");
	return (EINVAL);
	}

	/*
	* Register a shutdown handler.
	*/
	if ((sc->mfi_eh = EVENTHANDLER_REGISTER(shutdown_final, mfi_shutdown,
	sc, SHUTDOWN_PRI_DEFAULT)) == NULL) {
	device_printf(sc->mfi_dev, "Warning: shutdown event "
	"registration failed\n");
	}

	/*
	* Create the control device for doing management
	*/
	unit = device_get_unit(sc->mfi_dev);
	sc->mfi_cdev = make_dev(&mfi_cdevsw, unit, UID_ROOT, GID_OPERATOR,
	0640, "mfi%d", unit);
	if (unit == 0)
	make_dev_alias(sc->mfi_cdev, "megaraid_sas_ioctl_node");
	if (sc->mfi_cdev != NULL)
	sc->mfi_cdev->si_drv1 = sc;
	SYSCTL_ADD_INT(device_get_sysctl_ctx(sc->mfi_dev),
	SYSCTL_CHILDREN(device_get_sysctl_tree(sc->mfi_dev)),
	OID_AUTO, "delete_busy_volumes", CTLFLAG_RW,
	&sc->mfi_delete_busy_volumes, 0, "Allow removal of busy volumes");
	SYSCTL_ADD_INT(device_get_sysctl_ctx(sc->mfi_dev),
	SYSCTL_CHILDREN(device_get_sysctl_tree(sc->mfi_dev)),
	OID_AUTO, "keep_deleted_volumes", CTLFLAG_RW,
	&sc->mfi_keep_deleted_volumes, 0,
	"Don't detach the mfid device for a busy volume that is deleted");

	device_add_child(sc->mfi_dev, "mfip", -1);
	bus_generic_attach(sc->mfi_dev);

	/* Start the timeout watchdog */
	callout_init(&sc->mfi_watchdog_callout, CALLOUT_MPSAFE);
	callout_reset(&sc->mfi_watchdog_callout, MFI_CMD_TIMEOUT * hz,
	mfi_timeout, sc);

	return (0);
	}

	static int
	mfi_alloc_commands(struct mfi_softc *sc)
	{
	struct mfi_command *cm;
	int i, ncmds;

	/*
	* XXX Should we allocate all the commands up front, or allocate on
	* demand later like 'aac' does?
	*/
	ncmds = MIN(mfi_max_cmds, sc->mfi_max_fw_cmds);
	if (bootverbose)
	device_printf(sc->mfi_dev, "Max fw cmds= %d, sizing driver "
	"pool to %d\n", sc->mfi_max_fw_cmds, ncmds);

	sc->mfi_commands = malloc(sizeof(struct mfi_command) * ncmds, M_MFIBUF,
	M_WAITOK \| M_ZERO);

	for (i = 0; i < ncmds; i++) {
	cm = &sc->mfi_commands[i];
	cm->cm_frame = (union mfi_frame *)((uintptr_t)sc->mfi_frames +
	sc->mfi_cmd_size * i);
	cm->cm_frame_busaddr = sc->mfi_frames_busaddr +
	sc->mfi_cmd_size * i;
	cm->cm_frame->header.context = i;
	cm->cm_sense = &sc->mfi_sense[i];
	cm->cm_sense_busaddr= sc->mfi_sense_busaddr + MFI_SENSE_LEN * i;
	cm->cm_sc = sc;
	cm->cm_index = i;
	if (bus_dmamap_create(sc->mfi_buffer_dmat, 0,
	&cm->cm_dmamap) == 0)
	mfi_release_command(cm);
	else
	break;
	sc->mfi_total_cmds++;
	}

	return (0);
	}

	void
	mfi_release_command(struct mfi_command *cm)
	{
	struct mfi_frame_header *hdr;
	uint32_t *hdr_data;

	/*
	* Zero out the important fields of the frame, but make sure the
	* context field is preserved. For efficiency, handle the fields
	* as 32 bit words. Clear out the first S/G entry too for safety.
	*/
	hdr = &cm->cm_frame->header;
	if (cm->cm_data != NULL && hdr->sg_count) {
	cm->cm_sg->sg32[0].len = 0;
	cm->cm_sg->sg32[0].addr = 0;
	}

	hdr_data = (uint32_t *)cm->cm_frame;
	hdr_data[0] = 0; /* cmd, sense_len, cmd_status, scsi_status */
	hdr_data[1] = 0; /* target_id, lun_id, cdb_len, sg_count */
	hdr_data[4] = 0; /* flags, timeout */
	hdr_data[5] = 0; /* data_len */

	cm->cm_extra_frames = 0;
	cm->cm_flags = 0;
	cm->cm_complete = NULL;
	cm->cm_private = NULL;
	cm->cm_data = NULL;
	cm->cm_sg = 0;
	cm->cm_total_frame_size = 0;

	mfi_enqueue_free(cm);
	}

	static int
	mfi_dcmd_command(struct mfi_softc sc, struct mfi_command *cmp, uint32_t opcode,
	void **bufp, size_t bufsize)
	{
	struct mfi_command *cm;
	struct mfi_dcmd_frame *dcmd;
	void *buf = NULL;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	cm = mfi_dequeue_free(sc);
	if (cm == NULL)
	return (EBUSY);

	if ((bufsize > 0) && (bufp != NULL)) {
	if (*bufp == NULL) {
	buf = malloc(bufsize, M_MFIBUF, M_NOWAIT\|M_ZERO);
	if (buf == NULL) {
	mfi_release_command(cm);
	return (ENOMEM);
	}
	*bufp = buf;
	} else {
	buf = *bufp;
	}
	}

	dcmd = &cm->cm_frame->dcmd;
	bzero(dcmd->mbox, MFI_MBOX_SIZE);
	dcmd->header.cmd = MFI_CMD_DCMD;
	dcmd->header.timeout = 0;
	dcmd->header.flags = 0;
	dcmd->header.data_len = bufsize;
	dcmd->opcode = opcode;
	cm->cm_sg = &dcmd->sgl;
	cm->cm_total_frame_size = MFI_DCMD_FRAME_SIZE;
	cm->cm_flags = 0;
	cm->cm_data = buf;
	cm->cm_private = buf;
	cm->cm_len = bufsize;

	*cmp = cm;
	if ((bufp != NULL) && (*bufp == NULL) && (buf != NULL))
	*bufp = buf;
	return (0);
	}

	static int
	mfi_comms_init(struct mfi_softc *sc)
	{
	struct mfi_command *cm;
	struct mfi_init_frame *init;
	struct mfi_init_qinfo *qinfo;
	int error;

	mtx_lock(&sc->mfi_io_lock);
	if ((cm = mfi_dequeue_free(sc)) == NULL)
	return (EBUSY);

	/*
	* Abuse the SG list area of the frame to hold the init_qinfo
	* object;
	*/
	init = &cm->cm_frame->init;
	qinfo = (struct mfi_init_qinfo *)((uintptr_t)init + MFI_FRAME_SIZE);

	bzero(qinfo, sizeof(struct mfi_init_qinfo));
	qinfo->rq_entries = sc->mfi_max_fw_cmds + 1;
	qinfo->rq_addr_lo = sc->mfi_comms_busaddr +
	offsetof(struct mfi_hwcomms, hw_reply_q);
	qinfo->pi_addr_lo = sc->mfi_comms_busaddr +
	offsetof(struct mfi_hwcomms, hw_pi);
	qinfo->ci_addr_lo = sc->mfi_comms_busaddr +
	offsetof(struct mfi_hwcomms, hw_ci);

	init->header.cmd = MFI_CMD_INIT;
	init->header.data_len = sizeof(struct mfi_init_qinfo);
	init->qinfo_new_addr_lo = cm->cm_frame_busaddr + MFI_FRAME_SIZE;
	cm->cm_data = NULL;
	cm->cm_flags = MFI_CMD_POLLED;

	if ((error = mfi_mapcmd(sc, cm)) != 0) {
	device_printf(sc->mfi_dev, "failed to send init command\n");
	mtx_unlock(&sc->mfi_io_lock);
	return (error);
	}
	mfi_release_command(cm);
	mtx_unlock(&sc->mfi_io_lock);

	return (0);
	}

	static int
	mfi_get_controller_info(struct mfi_softc *sc)
	{
	struct mfi_command *cm = NULL;
	struct mfi_ctrl_info *ci = NULL;
	uint32_t max_sectors_1, max_sectors_2;
	int error;

	mtx_lock(&sc->mfi_io_lock);
	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_GETINFO,
	(void *)&ci, sizeof(ci));
	if (error)
	goto out;
	cm->cm_flags = MFI_CMD_DATAIN \| MFI_CMD_POLLED;

	if ((error = mfi_mapcmd(sc, cm)) != 0) {
	device_printf(sc->mfi_dev, "Failed to get controller info\n");
	sc->mfi_max_io = (sc->mfi_max_sge - 1) * PAGE_SIZE /
	MFI_SECTOR_LEN;
	error = 0;
	goto out;
	}

	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
	BUS_DMASYNC_POSTREAD);
	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);

	max_sectors_1 = (1 << ci->stripe_sz_ops.min) * ci->max_strips_per_io;
	max_sectors_2 = ci->max_request_size;
	sc->mfi_max_io = min(max_sectors_1, max_sectors_2);

	out:
	if (ci)
	free(ci, M_MFIBUF);
	if (cm)
	mfi_release_command(cm);
	mtx_unlock(&sc->mfi_io_lock);
	return (error);
	}

	static int
	mfi_get_log_state(struct mfi_softc sc, struct mfi_evt_log_state *log_state)
	{
	struct mfi_command *cm = NULL;
	int error;

	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_EVENT_GETINFO,
	(void )log_state, sizeof(log_state));
	if (error)
	goto out;
	cm->cm_flags = MFI_CMD_DATAIN \| MFI_CMD_POLLED;

	if ((error = mfi_mapcmd(sc, cm)) != 0) {
	device_printf(sc->mfi_dev, "Failed to get log state\n");
	goto out;
	}

	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
	BUS_DMASYNC_POSTREAD);
	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);

	out:
	if (cm)
	mfi_release_command(cm);

	return (error);
	}

	static int
	mfi_aen_setup(struct mfi_softc *sc, uint32_t seq_start)
	{
	struct mfi_evt_log_state *log_state = NULL;
	union mfi_evt class_locale;
	int error = 0;
	uint32_t seq;

	class_locale.members.reserved = 0;
	class_locale.members.locale = mfi_event_locale;
	class_locale.members.evt_class = mfi_event_class;

	if (seq_start == 0) {
	error = mfi_get_log_state(sc, &log_state);
	if (error) {
	if (log_state)
	free(log_state, M_MFIBUF);
	return (error);
	}

	/*
	* Walk through any events that fired since the last
	* shutdown.
	*/
	mfi_parse_entries(sc, log_state->shutdown_seq_num,
	log_state->newest_seq_num);
	seq = log_state->newest_seq_num;
	} else
	seq = seq_start;
	mfi_aen_register(sc, seq, class_locale.word);
	free(log_state, M_MFIBUF);

	return 0;
	}

	static int
	mfi_wait_command(struct mfi_softc sc, struct mfi_command cm)
	{

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
	cm->cm_complete = NULL;


	/*
	* MegaCli can issue a DCMD of 0. In this case do nothing
	* and return 0 to it as status
	*/
	if (cm->cm_frame->dcmd.opcode == 0) {
	cm->cm_frame->header.cmd_status = MFI_STAT_OK;
	cm->cm_error = 0;
	return (cm->cm_error);
	}
	mfi_enqueue_ready(cm);
	mfi_startio(sc);
	if ((cm->cm_flags & MFI_CMD_COMPLETED) == 0)
	msleep(cm, &sc->mfi_io_lock, PRIBIO, "mfiwait", 0);
	return (cm->cm_error);
	}

	void
	mfi_free(struct mfi_softc *sc)
	{
	struct mfi_command *cm;
	int i;

	callout_drain(&sc->mfi_watchdog_callout);

	if (sc->mfi_cdev != NULL)
	destroy_dev(sc->mfi_cdev);

	if (sc->mfi_total_cmds != 0) {
	for (i = 0; i < sc->mfi_total_cmds; i++) {
	cm = &sc->mfi_commands[i];
	bus_dmamap_destroy(sc->mfi_buffer_dmat, cm->cm_dmamap);
	}
	free(sc->mfi_commands, M_MFIBUF);
	}

	if (sc->mfi_intr)
	bus_teardown_intr(sc->mfi_dev, sc->mfi_irq, sc->mfi_intr);
	if (sc->mfi_irq != NULL)
	bus_release_resource(sc->mfi_dev, SYS_RES_IRQ, sc->mfi_irq_rid,
	sc->mfi_irq);

	if (sc->mfi_sense_busaddr != 0)
	bus_dmamap_unload(sc->mfi_sense_dmat, sc->mfi_sense_dmamap);
	if (sc->mfi_sense != NULL)
	bus_dmamem_free(sc->mfi_sense_dmat, sc->mfi_sense,
	sc->mfi_sense_dmamap);
	if (sc->mfi_sense_dmat != NULL)
	bus_dma_tag_destroy(sc->mfi_sense_dmat);

	if (sc->mfi_frames_busaddr != 0)
	bus_dmamap_unload(sc->mfi_frames_dmat, sc->mfi_frames_dmamap);
	if (sc->mfi_frames != NULL)
	bus_dmamem_free(sc->mfi_frames_dmat, sc->mfi_frames,
	sc->mfi_frames_dmamap);
	if (sc->mfi_frames_dmat != NULL)
	bus_dma_tag_destroy(sc->mfi_frames_dmat);

	if (sc->mfi_comms_busaddr != 0)
	bus_dmamap_unload(sc->mfi_comms_dmat, sc->mfi_comms_dmamap);
	if (sc->mfi_comms != NULL)
	bus_dmamem_free(sc->mfi_comms_dmat, sc->mfi_comms,
	sc->mfi_comms_dmamap);
	if (sc->mfi_comms_dmat != NULL)
	bus_dma_tag_destroy(sc->mfi_comms_dmat);

	if (sc->mfi_buffer_dmat != NULL)
	bus_dma_tag_destroy(sc->mfi_buffer_dmat);
	if (sc->mfi_parent_dmat != NULL)
	bus_dma_tag_destroy(sc->mfi_parent_dmat);

	if (mtx_initialized(&sc->mfi_io_lock)) {
	mtx_destroy(&sc->mfi_io_lock);
	sx_destroy(&sc->mfi_config_lock);
	}

	return;
	}

	static void
	mfi_startup(void *arg)
	{
	struct mfi_softc *sc;

	sc = (struct mfi_softc *)arg;

	config_intrhook_disestablish(&sc->mfi_ich);

	sc->mfi_enable_intr(sc);
	sx_xlock(&sc->mfi_config_lock);
	mtx_lock(&sc->mfi_io_lock);
	mfi_ldprobe(sc);
	mtx_unlock(&sc->mfi_io_lock);
	sx_xunlock(&sc->mfi_config_lock);
	}

	static void
	mfi_intr(void *arg)
	{
	struct mfi_softc *sc;
	struct mfi_command *cm;
	uint32_t pi, ci, context;

	sc = (struct mfi_softc *)arg;

	if (sc->mfi_check_clear_intr(sc))
	return;

	pi = sc->mfi_comms->hw_pi;
	ci = sc->mfi_comms->hw_ci;
	mtx_lock(&sc->mfi_io_lock);
	while (ci != pi) {
	context = sc->mfi_comms->hw_reply_q[ci];
	if (context < sc->mfi_max_fw_cmds) {
	cm = &sc->mfi_commands[context];
	mfi_remove_busy(cm);
	cm->cm_error = 0;
	mfi_complete(sc, cm);
	}
	if (++ci == (sc->mfi_max_fw_cmds + 1)) {
	ci = 0;
	}
	}

	sc->mfi_comms->hw_ci = ci;

	/* Give defered I/O a chance to run */
	if (sc->mfi_flags & MFI_FLAGS_QFRZN)
	sc->mfi_flags &= ~MFI_FLAGS_QFRZN;
	mfi_startio(sc);
	mtx_unlock(&sc->mfi_io_lock);

	return;
	}

	int
	mfi_shutdown(struct mfi_softc *sc)
	{
	struct mfi_dcmd_frame *dcmd;
	struct mfi_command *cm;
	int error;

	mtx_lock(&sc->mfi_io_lock);
	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_SHUTDOWN, NULL, 0);
	if (error) {
	mtx_unlock(&sc->mfi_io_lock);
	return (error);
	}

	if (sc->mfi_aen_cm != NULL)
	mfi_abort(sc, sc->mfi_aen_cm);

	dcmd = &cm->cm_frame->dcmd;
	dcmd->header.flags = MFI_FRAME_DIR_NONE;
	cm->cm_flags = MFI_CMD_POLLED;
	cm->cm_data = NULL;

	if ((error = mfi_mapcmd(sc, cm)) != 0) {
	device_printf(sc->mfi_dev, "Failed to shutdown controller\n");
	}

	mfi_release_command(cm);
	mtx_unlock(&sc->mfi_io_lock);
	return (error);
	}

	static void
	mfi_ldprobe(struct mfi_softc *sc)
	{
	struct mfi_frame_header *hdr;
	struct mfi_command *cm = NULL;
	struct mfi_ld_list *list = NULL;
	struct mfi_disk *ld;
	int error, i;

	sx_assert(&sc->mfi_config_lock, SA_XLOCKED);
	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_LD_GET_LIST,
	(void *)&list, sizeof(list));
	if (error)
	goto out;

	cm->cm_flags = MFI_CMD_DATAIN;
	if (mfi_wait_command(sc, cm) != 0) {
	device_printf(sc->mfi_dev, "Failed to get device listing\n");
	goto out;
	}

	hdr = &cm->cm_frame->header;
	if (hdr->cmd_status != MFI_STAT_OK) {
	device_printf(sc->mfi_dev, "MFI_DCMD_LD_GET_LIST failed %x\n",
	hdr->cmd_status);
	goto out;
	}

	for (i = 0; i < list->ld_count; i++) {
	TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
	if (ld->ld_id == list->ld_list[i].ld.v.target_id)
	goto skip_add;
	}
	mfi_add_ld(sc, list->ld_list[i].ld.v.target_id);
	skip_add:;
	}
	out:
	if (list)
	free(list, M_MFIBUF);
	if (cm)
	mfi_release_command(cm);

	return;
	}

	/*
	* The timestamp is the number of seconds since 00:00 Jan 1, 2000. If
	* the bits in 24-31 are all set, then it is the number of seconds since
	* boot.
	*/
	static const char *
	format_timestamp(uint32_t timestamp)
	{
	static char buffer[32];

	if ((timestamp & 0xff000000) == 0xff000000)
	snprintf(buffer, sizeof(buffer), "boot + %us", timestamp &
	0x00ffffff);
	else
	snprintf(buffer, sizeof(buffer), "%us", timestamp);
	return (buffer);
	}

	static const char *
	format_class(int8_t class)
	{
	static char buffer[6];

	switch (class) {
	case MFI_EVT_CLASS_DEBUG:
	return ("debug");
	case MFI_EVT_CLASS_PROGRESS:
	return ("progress");
	case MFI_EVT_CLASS_INFO:
	return ("info");
	case MFI_EVT_CLASS_WARNING:
	return ("WARN");
	case MFI_EVT_CLASS_CRITICAL:
	return ("CRIT");
	case MFI_EVT_CLASS_FATAL:
	return ("FATAL");
	case MFI_EVT_CLASS_DEAD:
	return ("DEAD");
	default:
	snprintf(buffer, sizeof(buffer), "%d", class);
	return (buffer);
	}
	}

	static void
	mfi_decode_evt(struct mfi_softc sc, struct mfi_evt_detail detail)
	{

	device_printf(sc->mfi_dev, "%d (%s/0x%04x/%s) - %s\n", detail->seq,
	format_timestamp(detail->time), detail->evt_class.members.locale,
	format_class(detail->evt_class.members.evt_class), detail->description);
	}

	static int
	mfi_aen_register(struct mfi_softc *sc, int seq, int locale)
	{
	struct mfi_command *cm;
	struct mfi_dcmd_frame *dcmd;
	union mfi_evt current_aen, prior_aen;
	struct mfi_evt_detail *ed = NULL;
	int error = 0;

	current_aen.word = locale;
	if (sc->mfi_aen_cm != NULL) {
	prior_aen.word =
	((uint32_t *)&sc->mfi_aen_cm->cm_frame->dcmd.mbox)[1];
	if (prior_aen.members.evt_class <= current_aen.members.evt_class &&
	!((prior_aen.members.locale & current_aen.members.locale)
	^current_aen.members.locale)) {
	return (0);
	} else {
	prior_aen.members.locale \|= current_aen.members.locale;
	if (prior_aen.members.evt_class
	< current_aen.members.evt_class)
	current_aen.members.evt_class =
	prior_aen.members.evt_class;
	mfi_abort(sc, sc->mfi_aen_cm);
	}
	}

	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_CTRL_EVENT_WAIT,
	(void *)&ed, sizeof(ed));
	if (error) {
	goto out;
	}

	dcmd = &cm->cm_frame->dcmd;
	((uint32_t *)&dcmd->mbox)[0] = seq;
	((uint32_t *)&dcmd->mbox)[1] = locale;
	cm->cm_flags = MFI_CMD_DATAIN;
	cm->cm_complete = mfi_aen_complete;

	sc->mfi_aen_cm = cm;

	mfi_enqueue_ready(cm);
	mfi_startio(sc);

	out:
	return (error);
	}

	static void
	mfi_aen_complete(struct mfi_command *cm)
	{
	struct mfi_frame_header *hdr;
	struct mfi_softc *sc;
	struct mfi_evt_detail *detail;
	struct mfi_aen mfi_aen_entry, tmp;
	int seq = 0, aborted = 0;

	sc = cm->cm_sc;
	hdr = &cm->cm_frame->header;

	if (sc->mfi_aen_cm == NULL)
	return;

	if (sc->mfi_aen_cm->cm_aen_abort \|\|
	hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
	sc->mfi_aen_cm->cm_aen_abort = 0;
	aborted = 1;
	} else {
	sc->mfi_aen_triggered = 1;
	if (sc->mfi_poll_waiting) {
	sc->mfi_poll_waiting = 0;
	selwakeup(&sc->mfi_select);
	}
	detail = cm->cm_data;
	/*
	* XXX If this function is too expensive or is recursive, then
	* events should be put onto a queue and processed later.
	*/
	mfi_decode_evt(sc, detail);
	seq = detail->seq + 1;
	TAILQ_FOREACH_SAFE(mfi_aen_entry, &sc->mfi_aen_pids, aen_link, tmp) {
	TAILQ_REMOVE(&sc->mfi_aen_pids, mfi_aen_entry,
	aen_link);
	PROC_LOCK(mfi_aen_entry->p);
	- psignal(mfi_aen_entry->p, SIGIO);
	+ kern_psignal(mfi_aen_entry->p, SIGIO);
	PROC_UNLOCK(mfi_aen_entry->p);
	free(mfi_aen_entry, M_MFIBUF);
	}
	}

	free(cm->cm_data, M_MFIBUF);
	sc->mfi_aen_cm = NULL;
	wakeup(&sc->mfi_aen_cm);
	mfi_release_command(cm);

	/* set it up again so the driver can catch more events */
	if (!aborted) {
	mfi_aen_setup(sc, seq);
	}
	}

	#define MAX_EVENTS 15

	static int
	mfi_parse_entries(struct mfi_softc *sc, int start_seq, int stop_seq)
	{
	struct mfi_command *cm;
	struct mfi_dcmd_frame *dcmd;
	struct mfi_evt_list *el;
	union mfi_evt class_locale;
	int error, i, seq, size;

	class_locale.members.reserved = 0;
	class_locale.members.locale = mfi_event_locale;
	class_locale.members.evt_class = mfi_event_class;

	size = sizeof(struct mfi_evt_list) + sizeof(struct mfi_evt_detail)
	* (MAX_EVENTS - 1);
	el = malloc(size, M_MFIBUF, M_NOWAIT \| M_ZERO);
	if (el == NULL)
	return (ENOMEM);

	for (seq = start_seq;;) {
	if ((cm = mfi_dequeue_free(sc)) == NULL) {
	free(el, M_MFIBUF);
	return (EBUSY);
	}

	dcmd = &cm->cm_frame->dcmd;
	bzero(dcmd->mbox, MFI_MBOX_SIZE);
	dcmd->header.cmd = MFI_CMD_DCMD;
	dcmd->header.timeout = 0;
	dcmd->header.data_len = size;
	dcmd->opcode = MFI_DCMD_CTRL_EVENT_GET;
	((uint32_t *)&dcmd->mbox)[0] = seq;
	((uint32_t *)&dcmd->mbox)[1] = class_locale.word;
	cm->cm_sg = &dcmd->sgl;
	cm->cm_total_frame_size = MFI_DCMD_FRAME_SIZE;
	cm->cm_flags = MFI_CMD_DATAIN \| MFI_CMD_POLLED;
	cm->cm_data = el;
	cm->cm_len = size;

	if ((error = mfi_mapcmd(sc, cm)) != 0) {
	device_printf(sc->mfi_dev,
	"Failed to get controller entries\n");
	mfi_release_command(cm);
	break;
	}

	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
	BUS_DMASYNC_POSTREAD);
	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);

	if (dcmd->header.cmd_status == MFI_STAT_NOT_FOUND) {
	mfi_release_command(cm);
	break;
	}
	if (dcmd->header.cmd_status != MFI_STAT_OK) {
	device_printf(sc->mfi_dev,
	"Error %d fetching controller entries\n",
	dcmd->header.cmd_status);
	mfi_release_command(cm);
	break;
	}
	mfi_release_command(cm);

	for (i = 0; i < el->count; i++) {
	/*
	* If this event is newer than 'stop_seq' then
	* break out of the loop. Note that the log
	* is a circular buffer so we have to handle
	* the case that our stop point is earlier in
	* the buffer than our start point.
	*/
	if (el->event[i].seq >= stop_seq) {
	if (start_seq <= stop_seq)
	break;
	else if (el->event[i].seq < start_seq)
	break;
	}
	mfi_decode_evt(sc, &el->event[i]);
	}
	seq = el->event[el->count - 1].seq + 1;
	}

	free(el, M_MFIBUF);
	return (0);
	}

	static int
	mfi_add_ld(struct mfi_softc *sc, int id)
	{
	struct mfi_command *cm;
	struct mfi_dcmd_frame *dcmd = NULL;
	struct mfi_ld_info *ld_info = NULL;
	int error;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	error = mfi_dcmd_command(sc, &cm, MFI_DCMD_LD_GET_INFO,
	(void *)&ld_info, sizeof(ld_info));
	if (error) {
	device_printf(sc->mfi_dev,
	"Failed to allocate for MFI_DCMD_LD_GET_INFO %d\n", error);
	if (ld_info)
	free(ld_info, M_MFIBUF);
	return (error);
	}
	cm->cm_flags = MFI_CMD_DATAIN;
	dcmd = &cm->cm_frame->dcmd;
	dcmd->mbox[0] = id;
	if (mfi_wait_command(sc, cm) != 0) {
	device_printf(sc->mfi_dev,
	"Failed to get logical drive: %d\n", id);
	free(ld_info, M_MFIBUF);
	return (0);
	}

	mfi_add_ld_complete(cm);
	return (0);
	}

	static void
	mfi_add_ld_complete(struct mfi_command *cm)
	{
	struct mfi_frame_header *hdr;
	struct mfi_ld_info *ld_info;
	struct mfi_softc *sc;
	device_t child;

	sc = cm->cm_sc;
	hdr = &cm->cm_frame->header;
	ld_info = cm->cm_private;

	if (hdr->cmd_status != MFI_STAT_OK) {
	free(ld_info, M_MFIBUF);
	mfi_release_command(cm);
	return;
	}
	mfi_release_command(cm);

	mtx_unlock(&sc->mfi_io_lock);
	mtx_lock(&Giant);
	if ((child = device_add_child(sc->mfi_dev, "mfid", -1)) == NULL) {
	device_printf(sc->mfi_dev, "Failed to add logical disk\n");
	free(ld_info, M_MFIBUF);
	mtx_unlock(&Giant);
	mtx_lock(&sc->mfi_io_lock);
	return;
	}

	device_set_ivars(child, ld_info);
	device_set_desc(child, "MFI Logical Disk");
	bus_generic_attach(sc->mfi_dev);
	mtx_unlock(&Giant);
	mtx_lock(&sc->mfi_io_lock);
	}

	static struct mfi_command *
	mfi_bio_command(struct mfi_softc *sc)
	{
	struct mfi_io_frame *io;
	struct mfi_command *cm;
	struct bio *bio;
	int flags, blkcount;

	if ((cm = mfi_dequeue_free(sc)) == NULL)
	return (NULL);

	if ((bio = mfi_dequeue_bio(sc)) == NULL) {
	mfi_release_command(cm);
	return (NULL);
	}

	io = &cm->cm_frame->io;
	switch (bio->bio_cmd & 0x03) {
	case BIO_READ:
	io->header.cmd = MFI_CMD_LD_READ;
	flags = MFI_CMD_DATAIN;
	break;
	case BIO_WRITE:
	io->header.cmd = MFI_CMD_LD_WRITE;
	flags = MFI_CMD_DATAOUT;
	break;
	default:
	panic("Invalid bio command");
	}

	/* Cheat with the sector length to avoid a non-constant division */
	blkcount = (bio->bio_bcount + MFI_SECTOR_LEN - 1) / MFI_SECTOR_LEN;
	io->header.target_id = (uintptr_t)bio->bio_driver1;
	io->header.timeout = 0;
	io->header.flags = 0;
	io->header.sense_len = MFI_SENSE_LEN;
	io->header.data_len = blkcount;
	io->sense_addr_lo = cm->cm_sense_busaddr;
	io->sense_addr_hi = 0;
	io->lba_hi = (bio->bio_pblkno & 0xffffffff00000000) >> 32;
	io->lba_lo = bio->bio_pblkno & 0xffffffff;
	cm->cm_complete = mfi_bio_complete;
	cm->cm_private = bio;
	cm->cm_data = bio->bio_data;
	cm->cm_len = bio->bio_bcount;
	cm->cm_sg = &io->sgl;
	cm->cm_total_frame_size = MFI_IO_FRAME_SIZE;
	cm->cm_flags = flags;
	return (cm);
	}

	static void
	mfi_bio_complete(struct mfi_command *cm)
	{
	struct bio *bio;
	struct mfi_frame_header *hdr;
	struct mfi_softc *sc;

	bio = cm->cm_private;
	hdr = &cm->cm_frame->header;
	sc = cm->cm_sc;

	if ((hdr->cmd_status != MFI_STAT_OK) \|\| (hdr->scsi_status != 0)) {
	bio->bio_flags \|= BIO_ERROR;
	bio->bio_error = EIO;
	device_printf(sc->mfi_dev, "I/O error, status= %d "
	"scsi_status= %d\n", hdr->cmd_status, hdr->scsi_status);
	mfi_print_sense(cm->cm_sc, cm->cm_sense);
	} else if (cm->cm_error != 0) {
	bio->bio_flags \|= BIO_ERROR;
	}

	mfi_release_command(cm);
	mfi_disk_complete(bio);
	}

	void
	mfi_startio(struct mfi_softc *sc)
	{
	struct mfi_command *cm;
	struct ccb_hdr *ccbh;

	for (;;) {
	/* Don't bother if we're short on resources */
	if (sc->mfi_flags & MFI_FLAGS_QFRZN)
	break;

	/* Try a command that has already been prepared */
	cm = mfi_dequeue_ready(sc);

	if (cm == NULL) {
	if ((ccbh = TAILQ_FIRST(&sc->mfi_cam_ccbq)) != NULL)
	cm = sc->mfi_cam_start(ccbh);
	}

	/* Nope, so look for work on the bioq */
	if (cm == NULL)
	cm = mfi_bio_command(sc);

	/* No work available, so exit */
	if (cm == NULL)
	break;

	/* Send the command to the controller */
	if (mfi_mapcmd(sc, cm) != 0) {
	mfi_requeue_ready(cm);
	break;
	}
	}
	}

	static int
	mfi_mapcmd(struct mfi_softc sc, struct mfi_command cm)
	{
	int error, polled;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	if (cm->cm_data != NULL) {
	polled = (cm->cm_flags & MFI_CMD_POLLED) ? BUS_DMA_NOWAIT : 0;
	error = bus_dmamap_load(sc->mfi_buffer_dmat, cm->cm_dmamap,
	cm->cm_data, cm->cm_len, mfi_data_cb, cm, polled);
	if (error == EINPROGRESS) {
	sc->mfi_flags \|= MFI_FLAGS_QFRZN;
	return (0);
	}
	} else {
	error = mfi_send_frame(sc, cm);
	}

	return (error);
	}

	static void
	mfi_data_cb(void arg, bus_dma_segment_t segs, int nsegs, int error)
	{
	struct mfi_frame_header *hdr;
	struct mfi_command *cm;
	union mfi_sgl *sgl;
	struct mfi_softc *sc;
	int i, dir;

	cm = (struct mfi_command *)arg;
	sc = cm->cm_sc;
	hdr = &cm->cm_frame->header;
	sgl = cm->cm_sg;

	if (error) {
	printf("error %d in callback\n", error);
	cm->cm_error = error;
	mfi_complete(sc, cm);
	return;
	}

	if ((sc->mfi_flags & MFI_FLAGS_SG64) == 0) {
	for (i = 0; i < nsegs; i++) {
	sgl->sg32[i].addr = segs[i].ds_addr;
	sgl->sg32[i].len = segs[i].ds_len;
	}
	} else {
	for (i = 0; i < nsegs; i++) {
	sgl->sg64[i].addr = segs[i].ds_addr;
	sgl->sg64[i].len = segs[i].ds_len;
	}
	hdr->flags \|= MFI_FRAME_SGL64;
	}
	hdr->sg_count = nsegs;

	dir = 0;
	if (cm->cm_flags & MFI_CMD_DATAIN) {
	dir \|= BUS_DMASYNC_PREREAD;
	hdr->flags \|= MFI_FRAME_DIR_READ;
	}
	if (cm->cm_flags & MFI_CMD_DATAOUT) {
	dir \|= BUS_DMASYNC_PREWRITE;
	hdr->flags \|= MFI_FRAME_DIR_WRITE;
	}
	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap, dir);
	cm->cm_flags \|= MFI_CMD_MAPPED;

	/*
	* Instead of calculating the total number of frames in the
	* compound frame, it's already assumed that there will be at
	* least 1 frame, so don't compensate for the modulo of the
	* following division.
	*/
	cm->cm_total_frame_size += (sc->mfi_sge_size * nsegs);
	cm->cm_extra_frames = (cm->cm_total_frame_size - 1) / MFI_FRAME_SIZE;

	mfi_send_frame(sc, cm);

	return;
	}

	static int
	mfi_send_frame(struct mfi_softc sc, struct mfi_command cm)
	{
	struct mfi_frame_header *hdr;
	int tm = MFI_POLL_TIMEOUT_SECS * 1000;

	hdr = &cm->cm_frame->header;

	if ((cm->cm_flags & MFI_CMD_POLLED) == 0) {
	cm->cm_timestamp = time_uptime;
	mfi_enqueue_busy(cm);
	} else {
	hdr->cmd_status = MFI_STAT_INVALID_STATUS;
	hdr->flags \|= MFI_FRAME_DONT_POST_IN_REPLY_QUEUE;
	}

	/*
	* The bus address of the command is aligned on a 64 byte boundary,
	* leaving the least 6 bits as zero. For whatever reason, the
	* hardware wants the address shifted right by three, leaving just
	* 3 zero bits. These three bits are then used as a prefetching
	* hint for the hardware to predict how many frames need to be
	* fetched across the bus. If a command has more than 8 frames
	* then the 3 bits are set to 0x7 and the firmware uses other
	* information in the command to determine the total amount to fetch.
	* However, FreeBSD doesn't support I/O larger than 128K, so 8 frames
	* is enough for both 32bit and 64bit systems.
	*/
	if (cm->cm_extra_frames > 7)
	cm->cm_extra_frames = 7;

	sc->mfi_issue_cmd(sc,cm->cm_frame_busaddr,cm->cm_extra_frames);

	if ((cm->cm_flags & MFI_CMD_POLLED) == 0)
	return (0);

	/* This is a polled command, so busy-wait for it to complete. */
	while (hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
	DELAY(1000);
	tm -= 1;
	if (tm <= 0)
	break;
	}

	if (hdr->cmd_status == MFI_STAT_INVALID_STATUS) {
	device_printf(sc->mfi_dev, "Frame %p timed out "
	"command 0x%X\n", hdr, cm->cm_frame->dcmd.opcode);
	return (ETIMEDOUT);
	}

	return (0);
	}

	static void
	mfi_complete(struct mfi_softc sc, struct mfi_command cm)
	{
	int dir;

	if ((cm->cm_flags & MFI_CMD_MAPPED) != 0) {
	dir = 0;
	if (cm->cm_flags & MFI_CMD_DATAIN)
	dir \|= BUS_DMASYNC_POSTREAD;
	if (cm->cm_flags & MFI_CMD_DATAOUT)
	dir \|= BUS_DMASYNC_POSTWRITE;

	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap, dir);
	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
	cm->cm_flags &= ~MFI_CMD_MAPPED;
	}

	cm->cm_flags \|= MFI_CMD_COMPLETED;

	if (cm->cm_complete != NULL)
	cm->cm_complete(cm);
	else
	wakeup(cm);
	}

	static int
	mfi_abort(struct mfi_softc sc, struct mfi_command cm_abort)
	{
	struct mfi_command *cm;
	struct mfi_abort_frame *abort;
	int i = 0;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);

	if ((cm = mfi_dequeue_free(sc)) == NULL) {
	return (EBUSY);
	}

	abort = &cm->cm_frame->abort;
	abort->header.cmd = MFI_CMD_ABORT;
	abort->header.flags = 0;
	abort->abort_context = cm_abort->cm_frame->header.context;
	abort->abort_mfi_addr_lo = cm_abort->cm_frame_busaddr;
	abort->abort_mfi_addr_hi = 0;
	cm->cm_data = NULL;
	cm->cm_flags = MFI_CMD_POLLED;

	sc->mfi_aen_cm->cm_aen_abort = 1;
	mfi_mapcmd(sc, cm);
	mfi_release_command(cm);

	while (i < 5 && sc->mfi_aen_cm != NULL) {
	msleep(&sc->mfi_aen_cm, &sc->mfi_io_lock, 0, "mfiabort", 5 * hz);
	i++;
	}

	return (0);
	}

	int
	mfi_dump_blocks(struct mfi_softc sc, int id, uint64_t lba, void virt, int len)
	{
	struct mfi_command *cm;
	struct mfi_io_frame *io;
	int error;

	if ((cm = mfi_dequeue_free(sc)) == NULL)
	return (EBUSY);

	io = &cm->cm_frame->io;
	io->header.cmd = MFI_CMD_LD_WRITE;
	io->header.target_id = id;
	io->header.timeout = 0;
	io->header.flags = 0;
	io->header.sense_len = MFI_SENSE_LEN;
	io->header.data_len = (len + MFI_SECTOR_LEN - 1) / MFI_SECTOR_LEN;
	io->sense_addr_lo = cm->cm_sense_busaddr;
	io->sense_addr_hi = 0;
	io->lba_hi = (lba & 0xffffffff00000000) >> 32;
	io->lba_lo = lba & 0xffffffff;
	cm->cm_data = virt;
	cm->cm_len = len;
	cm->cm_sg = &io->sgl;
	cm->cm_total_frame_size = MFI_IO_FRAME_SIZE;
	cm->cm_flags = MFI_CMD_POLLED \| MFI_CMD_DATAOUT;

	error = mfi_mapcmd(sc, cm);
	bus_dmamap_sync(sc->mfi_buffer_dmat, cm->cm_dmamap,
	BUS_DMASYNC_POSTWRITE);
	bus_dmamap_unload(sc->mfi_buffer_dmat, cm->cm_dmamap);
	mfi_release_command(cm);

	return (error);
	}

	static int
	mfi_open(struct cdev dev, int flags, int fmt, struct thread td)
	{
	struct mfi_softc *sc;
	int error;

	sc = dev->si_drv1;

	mtx_lock(&sc->mfi_io_lock);
	if (sc->mfi_detaching)
	error = ENXIO;
	else {
	sc->mfi_flags \|= MFI_FLAGS_OPEN;
	error = 0;
	}
	mtx_unlock(&sc->mfi_io_lock);

	return (error);
	}

	static int
	mfi_close(struct cdev dev, int flags, int fmt, struct thread td)
	{
	struct mfi_softc *sc;
	struct mfi_aen mfi_aen_entry, tmp;

	sc = dev->si_drv1;

	mtx_lock(&sc->mfi_io_lock);
	sc->mfi_flags &= ~MFI_FLAGS_OPEN;

	TAILQ_FOREACH_SAFE(mfi_aen_entry, &sc->mfi_aen_pids, aen_link, tmp) {
	if (mfi_aen_entry->p == curproc) {
	TAILQ_REMOVE(&sc->mfi_aen_pids, mfi_aen_entry,
	aen_link);
	free(mfi_aen_entry, M_MFIBUF);
	}
	}
	mtx_unlock(&sc->mfi_io_lock);
	return (0);
	}

	static int
	mfi_config_lock(struct mfi_softc *sc, uint32_t opcode)
	{

	switch (opcode) {
	case MFI_DCMD_LD_DELETE:
	case MFI_DCMD_CFG_ADD:
	case MFI_DCMD_CFG_CLEAR:
	sx_xlock(&sc->mfi_config_lock);
	return (1);
	default:
	return (0);
	}
	}

	static void
	mfi_config_unlock(struct mfi_softc *sc, int locked)
	{

	if (locked)
	sx_xunlock(&sc->mfi_config_lock);
	}

	/* Perform pre-issue checks on commands from userland and possibly veto them. */
	static int
	mfi_check_command_pre(struct mfi_softc sc, struct mfi_command cm)
	{
	struct mfi_disk ld, ld2;
	int error;

	mtx_assert(&sc->mfi_io_lock, MA_OWNED);
	error = 0;
	switch (cm->cm_frame->dcmd.opcode) {
	case MFI_DCMD_LD_DELETE:
	TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
	if (ld->ld_id == cm->cm_frame->dcmd.mbox[0])
	break;
	}
	if (ld == NULL)
	error = ENOENT;
	else
	error = mfi_disk_disable(ld);
	break;
	case MFI_DCMD_CFG_CLEAR:
	TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
	error = mfi_disk_disable(ld);
	if (error)
	break;
	}
	if (error) {
	TAILQ_FOREACH(ld2, &sc->mfi_ld_tqh, ld_link) {
	if (ld2 == ld)
	break;
	mfi_disk_enable(ld2);
	}
	}
	break;
	default:
	break;
	}
	return (error);
	}

	/* Perform post-issue checks on commands from userland. */
	static void
	mfi_check_command_post(struct mfi_softc sc, struct mfi_command cm)
	{
	struct mfi_disk ld, ldn;

	switch (cm->cm_frame->dcmd.opcode) {
	case MFI_DCMD_LD_DELETE:
	TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
	if (ld->ld_id == cm->cm_frame->dcmd.mbox[0])
	break;
	}
	KASSERT(ld != NULL, ("volume dissappeared"));
	if (cm->cm_frame->header.cmd_status == MFI_STAT_OK) {
	mtx_unlock(&sc->mfi_io_lock);
	mtx_lock(&Giant);
	device_delete_child(sc->mfi_dev, ld->ld_dev);
	mtx_unlock(&Giant);
	mtx_lock(&sc->mfi_io_lock);
	} else
	mfi_disk_enable(ld);
	break;
	case MFI_DCMD_CFG_CLEAR:
	if (cm->cm_frame->header.cmd_status == MFI_STAT_OK) {
	mtx_unlock(&sc->mfi_io_lock);
	mtx_lock(&Giant);
	TAILQ_FOREACH_SAFE(ld, &sc->mfi_ld_tqh, ld_link, ldn) {
	device_delete_child(sc->mfi_dev, ld->ld_dev);
	}
	mtx_unlock(&Giant);
	mtx_lock(&sc->mfi_io_lock);
	} else {
	TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link)
	mfi_disk_enable(ld);
	}
	break;
	case MFI_DCMD_CFG_ADD:
	mfi_ldprobe(sc);
	break;
	case MFI_DCMD_CFG_FOREIGN_IMPORT:
	mfi_ldprobe(sc);
	break;
	}
	}

	static int
	mfi_user_command(struct mfi_softc sc, struct mfi_ioc_passthru ioc)
	{
	struct mfi_command *cm;
	struct mfi_dcmd_frame *dcmd;
	void *ioc_buf = NULL;
	uint32_t context;
	int error = 0, locked;


	if (ioc->buf_size > 0) {
	ioc_buf = malloc(ioc->buf_size, M_MFIBUF, M_WAITOK);
	if (ioc_buf == NULL) {
	return (ENOMEM);
	}
	error = copyin(ioc->buf, ioc_buf, ioc->buf_size);
	if (error) {
	device_printf(sc->mfi_dev, "failed to copyin\n");
	free(ioc_buf, M_MFIBUF);
	return (error);
	}
	}

	locked = mfi_config_lock(sc, ioc->ioc_frame.opcode);

	mtx_lock(&sc->mfi_io_lock);
	while ((cm = mfi_dequeue_free(sc)) == NULL)
	msleep(mfi_user_command, &sc->mfi_io_lock, 0, "mfiioc", hz);

	/* Save context for later */
	context = cm->cm_frame->header.context;

	dcmd = &cm->cm_frame->dcmd;
	bcopy(&ioc->ioc_frame, dcmd, sizeof(struct mfi_dcmd_frame));

	cm->cm_sg = &dcmd->sgl;
	cm->cm_total_frame_size = MFI_DCMD_FRAME_SIZE;
	cm->cm_data = ioc_buf;
	cm->cm_len = ioc->buf_size;

	/* restore context */
	cm->cm_frame->header.context = context;

	/* Cheat since we don't know if we're writing or reading */
	cm->cm_flags = MFI_CMD_DATAIN \| MFI_CMD_DATAOUT;

	error = mfi_check_command_pre(sc, cm);
	if (error)
	goto out;

	error = mfi_wait_command(sc, cm);
	if (error) {
	device_printf(sc->mfi_dev, "ioctl failed %d\n", error);
	goto out;
	}
	bcopy(dcmd, &ioc->ioc_frame, sizeof(struct mfi_dcmd_frame));
	mfi_check_command_post(sc, cm);
	out:
	mfi_release_command(cm);
	mtx_unlock(&sc->mfi_io_lock);
	mfi_config_unlock(sc, locked);
	if (ioc->buf_size > 0)
	error = copyout(ioc_buf, ioc->buf, ioc->buf_size);
	if (ioc_buf)
	free(ioc_buf, M_MFIBUF);
	return (error);
	}

	#ifdef __amd64__
	#define PTRIN(p) ((void *)(uintptr_t)(p))
	#else
	#define PTRIN(p) (p)
	#endif

	static int
	mfi_ioctl(struct cdev dev, u_long cmd, caddr_t arg, int flag, struct thread td)
	{
	struct mfi_softc *sc;
	union mfi_statrequest *ms;
	struct mfi_ioc_packet *ioc;
	#ifdef __amd64__
	struct mfi_ioc_packet32 *ioc32;
	#endif
	struct mfi_ioc_aen *aen;
	struct mfi_command *cm = NULL;
	uint32_t context;
	union mfi_sense_ptr sense_ptr;
	uint8_t data = NULL, temp;
	int i;
	struct mfi_ioc_passthru iop = (struct mfi_ioc_passthru )arg;
	#ifdef __amd64__
	struct mfi_ioc_passthru32 iop32 = (struct mfi_ioc_passthru32 )arg;
	struct mfi_ioc_passthru iop_swab;
	#endif
	int error, locked;

	sc = dev->si_drv1;
	error = 0;

	switch (cmd) {
	case MFIIO_STATS:
	ms = (union mfi_statrequest *)arg;
	switch (ms->ms_item) {
	case MFIQ_FREE:
	case MFIQ_BIO:
	case MFIQ_READY:
	case MFIQ_BUSY:
	bcopy(&sc->mfi_qstat[ms->ms_item], &ms->ms_qstat,
	sizeof(struct mfi_qstat));
	break;
	default:
	error = ENOIOCTL;
	break;
	}
	break;
	case MFIIO_QUERY_DISK:
	{
	struct mfi_query_disk *qd;
	struct mfi_disk *ld;

	qd = (struct mfi_query_disk *)arg;
	mtx_lock(&sc->mfi_io_lock);
	TAILQ_FOREACH(ld, &sc->mfi_ld_tqh, ld_link) {
	if (ld->ld_id == qd->array_id)
	break;
	}
	if (ld == NULL) {
	qd->present = 0;
	mtx_unlock(&sc->mfi_io_lock);
	return (0);
	}
	qd->present = 1;
	if (ld->ld_flags & MFI_DISK_FLAGS_OPEN)
	qd->open = 1;
	bzero(qd->devname, SPECNAMELEN + 1);
	snprintf(qd->devname, SPECNAMELEN, "mfid%d", ld->ld_unit);
	mtx_unlock(&sc->mfi_io_lock);
	break;
	}
	case MFI_CMD:
	#ifdef __amd64__
	case MFI_CMD32:
	#endif
	{
	devclass_t devclass;
	ioc = (struct mfi_ioc_packet *)arg;
	int adapter;

	adapter = ioc->mfi_adapter_no;
	if (device_get_unit(sc->mfi_dev) == 0 && adapter != 0) {
	devclass = devclass_find("mfi");
	sc = devclass_get_softc(devclass, adapter);
	}
	mtx_lock(&sc->mfi_io_lock);
	if ((cm = mfi_dequeue_free(sc)) == NULL) {
	mtx_unlock(&sc->mfi_io_lock);
	return (EBUSY);
	}
	mtx_unlock(&sc->mfi_io_lock);
	locked = 0;

	/*
	* save off original context since copying from user
	* will clobber some data
	*/
	context = cm->cm_frame->header.context;

	bcopy(ioc->mfi_frame.raw, cm->cm_frame,
	2 * MFI_DCMD_FRAME_SIZE); /* this isn't quite right */
	cm->cm_total_frame_size = (sizeof(union mfi_sgl)
	* ioc->mfi_sge_count) + ioc->mfi_sgl_off;
	if (ioc->mfi_sge_count) {
	cm->cm_sg =
	(union mfi_sgl *)&cm->cm_frame->bytes[ioc->mfi_sgl_off];
	}
	cm->cm_flags = 0;
	if (cm->cm_frame->header.flags & MFI_FRAME_DATAIN)
	cm->cm_flags \|= MFI_CMD_DATAIN;
	if (cm->cm_frame->header.flags & MFI_FRAME_DATAOUT)
	cm->cm_flags \|= MFI_CMD_DATAOUT;
	/* Legacy app shim */
	if (cm->cm_flags == 0)
	cm->cm_flags \|= MFI_CMD_DATAIN \| MFI_CMD_DATAOUT;
	cm->cm_len = cm->cm_frame->header.data_len;
	if (cm->cm_len &&
	(cm->cm_flags & (MFI_CMD_DATAIN \| MFI_CMD_DATAOUT))) {
	cm->cm_data = data = malloc(cm->cm_len, M_MFIBUF,
	M_WAITOK \| M_ZERO);
	if (cm->cm_data == NULL) {
	device_printf(sc->mfi_dev, "Malloc failed\n");
	goto out;
	}
	} else {
	cm->cm_data = 0;
	}

	/* restore header context */
	cm->cm_frame->header.context = context;

	temp = data;
	if (cm->cm_flags & MFI_CMD_DATAOUT) {
	for (i = 0; i < ioc->mfi_sge_count; i++) {
	#ifdef __amd64__
	if (cmd == MFI_CMD) {
	/* Native */
	error = copyin(ioc->mfi_sgl[i].iov_base,
	temp,
	ioc->mfi_sgl[i].iov_len);
	} else {
	void *temp_convert;
	/* 32bit */
	ioc32 = (struct mfi_ioc_packet32 *)ioc;
	temp_convert =
	PTRIN(ioc32->mfi_sgl[i].iov_base);
	error = copyin(temp_convert,
	temp,
	ioc32->mfi_sgl[i].iov_len);
	}
	#else
	error = copyin(ioc->mfi_sgl[i].iov_base,
	temp,
	ioc->mfi_sgl[i].iov_len);
	#endif
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy in failed\n");
	goto out;
	}
	temp = &temp[ioc->mfi_sgl[i].iov_len];
	}
	}

	if (cm->cm_frame->header.cmd == MFI_CMD_DCMD)
	locked = mfi_config_lock(sc, cm->cm_frame->dcmd.opcode);

	if (cm->cm_frame->header.cmd == MFI_CMD_PD_SCSI_IO) {
	cm->cm_frame->pass.sense_addr_lo = cm->cm_sense_busaddr;
	cm->cm_frame->pass.sense_addr_hi = 0;
	}

	mtx_lock(&sc->mfi_io_lock);
	error = mfi_check_command_pre(sc, cm);
	if (error) {
	mtx_unlock(&sc->mfi_io_lock);
	goto out;
	}

	if ((error = mfi_wait_command(sc, cm)) != 0) {
	device_printf(sc->mfi_dev,
	"Controller polled failed\n");
	mtx_unlock(&sc->mfi_io_lock);
	goto out;
	}

	mfi_check_command_post(sc, cm);
	mtx_unlock(&sc->mfi_io_lock);

	temp = data;
	if (cm->cm_flags & MFI_CMD_DATAIN) {
	for (i = 0; i < ioc->mfi_sge_count; i++) {
	#ifdef __amd64__
	if (cmd == MFI_CMD) {
	/* Native */
	error = copyout(temp,
	ioc->mfi_sgl[i].iov_base,
	ioc->mfi_sgl[i].iov_len);
	} else {
	void *temp_convert;
	/* 32bit */
	ioc32 = (struct mfi_ioc_packet32 *)ioc;
	temp_convert =
	PTRIN(ioc32->mfi_sgl[i].iov_base);
	error = copyout(temp,
	temp_convert,
	ioc32->mfi_sgl[i].iov_len);
	}
	#else
	error = copyout(temp,
	ioc->mfi_sgl[i].iov_base,
	ioc->mfi_sgl[i].iov_len);
	#endif
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy out failed\n");
	goto out;
	}
	temp = &temp[ioc->mfi_sgl[i].iov_len];
	}
	}

	if (ioc->mfi_sense_len) {
	/* get user-space sense ptr then copy out sense */
	bcopy(&ioc->mfi_frame.raw[ioc->mfi_sense_off],
	&sense_ptr.sense_ptr_data[0],
	sizeof(sense_ptr.sense_ptr_data));
	#ifdef __amd64__
	if (cmd != MFI_CMD) {
	/*
	* not 64bit native so zero out any address
	* over 32bit */
	sense_ptr.addr.high = 0;
	}
	#endif
	error = copyout(cm->cm_sense, sense_ptr.user_space,
	ioc->mfi_sense_len);
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy out failed\n");
	goto out;
	}
	}

	ioc->mfi_frame.hdr.cmd_status = cm->cm_frame->header.cmd_status;
	out:
	mfi_config_unlock(sc, locked);
	if (data)
	free(data, M_MFIBUF);
	if (cm) {
	mtx_lock(&sc->mfi_io_lock);
	mfi_release_command(cm);
	mtx_unlock(&sc->mfi_io_lock);
	}

	break;
	}
	case MFI_SET_AEN:
	aen = (struct mfi_ioc_aen *)arg;
	error = mfi_aen_register(sc, aen->aen_seq_num,
	aen->aen_class_locale);

	break;
	case MFI_LINUX_CMD_2: /* Firmware Linux ioctl shim */
	{
	devclass_t devclass;
	struct mfi_linux_ioc_packet l_ioc;
	int adapter;

	devclass = devclass_find("mfi");
	if (devclass == NULL)
	return (ENOENT);

	error = copyin(arg, &l_ioc, sizeof(l_ioc));
	if (error)
	return (error);
	adapter = l_ioc.lioc_adapter_no;
	sc = devclass_get_softc(devclass, adapter);
	if (sc == NULL)
	return (ENOENT);
	return (mfi_linux_ioctl_int(sc->mfi_cdev,
	cmd, arg, flag, td));
	break;
	}
	case MFI_LINUX_SET_AEN_2: /* AEN Linux ioctl shim */
	{
	devclass_t devclass;
	struct mfi_linux_ioc_aen l_aen;
	int adapter;

	devclass = devclass_find("mfi");
	if (devclass == NULL)
	return (ENOENT);

	error = copyin(arg, &l_aen, sizeof(l_aen));
	if (error)
	return (error);
	adapter = l_aen.laen_adapter_no;
	sc = devclass_get_softc(devclass, adapter);
	if (sc == NULL)
	return (ENOENT);
	return (mfi_linux_ioctl_int(sc->mfi_cdev,
	cmd, arg, flag, td));
	break;
	}
	#ifdef __amd64__
	case MFIIO_PASSTHRU32:
	iop_swab.ioc_frame = iop32->ioc_frame;
	iop_swab.buf_size = iop32->buf_size;
	iop_swab.buf = PTRIN(iop32->buf);
	iop = &iop_swab;
	/* FALLTHROUGH */
	#endif
	case MFIIO_PASSTHRU:
	error = mfi_user_command(sc, iop);
	#ifdef __amd64__
	if (cmd == MFIIO_PASSTHRU32)
	iop32->ioc_frame = iop_swab.ioc_frame;
	#endif
	break;
	default:
	device_printf(sc->mfi_dev, "IOCTL 0x%lx not handled\n", cmd);
	error = ENOENT;
	break;
	}

	return (error);
	}

	static int
	mfi_linux_ioctl_int(struct cdev dev, u_long cmd, caddr_t arg, int flag, struct thread td)
	{
	struct mfi_softc *sc;
	struct mfi_linux_ioc_packet l_ioc;
	struct mfi_linux_ioc_aen l_aen;
	struct mfi_command *cm = NULL;
	struct mfi_aen *mfi_aen_entry;
	union mfi_sense_ptr sense_ptr;
	uint32_t context;
	uint8_t data = NULL, temp;
	int i;
	int error, locked;

	sc = dev->si_drv1;
	error = 0;
	switch (cmd) {
	case MFI_LINUX_CMD_2: /* Firmware Linux ioctl shim */
	error = copyin(arg, &l_ioc, sizeof(l_ioc));
	if (error != 0)
	return (error);

	if (l_ioc.lioc_sge_count > MAX_LINUX_IOCTL_SGE) {
	return (EINVAL);
	}

	mtx_lock(&sc->mfi_io_lock);
	if ((cm = mfi_dequeue_free(sc)) == NULL) {
	mtx_unlock(&sc->mfi_io_lock);
	return (EBUSY);
	}
	mtx_unlock(&sc->mfi_io_lock);
	locked = 0;

	/*
	* save off original context since copying from user
	* will clobber some data
	*/
	context = cm->cm_frame->header.context;

	bcopy(l_ioc.lioc_frame.raw, cm->cm_frame,
	2 * MFI_DCMD_FRAME_SIZE); /* this isn't quite right */
	cm->cm_total_frame_size = (sizeof(union mfi_sgl)
	* l_ioc.lioc_sge_count) + l_ioc.lioc_sgl_off;
	if (l_ioc.lioc_sge_count)
	cm->cm_sg =
	(union mfi_sgl *)&cm->cm_frame->bytes[l_ioc.lioc_sgl_off];
	cm->cm_flags = 0;
	if (cm->cm_frame->header.flags & MFI_FRAME_DATAIN)
	cm->cm_flags \|= MFI_CMD_DATAIN;
	if (cm->cm_frame->header.flags & MFI_FRAME_DATAOUT)
	cm->cm_flags \|= MFI_CMD_DATAOUT;
	cm->cm_len = cm->cm_frame->header.data_len;
	if (cm->cm_len &&
	(cm->cm_flags & (MFI_CMD_DATAIN \| MFI_CMD_DATAOUT))) {
	cm->cm_data = data = malloc(cm->cm_len, M_MFIBUF,
	M_WAITOK \| M_ZERO);
	if (cm->cm_data == NULL) {
	device_printf(sc->mfi_dev, "Malloc failed\n");
	goto out;
	}
	} else {
	cm->cm_data = 0;
	}

	/* restore header context */
	cm->cm_frame->header.context = context;

	temp = data;
	if (cm->cm_flags & MFI_CMD_DATAOUT) {
	for (i = 0; i < l_ioc.lioc_sge_count; i++) {
	error = copyin(PTRIN(l_ioc.lioc_sgl[i].iov_base),
	temp,
	l_ioc.lioc_sgl[i].iov_len);
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy in failed\n");
	goto out;
	}
	temp = &temp[l_ioc.lioc_sgl[i].iov_len];
	}
	}

	if (cm->cm_frame->header.cmd == MFI_CMD_DCMD)
	locked = mfi_config_lock(sc, cm->cm_frame->dcmd.opcode);

	if (cm->cm_frame->header.cmd == MFI_CMD_PD_SCSI_IO) {
	cm->cm_frame->pass.sense_addr_lo = cm->cm_sense_busaddr;
	cm->cm_frame->pass.sense_addr_hi = 0;
	}

	mtx_lock(&sc->mfi_io_lock);
	error = mfi_check_command_pre(sc, cm);
	if (error) {
	mtx_unlock(&sc->mfi_io_lock);
	goto out;
	}

	if ((error = mfi_wait_command(sc, cm)) != 0) {
	device_printf(sc->mfi_dev,
	"Controller polled failed\n");
	mtx_unlock(&sc->mfi_io_lock);
	goto out;
	}

	mfi_check_command_post(sc, cm);
	mtx_unlock(&sc->mfi_io_lock);

	temp = data;
	if (cm->cm_flags & MFI_CMD_DATAIN) {
	for (i = 0; i < l_ioc.lioc_sge_count; i++) {
	error = copyout(temp,
	PTRIN(l_ioc.lioc_sgl[i].iov_base),
	l_ioc.lioc_sgl[i].iov_len);
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy out failed\n");
	goto out;
	}
	temp = &temp[l_ioc.lioc_sgl[i].iov_len];
	}
	}

	if (l_ioc.lioc_sense_len) {
	/* get user-space sense ptr then copy out sense */
	bcopy(&((struct mfi_linux_ioc_packet*)arg)
	->lioc_frame.raw[l_ioc.lioc_sense_off],
	&sense_ptr.sense_ptr_data[0],
	sizeof(sense_ptr.sense_ptr_data));
	#ifdef __amd64__
	/*
	* only 32bit Linux support so zero out any
	* address over 32bit
	*/
	sense_ptr.addr.high = 0;
	#endif
	error = copyout(cm->cm_sense, sense_ptr.user_space,
	l_ioc.lioc_sense_len);
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy out failed\n");
	goto out;
	}
	}

	error = copyout(&cm->cm_frame->header.cmd_status,
	&((struct mfi_linux_ioc_packet*)arg)
	->lioc_frame.hdr.cmd_status,
	1);
	if (error != 0) {
	device_printf(sc->mfi_dev,
	"Copy out failed\n");
	goto out;
	}

	out:
	mfi_config_unlock(sc, locked);
	if (data)
	free(data, M_MFIBUF);
	if (cm) {
	mtx_lock(&sc->mfi_io_lock);
	mfi_release_command(cm);
	mtx_unlock(&sc->mfi_io_lock);
	}

	return (error);
	case MFI_LINUX_SET_AEN_2: /* AEN Linux ioctl shim */
	error = copyin(arg, &l_aen, sizeof(l_aen));
	if (error != 0)
	return (error);
	printf("AEN IMPLEMENTED for pid %d\n", curproc->p_pid);
	mfi_aen_entry = malloc(sizeof(struct mfi_aen), M_MFIBUF,
	M_WAITOK);
	mtx_lock(&sc->mfi_io_lock);
	if (mfi_aen_entry != NULL) {
	mfi_aen_entry->p = curproc;
	TAILQ_INSERT_TAIL(&sc->mfi_aen_pids, mfi_aen_entry,
	aen_link);
	}
	error = mfi_aen_register(sc, l_aen.laen_seq_num,
	l_aen.laen_class_locale);

	if (error != 0) {
	TAILQ_REMOVE(&sc->mfi_aen_pids, mfi_aen_entry,
	aen_link);
	free(mfi_aen_entry, M_MFIBUF);
	}
	mtx_unlock(&sc->mfi_io_lock);

	return (error);
	default:
	device_printf(sc->mfi_dev, "IOCTL 0x%lx not handled\n", cmd);
	error = ENOENT;
	break;
	}

	return (error);
	}

	static int
	mfi_poll(struct cdev dev, int poll_events, struct thread td)
	{
	struct mfi_softc *sc;
	int revents = 0;

	sc = dev->si_drv1;

	if (poll_events & (POLLIN \| POLLRDNORM)) {
	if (sc->mfi_aen_triggered != 0) {
	revents \|= poll_events & (POLLIN \| POLLRDNORM);
	sc->mfi_aen_triggered = 0;
	}
	if (sc->mfi_aen_triggered == 0 && sc->mfi_aen_cm == NULL) {
	revents \|= POLLERR;
	}
	}

	if (revents == 0) {
	if (poll_events & (POLLIN \| POLLRDNORM)) {
	sc->mfi_poll_waiting = 1;
	selrecord(td, &sc->mfi_select);
	}
	}

	return revents;
	}


	static void
	mfi_dump_all(void)
	{
	struct mfi_softc *sc;
	struct mfi_command *cm;
	devclass_t dc;
	time_t deadline;
	int timedout;
	int i;

	dc = devclass_find("mfi");
	if (dc == NULL) {
	printf("No mfi dev class\n");
	return;
	}

	for (i = 0; ; i++) {
	sc = devclass_get_softc(dc, i);
	if (sc == NULL)
	break;
	device_printf(sc->mfi_dev, "Dumping\n\n");
	timedout = 0;
	deadline = time_uptime - MFI_CMD_TIMEOUT;
	mtx_lock(&sc->mfi_io_lock);
	TAILQ_FOREACH(cm, &sc->mfi_busy, cm_link) {
	if (cm->cm_timestamp < deadline) {
	device_printf(sc->mfi_dev,
	"COMMAND %p TIMEOUT AFTER %d SECONDS\n", cm,
	(int)(time_uptime - cm->cm_timestamp));
	MFI_PRINT_CMD(cm);
	timedout++;
	}
	}

	#if 0
	if (timedout)
	MFI_DUMP_CMDS(SC);
	#endif

	mtx_unlock(&sc->mfi_io_lock);
	}

	return;
	}

	static void
	mfi_timeout(void *data)
	{
	struct mfi_softc sc = (struct mfi_softc )data;
	struct mfi_command *cm;
	time_t deadline;
	int timedout = 0;

	deadline = time_uptime - MFI_CMD_TIMEOUT;
	mtx_lock(&sc->mfi_io_lock);
	TAILQ_FOREACH(cm, &sc->mfi_busy, cm_link) {
	if (sc->mfi_aen_cm == cm)
	continue;
	if ((sc->mfi_aen_cm != cm) && (cm->cm_timestamp < deadline)) {
	device_printf(sc->mfi_dev,
	"COMMAND %p TIMEOUT AFTER %d SECONDS\n", cm,
	(int)(time_uptime - cm->cm_timestamp));
	MFI_PRINT_CMD(cm);
	MFI_VALIDATE_CMD(sc, cm);
	timedout++;
	}
	}

	#if 0
	if (timedout)
	MFI_DUMP_CMDS(SC);
	#endif

	mtx_unlock(&sc->mfi_io_lock);

	callout_reset(&sc->mfi_watchdog_callout, MFI_CMD_TIMEOUT * hz,
	mfi_timeout, sc);

	if (0)
	mfi_dump_all();
	return;
	}
	Index: head/sys/dev/sound/midi/midi.c
	===================================================================
	--- head/sys/dev/sound/midi/midi.c (revision 225616)
	+++ head/sys/dev/sound/midi/midi.c (revision 225617)
	@@ -1,1531 +1,1531 @@
	/*-
	* Copyright (c) 2003 Mathew Kanner
	* Copyright (c) 1998 The NetBSD Foundation, Inc.
	* All rights reserved.
	*
	* This code is derived from software contributed to The NetBSD Foundation
	* by Lennart Augustsson (augustss@netbsd.org).
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
	* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	/*
	* Parts of this file started out as NetBSD: midi.c 1.31
	* They are mostly gone. Still the most obvious will be the state
	* machine midi_in
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/queue.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/signalvar.h>
	#include <sys/conf.h>
	#include <sys/selinfo.h>
	#include <sys/sysctl.h>
	#include <sys/types.h>
	#include <sys/malloc.h>
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/proc.h>
	#include <sys/fcntl.h>
	#include <sys/types.h>
	#include <sys/uio.h>
	#include <sys/poll.h>
	#include <sys/sbuf.h>
	#include <sys/kobj.h>
	#include <sys/module.h>

	#ifdef HAVE_KERNEL_OPTION_HEADERS
	#include "opt_snd.h"
	#endif

	#include <dev/sound/midi/midi.h>
	#include "mpu_if.h"

	#include <dev/sound/midi/midiq.h>
	#include "synth_if.h"
	MALLOC_DEFINE(M_MIDI, "midi buffers", "Midi data allocation area");

	#ifndef KOBJMETHOD_END
	#define KOBJMETHOD_END { NULL, NULL }
	#endif

	#define PCMMKMINOR(u, d, c) ((((c) & 0xff) << 16) \| (((u) & 0x0f) << 4) \| ((d) & 0x0f))
	#define MIDIMKMINOR(u, d, c) PCMMKMINOR(u, d, c)

	#define MIDI_DEV_RAW 2
	#define MIDI_DEV_MIDICTL 12

	enum midi_states {
	MIDI_IN_START, MIDI_IN_SYSEX, MIDI_IN_DATA
	};

	/*
	* The MPU interface current has init() uninit() inqsize(( outqsize()
	* callback() : fiddle with the tx\|rx status.
	*/

	#include "mpu_if.h"

	/*
	* /dev/rmidi Structure definitions
	*/

	#define MIDI_NAMELEN 16
	struct snd_midi {
	KOBJ_FIELDS;
	struct mtx lock; /* Protects all but queues */
	void *cookie;

	int unit; /* Should only be used in midistat */
	int channel; /* Should only be used in midistat */

	int busy;
	int flags; /* File flags */
	char name[MIDI_NAMELEN];
	struct mtx qlock; /* Protects inq, outq and flags */
	MIDIQ_HEAD(, char) inq, outq;
	int rchan, wchan;
	struct selinfo rsel, wsel;
	int hiwat; /* QLEN(outq)>High-water -> disable
	* writes from userland */
	enum midi_states inq_state;
	int inq_status, inq_left; /* Variables for the state machine in
	* Midi_in, this is to provide that
	* signals only get issued only
	* complete command packets. */
	struct proc *async;
	struct cdev *dev;
	struct synth_midi *synth;
	int synth_flags;
	TAILQ_ENTRY(snd_midi) link;
	};

	struct synth_midi {
	KOBJ_FIELDS;
	struct snd_midi *m;
	};

	static synth_open_t midisynth_open;
	static synth_close_t midisynth_close;
	static synth_writeraw_t midisynth_writeraw;
	static synth_killnote_t midisynth_killnote;
	static synth_startnote_t midisynth_startnote;
	static synth_setinstr_t midisynth_setinstr;
	static synth_alloc_t midisynth_alloc;
	static synth_controller_t midisynth_controller;
	static synth_bender_t midisynth_bender;


	static kobj_method_t midisynth_methods[] = {
	KOBJMETHOD(synth_open, midisynth_open),
	KOBJMETHOD(synth_close, midisynth_close),
	KOBJMETHOD(synth_writeraw, midisynth_writeraw),
	KOBJMETHOD(synth_setinstr, midisynth_setinstr),
	KOBJMETHOD(synth_startnote, midisynth_startnote),
	KOBJMETHOD(synth_killnote, midisynth_killnote),
	KOBJMETHOD(synth_alloc, midisynth_alloc),
	KOBJMETHOD(synth_controller, midisynth_controller),
	KOBJMETHOD(synth_bender, midisynth_bender),
	KOBJMETHOD_END
	};

	DEFINE_CLASS(midisynth, midisynth_methods, 0);

	/*
	* Module Exports & Interface
	*
	* struct midi_chan *midi_init(MPU_CLASS cls, int unit, int chan) int
	* midi_uninit(struct snd_midi *) 0 == no error EBUSY or other error int
	* Midi_in(struct midi_chan , char buf, int count) int Midi_out(struct
	* midi_chan , char buf, int count)
	*
	* midi_{in,out} return actual size transfered
	*
	*/


	/*
	* midi_devs tailq, holder of all rmidi instances protected by midistat_lock
	*/

	TAILQ_HEAD(, snd_midi) midi_devs;

	/*
	* /dev/midistat variables and declarations, protected by midistat_lock
	*/

	static struct mtx midistat_lock;
	static int midistat_isopen = 0;
	static struct sbuf midistat_sbuf;
	static int midistat_bufptr;
	static struct cdev *midistat_dev;

	/*
	* /dev/midistat dev_t declarations
	*/

	static d_open_t midistat_open;
	static d_close_t midistat_close;
	static d_read_t midistat_read;

	static struct cdevsw midistat_cdevsw = {
	.d_version = D_VERSION,
	.d_open = midistat_open,
	.d_close = midistat_close,
	.d_read = midistat_read,
	.d_name = "midistat",
	};


	/*
	* /dev/rmidi dev_t declarations, struct variable access is protected by
	* locks contained within the structure.
	*/

	static d_open_t midi_open;
	static d_close_t midi_close;
	static d_ioctl_t midi_ioctl;
	static d_read_t midi_read;
	static d_write_t midi_write;
	static d_poll_t midi_poll;

	static struct cdevsw midi_cdevsw = {
	.d_version = D_VERSION,
	.d_open = midi_open,
	.d_close = midi_close,
	.d_read = midi_read,
	.d_write = midi_write,
	.d_ioctl = midi_ioctl,
	.d_poll = midi_poll,
	.d_name = "rmidi",
	};

	/*
	* Prototypes of library functions
	*/

	static int midi_destroy(struct snd_midi *, int);
	static int midistat_prepare(struct sbuf * s);
	static int midi_load(void);
	static int midi_unload(void);

	/*
	* Misc declr.
	*/
	SYSCTL_NODE(_hw, OID_AUTO, midi, CTLFLAG_RD, 0, "Midi driver");
	SYSCTL_NODE(_hw_midi, OID_AUTO, stat, CTLFLAG_RD, 0, "Status device");

	int midi_debug;
	/* XXX: should this be moved into debug.midi? */
	SYSCTL_INT(_hw_midi, OID_AUTO, debug, CTLFLAG_RW, &midi_debug, 0, "");

	int midi_dumpraw;
	SYSCTL_INT(_hw_midi, OID_AUTO, dumpraw, CTLFLAG_RW, &midi_dumpraw, 0, "");

	int midi_instroff;
	SYSCTL_INT(_hw_midi, OID_AUTO, instroff, CTLFLAG_RW, &midi_instroff, 0, "");

	int midistat_verbose;
	SYSCTL_INT(_hw_midi_stat, OID_AUTO, verbose, CTLFLAG_RW,
	&midistat_verbose, 0, "");

	#define MIDI_DEBUG(l,a) if(midi_debug>=l) a
	/*
	* CODE START
	*/

	/*
	* Register a new rmidi device. cls midi_if interface unit == 0 means
	* auto-assign new unit number unit != 0 already assigned a unit number, eg.
	* not the first channel provided by this device. channel, sub-unit
	* cookie is passed back on MPU calls Typical device drivers will call with
	* unit=0, channel=1..(number of channels) and cookie=soft_c and won't care
	* what unit number is used.
	*
	* It is an error to call midi_init with an already used unit/channel combo.
	*
	* Returns NULL on error
	*
	*/
	struct snd_midi *
	midi_init(kobj_class_t cls, int unit, int channel, void *cookie)
	{
	struct snd_midi *m;
	int i;
	int inqsize, outqsize;
	MIDI_TYPE *buf;

	MIDI_DEBUG(1, printf("midiinit: unit %d/%d.\n", unit, channel));
	mtx_lock(&midistat_lock);
	/*
	* Protect against call with existing unit/channel or auto-allocate a
	* new unit number.
	*/
	i = -1;
	TAILQ_FOREACH(m, &midi_devs, link) {
	mtx_lock(&m->lock);
	if (unit != 0) {
	if (m->unit == unit && m->channel == channel) {
	mtx_unlock(&m->lock);
	goto err0;
	}
	} else {
	/*
	* Find a better unit number
	*/
	if (m->unit > i)
	i = m->unit;
	}
	mtx_unlock(&m->lock);
	}

	if (unit == 0)
	unit = i + 1;

	MIDI_DEBUG(1, printf("midiinit #2: unit %d/%d.\n", unit, channel));
	m = malloc(sizeof(*m), M_MIDI, M_NOWAIT \| M_ZERO);
	if (m == NULL)
	goto err0;

	m->synth = malloc(sizeof(*m->synth), M_MIDI, M_NOWAIT \| M_ZERO);
	kobj_init((kobj_t)m->synth, &midisynth_class);
	m->synth->m = m;
	kobj_init((kobj_t)m, cls);
	inqsize = MPU_INQSIZE(m, cookie);
	outqsize = MPU_OUTQSIZE(m, cookie);

	MIDI_DEBUG(1, printf("midiinit queues %d/%d.\n", inqsize, outqsize));
	if (!inqsize && !outqsize)
	goto err1;

	mtx_init(&m->lock, "raw midi", NULL, 0);
	mtx_init(&m->qlock, "q raw midi", NULL, 0);

	mtx_lock(&m->lock);
	mtx_lock(&m->qlock);

	if (inqsize)
	buf = malloc(sizeof(MIDI_TYPE) * inqsize, M_MIDI, M_NOWAIT);
	else
	buf = NULL;

	MIDIQ_INIT(m->inq, buf, inqsize);

	if (outqsize)
	buf = malloc(sizeof(MIDI_TYPE) * outqsize, M_MIDI, M_NOWAIT);
	else
	buf = NULL;
	m->hiwat = outqsize / 2;

	MIDIQ_INIT(m->outq, buf, outqsize);

	if ((inqsize && !MIDIQ_BUF(m->inq)) \|\|
	(outqsize && !MIDIQ_BUF(m->outq)))
	goto err2;


	m->busy = 0;
	m->flags = 0;
	m->unit = unit;
	m->channel = channel;
	m->cookie = cookie;

	if (MPU_INIT(m, cookie))
	goto err2;

	mtx_unlock(&m->lock);
	mtx_unlock(&m->qlock);

	TAILQ_INSERT_TAIL(&midi_devs, m, link);

	mtx_unlock(&midistat_lock);

	m->dev = make_dev(&midi_cdevsw,
	MIDIMKMINOR(unit, MIDI_DEV_RAW, channel),
	UID_ROOT, GID_WHEEL, 0666, "midi%d.%d", unit, channel);
	m->dev->si_drv1 = m;

	return m;

	err2: mtx_destroy(&m->qlock);
	mtx_destroy(&m->lock);

	if (MIDIQ_BUF(m->inq))
	free(MIDIQ_BUF(m->inq), M_MIDI);
	if (MIDIQ_BUF(m->outq))
	free(MIDIQ_BUF(m->outq), M_MIDI);
	err1: free(m, M_MIDI);
	err0: mtx_unlock(&midistat_lock);
	MIDI_DEBUG(1, printf("midi_init ended in error\n"));
	return NULL;
	}

	/*
	* midi_uninit does not call MIDI_UNINIT, as since this is the implementors
	* entry point. midi_unint if fact, does not send any methods. A call to
	* midi_uninit is a defacto promise that you won't manipulate ch anymore
	*
	*/

	int
	midi_uninit(struct snd_midi *m)
	{
	int err;

	err = ENXIO;
	mtx_lock(&midistat_lock);
	mtx_lock(&m->lock);
	if (m->busy) {
	if (!(m->rchan \|\| m->wchan))
	goto err;

	if (m->rchan) {
	wakeup(&m->rchan);
	m->rchan = 0;
	}
	if (m->wchan) {
	wakeup(&m->wchan);
	m->wchan = 0;
	}
	}
	err = midi_destroy(m, 0);
	if (!err)
	goto exit;

	err: mtx_unlock(&m->lock);
	exit: mtx_unlock(&midistat_lock);
	return err;
	}

	/*
	* midi_in: process all data until the queue is full, then discards the rest.
	* Since midi_in is a state machine, data discards can cause it to get out of
	* whack. Process as much as possible. It calls, wakeup, selnotify and
	* psignal at most once.
	*/

	#ifdef notdef
	static int midi_lengths[] = {2, 2, 2, 2, 1, 1, 2, 0};

	#endif /* notdef */
	/* Number of bytes in a MIDI command */
	#define MIDI_LENGTH(d) (midi_lengths[((d) >> 4) & 7])
	#define MIDI_ACK 0xfe
	#define MIDI_IS_STATUS(d) ((d) >= 0x80)
	#define MIDI_IS_COMMON(d) ((d) >= 0xf0)

	#define MIDI_SYSEX_START 0xF0
	#define MIDI_SYSEX_END 0xF7


	int
	midi_in(struct snd_midi m, MIDI_TYPE buf, int size)
	{
	/* int i, sig, enq; */
	int used;

	/* MIDI_TYPE data; */
	MIDI_DEBUG(5, printf("midi_in: m=%p size=%d\n", m, size));

	/*
	* XXX: locking flub
	*/
	if (!(m->flags & M_RX))
	return size;

	used = 0;

	mtx_lock(&m->qlock);
	#if 0
	/*
	* Don't bother queuing if not in read mode. Discard everything and
	* return size so the caller doesn't freak out.
	*/

	if (!(m->flags & M_RX))
	return size;

	for (i = sig = 0; i < size; i++) {

	data = buf[i];
	enq = 0;
	if (data == MIDI_ACK)
	continue;

	switch (m->inq_state) {
	case MIDI_IN_START:
	if (MIDI_IS_STATUS(data)) {
	switch (data) {
	case 0xf0: /* Sysex */
	m->inq_state = MIDI_IN_SYSEX;
	break;
	case 0xf1: /* MTC quarter frame */
	case 0xf3: /* Song select */
	m->inq_state = MIDI_IN_DATA;
	enq = 1;
	m->inq_left = 1;
	break;
	case 0xf2: /* Song position pointer */
	m->inq_state = MIDI_IN_DATA;
	enq = 1;
	m->inq_left = 2;
	break;
	default:
	if (MIDI_IS_COMMON(data)) {
	enq = 1;
	sig = 1;
	} else {
	m->inq_state = MIDI_IN_DATA;
	enq = 1;
	m->inq_status = data;
	m->inq_left = MIDI_LENGTH(data);
	}
	break;
	}
	} else if (MIDI_IS_STATUS(m->inq_status)) {
	m->inq_state = MIDI_IN_DATA;
	if (!MIDIQ_FULL(m->inq)) {
	used++;
	MIDIQ_ENQ(m->inq, &m->inq_status, 1);
	}
	enq = 1;
	m->inq_left = MIDI_LENGTH(m->inq_status) - 1;
	}
	break;
	/*
	* End of case MIDI_IN_START:
	*/

	case MIDI_IN_DATA:
	enq = 1;
	if (--m->inq_left <= 0)
	sig = 1;/* deliver data */
	break;
	case MIDI_IN_SYSEX:
	if (data == MIDI_SYSEX_END)
	m->inq_state = MIDI_IN_START;
	break;
	}

	if (enq)
	if (!MIDIQ_FULL(m->inq)) {
	MIDIQ_ENQ(m->inq, &data, 1);
	used++;
	}
	/*
	* End of the state machines main "for loop"
	*/
	}
	if (sig) {
	#endif
	MIDI_DEBUG(6, printf("midi_in: len %jd avail %jd\n",
	(intmax_t)MIDIQ_LEN(m->inq),
	(intmax_t)MIDIQ_AVAIL(m->inq)));
	if (MIDIQ_AVAIL(m->inq) > size) {
	used = size;
	MIDIQ_ENQ(m->inq, buf, size);
	} else {
	MIDI_DEBUG(4, printf("midi_in: Discarding data qu\n"));
	mtx_unlock(&m->qlock);
	return 0;
	}
	if (m->rchan) {
	wakeup(&m->rchan);
	m->rchan = 0;
	}
	selwakeup(&m->rsel);
	if (m->async) {
	PROC_LOCK(m->async);
	- psignal(m->async, SIGIO);
	+ kern_psignal(m->async, SIGIO);
	PROC_UNLOCK(m->async);
	}
	#if 0
	}
	#endif
	mtx_unlock(&m->qlock);
	return used;
	}

	/*
	* midi_out: The only clearer of the M_TXEN flag.
	*/
	int
	midi_out(struct snd_midi m, MIDI_TYPE buf, int size)
	{
	int used;

	/*
	* XXX: locking flub
	*/
	if (!(m->flags & M_TXEN))
	return 0;

	MIDI_DEBUG(2, printf("midi_out: %p\n", m));
	mtx_lock(&m->qlock);
	used = MIN(size, MIDIQ_LEN(m->outq));
	MIDI_DEBUG(3, printf("midi_out: used %d\n", used));
	if (used)
	MIDIQ_DEQ(m->outq, buf, used);
	if (MIDIQ_EMPTY(m->outq)) {
	m->flags &= ~M_TXEN;
	MPU_CALLBACKP(m, m->cookie, m->flags);
	}
	if (used && MIDIQ_AVAIL(m->outq) > m->hiwat) {
	if (m->wchan) {
	wakeup(&m->wchan);
	m->wchan = 0;
	}
	selwakeup(&m->wsel);
	if (m->async) {
	PROC_LOCK(m->async);
	- psignal(m->async, SIGIO);
	+ kern_psignal(m->async, SIGIO);
	PROC_UNLOCK(m->async);
	}
	}
	mtx_unlock(&m->qlock);
	return used;
	}


	/*
	* /dev/rmidi#.# device access functions
	*/
	int
	midi_open(struct cdev i_dev, int flags, int mode, struct thread td)
	{
	struct snd_midi *m = i_dev->si_drv1;
	int retval;

	MIDI_DEBUG(1, printf("midiopen %p %s %s\n", td,
	flags & FREAD ? "M_RX" : "", flags & FWRITE ? "M_TX" : ""));
	if (m == NULL)
	return ENXIO;

	mtx_lock(&m->lock);
	mtx_lock(&m->qlock);

	retval = 0;

	if (flags & FREAD) {
	if (MIDIQ_SIZE(m->inq) == 0)
	retval = ENXIO;
	else if (m->flags & M_RX)
	retval = EBUSY;
	if (retval)
	goto err;
	}
	if (flags & FWRITE) {
	if (MIDIQ_SIZE(m->outq) == 0)
	retval = ENXIO;
	else if (m->flags & M_TX)
	retval = EBUSY;
	if (retval)
	goto err;
	}
	m->busy++;

	m->rchan = 0;
	m->wchan = 0;
	m->async = 0;

	if (flags & FREAD) {
	m->flags \|= M_RX \| M_RXEN;
	/*
	* Only clear the inq, the outq might still have data to drain
	* from a previous session
	*/
	MIDIQ_CLEAR(m->inq);
	};

	if (flags & FWRITE)
	m->flags \|= M_TX;

	MPU_CALLBACK(m, m->cookie, m->flags);

	MIDI_DEBUG(2, printf("midi_open: opened.\n"));

	err: mtx_unlock(&m->qlock);
	mtx_unlock(&m->lock);
	return retval;
	}

	int
	midi_close(struct cdev i_dev, int flags, int mode, struct thread td)
	{
	struct snd_midi *m = i_dev->si_drv1;
	int retval;
	int oldflags;

	MIDI_DEBUG(1, printf("midi_close %p %s %s\n", td,
	flags & FREAD ? "M_RX" : "", flags & FWRITE ? "M_TX" : ""));

	if (m == NULL)
	return ENXIO;

	mtx_lock(&m->lock);
	mtx_lock(&m->qlock);

	if ((flags & FREAD && !(m->flags & M_RX)) \|\|
	(flags & FWRITE && !(m->flags & M_TX))) {
	retval = ENXIO;
	goto err;
	}
	m->busy--;

	oldflags = m->flags;

	if (flags & FREAD)
	m->flags &= ~(M_RX \| M_RXEN);
	if (flags & FWRITE)
	m->flags &= ~M_TX;

	if ((m->flags & (M_TXEN \| M_RXEN)) != (oldflags & (M_RXEN \| M_TXEN)))
	MPU_CALLBACK(m, m->cookie, m->flags);

	MIDI_DEBUG(1, printf("midi_close: closed, busy = %d.\n", m->busy));

	mtx_unlock(&m->qlock);
	mtx_unlock(&m->lock);
	retval = 0;
	err: return retval;
	}

	/*
	* TODO: midi_read, per oss programmer's guide pg. 42 should return as soon
	* as data is available.
	*/
	int
	midi_read(struct cdev i_dev, struct uio uio, int ioflag)
	{
	#define MIDI_RSIZE 32
	struct snd_midi *m = i_dev->si_drv1;
	int retval;
	int used;
	char buf[MIDI_RSIZE];

	MIDI_DEBUG(5, printf("midiread: count=%lu\n",
	(unsigned long)uio->uio_resid));

	retval = EIO;

	if (m == NULL)
	goto err0;

	mtx_lock(&m->lock);
	mtx_lock(&m->qlock);

	if (!(m->flags & M_RX))
	goto err1;

	while (uio->uio_resid > 0) {
	while (MIDIQ_EMPTY(m->inq)) {
	retval = EWOULDBLOCK;
	if (ioflag & O_NONBLOCK)
	goto err1;
	mtx_unlock(&m->lock);
	m->rchan = 1;
	retval = msleep(&m->rchan, &m->qlock,
	PCATCH \| PDROP, "midi RX", 0);
	/*
	* We slept, maybe things have changed since last
	* dying check
	*/
	if (retval == EINTR)
	goto err0;
	if (m != i_dev->si_drv1)
	retval = ENXIO;
	/* if (retval && retval != ERESTART) */
	if (retval)
	goto err0;
	mtx_lock(&m->lock);
	mtx_lock(&m->qlock);
	m->rchan = 0;
	if (!m->busy)
	goto err1;
	}
	MIDI_DEBUG(6, printf("midi_read start\n"));
	/*
	* At this point, it is certain that m->inq has data
	*/

	used = MIN(MIDIQ_LEN(m->inq), uio->uio_resid);
	used = MIN(used, MIDI_RSIZE);

	MIDI_DEBUG(6, printf("midiread: uiomove cc=%d\n", used));
	MIDIQ_DEQ(m->inq, buf, used);
	retval = uiomove(buf, used, uio);
	if (retval)
	goto err1;
	}

	/*
	* If we Made it here then transfer is good
	*/
	retval = 0;
	err1: mtx_unlock(&m->qlock);
	mtx_unlock(&m->lock);
	err0: MIDI_DEBUG(4, printf("midi_read: ret %d\n", retval));
	return retval;
	}

	/*
	* midi_write: The only setter of M_TXEN
	*/

	int
	midi_write(struct cdev i_dev, struct uio uio, int ioflag)
	{
	#define MIDI_WSIZE 32
	struct snd_midi *m = i_dev->si_drv1;
	int retval;
	int used;
	char buf[MIDI_WSIZE];


	MIDI_DEBUG(4, printf("midi_write\n"));
	retval = 0;
	if (m == NULL)
	goto err0;

	mtx_lock(&m->lock);
	mtx_lock(&m->qlock);

	if (!(m->flags & M_TX))
	goto err1;

	while (uio->uio_resid > 0) {
	while (MIDIQ_AVAIL(m->outq) == 0) {
	retval = EWOULDBLOCK;
	if (ioflag & O_NONBLOCK)
	goto err1;
	mtx_unlock(&m->lock);
	m->wchan = 1;
	MIDI_DEBUG(3, printf("midi_write msleep\n"));
	retval = msleep(&m->wchan, &m->qlock,
	PCATCH \| PDROP, "midi TX", 0);
	/*
	* We slept, maybe things have changed since last
	* dying check
	*/
	if (retval == EINTR)
	goto err0;
	if (m != i_dev->si_drv1)
	retval = ENXIO;
	if (retval)
	goto err0;
	mtx_lock(&m->lock);
	mtx_lock(&m->qlock);
	m->wchan = 0;
	if (!m->busy)
	goto err1;
	}

	/*
	* We are certain than data can be placed on the queue
	*/

	used = MIN(MIDIQ_AVAIL(m->outq), uio->uio_resid);
	used = MIN(used, MIDI_WSIZE);
	MIDI_DEBUG(5, printf("midiout: resid %zd len %jd avail %jd\n",
	uio->uio_resid, (intmax_t)MIDIQ_LEN(m->outq),
	(intmax_t)MIDIQ_AVAIL(m->outq)));


	MIDI_DEBUG(5, printf("midi_write: uiomove cc=%d\n", used));
	retval = uiomove(buf, used, uio);
	if (retval)
	goto err1;
	MIDIQ_ENQ(m->outq, buf, used);
	/*
	* Inform the bottom half that data can be written
	*/
	if (!(m->flags & M_TXEN)) {
	m->flags \|= M_TXEN;
	MPU_CALLBACK(m, m->cookie, m->flags);
	}
	}
	/*
	* If we Made it here then transfer is good
	*/
	retval = 0;
	err1: mtx_unlock(&m->qlock);
	mtx_unlock(&m->lock);
	err0: return retval;
	}

	int
	midi_ioctl(struct cdev *i_dev, u_long cmd, caddr_t arg, int mode,
	struct thread *td)
	{
	return ENXIO;
	}

	int
	midi_poll(struct cdev i_dev, int events, struct thread td)
	{
	struct snd_midi *m = i_dev->si_drv1;
	int revents;

	if (m == NULL)
	return 0;

	revents = 0;

	mtx_lock(&m->lock);
	mtx_lock(&m->qlock);

	if (events & (POLLIN \| POLLRDNORM))
	if (!MIDIQ_EMPTY(m->inq))
	events \|= events & (POLLIN \| POLLRDNORM);

	if (events & (POLLOUT \| POLLWRNORM))
	if (MIDIQ_AVAIL(m->outq) < m->hiwat)
	events \|= events & (POLLOUT \| POLLWRNORM);

	if (revents == 0) {
	if (events & (POLLIN \| POLLRDNORM))
	selrecord(td, &m->rsel);

	if (events & (POLLOUT \| POLLWRNORM))
	selrecord(td, &m->wsel);
	}
	mtx_unlock(&m->lock);
	mtx_unlock(&m->qlock);

	return (revents);
	}

	/*
	* /dev/midistat device functions
	*
	*/
	static int
	midistat_open(struct cdev i_dev, int flags, int mode, struct thread td)
	{
	int error;

	MIDI_DEBUG(1, printf("midistat_open\n"));
	mtx_lock(&midistat_lock);

	if (midistat_isopen) {
	mtx_unlock(&midistat_lock);
	return EBUSY;
	}
	midistat_isopen = 1;
	mtx_unlock(&midistat_lock);

	if (sbuf_new(&midistat_sbuf, NULL, 4096, SBUF_AUTOEXTEND) == NULL) {
	error = ENXIO;
	mtx_lock(&midistat_lock);
	goto out;
	}
	mtx_lock(&midistat_lock);
	midistat_bufptr = 0;
	error = (midistat_prepare(&midistat_sbuf) > 0) ? 0 : ENOMEM;

	out: if (error)
	midistat_isopen = 0;
	mtx_unlock(&midistat_lock);
	return error;
	}

	static int
	midistat_close(struct cdev i_dev, int flags, int mode, struct thread td)
	{
	MIDI_DEBUG(1, printf("midistat_close\n"));
	mtx_lock(&midistat_lock);
	if (!midistat_isopen) {
	mtx_unlock(&midistat_lock);
	return EBADF;
	}
	sbuf_delete(&midistat_sbuf);
	midistat_isopen = 0;

	mtx_unlock(&midistat_lock);
	return 0;
	}

	static int
	midistat_read(struct cdev i_dev, struct uio buf, int flag)
	{
	int l, err;

	MIDI_DEBUG(4, printf("midistat_read\n"));
	mtx_lock(&midistat_lock);
	if (!midistat_isopen) {
	mtx_unlock(&midistat_lock);
	return EBADF;
	}
	l = min(buf->uio_resid, sbuf_len(&midistat_sbuf) - midistat_bufptr);
	err = 0;
	if (l > 0) {
	mtx_unlock(&midistat_lock);
	err = uiomove(sbuf_data(&midistat_sbuf) + midistat_bufptr, l,
	buf);
	mtx_lock(&midistat_lock);
	} else
	l = 0;
	midistat_bufptr += l;
	mtx_unlock(&midistat_lock);
	return err;
	}

	/*
	* Module library functions
	*/

	static int
	midistat_prepare(struct sbuf *s)
	{
	struct snd_midi *m;

	mtx_assert(&midistat_lock, MA_OWNED);

	sbuf_printf(s, "FreeBSD Midi Driver (midi2)\n");
	if (TAILQ_EMPTY(&midi_devs)) {
	sbuf_printf(s, "No devices installed.\n");
	sbuf_finish(s);
	return sbuf_len(s);
	}
	sbuf_printf(s, "Installed devices:\n");

	TAILQ_FOREACH(m, &midi_devs, link) {
	mtx_lock(&m->lock);
	sbuf_printf(s, "%s [%d/%d:%s]", m->name, m->unit, m->channel,
	MPU_PROVIDER(m, m->cookie));
	sbuf_printf(s, "%s", MPU_DESCR(m, m->cookie, midistat_verbose));
	sbuf_printf(s, "\n");
	mtx_unlock(&m->lock);
	}

	sbuf_finish(s);
	return sbuf_len(s);
	}

	#ifdef notdef
	/*
	* Convert IOCTL command to string for debugging
	*/

	static char *
	midi_cmdname(int cmd)
	{
	static struct {
	int cmd;
	char *name;
	} *tab, cmdtab_midiioctl[] = {
	#define A(x) {x, ## x}
	/*
	* Once we have some real IOCTLs define, the following will
	* be relavant.
	*
	* A(SNDCTL_MIDI_PRETIME), A(SNDCTL_MIDI_MPUMODE),
	* A(SNDCTL_MIDI_MPUCMD), A(SNDCTL_SYNTH_INFO),
	* A(SNDCTL_MIDI_INFO), A(SNDCTL_SYNTH_MEMAVL),
	* A(SNDCTL_FM_LOAD_INSTR), A(SNDCTL_FM_4OP_ENABLE),
	* A(MIOSPASSTHRU), A(MIOGPASSTHRU), A(AIONWRITE),
	* A(AIOGSIZE), A(AIOSSIZE), A(AIOGFMT), A(AIOSFMT),
	* A(AIOGMIX), A(AIOSMIX), A(AIOSTOP), A(AIOSYNC),
	* A(AIOGCAP),
	*/
	#undef A
	{
	-1, "unknown"
	},
	};

	for (tab = cmdtab_midiioctl; tab->cmd != cmd && tab->cmd != -1; tab++);
	return tab->name;
	}

	#endif /* notdef */

	/*
	* midisynth
	*/


	int
	midisynth_open(void n, void arg, int flags)
	{
	struct snd_midi m = ((struct synth_midi )n)->m;
	int retval;

	MIDI_DEBUG(1, printf("midisynth_open %s %s\n",
	flags & FREAD ? "M_RX" : "", flags & FWRITE ? "M_TX" : ""));

	if (m == NULL)
	return ENXIO;

	mtx_lock(&m->lock);
	mtx_lock(&m->qlock);

	retval = 0;

	if (flags & FREAD) {
	if (MIDIQ_SIZE(m->inq) == 0)
	retval = ENXIO;
	else if (m->flags & M_RX)
	retval = EBUSY;
	if (retval)
	goto err;
	}
	if (flags & FWRITE) {
	if (MIDIQ_SIZE(m->outq) == 0)
	retval = ENXIO;
	else if (m->flags & M_TX)
	retval = EBUSY;
	if (retval)
	goto err;
	}
	m->busy++;

	/*
	* TODO: Consider m->async = 0;
	*/

	if (flags & FREAD) {
	m->flags \|= M_RX \| M_RXEN;
	/*
	* Only clear the inq, the outq might still have data to drain
	* from a previous session
	*/
	MIDIQ_CLEAR(m->inq);
	m->rchan = 0;
	};

	if (flags & FWRITE) {
	m->flags \|= M_TX;
	m->wchan = 0;
	}
	m->synth_flags = flags & (FREAD \| FWRITE);

	MPU_CALLBACK(m, m->cookie, m->flags);


	err: mtx_unlock(&m->qlock);
	mtx_unlock(&m->lock);
	MIDI_DEBUG(2, printf("midisynth_open: return %d.\n", retval));
	return retval;
	}

	int
	midisynth_close(void *n)
	{
	struct snd_midi m = ((struct synth_midi )n)->m;
	int retval;
	int oldflags;

	MIDI_DEBUG(1, printf("midisynth_close %s %s\n",
	m->synth_flags & FREAD ? "M_RX" : "",
	m->synth_flags & FWRITE ? "M_TX" : ""));

	if (m == NULL)
	return ENXIO;

	mtx_lock(&m->lock);
	mtx_lock(&m->qlock);

	if ((m->synth_flags & FREAD && !(m->flags & M_RX)) \|\|
	(m->synth_flags & FWRITE && !(m->flags & M_TX))) {
	retval = ENXIO;
	goto err;
	}
	m->busy--;

	oldflags = m->flags;

	if (m->synth_flags & FREAD)
	m->flags &= ~(M_RX \| M_RXEN);
	if (m->synth_flags & FWRITE)
	m->flags &= ~M_TX;

	if ((m->flags & (M_TXEN \| M_RXEN)) != (oldflags & (M_RXEN \| M_TXEN)))
	MPU_CALLBACK(m, m->cookie, m->flags);

	MIDI_DEBUG(1, printf("midi_close: closed, busy = %d.\n", m->busy));

	mtx_unlock(&m->qlock);
	mtx_unlock(&m->lock);
	retval = 0;
	err: return retval;
	}

	/*
	* Always blocking.
	*/

	int
	midisynth_writeraw(void n, uint8_t buf, size_t len)
	{
	struct snd_midi m = ((struct synth_midi )n)->m;
	int retval;
	int used;
	int i;

	MIDI_DEBUG(4, printf("midisynth_writeraw\n"));

	retval = 0;

	if (m == NULL)
	return ENXIO;

	mtx_lock(&m->lock);
	mtx_lock(&m->qlock);

	if (!(m->flags & M_TX))
	goto err1;

	if (midi_dumpraw)
	printf("midi dump: ");

	while (len > 0) {
	while (MIDIQ_AVAIL(m->outq) == 0) {
	if (!(m->flags & M_TXEN)) {
	m->flags \|= M_TXEN;
	MPU_CALLBACK(m, m->cookie, m->flags);
	}
	mtx_unlock(&m->lock);
	m->wchan = 1;
	MIDI_DEBUG(3, printf("midisynth_writeraw msleep\n"));
	retval = msleep(&m->wchan, &m->qlock,
	PCATCH \| PDROP, "midi TX", 0);
	/*
	* We slept, maybe things have changed since last
	* dying check
	*/
	if (retval == EINTR)
	goto err0;

	if (retval)
	goto err0;
	mtx_lock(&m->lock);
	mtx_lock(&m->qlock);
	m->wchan = 0;
	if (!m->busy)
	goto err1;
	}

	/*
	* We are certain than data can be placed on the queue
	*/

	used = MIN(MIDIQ_AVAIL(m->outq), len);
	used = MIN(used, MIDI_WSIZE);
	MIDI_DEBUG(5,
	printf("midi_synth: resid %zu len %jd avail %jd\n",
	len, (intmax_t)MIDIQ_LEN(m->outq),
	(intmax_t)MIDIQ_AVAIL(m->outq)));

	if (midi_dumpraw)
	for (i = 0; i < used; i++)
	printf("%x ", buf[i]);

	MIDIQ_ENQ(m->outq, buf, used);
	len -= used;

	/*
	* Inform the bottom half that data can be written
	*/
	if (!(m->flags & M_TXEN)) {
	m->flags \|= M_TXEN;
	MPU_CALLBACK(m, m->cookie, m->flags);
	}
	}
	/*
	* If we Made it here then transfer is good
	*/
	if (midi_dumpraw)
	printf("\n");

	retval = 0;
	err1: mtx_unlock(&m->qlock);
	mtx_unlock(&m->lock);
	err0: return retval;
	}

	static int
	midisynth_killnote(void *n, uint8_t chn, uint8_t note, uint8_t vel)
	{
	u_char c[3];


	if (note > 127 \|\| chn > 15)
	return (EINVAL);

	if (vel > 127)
	vel = 127;

	if (vel == 64) {
	c[0] = 0x90 \| (chn & 0x0f); /* Note on. */
	c[1] = (u_char)note;
	c[2] = 0;
	} else {
	c[0] = 0x80 \| (chn & 0x0f); /* Note off. */
	c[1] = (u_char)note;
	c[2] = (u_char)vel;
	}

	return midisynth_writeraw(n, c, 3);
	}

	static int
	midisynth_setinstr(void *n, uint8_t chn, uint16_t instr)
	{
	u_char c[2];

	if (instr > 127 \|\| chn > 15)
	return EINVAL;

	c[0] = 0xc0 \| (chn & 0x0f); /* Progamme change. */
	c[1] = instr + midi_instroff;

	return midisynth_writeraw(n, c, 2);
	}

	static int
	midisynth_startnote(void *n, uint8_t chn, uint8_t note, uint8_t vel)
	{
	u_char c[3];

	if (note > 127 \|\| chn > 15)
	return EINVAL;

	if (vel > 127)
	vel = 127;

	c[0] = 0x90 \| (chn & 0x0f); /* Note on. */
	c[1] = (u_char)note;
	c[2] = (u_char)vel;

	return midisynth_writeraw(n, c, 3);
	}
	static int
	midisynth_alloc(void *n, uint8_t chan, uint8_t note)
	{
	return chan;
	}

	static int
	midisynth_controller(void *n, uint8_t chn, uint8_t ctrlnum, uint16_t val)
	{
	u_char c[3];

	if (ctrlnum > 127 \|\| chn > 15)
	return EINVAL;

	c[0] = 0xb0 \| (chn & 0x0f); /* Control Message. */
	c[1] = ctrlnum;
	c[2] = val;
	return midisynth_writeraw(n, c, 3);
	}

	static int
	midisynth_bender(void *n, uint8_t chn, uint16_t val)
	{
	u_char c[3];


	if (val > 16383 \|\| chn > 15)
	return EINVAL;

	c[0] = 0xe0 \| (chn & 0x0f); /* Pitch bend. */
	c[1] = (u_char)val & 0x7f;
	c[2] = (u_char)(val >> 7) & 0x7f;

	return midisynth_writeraw(n, c, 3);
	}

	/*
	* Single point of midi destructions.
	*/
	static int
	midi_destroy(struct snd_midi *m, int midiuninit)
	{

	mtx_assert(&midistat_lock, MA_OWNED);
	mtx_assert(&m->lock, MA_OWNED);

	MIDI_DEBUG(3, printf("midi_destroy\n"));
	m->dev->si_drv1 = NULL;
	mtx_unlock(&m->lock); /* XXX */
	destroy_dev(m->dev);
	TAILQ_REMOVE(&midi_devs, m, link);
	if (midiuninit)
	MPU_UNINIT(m, m->cookie);
	free(MIDIQ_BUF(m->inq), M_MIDI);
	free(MIDIQ_BUF(m->outq), M_MIDI);
	mtx_destroy(&m->qlock);
	mtx_destroy(&m->lock);
	free(m, M_MIDI);
	return 0;
	}

	/*
	* Load and unload functions, creates the /dev/midistat device
	*/

	static int
	midi_load()
	{
	mtx_init(&midistat_lock, "midistat lock", NULL, 0);
	TAILQ_INIT(&midi_devs); /* Initialize the queue. */

	midistat_dev = make_dev(&midistat_cdevsw,
	MIDIMKMINOR(0, MIDI_DEV_MIDICTL, 0),
	UID_ROOT, GID_WHEEL, 0666, "midistat");

	return 0;
	}

	static int
	midi_unload()
	{
	struct snd_midi *m;
	int retval;

	MIDI_DEBUG(1, printf("midi_unload()\n"));
	retval = EBUSY;
	mtx_lock(&midistat_lock);
	if (midistat_isopen)
	goto exit0;

	TAILQ_FOREACH(m, &midi_devs, link) {
	mtx_lock(&m->lock);
	if (m->busy)
	retval = EBUSY;
	else
	retval = midi_destroy(m, 1);
	if (retval)
	goto exit1;
	}

	mtx_unlock(&midistat_lock); /* XXX */

	destroy_dev(midistat_dev);
	/*
	* Made it here then unload is complete
	*/
	mtx_destroy(&midistat_lock);
	return 0;

	exit1:
	mtx_unlock(&m->lock);
	exit0:
	mtx_unlock(&midistat_lock);
	if (retval)
	MIDI_DEBUG(2, printf("midi_unload: failed\n"));
	return retval;
	}

	extern int seq_modevent(module_t mod, int type, void *data);

	static int
	midi_modevent(module_t mod, int type, void *data)
	{
	int retval;

	retval = 0;

	switch (type) {
	case MOD_LOAD:
	retval = midi_load();
	#if 0
	if (retval == 0)
	retval = seq_modevent(mod, type, data);
	#endif
	break;

	case MOD_UNLOAD:
	retval = midi_unload();
	#if 0
	if (retval == 0)
	retval = seq_modevent(mod, type, data);
	#endif
	break;

	default:
	break;
	}

	return retval;
	}

	kobj_t
	midimapper_addseq(void arg1, int unit, void **cookie)
	{
	unit = 0;

	return (kobj_t)arg1;
	}

	int
	midimapper_open(void arg1, void *cookie)
	{
	int retval = 0;
	struct snd_midi *m;

	mtx_lock(&midistat_lock);

	TAILQ_FOREACH(m, &midi_devs, link) {
	retval++;
	}

	mtx_unlock(&midistat_lock);
	return retval;
	}

	int
	midimapper_close(void arg1, void cookie)
	{
	return 0;
	}

	kobj_t
	midimapper_fetch_synth(void arg, void cookie, int unit)
	{
	struct snd_midi *m;
	int retval = 0;

	mtx_lock(&midistat_lock);

	TAILQ_FOREACH(m, &midi_devs, link) {
	if (unit == retval) {
	mtx_unlock(&midistat_lock);
	return (kobj_t)m->synth;
	}
	retval++;
	}

	mtx_unlock(&midistat_lock);
	return NULL;
	}

	DEV_MODULE(midi, midi_modevent, NULL);
	MODULE_VERSION(midi, 1);
	Index: head/sys/dev/syscons/scmouse.c
	===================================================================
	--- head/sys/dev/syscons/scmouse.c (revision 225616)
	+++ head/sys/dev/syscons/scmouse.c (revision 225617)
	@@ -1,958 +1,958 @@
	/*-
	* Copyright (c) 1999 Kazutaka YOKOTA <yokota@zodiac.mech.utsunomiya-u.ac.jp>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer as
	* the first lines of this file unmodified.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_syscons.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/conf.h>
	#include <sys/consio.h>
	#include <sys/fbio.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mouse.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/random.h>
	#include <sys/signalvar.h>
	#include <sys/tty.h>

	#include <dev/syscons/syscons.h>

	#ifdef SC_TWOBUTTON_MOUSE
	#define SC_MOUSE_PASTEBUTTON MOUSE_BUTTON3DOWN /* right button */
	#define SC_MOUSE_EXTENDBUTTON MOUSE_BUTTON2DOWN /* not really used */
	#else
	#define SC_MOUSE_PASTEBUTTON MOUSE_BUTTON2DOWN /* middle button */
	#define SC_MOUSE_EXTENDBUTTON MOUSE_BUTTON3DOWN /* right button */
	#endif /* SC_TWOBUTTON_MOUSE */

	#define SC_WAKEUP_DELTA 20

	/* for backward compatibility */
	#define OLD_CONS_MOUSECTL _IOWR('c', 10, old_mouse_info_t)

	typedef struct old_mouse_data {
	int x;
	int y;
	int buttons;
	} old_mouse_data_t;

	typedef struct old_mouse_info {
	int operation;
	union {
	struct old_mouse_data data;
	struct mouse_mode mode;
	} u;
	} old_mouse_info_t;

	#ifndef SC_NO_SYSMOUSE

	/* local variables */
	#ifndef SC_NO_CUTPASTE
	static int cut_buffer_size;
	static u_char *cut_buffer;
	#endif

	/* local functions */
	static void set_mouse_pos(scr_stat *scp);
	#ifndef SC_NO_CUTPASTE
	static int skip_spc_right(scr_stat *scp, int p);
	static int skip_spc_left(scr_stat *scp, int p);
	static void mouse_cut(scr_stat *scp);
	static void mouse_cut_start(scr_stat *scp);
	static void mouse_cut_end(scr_stat *scp);
	static void mouse_cut_word(scr_stat *scp);
	static void mouse_cut_line(scr_stat *scp);
	static void mouse_cut_extend(scr_stat *scp);
	#endif /* SC_NO_CUTPASTE */

	#ifndef SC_NO_CUTPASTE
	/* allocate a cut buffer */
	void
	sc_alloc_cut_buffer(scr_stat *scp, int wait)
	{
	u_char *p;

	if ((cut_buffer == NULL)
	\|\| (cut_buffer_size < scp->xsize * scp->ysize + 1)) {
	p = cut_buffer;
	cut_buffer = NULL;
	if (p != NULL)
	free(p, M_DEVBUF);
	cut_buffer_size = scp->xsize * scp->ysize + 1;
	p = (u_char *)malloc(cut_buffer_size,
	M_DEVBUF, (wait) ? M_WAITOK : M_NOWAIT);
	if (p != NULL)
	p[0] = '\0';
	cut_buffer = p;
	}
	}
	#endif /* SC_NO_CUTPASTE */

	static void
	sc_mouse_input_button(scr_stat *scp, int button)
	{
	char mouseb[6] = "\x1B[M";

	mouseb[3] = ' ' + button;
	mouseb[4] = '!' + scp->mouse_pos % scp->xsize;
	mouseb[5] = '!' + scp->mouse_pos / scp->xsize;
	sc_respond(scp, mouseb, sizeof mouseb, 1);
	}

	static void
	sc_mouse_input(scr_stat scp, mouse_info_t mouse)
	{

	switch (mouse->operation) {
	case MOUSE_BUTTON_EVENT:
	if (mouse->u.event.value > 0) {
	/* Mouse button pressed. */
	if (mouse->u.event.id & MOUSE_BUTTON1DOWN)
	sc_mouse_input_button(scp, 0);
	if (mouse->u.event.id & MOUSE_BUTTON2DOWN)
	sc_mouse_input_button(scp, 1);
	if (mouse->u.event.id & MOUSE_BUTTON3DOWN)
	sc_mouse_input_button(scp, 2);
	} else {
	/* Mouse button released. */
	sc_mouse_input_button(scp, 3);
	}
	break;
	case MOUSE_MOTION_EVENT:
	if (mouse->u.data.z < 0) {
	/* Scroll up. */
	sc_mouse_input_button(scp, 64);
	} else if (mouse->u.data.z > 0) {
	/* Scroll down. */
	sc_mouse_input_button(scp, 65);
	}
	break;
	}
	}

	/* move mouse */
	void
	sc_mouse_move(scr_stat *scp, int x, int y)
	{
	int s;

	s = spltty();
	scp->mouse_xpos = scp->mouse_oldxpos = x;
	scp->mouse_ypos = scp->mouse_oldypos = y;
	if (scp->font_size <= 0 \|\| scp->font_width <= 0)
	scp->mouse_pos = scp->mouse_oldpos = 0;
	else
	scp->mouse_pos = scp->mouse_oldpos =
	(y/scp->font_size - scp->yoff)*scp->xsize + x/scp->font_width -
	scp->xoff;
	scp->status \|= MOUSE_MOVED;
	splx(s);
	}

	/* adjust mouse position */
	static void
	set_mouse_pos(scr_stat *scp)
	{
	if (scp->mouse_xpos < scp->xoff*scp->font_width)
	scp->mouse_xpos = scp->xoff*scp->font_width;
	if (scp->mouse_ypos < scp->yoff*scp->font_size)
	scp->mouse_ypos = scp->yoff*scp->font_size;
	if (ISGRAPHSC(scp)) {
	if (scp->mouse_xpos > scp->xpixel-1)
	scp->mouse_xpos = scp->xpixel-1;
	if (scp->mouse_ypos > scp->ypixel-1)
	scp->mouse_ypos = scp->ypixel-1;
	return;
	} else {
	if (scp->mouse_xpos > (scp->xsize + scp->xoff)*scp->font_width - 1)
	scp->mouse_xpos = (scp->xsize + scp->xoff)*scp->font_width - 1;
	if (scp->mouse_ypos > (scp->ysize + scp->yoff)*scp->font_size - 1)
	scp->mouse_ypos = (scp->ysize + scp->yoff)*scp->font_size - 1;
	}

	if ((scp->mouse_xpos != scp->mouse_oldxpos \|\| scp->mouse_ypos != scp->mouse_oldypos)
	&& (scp->font_size != 0 && scp->font_width != 0)) {
	scp->status \|= MOUSE_MOVED;
	scp->mouse_pos =
	(scp->mouse_ypos/scp->font_size - scp->yoff)*scp->xsize
	+ scp->mouse_xpos/scp->font_width - scp->xoff;
	#ifndef SC_NO_CUTPASTE
	if ((scp->status & MOUSE_VISIBLE) && (scp->status & MOUSE_CUTTING))
	mouse_cut(scp);
	#endif
	}
	}

	#ifndef SC_NO_CUTPASTE

	void
	sc_draw_mouse_image(scr_stat *scp)
	{
	if (ISGRAPHSC(scp))
	return;

	SC_VIDEO_LOCK(scp->sc);
	(*scp->rndr->draw_mouse)(scp, scp->mouse_xpos, scp->mouse_ypos, TRUE);
	scp->mouse_oldpos = scp->mouse_pos;
	scp->mouse_oldxpos = scp->mouse_xpos;
	scp->mouse_oldypos = scp->mouse_ypos;
	scp->status \|= MOUSE_VISIBLE;
	SC_VIDEO_UNLOCK(scp->sc);
	}

	void
	sc_remove_mouse_image(scr_stat *scp)
	{
	int size;
	int i;

	if (ISGRAPHSC(scp))
	return;

	SC_VIDEO_LOCK(scp->sc);
	(*scp->rndr->draw_mouse)(scp,
	(scp->mouse_oldpos%scp->xsize + scp->xoff)
	* scp->font_width,
	(scp->mouse_oldpos/scp->xsize + scp->yoff)
	* scp->font_size,
	FALSE);
	size = scp->xsize*scp->ysize;
	i = scp->mouse_oldpos;
	mark_for_update(scp, i);
	mark_for_update(scp, i);
	#ifndef PC98
	if (i + scp->xsize + 1 < size) {
	mark_for_update(scp, i + scp->xsize + 1);
	} else if (i + scp->xsize < size) {
	mark_for_update(scp, i + scp->xsize);
	} else if (i + 1 < size) {
	mark_for_update(scp, i + 1);
	}
	#endif /* PC98 */
	scp->status &= ~MOUSE_VISIBLE;
	SC_VIDEO_UNLOCK(scp->sc);
	}

	int
	sc_inside_cutmark(scr_stat *scp, int pos)
	{
	int start;
	int end;

	if (scp->mouse_cut_end < 0)
	return FALSE;
	if (scp->mouse_cut_start <= scp->mouse_cut_end) {
	start = scp->mouse_cut_start;
	end = scp->mouse_cut_end;
	} else {
	start = scp->mouse_cut_end;
	end = scp->mouse_cut_start - 1;
	}
	return ((start <= pos) && (pos <= end));
	}

	void
	sc_remove_cutmarking(scr_stat *scp)
	{
	int s;

	s = spltty();
	if (scp->mouse_cut_end >= 0) {
	mark_for_update(scp, scp->mouse_cut_start);
	mark_for_update(scp, scp->mouse_cut_end);
	}
	scp->mouse_cut_start = scp->xsize*scp->ysize;
	scp->mouse_cut_end = -1;
	splx(s);
	scp->status &= ~MOUSE_CUTTING;
	}

	void
	sc_remove_all_cutmarkings(sc_softc_t *sc)
	{
	scr_stat *scp;
	int i;

	/* delete cut markings in all vtys */
	for (i = 0; i < sc->vtys; ++i) {
	scp = SC_STAT(sc->dev[i]);
	if (scp == NULL)
	continue;
	sc_remove_cutmarking(scp);
	}
	}

	void
	sc_remove_all_mouse(sc_softc_t *sc)
	{
	scr_stat *scp;
	int i;

	for (i = 0; i < sc->vtys; ++i) {
	scp = SC_STAT(sc->dev[i]);
	if (scp == NULL)
	continue;
	if (scp->status & MOUSE_VISIBLE) {
	scp->status &= ~MOUSE_VISIBLE;
	mark_all(scp);
	}
	}
	}

	#define IS_SPACE_CHAR(c) (((c) & 0xff) == ' ')

	#ifdef SC_CUT_SPACES2TABS
	#define IS_BLANK_CHAR(c) (((c) & 0xff) == ' ' \|\| ((c) & 0xff) == '\t')
	#else
	#define IS_BLANK_CHAR(c) IS_SPACE_CHAR(c)
	#endif /* SC_CUT_SPACES2TABS */

	#ifdef SC_CUT_SEPCHARS
	#define IS_SEP_CHAR(c) (index(SC_CUT_SEPCHARS, (c) & 0xff) != NULL)
	#else
	#define IS_SEP_CHAR(c) IS_SPACE_CHAR(c)
	#endif /* SC_CUT_SEPCHARS */

	/* skip spaces to right */
	static int
	skip_spc_right(scr_stat *scp, int p)
	{
	int c;
	int i;

	for (i = p % scp->xsize; i < scp->xsize; ++i) {
	c = sc_vtb_getc(&scp->vtb, p);
	if (!IS_SPACE_CHAR(c))
	break;
	++p;
	}
	return i;
	}

	/* skip spaces to left */
	static int
	skip_spc_left(scr_stat *scp, int p)
	{
	int c;
	int i;

	for (i = p-- % scp->xsize - 1; i >= 0; --i) {
	c = sc_vtb_getc(&scp->vtb, p);
	if (!IS_SPACE_CHAR(c))
	break;
	--p;
	}
	return i;
	}

	static void
	mouse_do_cut(scr_stat *scp, int from, int to)
	{
	int blank;
	int i;
	int leadspaces;
	int p;
	int s;

	for (p = from, i = blank = leadspaces = 0; p <= to; ++p) {
	cut_buffer[i] = sc_vtb_getc(&scp->vtb, p);
	/* Be prepared that sc_vtb_getc() can return '\0' */
	if (cut_buffer[i] == '\0')
	cut_buffer[i] = ' ';
	#ifdef SC_CUT_SPACES2TABS
	if (leadspaces != -1) {
	if (IS_SPACE_CHAR(cut_buffer[i])) {
	leadspaces++;
	/* Check that we are at tabstop position */
	if ((p % scp->xsize) % 8 == 7) {
	i -= leadspaces - 1;
	cut_buffer[i] = '\t';
	leadspaces = 0;
	}
	} else {
	leadspaces = -1;
	}
	}
	#endif /* SC_CUT_SPACES2TABS */
	/* remember the position of the last non-space char */
	if (!IS_BLANK_CHAR(cut_buffer[i]))
	blank = i + 1; /* the first space after the last non-space */
	++i;
	/* trim trailing blank when crossing lines */
	if ((p % scp->xsize) == (scp->xsize - 1)) {
	cut_buffer[blank++] = '\r';
	i = blank;
	leadspaces = 0;
	}
	}
	cut_buffer[i] = '\0';

	/* remove the current marking */
	s = spltty();
	if (scp->mouse_cut_start <= scp->mouse_cut_end) {
	mark_for_update(scp, scp->mouse_cut_start);
	mark_for_update(scp, scp->mouse_cut_end);
	} else if (scp->mouse_cut_end >= 0) {
	mark_for_update(scp, scp->mouse_cut_end);
	mark_for_update(scp, scp->mouse_cut_start);
	}

	/* mark the new region */
	scp->mouse_cut_start = from;
	scp->mouse_cut_end = to;
	mark_for_update(scp, from);
	mark_for_update(scp, to);
	splx(s);
	}

	/* copy marked region to the cut buffer */
	static void
	mouse_cut(scr_stat *scp)
	{
	int start;
	int end;
	int from;
	int to;
	int c;
	int p;
	int s;
	int i;

	start = scp->mouse_cut_start;
	end = scp->mouse_cut_end;
	if (scp->mouse_pos >= start) {
	from = start;
	to = end = scp->mouse_pos;
	} else {
	from = end = scp->mouse_pos;
	to = start - 1;
	}
	p = to;
	for (i = p % scp->xsize; i < scp->xsize; ++i) {
	c = sc_vtb_getc(&scp->vtb, p);
	if (!IS_SPACE_CHAR(c))
	break;
	++p;
	}
	/* if there is nothing but blank chars, trim them, but mark towards eol */
	if (i == scp->xsize) {
	if (end >= start)
	to = end = p - 1;
	else
	to = start = p;
	}
	mouse_do_cut(scp, from, to);
	s = spltty();
	scp->mouse_cut_start = start;
	scp->mouse_cut_end = end;
	splx(s);
	}

	/* a mouse button is pressed, start cut operation */
	static void
	mouse_cut_start(scr_stat *scp)
	{
	int i;
	int s;

	if (scp->status & MOUSE_VISIBLE) {
	sc_remove_all_cutmarkings(scp->sc);
	if ((scp->mouse_pos == scp->mouse_cut_start) &&
	(scp->mouse_pos == scp->mouse_cut_end)) {
	cut_buffer[0] = '\0';
	return;
	} else if (skip_spc_right(scp, scp->mouse_pos) >= scp->xsize) {
	/* if the pointer is on trailing blank chars, mark towards eol */
	i = skip_spc_left(scp, scp->mouse_pos) + 1;
	s = spltty();
	scp->mouse_cut_start =
	(scp->mouse_pos / scp->xsize) * scp->xsize + i;
	scp->mouse_cut_end =
	(scp->mouse_pos / scp->xsize + 1) * scp->xsize - 1;
	splx(s);
	cut_buffer[0] = '\r';
	} else {
	s = spltty();
	scp->mouse_cut_start = scp->mouse_pos;
	scp->mouse_cut_end = scp->mouse_cut_start;
	splx(s);
	cut_buffer[0] = sc_vtb_getc(&scp->vtb, scp->mouse_cut_start);
	}
	cut_buffer[1] = '\0';
	scp->status \|= MOUSE_CUTTING;
	mark_all(scp); /* this is probably overkill XXX */
	}
	}

	/* end of cut operation */
	static void
	mouse_cut_end(scr_stat *scp)
	{
	if (scp->status & MOUSE_VISIBLE)
	scp->status &= ~MOUSE_CUTTING;
	}

	/* copy a word under the mouse pointer */
	static void
	mouse_cut_word(scr_stat *scp)
	{
	int start;
	int end;
	int sol;
	int eol;
	int c;
	int j;
	int len;

	/*
	* Because we don't have locale information in the kernel,
	* we only distinguish space char and non-space chars. Punctuation
	* chars, symbols and other regular chars are all treated alike
	* unless user specified SC_CUT_SEPCHARS in his kernel config file.
	*/
	if (scp->status & MOUSE_VISIBLE) {
	sol = (scp->mouse_pos / scp->xsize) * scp->xsize;
	eol = sol + scp->xsize;
	c = sc_vtb_getc(&scp->vtb, scp->mouse_pos);
	if (IS_SEP_CHAR(c)) {
	/* blank space */
	for (j = scp->mouse_pos; j >= sol; --j) {
	c = sc_vtb_getc(&scp->vtb, j);
	if (!IS_SEP_CHAR(c))
	break;
	}
	start = ++j;
	for (j = scp->mouse_pos; j < eol; ++j) {
	c = sc_vtb_getc(&scp->vtb, j);
	if (!IS_SEP_CHAR(c))
	break;
	}
	end = j - 1;
	} else {
	/* non-space word */
	for (j = scp->mouse_pos; j >= sol; --j) {
	c = sc_vtb_getc(&scp->vtb, j);
	if (IS_SEP_CHAR(c))
	break;
	}
	start = ++j;
	for (j = scp->mouse_pos; j < eol; ++j) {
	c = sc_vtb_getc(&scp->vtb, j);
	if (IS_SEP_CHAR(c))
	break;
	}
	end = j - 1;
	}

	/* copy the found word */
	mouse_do_cut(scp, start, end);
	len = strlen(cut_buffer);
	if (cut_buffer[len - 1] == '\r')
	cut_buffer[len - 1] = '\0';
	}
	}

	/* copy a line under the mouse pointer */
	static void
	mouse_cut_line(scr_stat *scp)
	{
	int len;
	int from;

	if (scp->status & MOUSE_VISIBLE) {
	from = (scp->mouse_pos / scp->xsize) * scp->xsize;
	mouse_do_cut(scp, from, from + scp->xsize - 1);
	len = strlen(cut_buffer);
	if (cut_buffer[len - 1] == '\r')
	cut_buffer[len - 1] = '\0';
	scp->status \|= MOUSE_CUTTING;
	}
	}

	/* extend the marked region to the mouse pointer position */
	static void
	mouse_cut_extend(scr_stat *scp)
	{
	int start;
	int end;
	int s;

	if ((scp->status & MOUSE_VISIBLE) && !(scp->status & MOUSE_CUTTING)
	&& (scp->mouse_cut_end >= 0)) {
	if (scp->mouse_cut_start <= scp->mouse_cut_end) {
	start = scp->mouse_cut_start;
	end = scp->mouse_cut_end;
	} else {
	start = scp->mouse_cut_end;
	end = scp->mouse_cut_start - 1;
	}
	s = spltty();
	if (scp->mouse_pos > end) {
	scp->mouse_cut_start = start;
	scp->mouse_cut_end = end;
	} else if (scp->mouse_pos < start) {
	scp->mouse_cut_start = end + 1;
	scp->mouse_cut_end = start;
	} else {
	if (scp->mouse_pos - start > end + 1 - scp->mouse_pos) {
	scp->mouse_cut_start = start;
	scp->mouse_cut_end = end;
	} else {
	scp->mouse_cut_start = end + 1;
	scp->mouse_cut_end = start;
	}
	}
	splx(s);
	mouse_cut(scp);
	scp->status \|= MOUSE_CUTTING;
	}
	}

	/* paste cut buffer contents into the current vty */
	void
	sc_mouse_paste(scr_stat *scp)
	{
	sc_paste(scp, cut_buffer, strlen(cut_buffer));
	}

	#endif /* SC_NO_CUTPASTE */

	int
	sc_mouse_ioctl(struct tty tp, u_long cmd, caddr_t data, struct thread td)
	{
	mouse_info_t *mouse;
	mouse_info_t buf;
	scr_stat *cur_scp;
	scr_stat *scp;
	struct proc *p1;
	int s;
	int f;

	scp = SC_STAT(tp);

	switch (cmd) {

	case CONS_MOUSECTL: /* control mouse arrow */
	case OLD_CONS_MOUSECTL:

	mouse = (mouse_info_t*)data;

	random_harvest(mouse, sizeof(mouse_info_t), 2, 0, RANDOM_MOUSE);

	if (cmd == OLD_CONS_MOUSECTL) {
	static u_char swapb[] = { 0, 4, 2, 6, 1, 5, 3, 7 };
	old_mouse_info_t old_mouse = (old_mouse_info_t )data;

	mouse = &buf;
	mouse->operation = old_mouse->operation;
	switch (mouse->operation) {
	case MOUSE_MODE:
	mouse->u.mode = old_mouse->u.mode;
	break;
	case MOUSE_SHOW:
	case MOUSE_HIDE:
	break;
	case MOUSE_MOVEABS:
	case MOUSE_MOVEREL:
	case MOUSE_ACTION:
	mouse->u.data.x = old_mouse->u.data.x;
	mouse->u.data.y = old_mouse->u.data.y;
	mouse->u.data.z = 0;
	mouse->u.data.buttons = swapb[old_mouse->u.data.buttons & 0x7];
	break;
	case MOUSE_GETINFO:
	old_mouse->u.data.x = scp->mouse_xpos;
	old_mouse->u.data.y = scp->mouse_ypos;
	old_mouse->u.data.buttons = swapb[scp->mouse_buttons & 0x7];
	return 0;
	default:
	return EINVAL;
	}
	}

	cur_scp = scp->sc->cur_scp;

	switch (mouse->operation) {
	case MOUSE_MODE:
	if (ISSIGVALID(mouse->u.mode.signal)) {
	scp->mouse_signal = mouse->u.mode.signal;
	scp->mouse_proc = td->td_proc;
	scp->mouse_pid = td->td_proc->p_pid;
	}
	else {
	scp->mouse_signal = 0;
	scp->mouse_proc = NULL;
	scp->mouse_pid = 0;
	}
	return 0;

	case MOUSE_SHOW:
	s = spltty();
	if (!(scp->sc->flags & SC_MOUSE_ENABLED)) {
	scp->sc->flags \|= SC_MOUSE_ENABLED;
	cur_scp->status &= ~MOUSE_HIDDEN;
	if (!ISGRAPHSC(cur_scp))
	mark_all(cur_scp);
	}
	splx(s);
	return 0;
	/* NOTREACHED */

	case MOUSE_HIDE:
	s = spltty();
	if (scp->sc->flags & SC_MOUSE_ENABLED) {
	scp->sc->flags &= ~SC_MOUSE_ENABLED;
	sc_remove_all_mouse(scp->sc);
	}
	splx(s);
	return 0;
	/* NOTREACHED */

	case MOUSE_MOVEABS:
	s = spltty();
	scp->mouse_xpos = mouse->u.data.x;
	scp->mouse_ypos = mouse->u.data.y;
	set_mouse_pos(scp);
	splx(s);
	break;

	case MOUSE_MOVEREL:
	s = spltty();
	scp->mouse_xpos += mouse->u.data.x;
	scp->mouse_ypos += mouse->u.data.y;
	set_mouse_pos(scp);
	splx(s);
	break;

	case MOUSE_GETINFO:
	mouse->u.data.x = scp->mouse_xpos;
	mouse->u.data.y = scp->mouse_ypos;
	mouse->u.data.z = 0;
	mouse->u.data.buttons = scp->mouse_buttons;
	return 0;

	case MOUSE_ACTION:
	case MOUSE_MOTION_EVENT:
	/* send out mouse event on /dev/sysmouse */
	#if 0
	/* this should maybe only be settable from /dev/consolectl SOS */
	if (SC_VTY(tp->t_dev) != SC_CONSOLECTL)
	return ENOTTY;
	#endif
	s = spltty();
	if (mouse->u.data.x != 0 \|\| mouse->u.data.y != 0) {
	cur_scp->mouse_xpos += mouse->u.data.x;
	cur_scp->mouse_ypos += mouse->u.data.y;
	set_mouse_pos(cur_scp);
	}
	f = 0;
	if (mouse->operation == MOUSE_ACTION) {
	f = cur_scp->mouse_buttons ^ mouse->u.data.buttons;
	cur_scp->mouse_buttons = mouse->u.data.buttons;
	}
	splx(s);

	if (sysmouse_event(mouse) == 0)
	return 0;

	/*
	* If any buttons are down or the mouse has moved a lot,
	* stop the screen saver.
	*/
	if (((mouse->operation == MOUSE_ACTION) && mouse->u.data.buttons)
	\|\| (mouse->u.data.x*mouse->u.data.x
	+ mouse->u.data.y*mouse->u.data.y
	>= SC_WAKEUP_DELTA*SC_WAKEUP_DELTA)) {
	sc_touch_scrn_saver();
	}

	cur_scp->status &= ~MOUSE_HIDDEN;

	if (cur_scp->mouse_level > 0) {
	sc_mouse_input(scp, mouse);
	break;
	}

	if (cur_scp->mouse_signal && cur_scp->mouse_proc) {
	/* has controlling process died? */
	if (cur_scp->mouse_proc != (p1 = pfind(cur_scp->mouse_pid))) {
	cur_scp->mouse_signal = 0;
	cur_scp->mouse_proc = NULL;
	cur_scp->mouse_pid = 0;
	if (p1)
	PROC_UNLOCK(p1);
	} else {
	- psignal(cur_scp->mouse_proc, cur_scp->mouse_signal);
	+ kern_psignal(cur_scp->mouse_proc, cur_scp->mouse_signal);
	PROC_UNLOCK(cur_scp->mouse_proc);
	break;
	}
	}

	#ifndef SC_NO_CUTPASTE
	if (ISGRAPHSC(cur_scp) \|\| (cut_buffer == NULL))
	break;

	if ((mouse->operation == MOUSE_ACTION) && f) {
	/* process button presses */
	if (cur_scp->mouse_buttons & MOUSE_BUTTON1DOWN)
	mouse_cut_start(cur_scp);
	else
	mouse_cut_end(cur_scp);
	if (cur_scp->mouse_buttons & MOUSE_BUTTON2DOWN \|\|
	cur_scp->mouse_buttons & MOUSE_BUTTON3DOWN)
	sc_mouse_paste(cur_scp);
	}
	#endif /* SC_NO_CUTPASTE */
	break;

	case MOUSE_BUTTON_EVENT:
	if ((mouse->u.event.id & MOUSE_BUTTONS) == 0)
	return EINVAL;
	if (mouse->u.event.value < 0)
	return EINVAL;
	#if 0
	/* this should maybe only be settable from /dev/consolectl SOS */
	if (SC_VTY(tp->t_dev) != SC_CONSOLECTL)
	return ENOTTY;
	#endif
	if (mouse->u.event.value > 0)
	cur_scp->mouse_buttons \|= mouse->u.event.id;
	else
	cur_scp->mouse_buttons &= ~mouse->u.event.id;

	if (sysmouse_event(mouse) == 0)
	return 0;

	/* if a button is held down, stop the screen saver */
	if (mouse->u.event.value > 0)
	sc_touch_scrn_saver();

	cur_scp->status &= ~MOUSE_HIDDEN;

	if (cur_scp->mouse_level > 0) {
	sc_mouse_input(scp, mouse);
	break;
	}

	if (cur_scp->mouse_signal && cur_scp->mouse_proc) {
	if (cur_scp->mouse_proc != (p1 = pfind(cur_scp->mouse_pid))){
	cur_scp->mouse_signal = 0;
	cur_scp->mouse_proc = NULL;
	cur_scp->mouse_pid = 0;
	if (p1)
	PROC_UNLOCK(p1);
	} else {
	- psignal(cur_scp->mouse_proc, cur_scp->mouse_signal);
	+ kern_psignal(cur_scp->mouse_proc, cur_scp->mouse_signal);
	PROC_UNLOCK(cur_scp->mouse_proc);
	break;
	}
	}

	#ifndef SC_NO_CUTPASTE
	if (ISGRAPHSC(cur_scp) \|\| (cut_buffer == NULL))
	break;

	switch (mouse->u.event.id) {
	case MOUSE_BUTTON1DOWN:
	switch (mouse->u.event.value % 4) {
	case 0: /* up */
	mouse_cut_end(cur_scp);
	break;
	case 1: /* single click: start cut operation */
	mouse_cut_start(cur_scp);
	break;
	case 2: /* double click: cut a word */
	mouse_cut_word(cur_scp);
	mouse_cut_end(cur_scp);
	break;
	case 3: /* triple click: cut a line */
	mouse_cut_line(cur_scp);
	mouse_cut_end(cur_scp);
	break;
	}
	break;
	case SC_MOUSE_PASTEBUTTON:
	switch (mouse->u.event.value) {
	case 0: /* up */
	break;
	default:
	sc_mouse_paste(cur_scp);
	break;
	}
	break;
	case SC_MOUSE_EXTENDBUTTON:
	switch (mouse->u.event.value) {
	case 0: /* up */
	if (!(cur_scp->mouse_buttons & MOUSE_BUTTON1DOWN))
	mouse_cut_end(cur_scp);
	break;
	default:
	mouse_cut_extend(cur_scp);
	break;
	}
	break;
	}
	#endif /* SC_NO_CUTPASTE */
	break;

	case MOUSE_MOUSECHAR:
	if (mouse->u.mouse_char < 0) {
	mouse->u.mouse_char = scp->sc->mouse_char;
	} else {
	if (mouse->u.mouse_char > UCHAR_MAX - 3)
	return EINVAL;
	s = spltty();
	sc_remove_all_mouse(scp->sc);
	#ifndef SC_NO_FONT_LOADING
	if (ISTEXTSC(cur_scp) && (cur_scp->font != NULL))
	sc_load_font(cur_scp, 0, cur_scp->font_size,
	cur_scp->font_width,
	cur_scp->font + cur_scp->font_size
	* cur_scp->sc->mouse_char,
	cur_scp->sc->mouse_char, 4);
	#endif
	scp->sc->mouse_char = mouse->u.mouse_char;
	splx(s);
	}
	break;

	default:
	return EINVAL;
	}

	return 0;
	}

	return ENOIOCTL;
	}

	#endif /* SC_NO_SYSMOUSE */
	Index: head/sys/dev/syscons/syscons.c
	===================================================================
	--- head/sys/dev/syscons/syscons.c (revision 225616)
	+++ head/sys/dev/syscons/syscons.c (revision 225617)
	@@ -1,3862 +1,3862 @@
	/*-
	* Copyright (c) 1992-1998 Søren Schmidt
	* All rights reserved.
	*
	* This code is derived from software contributed to The DragonFly Project
	* by Sascha Wildner <saw@online.de>
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer,
	* without modification, immediately at the beginning of the file.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_syscons.h"
	#include "opt_splash.h"
	#include "opt_ddb.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bus.h>
	#include <sys/conf.h>
	#include <sys/cons.h>
	#include <sys/consio.h>
	#include <sys/kdb.h>
	#include <sys/eventhandler.h>
	#include <sys/fbio.h>
	#include <sys/kbio.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/random.h>
	#include <sys/reboot.h>
	#include <sys/serial.h>
	#include <sys/signalvar.h>
	#include <sys/sysctl.h>
	#include <sys/tty.h>
	#include <sys/power.h>

	#include <machine/clock.h>
	#if defined(__sparc64__) \|\| defined(__powerpc__)
	#include <machine/sc_machdep.h>
	#else
	#include <machine/pc/display.h>
	#endif
	#if defined( __i386__) \|\| defined(__amd64__)
	#include <machine/psl.h>
	#include <machine/frame.h>
	#endif
	#include <machine/stdarg.h>

	#include <dev/kbd/kbdreg.h>
	#include <dev/fb/fbreg.h>
	#include <dev/fb/splashreg.h>
	#include <dev/syscons/syscons.h>

	#define COLD 0
	#define WARM 1

	#define DEFAULT_BLANKTIME (560) / 5 minutes */
	#define MAX_BLANKTIME (7246060) / 7 days!? */

	#define KEYCODE_BS 0x0e /* "<-- Backspace" key, XXX */

	typedef struct default_attr {
	int std_color; /* normal hardware color */
	int rev_color; /* reverse hardware color */
	} default_attr;

	static default_attr user_default = {
	SC_NORM_ATTR,
	SC_NORM_REV_ATTR,
	};

	static int sc_console_unit = -1;
	static int sc_saver_keyb_only = 1;
	static scr_stat *sc_console;
	static struct consdev *sc_consptr;
	static scr_stat main_console;
	static struct tty *main_devs[MAXCONS];

	static char init_done = COLD;
	static int shutdown_in_progress = FALSE;
	static int suspend_in_progress = FALSE;
	static char sc_malloc = FALSE;

	static int saver_mode = CONS_NO_SAVER; /* LKM/user saver */
	static int run_scrn_saver = FALSE; /* should run the saver? */
	static int enable_bell = TRUE; /* enable beeper */

	#ifndef SC_DISABLE_REBOOT
	static int enable_reboot = TRUE; /* enable keyboard reboot */
	#endif

	#ifndef SC_DISABLE_KDBKEY
	static int enable_kdbkey = TRUE; /* enable keyboard debug */
	#endif

	static long scrn_blank_time = 0; /* screen saver timeout value */
	#ifdef DEV_SPLASH
	static int scrn_blanked; /* # of blanked screen */
	static int sticky_splash = FALSE;

	static void none_saver(sc_softc_t *sc, int blank) { }
	static void (current_saver)(sc_softc_t , int) = none_saver;
	#endif

	#ifdef SC_NO_SUSPEND_VTYSWITCH
	static int sc_no_suspend_vtswitch = 1;
	#else
	static int sc_no_suspend_vtswitch = 0;
	#endif
	static int sc_susp_scr;

	SYSCTL_NODE(_hw, OID_AUTO, syscons, CTLFLAG_RD, 0, "syscons");
	SYSCTL_NODE(_hw_syscons, OID_AUTO, saver, CTLFLAG_RD, 0, "saver");
	SYSCTL_INT(_hw_syscons_saver, OID_AUTO, keybonly, CTLFLAG_RW,
	&sc_saver_keyb_only, 0, "screen saver interrupted by input only");
	SYSCTL_INT(_hw_syscons, OID_AUTO, bell, CTLFLAG_RW, &enable_bell,
	0, "enable bell");
	#ifndef SC_DISABLE_REBOOT
	SYSCTL_INT(_hw_syscons, OID_AUTO, kbd_reboot, CTLFLAG_RW\|CTLFLAG_SECURE, &enable_reboot,
	0, "enable keyboard reboot");
	#endif
	#ifndef SC_DISABLE_KDBKEY
	SYSCTL_INT(_hw_syscons, OID_AUTO, kbd_debug, CTLFLAG_RW\|CTLFLAG_SECURE, &enable_kdbkey,
	0, "enable keyboard debug");
	#endif
	TUNABLE_INT("hw.syscons.sc_no_suspend_vtswitch", &sc_no_suspend_vtswitch);
	SYSCTL_INT(_hw_syscons, OID_AUTO, sc_no_suspend_vtswitch, CTLFLAG_RW,
	&sc_no_suspend_vtswitch, 0, "Disable VT switch before suspend.");
	#if !defined(SC_NO_FONT_LOADING) && defined(SC_DFLT_FONT)
	#include "font.h"
	#endif

	tsw_ioctl_t *sc_user_ioctl;

	static bios_values_t bios_value;

	static int enable_panic_key;
	SYSCTL_INT(_machdep, OID_AUTO, enable_panic_key, CTLFLAG_RW, &enable_panic_key,
	0, "Enable panic via keypress specified in kbdmap(5)");

	#define SC_CONSOLECTL 255

	#define VTY_WCHAN(sc, vty) (&SC_DEV(sc, vty))

	static int debugger;

	/* prototypes */
	static int sc_allocate_keyboard(sc_softc_t *sc, int unit);
	static int scvidprobe(int unit, int flags, int cons);
	static int sckbdprobe(int unit, int flags, int cons);
	static void scmeminit(void *arg);
	static int scdevtounit(struct tty *tp);
	static kbd_callback_func_t sckbdevent;
	static void scinit(int unit, int flags);
	static scr_stat sc_get_stat(struct tty tp);
	static void scterm(int unit, int flags);
	static void scshutdown(void *, int);
	static void scsuspend(void *);
	static void scresume(void *);
	static u_int scgetc(sc_softc_t *sc, u_int flags);
	#define SCGETC_CN 1
	#define SCGETC_NONBLOCK 2
	static void sccnupdate(scr_stat *scp);
	static scr_stat alloc_scp(sc_softc_t sc, int vty);
	static void init_scp(sc_softc_t sc, int vty, scr_stat scp);
	static timeout_t scrn_timer;
	static int and_region(int s1, int e1, int s2, int e2);
	static void scrn_update(scr_stat *scp, int show_cursor);

	#ifdef DEV_SPLASH
	static int scsplash_callback(int event, void *arg);
	static void scsplash_saver(sc_softc_t *sc, int show);
	static int add_scrn_saver(void (this_saver)(sc_softc_t , int));
	static int remove_scrn_saver(void (this_saver)(sc_softc_t , int));
	static int set_scrn_saver_mode(scr_stat scp, int mode, u_char pal, int border);
	static int restore_scrn_saver_mode(scr_stat *scp, int changemode);
	static void stop_scrn_saver(sc_softc_t sc, void (saver)(sc_softc_t *, int));
	static int wait_scrn_saver_stop(sc_softc_t *sc);
	#define scsplash_stick(stick) (sticky_splash = (stick))
	#else /* !DEV_SPLASH */
	#define scsplash_stick(stick)
	#endif /* DEV_SPLASH */

	static int do_switch_scr(sc_softc_t *sc, int s);
	static int vt_proc_alive(scr_stat *scp);
	static int signal_vt_rel(scr_stat *scp);
	static int signal_vt_acq(scr_stat *scp);
	static int finish_vt_rel(scr_stat scp, int release, int s);
	static int finish_vt_acq(scr_stat *scp);
	static void exchange_scr(sc_softc_t *sc);
	static void update_cursor_image(scr_stat *scp);
	static void change_cursor_shape(scr_stat *scp, int flags, int base, int height);
	static int save_kbd_state(scr_stat *scp);
	static int update_kbd_state(scr_stat *scp, int state, int mask);
	static int update_kbd_leds(scr_stat *scp, int which);
	static timeout_t blink_screen;
	static struct tty *sc_alloc_tty(int, int);

	static cn_probe_t sc_cnprobe;
	static cn_init_t sc_cninit;
	static cn_term_t sc_cnterm;
	static cn_getc_t sc_cngetc;
	static cn_putc_t sc_cnputc;

	CONSOLE_DRIVER(sc);

	static tsw_open_t sctty_open;
	static tsw_close_t sctty_close;
	static tsw_outwakeup_t sctty_outwakeup;
	static tsw_ioctl_t sctty_ioctl;
	static tsw_mmap_t sctty_mmap;

	static struct ttydevsw sc_ttydevsw = {
	.tsw_open = sctty_open,
	.tsw_close = sctty_close,
	.tsw_outwakeup = sctty_outwakeup,
	.tsw_ioctl = sctty_ioctl,
	.tsw_mmap = sctty_mmap,
	};

	static d_ioctl_t consolectl_ioctl;

	static struct cdevsw consolectl_devsw = {
	.d_version = D_VERSION,
	.d_flags = D_NEEDGIANT,
	.d_ioctl = consolectl_ioctl,
	.d_name = "consolectl",
	};

	int
	sc_probe_unit(int unit, int flags)
	{
	if (!scvidprobe(unit, flags, FALSE)) {
	if (bootverbose)
	printf("%s%d: no video adapter found.\n", SC_DRIVER_NAME, unit);
	return ENXIO;
	}

	/* syscons will be attached even when there is no keyboard */
	sckbdprobe(unit, flags, FALSE);

	return 0;
	}

	/* probe video adapters, return TRUE if found */
	static int
	scvidprobe(int unit, int flags, int cons)
	{
	/*
	* Access the video adapter driver through the back door!
	* Video adapter drivers need to be configured before syscons.
	* However, when syscons is being probed as the low-level console,
	* they have not been initialized yet. We force them to initialize
	* themselves here. XXX
	*/
	vid_configure(cons ? VIO_PROBE_ONLY : 0);

	return (vid_find_adapter("*", unit) >= 0);
	}

	/* probe the keyboard, return TRUE if found */
	static int
	sckbdprobe(int unit, int flags, int cons)
	{
	/* access the keyboard driver through the backdoor! */
	kbd_configure(cons ? KB_CONF_PROBE_ONLY : 0);

	return (kbd_find_keyboard("*", unit) >= 0);
	}

	static char
	adapter_name(video_adapter_t adp)
	{
	static struct {
	int type;
	char *name[2];
	} names[] = {
	{ KD_MONO, { "MDA", "MDA" } },
	{ KD_HERCULES, { "Hercules", "Hercules" } },
	{ KD_CGA, { "CGA", "CGA" } },
	{ KD_EGA, { "EGA", "EGA (mono)" } },
	{ KD_VGA, { "VGA", "VGA (mono)" } },
	{ KD_PC98, { "PC-98x1", "PC-98x1" } },
	{ KD_TGA, { "TGA", "TGA" } },
	{ -1, { "Unknown", "Unknown" } },
	};
	int i;

	for (i = 0; names[i].type != -1; ++i)
	if (names[i].type == adp->va_type)
	break;
	return names[i].name[(adp->va_flags & V_ADP_COLOR) ? 0 : 1];
	}

	static void
	sctty_outwakeup(struct tty *tp)
	{
	size_t len;
	u_char buf[PCBURST];
	scr_stat *scp = sc_get_stat(tp);

	if (scp->status & SLKED \|\|
	(scp == scp->sc->cur_scp && scp->sc->blink_in_progress))
	return;

	for (;;) {
	len = ttydisc_getc(tp, buf, sizeof buf);
	if (len == 0)
	break;
	sc_puts(scp, buf, len, 0);
	}
	}

	static struct tty *
	sc_alloc_tty(int index, int devnum)
	{
	struct sc_ttysoftc *stc;
	struct tty *tp;

	/* Allocate TTY object and softc to store unit number. */
	stc = malloc(sizeof(struct sc_ttysoftc), M_DEVBUF, M_WAITOK);
	stc->st_index = index;
	stc->st_stat = NULL;
	tp = tty_alloc_mutex(&sc_ttydevsw, stc, &Giant);

	/* Create device node. */
	tty_makedev(tp, NULL, "v%r", devnum);

	return (tp);
	}

	#ifdef SC_PIXEL_MODE
	static void
	sc_set_vesa_mode(scr_stat scp, sc_softc_t sc, int unit)
	{
	video_info_t info;
	u_char *font;
	int depth;
	int fontsize;
	int i;
	int vmode;

	vmode = 0;
	(void)resource_int_value("sc", unit, "vesa_mode", &vmode);
	if (vmode < M_VESA_BASE \|\| vmode > M_VESA_MODE_MAX \|\|
	vidd_get_info(sc->adp, vmode, &info) != 0 \|\|
	!sc_support_pixel_mode(&info))
	vmode = 0;

	/*
	* If the mode is unset or unsupported, search for an available
	* 800x600 graphics mode with the highest color depth.
	*/
	if (vmode == 0) {
	for (depth = 0, i = M_VESA_BASE; i <= M_VESA_MODE_MAX; i++)
	if (vidd_get_info(sc->adp, i, &info) == 0 &&
	info.vi_width == 800 && info.vi_height == 600 &&
	sc_support_pixel_mode(&info) &&
	info.vi_depth > depth) {
	vmode = i;
	depth = info.vi_depth;
	}
	if (vmode == 0)
	return;
	vidd_get_info(sc->adp, vmode, &info);
	}

	#if !defined(SC_NO_FONT_LOADING) && defined(SC_DFLT_FONT)
	fontsize = info.vi_cheight;
	#else
	fontsize = scp->font_size;
	#endif
	if (fontsize < 14)
	fontsize = 8;
	else if (fontsize >= 16)
	fontsize = 16;
	else
	fontsize = 14;
	#ifndef SC_NO_FONT_LOADING
	switch (fontsize) {
	case 8:
	if ((sc->fonts_loaded & FONT_8) == 0)
	return;
	font = sc->font_8;
	break;
	case 14:
	if ((sc->fonts_loaded & FONT_14) == 0)
	return;
	font = sc->font_14;
	break;
	case 16:
	if ((sc->fonts_loaded & FONT_16) == 0)
	return;
	font = sc->font_16;
	break;
	}
	#else
	font = NULL;
	#endif
	#ifdef DEV_SPLASH
	if ((sc->flags & SC_SPLASH_SCRN) != 0)
	splash_term(sc->adp);
	#endif
	#ifndef SC_NO_HISTORY
	if (scp->history != NULL) {
	sc_vtb_append(&scp->vtb, 0, scp->history,
	scp->ypos * scp->xsize + scp->xpos);
	scp->history_pos = sc_vtb_tail(scp->history);
	}
	#endif
	vidd_set_mode(sc->adp, vmode);
	scp->status \|= (UNKNOWN_MODE \| PIXEL_MODE \| MOUSE_HIDDEN);
	scp->status &= ~(GRAPHICS_MODE \| MOUSE_VISIBLE);
	scp->xpixel = info.vi_width;
	scp->ypixel = info.vi_height;
	scp->xsize = scp->xpixel / 8;
	scp->ysize = scp->ypixel / fontsize;
	scp->xpos = 0;
	scp->ypos = scp->ysize - 1;
	scp->xoff = scp->yoff = 0;
	scp->font = font;
	scp->font_size = fontsize;
	scp->font_width = 8;
	scp->start = scp->xsize * scp->ysize - 1;
	scp->end = 0;
	scp->cursor_pos = scp->cursor_oldpos = scp->xsize * scp->xsize;
	scp->mode = sc->initial_mode = vmode;
	#ifndef __sparc64__
	sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, scp->xsize, scp->ysize,
	(void *)sc->adp->va_window, FALSE);
	#endif
	sc_alloc_scr_buffer(scp, FALSE, FALSE);
	sc_init_emulator(scp, NULL);
	#ifndef SC_NO_CUTPASTE
	sc_alloc_cut_buffer(scp, FALSE);
	#endif
	#ifndef SC_NO_HISTORY
	sc_alloc_history_buffer(scp, 0, 0, FALSE);
	#endif
	sc_set_border(scp, scp->border);
	sc_set_cursor_image(scp);
	scp->status &= ~UNKNOWN_MODE;
	#ifdef DEV_SPLASH
	if ((sc->flags & SC_SPLASH_SCRN) != 0)
	splash_init(sc->adp, scsplash_callback, sc);
	#endif
	}
	#endif

	int
	sc_attach_unit(int unit, int flags)
	{
	sc_softc_t *sc;
	scr_stat *scp;
	struct cdev *dev;
	int vc;

	flags &= ~SC_KERNEL_CONSOLE;

	if (sc_console_unit == unit) {
	/*
	* If this unit is being used as the system console, we need to
	* adjust some variables and buffers before and after scinit().
	*/
	/* assert(sc_console != NULL) */
	flags \|= SC_KERNEL_CONSOLE;
	scmeminit(NULL);
	}
	scinit(unit, flags);

	sc = sc_get_softc(unit, flags & SC_KERNEL_CONSOLE);
	sc->config = flags;
	scp = sc_get_stat(sc->dev[0]);
	if (sc_console == NULL) /* sc_console_unit < 0 */
	sc_console = scp;

	#ifdef SC_PIXEL_MODE
	if ((sc->config & SC_VESAMODE) != 0)
	sc_set_vesa_mode(scp, sc, unit);
	#endif /* SC_PIXEL_MODE */

	/* initialize cursor */
	if (!ISGRAPHSC(scp))
	update_cursor_image(scp);

	/* get screen update going */
	scrn_timer(sc);

	/* set up the keyboard */
	(void)kbdd_ioctl(sc->kbd, KDSKBMODE, (caddr_t)&scp->kbd_mode);
	update_kbd_state(scp, scp->status, LOCK_MASK);

	printf("%s%d: %s <%d virtual consoles, flags=0x%x>\n",
	SC_DRIVER_NAME, unit, adapter_name(sc->adp), sc->vtys, sc->config);
	if (bootverbose) {
	printf("%s%d:", SC_DRIVER_NAME, unit);
	if (sc->adapter >= 0)
	printf(" fb%d", sc->adapter);
	if (sc->keyboard >= 0)
	printf(", kbd%d", sc->keyboard);
	if (scp->tsw)
	printf(", terminal emulator: %s (%s)",
	scp->tsw->te_name, scp->tsw->te_desc);
	printf("\n");
	}

	/* Register suspend/resume/shutdown callbacks for the kernel console. */
	if (sc_console_unit == unit) {
	EVENTHANDLER_REGISTER(power_suspend, scsuspend, NULL,
	EVENTHANDLER_PRI_ANY);
	EVENTHANDLER_REGISTER(power_resume, scresume, NULL,
	EVENTHANDLER_PRI_ANY);
	EVENTHANDLER_REGISTER(shutdown_pre_sync, scshutdown, NULL,
	SHUTDOWN_PRI_DEFAULT);
	}

	for (vc = 0; vc < sc->vtys; vc++) {
	if (sc->dev[vc] == NULL) {
	sc->dev[vc] = sc_alloc_tty(vc, vc + unit * MAXCONS);
	if (vc == 0 && sc->dev == main_devs)
	SC_STAT(sc->dev[0]) = &main_console;
	}
	/*
	* The first vty already has struct tty and scr_stat initialized
	* in scinit(). The other vtys will have these structs when
	* first opened.
	*/
	}

	dev = make_dev(&consolectl_devsw, 0, UID_ROOT, GID_WHEEL, 0600,
	"consolectl");
	dev->si_drv1 = sc->dev[0];

	return 0;
	}

	static void
	scmeminit(void *arg)
	{
	if (sc_malloc)
	return;
	sc_malloc = TRUE;

	/*
	* As soon as malloc() becomes functional, we had better allocate
	* various buffers for the kernel console.
	*/

	if (sc_console_unit < 0) /* sc_console == NULL */
	return;

	/* copy the temporary buffer to the final buffer */
	sc_alloc_scr_buffer(sc_console, FALSE, FALSE);

	#ifndef SC_NO_CUTPASTE
	sc_alloc_cut_buffer(sc_console, FALSE);
	#endif

	#ifndef SC_NO_HISTORY
	/* initialize history buffer & pointers */
	sc_alloc_history_buffer(sc_console, 0, 0, FALSE);
	#endif
	}

	/* XXX */
	SYSINIT(sc_mem, SI_SUB_KMEM, SI_ORDER_ANY, scmeminit, NULL);

	static int
	scdevtounit(struct tty *tp)
	{
	int vty = SC_VTY(tp);

	if (vty == SC_CONSOLECTL)
	return ((sc_console != NULL) ? sc_console->sc->unit : -1);
	else if ((vty < 0) \|\| (vty >= MAXCONS*sc_max_unit()))
	return -1;
	else
	return vty/MAXCONS;
	}

	static int
	sctty_open(struct tty *tp)
	{
	int unit = scdevtounit(tp);
	sc_softc_t *sc;
	scr_stat *scp;
	#ifndef __sparc64__
	keyarg_t key;
	#endif

	DPRINTF(5, ("scopen: dev:%s, unit:%d, vty:%d\n",
	devtoname(tp->t_dev), unit, SC_VTY(tp)));

	sc = sc_get_softc(unit, (sc_console_unit == unit) ? SC_KERNEL_CONSOLE : 0);
	if (sc == NULL)
	return ENXIO;

	if (!tty_opened(tp)) {
	/* Use the current setting of the <-- key as default VERASE. */
	/* If the Delete key is preferable, an stty is necessary */
	#ifndef __sparc64__
	if (sc->kbd != NULL) {
	key.keynum = KEYCODE_BS;
	(void)kbdd_ioctl(sc->kbd, GIO_KEYMAPENT, (caddr_t)&key);
	tp->t_termios.c_cc[VERASE] = key.key.map[0];
	}
	#endif
	}

	scp = sc_get_stat(tp);
	if (scp == NULL) {
	scp = SC_STAT(tp) = alloc_scp(sc, SC_VTY(tp));
	if (ISGRAPHSC(scp))
	sc_set_pixel_mode(scp, NULL, 0, 0, 16, 8);
	}
	if (!tp->t_winsize.ws_col && !tp->t_winsize.ws_row) {
	tp->t_winsize.ws_col = scp->xsize;
	tp->t_winsize.ws_row = scp->ysize;
	}

	return (0);
	}

	static void
	sctty_close(struct tty *tp)
	{
	scr_stat *scp;
	int s;

	if (SC_VTY(tp) != SC_CONSOLECTL) {
	scp = sc_get_stat(tp);
	/* were we in the middle of the VT switching process? */
	DPRINTF(5, ("sc%d: scclose(), ", scp->sc->unit));
	s = spltty();
	if ((scp == scp->sc->cur_scp) && (scp->sc->unit == sc_console_unit))
	cnavailable(sc_consptr, TRUE);
	if (finish_vt_rel(scp, TRUE, &s) == 0) /* force release */
	DPRINTF(5, ("reset WAIT_REL, "));
	if (finish_vt_acq(scp) == 0) /* force acknowledge */
	DPRINTF(5, ("reset WAIT_ACQ, "));
	#ifdef not_yet_done
	if (scp == &main_console) {
	scp->pid = 0;
	scp->proc = NULL;
	scp->smode.mode = VT_AUTO;
	}
	else {
	sc_vtb_destroy(&scp->vtb);
	#ifndef __sparc64__
	sc_vtb_destroy(&scp->scr);
	#endif
	sc_free_history_buffer(scp, scp->ysize);
	SC_STAT(tp) = NULL;
	free(scp, M_DEVBUF);
	}
	#else
	scp->pid = 0;
	scp->proc = NULL;
	scp->smode.mode = VT_AUTO;
	#endif
	scp->kbd_mode = K_XLATE;
	if (scp == scp->sc->cur_scp)
	(void)kbdd_ioctl(scp->sc->kbd, KDSKBMODE, (caddr_t)&scp->kbd_mode);
	DPRINTF(5, ("done.\n"));
	}
	}

	#if 0 /* XXX mpsafetty: fix screensaver. What about outwakeup? */
	static int
	scread(struct cdev dev, struct uio uio, int flag)
	{
	if (!sc_saver_keyb_only)
	sc_touch_scrn_saver();
	return ttyread(dev, uio, flag);
	}
	#endif

	static int
	sckbdevent(keyboard_t thiskbd, int event, void arg)
	{
	sc_softc_t *sc;
	struct tty *cur_tty;
	int c, error = 0;
	size_t len;
	const u_char *cp;

	sc = (sc_softc_t *)arg;
	/* assert(thiskbd == sc->kbd) */

	mtx_lock(&Giant);

	switch (event) {
	case KBDIO_KEYINPUT:
	break;
	case KBDIO_UNLOADING:
	sc->kbd = NULL;
	sc->keyboard = -1;
	kbd_release(thiskbd, (void *)&sc->keyboard);
	goto done;
	default:
	error = EINVAL;
	goto done;
	}

	/*
	* Loop while there is still input to get from the keyboard.
	* I don't think this is nessesary, and it doesn't fix
	* the Xaccel-2.1 keyboard hang, but it can't hurt. XXX
	*/
	while ((c = scgetc(sc, SCGETC_NONBLOCK)) != NOKEY) {

	cur_tty = SC_DEV(sc, sc->cur_scp->index);
	if (!tty_opened(cur_tty))
	continue;

	if ((*sc->cur_scp->tsw->te_input)(sc->cur_scp, c, cur_tty))
	continue;

	switch (KEYFLAGS(c)) {
	case 0x0000: /* normal key */
	ttydisc_rint(cur_tty, KEYCHAR(c), 0);
	break;
	case FKEY: /* function key, return string */
	cp = (*sc->cur_scp->tsw->te_fkeystr)(sc->cur_scp, c);
	if (cp != NULL) {
	ttydisc_rint_simple(cur_tty, cp, strlen(cp));
	break;
	}
	cp = kbdd_get_fkeystr(thiskbd, KEYCHAR(c), &len);
	if (cp != NULL)
	ttydisc_rint_simple(cur_tty, cp, len);
	break;
	case MKEY: /* meta is active, prepend ESC */
	ttydisc_rint(cur_tty, 0x1b, 0);
	ttydisc_rint(cur_tty, KEYCHAR(c), 0);
	break;
	case BKEY: /* backtab fixed sequence (esc [ Z) */
	ttydisc_rint_simple(cur_tty, "\x1B[Z", 3);
	break;
	}

	ttydisc_rint_done(cur_tty);
	}

	sc->cur_scp->status \|= MOUSE_HIDDEN;

	done:
	mtx_unlock(&Giant);
	return (error);
	}

	static int
	sctty_ioctl(struct tty tp, u_long cmd, caddr_t data, struct thread td)
	{
	int error;
	int i;
	sc_softc_t *sc;
	scr_stat *scp;
	int s;
	#if defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	int ival;
	#endif

	/* If there is a user_ioctl function call that first */
	if (sc_user_ioctl) {
	error = (*sc_user_ioctl)(tp, cmd, data, td);
	if (error != ENOIOCTL)
	return error;
	}

	error = sc_vid_ioctl(tp, cmd, data, td);
	if (error != ENOIOCTL)
	return error;

	#ifndef SC_NO_HISTORY
	error = sc_hist_ioctl(tp, cmd, data, td);
	if (error != ENOIOCTL)
	return error;
	#endif

	#ifndef SC_NO_SYSMOUSE
	error = sc_mouse_ioctl(tp, cmd, data, td);
	if (error != ENOIOCTL)
	return error;
	#endif

	scp = sc_get_stat(tp);
	/* assert(scp != NULL) */
	/* scp is sc_console, if SC_VTY(dev) == SC_CONSOLECTL. */
	sc = scp->sc;

	if (scp->tsw) {
	error = (*scp->tsw->te_ioctl)(scp, tp, cmd, data, td);
	if (error != ENOIOCTL)
	return error;
	}

	switch (cmd) { /* process console hardware related ioctl's */

	case GIO_ATTR: /* get current attributes */
	/* this ioctl is not processed here, but in the terminal emulator */
	return ENOTTY;

	case GIO_COLOR: /* is this a color console ? */
	(int )data = (sc->adp->va_flags & V_ADP_COLOR) ? 1 : 0;
	return 0;

	case CONS_BLANKTIME: /* set screen saver timeout (0 = no saver) */
	if ((int )data < 0 \|\| (int )data > MAX_BLANKTIME)
	return EINVAL;
	s = spltty();
	scrn_blank_time = (int )data;
	run_scrn_saver = (scrn_blank_time != 0);
	splx(s);
	return 0;

	case CONS_CURSORTYPE: /* set cursor type (obsolete) */
	s = spltty();
	(int )data &= CONS_CURSOR_ATTRS;
	sc_change_cursor_shape(scp, (int )data, -1, -1);
	splx(s);
	return 0;

	case CONS_GETCURSORSHAPE: /* get cursor shape (new interface) */
	if (((int *)data)[0] & CONS_LOCAL_CURSOR) {
	((int *)data)[0] = scp->curr_curs_attr.flags;
	((int *)data)[1] = scp->curr_curs_attr.base;
	((int *)data)[2] = scp->curr_curs_attr.height;
	} else {
	((int *)data)[0] = sc->curs_attr.flags;
	((int *)data)[1] = sc->curs_attr.base;
	((int *)data)[2] = sc->curs_attr.height;
	}
	return 0;

	case CONS_SETCURSORSHAPE: /* set cursor shape (new interface) */
	s = spltty();
	sc_change_cursor_shape(scp, ((int *)data)[0],
	((int )data)[1], ((int )data)[2]);
	splx(s);
	return 0;

	case CONS_BELLTYPE: /* set bell type sound/visual */
	if (((int )data) & CONS_VISUAL_BELL)
	sc->flags \|= SC_VISUAL_BELL;
	else
	sc->flags &= ~SC_VISUAL_BELL;
	if (((int )data) & CONS_QUIET_BELL)
	sc->flags \|= SC_QUIET_BELL;
	else
	sc->flags &= ~SC_QUIET_BELL;
	return 0;

	case CONS_GETINFO: /* get current (virtual) console info */
	{
	vid_info_t ptr = (vid_info_t)data;
	if (ptr->size == sizeof(struct vid_info)) {
	ptr->m_num = sc->cur_scp->index;
	ptr->font_size = scp->font_size;
	ptr->mv_col = scp->xpos;
	ptr->mv_row = scp->ypos;
	ptr->mv_csz = scp->xsize;
	ptr->mv_rsz = scp->ysize;
	ptr->mv_hsz = (scp->history != NULL) ? scp->history->vtb_rows : 0;
	/*
	* The following fields are filled by the terminal emulator. XXX
	*
	* ptr->mv_norm.fore
	* ptr->mv_norm.back
	* ptr->mv_rev.fore
	* ptr->mv_rev.back
	*/
	ptr->mv_grfc.fore = 0; /* not supported */
	ptr->mv_grfc.back = 0; /* not supported */
	ptr->mv_ovscan = scp->border;
	if (scp == sc->cur_scp)
	save_kbd_state(scp);
	ptr->mk_keylock = scp->status & LOCK_MASK;
	return 0;
	}
	return EINVAL;
	}

	case CONS_GETVERS: /* get version number */
	(int)data = 0x200; /* version 2.0 */
	return 0;

	case CONS_IDLE: /* see if the screen has been idle */
	/*
	* When the screen is in the GRAPHICS_MODE or UNKNOWN_MODE,
	* the user process may have been writing something on the
	* screen and syscons is not aware of it. Declare the screen
	* is NOT idle if it is in one of these modes. But there is
	* an exception to it; if a screen saver is running in the
	* graphics mode in the current screen, we should say that the
	* screen has been idle.
	*/
	(int )data = (sc->flags & SC_SCRN_IDLE)
	&& (!ISGRAPHSC(sc->cur_scp)
	\|\| (sc->cur_scp->status & SAVER_RUNNING));
	return 0;

	case CONS_SAVERMODE: /* set saver mode */
	switch((int )data) {
	case CONS_NO_SAVER:
	case CONS_USR_SAVER:
	/* if a LKM screen saver is running, stop it first. */
	scsplash_stick(FALSE);
	saver_mode = (int )data;
	s = spltty();
	#ifdef DEV_SPLASH
	if ((error = wait_scrn_saver_stop(NULL))) {
	splx(s);
	return error;
	}
	#endif
	run_scrn_saver = TRUE;
	if (saver_mode == CONS_USR_SAVER)
	scp->status \|= SAVER_RUNNING;
	else
	scp->status &= ~SAVER_RUNNING;
	scsplash_stick(TRUE);
	splx(s);
	break;
	case CONS_LKM_SAVER:
	s = spltty();
	if ((saver_mode == CONS_USR_SAVER) && (scp->status & SAVER_RUNNING))
	scp->status &= ~SAVER_RUNNING;
	saver_mode = (int )data;
	splx(s);
	break;
	default:
	return EINVAL;
	}
	return 0;

	case CONS_SAVERSTART: /* immediately start/stop the screen saver */
	/*
	* Note that this ioctl does not guarantee the screen saver
	* actually starts or stops. It merely attempts to do so...
	*/
	s = spltty();
	run_scrn_saver = ((int )data != 0);
	if (run_scrn_saver)
	sc->scrn_time_stamp -= scrn_blank_time;
	splx(s);
	return 0;

	case CONS_SCRSHOT: /* get a screen shot */
	{
	int retval, hist_rsz;
	size_t lsize, csize;
	vm_offset_t frbp, hstp;
	unsigned lnum;
	scrshot_t ptr = (scrshot_t )data;
	void *outp = ptr->buf;

	if (ptr->x < 0 \|\| ptr->y < 0 \|\| ptr->xsize < 0 \|\| ptr->ysize < 0)
	return EINVAL;
	s = spltty();
	if (ISGRAPHSC(scp)) {
	splx(s);
	return EOPNOTSUPP;
	}
	hist_rsz = (scp->history != NULL) ? scp->history->vtb_rows : 0;
	if (((u_int)ptr->x + ptr->xsize) > scp->xsize \|\|
	((u_int)ptr->y + ptr->ysize) > (scp->ysize + hist_rsz)) {
	splx(s);
	return EINVAL;
	}

	lsize = scp->xsize * sizeof(u_int16_t);
	csize = ptr->xsize * sizeof(u_int16_t);
	/* Pointer to the last line of framebuffer */
	frbp = scp->vtb.vtb_buffer + scp->ysize * lsize + ptr->x *
	sizeof(u_int16_t);
	/* Pointer to the last line of target buffer */
	outp = (char )outp + ptr->ysize csize;
	/* Pointer to the last line of history buffer */
	if (scp->history != NULL)
	hstp = scp->history->vtb_buffer + sc_vtb_tail(scp->history) *
	sizeof(u_int16_t) + ptr->x * sizeof(u_int16_t);
	else
	hstp = 0;

	retval = 0;
	for (lnum = 0; lnum < (ptr->y + ptr->ysize); lnum++) {
	if (lnum < scp->ysize) {
	frbp -= lsize;
	} else {
	hstp -= lsize;
	if (hstp < scp->history->vtb_buffer)
	hstp += scp->history->vtb_rows * lsize;
	frbp = hstp;
	}
	if (lnum < ptr->y)
	continue;
	outp = (char *)outp - csize;
	retval = copyout((void *)frbp, outp, csize);
	if (retval != 0)
	break;
	}
	splx(s);
	return retval;
	}

	case VT_SETMODE: /* set screen switcher mode */
	{
	struct vt_mode *mode;
	struct proc *p1;

	mode = (struct vt_mode *)data;
	DPRINTF(5, ("%s%d: VT_SETMODE ", SC_DRIVER_NAME, sc->unit));
	if (scp->smode.mode == VT_PROCESS) {
	p1 = pfind(scp->pid);
	if (scp->proc == p1 && scp->proc != td->td_proc) {
	if (p1)
	PROC_UNLOCK(p1);
	DPRINTF(5, ("error EPERM\n"));
	return EPERM;
	}
	if (p1)
	PROC_UNLOCK(p1);
	}
	s = spltty();
	if (mode->mode == VT_AUTO) {
	scp->smode.mode = VT_AUTO;
	scp->proc = NULL;
	scp->pid = 0;
	DPRINTF(5, ("VT_AUTO, "));
	if ((scp == sc->cur_scp) && (sc->unit == sc_console_unit))
	cnavailable(sc_consptr, TRUE);
	/* were we in the middle of the vty switching process? */
	if (finish_vt_rel(scp, TRUE, &s) == 0)
	DPRINTF(5, ("reset WAIT_REL, "));
	if (finish_vt_acq(scp) == 0)
	DPRINTF(5, ("reset WAIT_ACQ, "));
	} else {
	if (!ISSIGVALID(mode->relsig) \|\| !ISSIGVALID(mode->acqsig)
	\|\| !ISSIGVALID(mode->frsig)) {
	splx(s);
	DPRINTF(5, ("error EINVAL\n"));
	return EINVAL;
	}
	DPRINTF(5, ("VT_PROCESS %d, ", td->td_proc->p_pid));
	bcopy(data, &scp->smode, sizeof(struct vt_mode));
	scp->proc = td->td_proc;
	scp->pid = scp->proc->p_pid;
	if ((scp == sc->cur_scp) && (sc->unit == sc_console_unit))
	cnavailable(sc_consptr, FALSE);
	}
	splx(s);
	DPRINTF(5, ("\n"));
	return 0;
	}

	case VT_GETMODE: /* get screen switcher mode */
	bcopy(&scp->smode, data, sizeof(struct vt_mode));
	return 0;

	#if defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IO('v', 4):
	ival = IOCPARM_IVAL(data);
	data = (caddr_t)&ival;
	/* FALLTHROUGH */
	#endif
	case VT_RELDISP: /* screen switcher ioctl */
	s = spltty();
	/*
	* This must be the current vty which is in the VT_PROCESS
	* switching mode...
	*/
	if ((scp != sc->cur_scp) \|\| (scp->smode.mode != VT_PROCESS)) {
	splx(s);
	return EINVAL;
	}
	/* ...and this process is controlling it. */
	if (scp->proc != td->td_proc) {
	splx(s);
	return EPERM;
	}
	error = EINVAL;
	switch((int )data) {
	case VT_FALSE: /* user refuses to release screen, abort */
	if ((error = finish_vt_rel(scp, FALSE, &s)) == 0)
	DPRINTF(5, ("%s%d: VT_FALSE\n", SC_DRIVER_NAME, sc->unit));
	break;
	case VT_TRUE: /* user has released screen, go on */
	if ((error = finish_vt_rel(scp, TRUE, &s)) == 0)
	DPRINTF(5, ("%s%d: VT_TRUE\n", SC_DRIVER_NAME, sc->unit));
	break;
	case VT_ACKACQ: /* acquire acknowledged, switch completed */
	if ((error = finish_vt_acq(scp)) == 0)
	DPRINTF(5, ("%s%d: VT_ACKACQ\n", SC_DRIVER_NAME, sc->unit));
	break;
	default:
	break;
	}
	splx(s);
	return error;

	case VT_OPENQRY: /* return free virtual console */
	for (i = sc->first_vty; i < sc->first_vty + sc->vtys; i++) {
	tp = SC_DEV(sc, i);
	if (!tty_opened(tp)) {
	(int )data = i + 1;
	return 0;
	}
	}
	return EINVAL;

	#if defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IO('v', 5):
	ival = IOCPARM_IVAL(data);
	data = (caddr_t)&ival;
	/* FALLTHROUGH */
	#endif
	case VT_ACTIVATE: /* switch to screen data /
	i = ((int )data == 0) ? scp->index : ((int )data - 1);
	s = spltty();
	error = sc_clean_up(sc->cur_scp);
	splx(s);
	if (error)
	return error;
	error = sc_switch_scr(sc, i);
	return (error);

	#if defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IO('v', 6):
	ival = IOCPARM_IVAL(data);
	data = (caddr_t)&ival;
	/* FALLTHROUGH */
	#endif
	case VT_WAITACTIVE: /* wait for switch to occur */
	i = ((int )data == 0) ? scp->index : ((int )data - 1);
	if ((i < sc->first_vty) \|\| (i >= sc->first_vty + sc->vtys))
	return EINVAL;
	if (i == sc->cur_scp->index)
	return 0;
	error = tsleep(VTY_WCHAN(sc, i), (PZERO + 1) \| PCATCH, "waitvt", 0);
	return error;

	case VT_GETACTIVE: /* get active vty # */
	(int )data = sc->cur_scp->index + 1;
	return 0;

	case VT_GETINDEX: /* get this vty # */
	(int )data = scp->index + 1;
	return 0;

	case VT_LOCKSWITCH: /* prevent vty switching */
	if (((int )data) & 0x01)
	sc->flags \|= SC_SCRN_VTYLOCK;
	else
	sc->flags &= ~SC_SCRN_VTYLOCK;
	return 0;

	case KDENABIO: /* allow io operations */
	error = priv_check(td, PRIV_IO);
	if (error != 0)
	return error;
	error = securelevel_gt(td->td_ucred, 0);
	if (error != 0)
	return error;
	#ifdef __i386__
	td->td_frame->tf_eflags \|= PSL_IOPL;
	#elif defined(__amd64__)
	td->td_frame->tf_rflags \|= PSL_IOPL;
	#endif
	return 0;

	case KDDISABIO: /* disallow io operations (default) */
	#ifdef __i386__
	td->td_frame->tf_eflags &= ~PSL_IOPL;
	#elif defined(__amd64__)
	td->td_frame->tf_rflags &= ~PSL_IOPL;
	#endif
	return 0;

	#if defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IO('K', 20):
	ival = IOCPARM_IVAL(data);
	data = (caddr_t)&ival;
	/* FALLTHROUGH */
	#endif
	case KDSKBSTATE: /* set keyboard state (locks) */
	if ((int )data & ~LOCK_MASK)
	return EINVAL;
	scp->status &= ~LOCK_MASK;
	scp->status \|= (int )data;
	if (scp == sc->cur_scp)
	update_kbd_state(scp, scp->status, LOCK_MASK);
	return 0;

	case KDGKBSTATE: /* get keyboard state (locks) */
	if (scp == sc->cur_scp)
	save_kbd_state(scp);
	(int )data = scp->status & LOCK_MASK;
	return 0;

	case KDGETREPEAT: /* get keyboard repeat & delay rates */
	case KDSETREPEAT: /* set keyboard repeat & delay rates (new) */
	error = kbdd_ioctl(sc->kbd, cmd, data);
	if (error == ENOIOCTL)
	error = ENODEV;
	return error;

	#if defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IO('K', 67):
	ival = IOCPARM_IVAL(data);
	data = (caddr_t)&ival;
	/* FALLTHROUGH */
	#endif
	case KDSETRAD: /* set keyboard repeat & delay rates (old) */
	if ((int )data & ~0x7f)
	return EINVAL;
	error = kbdd_ioctl(sc->kbd, KDSETRAD, data);
	if (error == ENOIOCTL)
	error = ENODEV;
	return error;

	#if defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IO('K', 7):
	ival = IOCPARM_IVAL(data);
	data = (caddr_t)&ival;
	/* FALLTHROUGH */
	#endif
	case KDSKBMODE: /* set keyboard mode */
	switch ((int )data) {
	case K_XLATE: /* switch to XLT ascii mode */
	case K_RAW: /* switch to RAW scancode mode */
	case K_CODE: /* switch to CODE mode */
	scp->kbd_mode = (int )data;
	if (scp == sc->cur_scp)
	(void)kbdd_ioctl(sc->kbd, KDSKBMODE, data);
	return 0;
	default:
	return EINVAL;
	}
	/* NOT REACHED */

	case KDGKBMODE: /* get keyboard mode */
	(int )data = scp->kbd_mode;
	return 0;

	case KDGKBINFO:
	error = kbdd_ioctl(sc->kbd, cmd, data);
	if (error == ENOIOCTL)
	error = ENODEV;
	return error;

	#if defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IO('K', 8):
	ival = IOCPARM_IVAL(data);
	data = (caddr_t)&ival;
	/* FALLTHROUGH */
	#endif
	case KDMKTONE: /* sound the bell */
	if ((int)data)
	sc_bell(scp, ((int)data)&0xffff,
	((((int)data)>>16)&0xffff)*hz/1000);
	else
	sc_bell(scp, scp->bell_pitch, scp->bell_duration);
	return 0;

	#if defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IO('K', 63):
	ival = IOCPARM_IVAL(data);
	data = (caddr_t)&ival;
	/* FALLTHROUGH */
	#endif
	case KIOCSOUND: /* make tone (data) hz /
	if (scp == sc->cur_scp) {
	if ((int )data)
	return sc_tone((int )data);
	else
	return sc_tone(0);
	}
	return 0;

	case KDGKBTYPE: /* get keyboard type */
	error = kbdd_ioctl(sc->kbd, cmd, data);
	if (error == ENOIOCTL) {
	/* always return something? XXX */
	(int )data = 0;
	}
	return 0;

	#if defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IO('K', 66):
	ival = IOCPARM_IVAL(data);
	data = (caddr_t)&ival;
	/* FALLTHROUGH */
	#endif
	case KDSETLED: /* set keyboard LED status */
	if ((int )data & ~LED_MASK) /* FIXME: LOCK_MASK? */
	return EINVAL;
	scp->status &= ~LED_MASK;
	scp->status \|= (int )data;
	if (scp == sc->cur_scp)
	update_kbd_leds(scp, scp->status);
	return 0;

	case KDGETLED: /* get keyboard LED status */
	if (scp == sc->cur_scp)
	save_kbd_state(scp);
	(int )data = scp->status & LED_MASK;
	return 0;

	case KBADDKBD: /* add/remove keyboard to/from mux */
	case KBRELKBD:
	error = kbdd_ioctl(sc->kbd, cmd, data);
	if (error == ENOIOCTL)
	error = ENODEV;
	return error;

	#if defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IO('c', 110):
	ival = IOCPARM_IVAL(data);
	data = (caddr_t)&ival;
	/* FALLTHROUGH */
	#endif
	case CONS_SETKBD: /* set the new keyboard */
	{
	keyboard_t *newkbd;

	s = spltty();
	newkbd = kbd_get_keyboard((int )data);
	if (newkbd == NULL) {
	splx(s);
	return EINVAL;
	}
	error = 0;
	if (sc->kbd != newkbd) {
	i = kbd_allocate(newkbd->kb_name, newkbd->kb_unit,
	(void *)&sc->keyboard, sckbdevent, sc);
	/* i == newkbd->kb_index */
	if (i >= 0) {
	if (sc->kbd != NULL) {
	save_kbd_state(sc->cur_scp);
	kbd_release(sc->kbd, (void *)&sc->keyboard);
	}
	sc->kbd = kbd_get_keyboard(i); /* sc->kbd == newkbd */
	sc->keyboard = i;
	(void)kbdd_ioctl(sc->kbd, KDSKBMODE,
	(caddr_t)&sc->cur_scp->kbd_mode);
	update_kbd_state(sc->cur_scp, sc->cur_scp->status,
	LOCK_MASK);
	} else {
	error = EPERM; /* XXX */
	}
	}
	splx(s);
	return error;
	}

	case CONS_RELKBD: /* release the current keyboard */
	s = spltty();
	error = 0;
	if (sc->kbd != NULL) {
	save_kbd_state(sc->cur_scp);
	error = kbd_release(sc->kbd, (void *)&sc->keyboard);
	if (error == 0) {
	sc->kbd = NULL;
	sc->keyboard = -1;
	}
	}
	splx(s);
	return error;

	case CONS_GETTERM: /* get the current terminal emulator info */
	{
	sc_term_sw_t *sw;

	if (((term_info_t *)data)->ti_index == 0) {
	sw = scp->tsw;
	} else {
	sw = sc_term_match_by_number(((term_info_t *)data)->ti_index);
	}
	if (sw != NULL) {
	strncpy(((term_info_t *)data)->ti_name, sw->te_name,
	sizeof(((term_info_t *)data)->ti_name));
	strncpy(((term_info_t *)data)->ti_desc, sw->te_desc,
	sizeof(((term_info_t *)data)->ti_desc));
	((term_info_t *)data)->ti_flags = 0;
	return 0;
	} else {
	((term_info_t *)data)->ti_name[0] = '\0';
	((term_info_t *)data)->ti_desc[0] = '\0';
	((term_info_t *)data)->ti_flags = 0;
	return EINVAL;
	}
	}

	case CONS_SETTERM: /* set the current terminal emulator */
	s = spltty();
	error = sc_init_emulator(scp, ((term_info_t *)data)->ti_name);
	/* FIXME: what if scp == sc_console! XXX */
	splx(s);
	return error;

	case GIO_SCRNMAP: /* get output translation table */
	bcopy(&sc->scr_map, data, sizeof(sc->scr_map));
	return 0;

	case PIO_SCRNMAP: /* set output translation table */
	bcopy(data, &sc->scr_map, sizeof(sc->scr_map));
	for (i=0; i<sizeof(sc->scr_map); i++) {
	sc->scr_rmap[sc->scr_map[i]] = i;
	}
	return 0;

	case GIO_KEYMAP: /* get keyboard translation table */
	case PIO_KEYMAP: /* set keyboard translation table */
	case OGIO_KEYMAP: /* get keyboard translation table (compat) */
	case OPIO_KEYMAP: /* set keyboard translation table (compat) */
	case GIO_DEADKEYMAP: /* get accent key translation table */
	case PIO_DEADKEYMAP: /* set accent key translation table */
	case GETFKEY: /* get function key string */
	case SETFKEY: /* set function key string */
	error = kbdd_ioctl(sc->kbd, cmd, data);
	if (error == ENOIOCTL)
	error = ENODEV;
	return error;

	#ifndef SC_NO_FONT_LOADING

	case PIO_FONT8x8: /* set 8x8 dot font */
	if (!ISFONTAVAIL(sc->adp->va_flags))
	return ENXIO;
	bcopy(data, sc->font_8, 8*256);
	sc->fonts_loaded \|= FONT_8;
	/*
	* FONT KLUDGE
	* Always use the font page #0. XXX
	* Don't load if the current font size is not 8x8.
	*/
	if (ISTEXTSC(sc->cur_scp) && (sc->cur_scp->font_size < 14))
	sc_load_font(sc->cur_scp, 0, 8, 8, sc->font_8, 0, 256);
	return 0;

	case GIO_FONT8x8: /* get 8x8 dot font */
	if (!ISFONTAVAIL(sc->adp->va_flags))
	return ENXIO;
	if (sc->fonts_loaded & FONT_8) {
	bcopy(sc->font_8, data, 8*256);
	return 0;
	}
	else
	return ENXIO;

	case PIO_FONT8x14: /* set 8x14 dot font */
	if (!ISFONTAVAIL(sc->adp->va_flags))
	return ENXIO;
	bcopy(data, sc->font_14, 14*256);
	sc->fonts_loaded \|= FONT_14;
	/*
	* FONT KLUDGE
	* Always use the font page #0. XXX
	* Don't load if the current font size is not 8x14.
	*/
	if (ISTEXTSC(sc->cur_scp)
	&& (sc->cur_scp->font_size >= 14)
	&& (sc->cur_scp->font_size < 16))
	sc_load_font(sc->cur_scp, 0, 14, 8, sc->font_14, 0, 256);
	return 0;

	case GIO_FONT8x14: /* get 8x14 dot font */
	if (!ISFONTAVAIL(sc->adp->va_flags))
	return ENXIO;
	if (sc->fonts_loaded & FONT_14) {
	bcopy(sc->font_14, data, 14*256);
	return 0;
	}
	else
	return ENXIO;

	case PIO_FONT8x16: /* set 8x16 dot font */
	if (!ISFONTAVAIL(sc->adp->va_flags))
	return ENXIO;
	bcopy(data, sc->font_16, 16*256);
	sc->fonts_loaded \|= FONT_16;
	/*
	* FONT KLUDGE
	* Always use the font page #0. XXX
	* Don't load if the current font size is not 8x16.
	*/
	if (ISTEXTSC(sc->cur_scp) && (sc->cur_scp->font_size >= 16))
	sc_load_font(sc->cur_scp, 0, 16, 8, sc->font_16, 0, 256);
	return 0;

	case GIO_FONT8x16: /* get 8x16 dot font */
	if (!ISFONTAVAIL(sc->adp->va_flags))
	return ENXIO;
	if (sc->fonts_loaded & FONT_16) {
	bcopy(sc->font_16, data, 16*256);
	return 0;
	}
	else
	return ENXIO;

	#endif /* SC_NO_FONT_LOADING */

	default:
	break;
	}

	return (ENOIOCTL);
	}

	static int
	consolectl_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
	struct thread *td)
	{

	return sctty_ioctl(dev->si_drv1, cmd, data, td);
	}

	static void
	sc_cnprobe(struct consdev *cp)
	{
	int unit;
	int flags;

	cp->cn_pri = sc_get_cons_priority(&unit, &flags);

	/* a video card is always required */
	if (!scvidprobe(unit, flags, TRUE))
	cp->cn_pri = CN_DEAD;

	/* syscons will become console even when there is no keyboard */
	sckbdprobe(unit, flags, TRUE);

	if (cp->cn_pri == CN_DEAD)
	return;

	/* initialize required fields */
	strcpy(cp->cn_name, "ttyv0");
	}

	static void
	sc_cninit(struct consdev *cp)
	{
	int unit;
	int flags;

	sc_get_cons_priority(&unit, &flags);
	scinit(unit, flags \| SC_KERNEL_CONSOLE);
	sc_console_unit = unit;
	sc_console = sc_get_stat(sc_get_softc(unit, SC_KERNEL_CONSOLE)->dev[0]);
	sc_consptr = cp;
	}

	static void
	sc_cnterm(struct consdev *cp)
	{
	/* we are not the kernel console any more, release everything */

	if (sc_console_unit < 0)
	return; /* shouldn't happen */

	#if 0 /* XXX */
	sc_clear_screen(sc_console);
	sccnupdate(sc_console);
	#endif

	scterm(sc_console_unit, SC_KERNEL_CONSOLE);
	sc_console_unit = -1;
	sc_console = NULL;
	}

	static void
	sc_cnputc(struct consdev *cd, int c)
	{
	u_char buf[1];
	scr_stat *scp = sc_console;
	#ifndef SC_NO_HISTORY
	#if 0
	struct tty *tp;
	#endif
	#endif /* !SC_NO_HISTORY */
	int s;

	/* assert(sc_console != NULL) */

	#ifndef SC_NO_HISTORY
	if (scp == scp->sc->cur_scp && scp->status & SLKED) {
	scp->status &= ~SLKED;
	update_kbd_state(scp, scp->status, SLKED);
	if (scp->status & BUFFER_SAVED) {
	if (!sc_hist_restore(scp))
	sc_remove_cutmarking(scp);
	scp->status &= ~BUFFER_SAVED;
	scp->status \|= CURSOR_ENABLED;
	sc_draw_cursor_image(scp);
	}
	#if 0
	/*
	* XXX: Now that TTY's have their own locks, we cannot process
	* any data after disabling scroll lock. cnputs already holds a
	* spinlock.
	*/
	tp = SC_DEV(scp->sc, scp->index);
	tty_lock(tp);
	if (tty_opened(tp))
	sctty_outwakeup(tp);
	tty_unlock(tp);
	#endif
	}
	#endif /* !SC_NO_HISTORY */

	buf[0] = c;
	sc_puts(scp, buf, 1, 1);

	s = spltty(); /* block sckbdevent and scrn_timer */
	sccnupdate(scp);
	splx(s);
	}

	static int
	sc_cngetc(struct consdev *cd)
	{
	static struct fkeytab fkey;
	static int fkeycp;
	scr_stat *scp;
	const u_char *p;
	int cur_mode;
	int s = spltty(); /* block sckbdevent and scrn_timer while we poll */
	int c;

	/* assert(sc_console != NULL) */

	/*
	* Stop the screen saver and update the screen if necessary.
	* What if we have been running in the screen saver code... XXX
	*/
	sc_touch_scrn_saver();
	scp = sc_console->sc->cur_scp; /* XXX */
	sccnupdate(scp);

	if (fkeycp < fkey.len) {
	splx(s);
	return fkey.str[fkeycp++];
	}

	if (scp->sc->kbd == NULL) {
	splx(s);
	return -1;
	}

	/*
	* Make sure the keyboard is accessible even when the kbd device
	* driver is disabled.
	*/
	kbdd_enable(scp->sc->kbd);

	/* we shall always use the keyboard in the XLATE mode here */
	cur_mode = scp->kbd_mode;
	scp->kbd_mode = K_XLATE;
	(void)kbdd_ioctl(scp->sc->kbd, KDSKBMODE, (caddr_t)&scp->kbd_mode);

	kbdd_poll(scp->sc->kbd, TRUE);
	c = scgetc(scp->sc, SCGETC_CN \| SCGETC_NONBLOCK);
	kbdd_poll(scp->sc->kbd, FALSE);

	scp->kbd_mode = cur_mode;
	(void)kbdd_ioctl(scp->sc->kbd, KDSKBMODE, (caddr_t)&scp->kbd_mode);
	kbdd_disable(scp->sc->kbd);
	splx(s);

	switch (KEYFLAGS(c)) {
	case 0: /* normal char */
	return KEYCHAR(c);
	case FKEY: /* function key */
	p = (*scp->tsw->te_fkeystr)(scp, c);
	if (p != NULL) {
	fkey.len = strlen(p);
	bcopy(p, fkey.str, fkey.len);
	fkeycp = 1;
	return fkey.str[0];
	}
	p = kbdd_get_fkeystr(scp->sc->kbd, KEYCHAR(c), (size_t *)&fkeycp);
	fkey.len = fkeycp;
	if ((p != NULL) && (fkey.len > 0)) {
	bcopy(p, fkey.str, fkey.len);
	fkeycp = 1;
	return fkey.str[0];
	}
	return c; /* XXX */
	case NOKEY:
	case ERRKEY:
	default:
	return -1;
	}
	/* NOT REACHED */
	}

	static void
	sccnupdate(scr_stat *scp)
	{
	/* this is a cut-down version of scrn_timer()... */

	if (suspend_in_progress \|\| scp->sc->font_loading_in_progress)
	return;

	if (debugger > 0 \|\| panicstr \|\| shutdown_in_progress) {
	sc_touch_scrn_saver();
	} else if (scp != scp->sc->cur_scp) {
	return;
	}

	if (!run_scrn_saver)
	scp->sc->flags &= ~SC_SCRN_IDLE;
	#ifdef DEV_SPLASH
	if ((saver_mode != CONS_LKM_SAVER) \|\| !(scp->sc->flags & SC_SCRN_IDLE))
	if (scp->sc->flags & SC_SCRN_BLANKED)
	stop_scrn_saver(scp->sc, current_saver);
	#endif

	if (scp != scp->sc->cur_scp \|\| scp->sc->blink_in_progress
	\|\| scp->sc->switch_in_progress)
	return;
	/*
	* FIXME: unlike scrn_timer(), we call scrn_update() from here even
	* when write_in_progress is non-zero. XXX
	*/

	if (!ISGRAPHSC(scp) && !(scp->sc->flags & SC_SCRN_BLANKED))
	scrn_update(scp, TRUE);
	}

	static void
	scrn_timer(void *arg)
	{
	#ifndef PC98
	static int kbd_interval = 0;
	#endif
	struct timeval tv;
	sc_softc_t *sc;
	scr_stat *scp;
	int again;
	int s;

	again = (arg != NULL);
	if (arg != NULL)
	sc = (sc_softc_t *)arg;
	else if (sc_console != NULL)
	sc = sc_console->sc;
	else
	return;

	/* don't do anything when we are performing some I/O operations */
	if (suspend_in_progress \|\| sc->font_loading_in_progress) {
	if (again)
	timeout(scrn_timer, sc, hz / 10);
	return;
	}
	s = spltty();

	#ifndef PC98
	if ((sc->kbd == NULL) && (sc->config & SC_AUTODETECT_KBD)) {
	/* try to allocate a keyboard automatically */
	if (++kbd_interval >= 25) {
	sc->keyboard = sc_allocate_keyboard(sc, -1);
	if (sc->keyboard >= 0) {
	sc->kbd = kbd_get_keyboard(sc->keyboard);
	(void)kbdd_ioctl(sc->kbd, KDSKBMODE,
	(caddr_t)&sc->cur_scp->kbd_mode);
	update_kbd_state(sc->cur_scp, sc->cur_scp->status,
	LOCK_MASK);
	}
	kbd_interval = 0;
	}
	}
	#endif /* PC98 */

	/* find the vty to update */
	scp = sc->cur_scp;

	/* should we stop the screen saver? */
	getmicrouptime(&tv);
	if (debugger > 0 \|\| panicstr \|\| shutdown_in_progress)
	sc_touch_scrn_saver();
	if (run_scrn_saver) {
	if (tv.tv_sec > sc->scrn_time_stamp + scrn_blank_time)
	sc->flags \|= SC_SCRN_IDLE;
	else
	sc->flags &= ~SC_SCRN_IDLE;
	} else {
	sc->scrn_time_stamp = tv.tv_sec;
	sc->flags &= ~SC_SCRN_IDLE;
	if (scrn_blank_time > 0)
	run_scrn_saver = TRUE;
	}
	#ifdef DEV_SPLASH
	if ((saver_mode != CONS_LKM_SAVER) \|\| !(sc->flags & SC_SCRN_IDLE))
	if (sc->flags & SC_SCRN_BLANKED)
	stop_scrn_saver(sc, current_saver);
	#endif

	/* should we just return ? */
	if (sc->blink_in_progress \|\| sc->switch_in_progress
	\|\| sc->write_in_progress) {
	if (again)
	timeout(scrn_timer, sc, hz / 10);
	splx(s);
	return;
	}

	/* Update the screen */
	scp = sc->cur_scp; /* cur_scp may have changed... */
	if (!ISGRAPHSC(scp) && !(sc->flags & SC_SCRN_BLANKED))
	scrn_update(scp, TRUE);

	#ifdef DEV_SPLASH
	/* should we activate the screen saver? */
	if ((saver_mode == CONS_LKM_SAVER) && (sc->flags & SC_SCRN_IDLE))
	if (!ISGRAPHSC(scp) \|\| (sc->flags & SC_SCRN_BLANKED))
	(*current_saver)(sc, TRUE);
	#endif

	if (again)
	timeout(scrn_timer, sc, hz / 25);
	splx(s);
	}

	static int
	and_region(int s1, int e1, int s2, int e2)
	{
	if (e1 < s2 \|\| e2 < s1)
	return FALSE;
	s1 = imax(s1, s2);
	e1 = imin(e1, e2);
	return TRUE;
	}

	static void
	scrn_update(scr_stat *scp, int show_cursor)
	{
	int start;
	int end;
	int s;
	int e;

	/* assert(scp == scp->sc->cur_scp) */

	SC_VIDEO_LOCK(scp->sc);

	#ifndef SC_NO_CUTPASTE
	/* remove the previous mouse pointer image if necessary */
	if (scp->status & MOUSE_VISIBLE) {
	s = scp->mouse_pos;
	e = scp->mouse_pos + scp->xsize + 1;
	if ((scp->status & (MOUSE_MOVED \| MOUSE_HIDDEN))
	\|\| and_region(&s, &e, scp->start, scp->end)
	\|\| ((scp->status & CURSOR_ENABLED) &&
	(scp->cursor_pos != scp->cursor_oldpos) &&
	(and_region(&s, &e, scp->cursor_pos, scp->cursor_pos)
	\|\| and_region(&s, &e, scp->cursor_oldpos, scp->cursor_oldpos)))) {
	sc_remove_mouse_image(scp);
	if (scp->end >= scp->xsize*scp->ysize)
	scp->end = scp->xsize*scp->ysize - 1;
	}
	}
	#endif /* !SC_NO_CUTPASTE */

	#if 1
	/* debug: XXX */
	if (scp->end >= scp->xsize*scp->ysize) {
	printf("scrn_update(): scp->end %d > size_of_screen!!\n", scp->end);
	scp->end = scp->xsize*scp->ysize - 1;
	}
	if (scp->start < 0) {
	printf("scrn_update(): scp->start %d < 0\n", scp->start);
	scp->start = 0;
	}
	#endif

	/* update screen image */
	if (scp->start <= scp->end) {
	if (scp->mouse_cut_end >= 0) {
	/* there is a marked region for cut & paste */
	if (scp->mouse_cut_start <= scp->mouse_cut_end) {
	start = scp->mouse_cut_start;
	end = scp->mouse_cut_end;
	} else {
	start = scp->mouse_cut_end;
	end = scp->mouse_cut_start - 1;
	}
	s = start;
	e = end;
	/* does the cut-mark region overlap with the update region? */
	if (and_region(&s, &e, scp->start, scp->end)) {
	(*scp->rndr->draw)(scp, s, e - s + 1, TRUE);
	s = 0;
	e = start - 1;
	if (and_region(&s, &e, scp->start, scp->end))
	(*scp->rndr->draw)(scp, s, e - s + 1, FALSE);
	s = end + 1;
	e = scp->xsize*scp->ysize - 1;
	if (and_region(&s, &e, scp->start, scp->end))
	(*scp->rndr->draw)(scp, s, e - s + 1, FALSE);
	} else {
	(*scp->rndr->draw)(scp, scp->start,
	scp->end - scp->start + 1, FALSE);
	}
	} else {
	(*scp->rndr->draw)(scp, scp->start,
	scp->end - scp->start + 1, FALSE);
	}
	}

	/* we are not to show the cursor and the mouse pointer... */
	if (!show_cursor) {
	scp->end = 0;
	scp->start = scp->xsize*scp->ysize - 1;
	SC_VIDEO_UNLOCK(scp->sc);
	return;
	}

	/* update cursor image */
	if (scp->status & CURSOR_ENABLED) {
	s = scp->start;
	e = scp->end;
	/* did cursor move since last time ? */
	if (scp->cursor_pos != scp->cursor_oldpos) {
	/* do we need to remove old cursor image ? */
	if (!and_region(&s, &e, scp->cursor_oldpos, scp->cursor_oldpos))
	sc_remove_cursor_image(scp);
	sc_draw_cursor_image(scp);
	} else {
	if (and_region(&s, &e, scp->cursor_pos, scp->cursor_pos))
	/* cursor didn't move, but has been overwritten */
	sc_draw_cursor_image(scp);
	else if (scp->curs_attr.flags & CONS_BLINK_CURSOR)
	/* if it's a blinking cursor, update it */
	(*scp->rndr->blink_cursor)(scp, scp->cursor_pos,
	sc_inside_cutmark(scp,
	scp->cursor_pos));
	}
	}

	#ifndef SC_NO_CUTPASTE
	/* update "pseudo" mouse pointer image */
	if (scp->sc->flags & SC_MOUSE_ENABLED) {
	if (!(scp->status & (MOUSE_VISIBLE \| MOUSE_HIDDEN))) {
	scp->status &= ~MOUSE_MOVED;
	sc_draw_mouse_image(scp);
	}
	}
	#endif /* SC_NO_CUTPASTE */

	scp->end = 0;
	scp->start = scp->xsize*scp->ysize - 1;

	SC_VIDEO_UNLOCK(scp->sc);
	}

	#ifdef DEV_SPLASH
	static int
	scsplash_callback(int event, void *arg)
	{
	sc_softc_t *sc;
	int error;

	sc = (sc_softc_t *)arg;

	switch (event) {
	case SPLASH_INIT:
	if (add_scrn_saver(scsplash_saver) == 0) {
	sc->flags &= ~SC_SAVER_FAILED;
	run_scrn_saver = TRUE;
	if (cold && !(boothowto & RB_VERBOSE)) {
	scsplash_stick(TRUE);
	(*current_saver)(sc, TRUE);
	}
	}
	return 0;

	case SPLASH_TERM:
	if (current_saver == scsplash_saver) {
	scsplash_stick(FALSE);
	error = remove_scrn_saver(scsplash_saver);
	if (error)
	return error;
	}
	return 0;

	default:
	return EINVAL;
	}
	}

	static void
	scsplash_saver(sc_softc_t *sc, int show)
	{
	static int busy = FALSE;
	scr_stat *scp;

	if (busy)
	return;
	busy = TRUE;

	scp = sc->cur_scp;
	if (show) {
	if (!(sc->flags & SC_SAVER_FAILED)) {
	if (!(sc->flags & SC_SCRN_BLANKED))
	set_scrn_saver_mode(scp, -1, NULL, 0);
	switch (splash(sc->adp, TRUE)) {
	case 0: /* succeeded */
	break;
	case EAGAIN: /* try later */
	restore_scrn_saver_mode(scp, FALSE);
	sc_touch_scrn_saver(); /* XXX */
	break;
	default:
	sc->flags \|= SC_SAVER_FAILED;
	scsplash_stick(FALSE);
	restore_scrn_saver_mode(scp, TRUE);
	printf("scsplash_saver(): failed to put up the image\n");
	break;
	}
	}
	} else if (!sticky_splash) {
	if ((sc->flags & SC_SCRN_BLANKED) && (splash(sc->adp, FALSE) == 0))
	restore_scrn_saver_mode(scp, TRUE);
	}
	busy = FALSE;
	}

	static int
	add_scrn_saver(void (this_saver)(sc_softc_t , int))
	{
	#if 0
	int error;

	if (current_saver != none_saver) {
	error = remove_scrn_saver(current_saver);
	if (error)
	return error;
	}
	#endif
	if (current_saver != none_saver)
	return EBUSY;

	run_scrn_saver = FALSE;
	saver_mode = CONS_LKM_SAVER;
	current_saver = this_saver;
	return 0;
	}

	static int
	remove_scrn_saver(void (this_saver)(sc_softc_t , int))
	{
	if (current_saver != this_saver)
	return EINVAL;

	#if 0
	/*
	* In order to prevent `current_saver' from being called by
	* the timeout routine `scrn_timer()' while we manipulate
	* the saver list, we shall set `current_saver' to `none_saver'
	* before stopping the current saver, rather than blocking by `splXX()'.
	*/
	current_saver = none_saver;
	if (scrn_blanked)
	stop_scrn_saver(this_saver);
	#endif

	/* unblank all blanked screens */
	wait_scrn_saver_stop(NULL);
	if (scrn_blanked)
	return EBUSY;

	current_saver = none_saver;
	return 0;
	}

	static int
	set_scrn_saver_mode(scr_stat scp, int mode, u_char pal, int border)
	{
	int s;

	/* assert(scp == scp->sc->cur_scp) */
	s = spltty();
	if (!ISGRAPHSC(scp))
	sc_remove_cursor_image(scp);
	scp->splash_save_mode = scp->mode;
	scp->splash_save_status = scp->status & (GRAPHICS_MODE \| PIXEL_MODE);
	scp->status &= ~(GRAPHICS_MODE \| PIXEL_MODE);
	scp->status \|= (UNKNOWN_MODE \| SAVER_RUNNING);
	scp->sc->flags \|= SC_SCRN_BLANKED;
	++scrn_blanked;
	splx(s);
	if (mode < 0)
	return 0;
	scp->mode = mode;
	if (set_mode(scp) == 0) {
	if (scp->sc->adp->va_info.vi_flags & V_INFO_GRAPHICS)
	scp->status \|= GRAPHICS_MODE;
	#ifndef SC_NO_PALETTE_LOADING
	if (pal != NULL)
	vidd_load_palette(scp->sc->adp, pal);
	#endif
	sc_set_border(scp, border);
	return 0;
	} else {
	s = spltty();
	scp->mode = scp->splash_save_mode;
	scp->status &= ~(UNKNOWN_MODE \| SAVER_RUNNING);
	scp->status \|= scp->splash_save_status;
	splx(s);
	return 1;
	}
	}

	static int
	restore_scrn_saver_mode(scr_stat *scp, int changemode)
	{
	int mode;
	int status;
	int s;

	/* assert(scp == scp->sc->cur_scp) */
	s = spltty();
	mode = scp->mode;
	status = scp->status;
	scp->mode = scp->splash_save_mode;
	scp->status &= ~(UNKNOWN_MODE \| SAVER_RUNNING);
	scp->status \|= scp->splash_save_status;
	scp->sc->flags &= ~SC_SCRN_BLANKED;
	if (!changemode) {
	if (!ISGRAPHSC(scp))
	sc_draw_cursor_image(scp);
	--scrn_blanked;
	splx(s);
	return 0;
	}
	if (set_mode(scp) == 0) {
	#ifndef SC_NO_PALETTE_LOADING
	#ifdef SC_PIXEL_MODE
	if (scp->sc->adp->va_info.vi_mem_model == V_INFO_MM_DIRECT)
	vidd_load_palette(scp->sc->adp, scp->sc->palette2);
	else
	#endif
	vidd_load_palette(scp->sc->adp, scp->sc->palette);
	#endif
	--scrn_blanked;
	splx(s);
	return 0;
	} else {
	scp->mode = mode;
	scp->status = status;
	splx(s);
	return 1;
	}
	}

	static void
	stop_scrn_saver(sc_softc_t sc, void (saver)(sc_softc_t *, int))
	{
	(*saver)(sc, FALSE);
	run_scrn_saver = FALSE;
	/* the screen saver may have chosen not to stop after all... */
	if (sc->flags & SC_SCRN_BLANKED)
	return;

	mark_all(sc->cur_scp);
	if (sc->delayed_next_scr)
	sc_switch_scr(sc, sc->delayed_next_scr - 1);
	if (debugger == 0)
	wakeup(&scrn_blanked);
	}

	static int
	wait_scrn_saver_stop(sc_softc_t *sc)
	{
	int error = 0;

	while (scrn_blanked > 0) {
	run_scrn_saver = FALSE;
	if (sc && !(sc->flags & SC_SCRN_BLANKED)) {
	error = 0;
	break;
	}
	error = tsleep(&scrn_blanked, PZERO \| PCATCH, "scrsav", 0);
	if ((error != 0) && (error != ERESTART))
	break;
	}
	run_scrn_saver = FALSE;
	return error;
	}
	#endif /* DEV_SPLASH */

	void
	sc_touch_scrn_saver(void)
	{
	scsplash_stick(FALSE);
	run_scrn_saver = FALSE;
	}

	int
	sc_switch_scr(sc_softc_t *sc, u_int next_scr)
	{
	scr_stat *cur_scp;
	struct tty *tp;
	struct proc *p;
	int s;

	DPRINTF(5, ("sc0: sc_switch_scr() %d ", next_scr + 1));

	if (sc->cur_scp == NULL)
	return (0);

	/* prevent switch if previously requested */
	if (sc->flags & SC_SCRN_VTYLOCK) {
	sc_bell(sc->cur_scp, sc->cur_scp->bell_pitch,
	sc->cur_scp->bell_duration);
	return EPERM;
	}

	/* delay switch if the screen is blanked or being updated */
	if ((sc->flags & SC_SCRN_BLANKED) \|\| sc->write_in_progress
	\|\| sc->blink_in_progress) {
	sc->delayed_next_scr = next_scr + 1;
	sc_touch_scrn_saver();
	DPRINTF(5, ("switch delayed\n"));
	return 0;
	}
	sc->delayed_next_scr = 0;

	s = spltty();
	cur_scp = sc->cur_scp;

	/* we are in the middle of the vty switching process... */
	if (sc->switch_in_progress
	&& (cur_scp->smode.mode == VT_PROCESS)
	&& cur_scp->proc) {
	p = pfind(cur_scp->pid);
	if (cur_scp->proc != p) {
	if (p)
	PROC_UNLOCK(p);
	/*
	* The controlling process has died!!. Do some clean up.
	* NOTE:`cur_scp->proc' and `cur_scp->smode.mode'
	* are not reset here yet; they will be cleared later.
	*/
	DPRINTF(5, ("cur_scp controlling process %d died, ",
	cur_scp->pid));
	if (cur_scp->status & SWITCH_WAIT_REL) {
	/*
	* Force the previous switch to finish, but return now
	* with error.
	*/
	DPRINTF(5, ("reset WAIT_REL, "));
	finish_vt_rel(cur_scp, TRUE, &s);
	splx(s);
	DPRINTF(5, ("finishing previous switch\n"));
	return EINVAL;
	} else if (cur_scp->status & SWITCH_WAIT_ACQ) {
	/* let's assume screen switch has been completed. */
	DPRINTF(5, ("reset WAIT_ACQ, "));
	finish_vt_acq(cur_scp);
	} else {
	/*
	* We are in between screen release and acquisition, and
	* reached here via scgetc() or scrn_timer() which has
	* interrupted exchange_scr(). Don't do anything stupid.
	*/
	DPRINTF(5, ("waiting nothing, "));
	}
	} else {
	if (p)
	PROC_UNLOCK(p);
	/*
	* The controlling process is alive, but not responding...
	* It is either buggy or it may be just taking time.
	* The following code is a gross kludge to cope with this
	* problem for which there is no clean solution. XXX
	*/
	if (cur_scp->status & SWITCH_WAIT_REL) {
	switch (sc->switch_in_progress++) {
	case 1:
	break;
	case 2:
	DPRINTF(5, ("sending relsig again, "));
	signal_vt_rel(cur_scp);
	break;
	case 3:
	break;
	case 4:
	default:
	/*
	* Act as if the controlling program returned
	* VT_FALSE.
	*/
	DPRINTF(5, ("force reset WAIT_REL, "));
	finish_vt_rel(cur_scp, FALSE, &s);
	splx(s);
	DPRINTF(5, ("act as if VT_FALSE was seen\n"));
	return EINVAL;
	}
	} else if (cur_scp->status & SWITCH_WAIT_ACQ) {
	switch (sc->switch_in_progress++) {
	case 1:
	break;
	case 2:
	DPRINTF(5, ("sending acqsig again, "));
	signal_vt_acq(cur_scp);
	break;
	case 3:
	break;
	case 4:
	default:
	/* clear the flag and finish the previous switch */
	DPRINTF(5, ("force reset WAIT_ACQ, "));
	finish_vt_acq(cur_scp);
	break;
	}
	}
	}
	}

	/*
	* Return error if an invalid argument is given, or vty switch
	* is still in progress.
	*/
	if ((next_scr < sc->first_vty) \|\| (next_scr >= sc->first_vty + sc->vtys)
	\|\| sc->switch_in_progress) {
	splx(s);
	sc_bell(cur_scp, bios_value.bell_pitch, BELL_DURATION);
	DPRINTF(5, ("error 1\n"));
	return EINVAL;
	}

	/*
	* Don't allow switching away from the graphics mode vty
	* if the switch mode is VT_AUTO, unless the next vty is the same
	* as the current or the current vty has been closed (but showing).
	*/
	tp = SC_DEV(sc, cur_scp->index);
	if ((cur_scp->index != next_scr)
	&& tty_opened(tp)
	&& (cur_scp->smode.mode == VT_AUTO)
	&& ISGRAPHSC(cur_scp)) {
	splx(s);
	sc_bell(cur_scp, bios_value.bell_pitch, BELL_DURATION);
	DPRINTF(5, ("error, graphics mode\n"));
	return EINVAL;
	}

	/*
	* Is the wanted vty open? Don't allow switching to a closed vty.
	* If we are in DDB, don't switch to a vty in the VT_PROCESS mode.
	* Note that we always allow the user to switch to the kernel
	* console even if it is closed.
	*/
	if ((sc_console == NULL) \|\| (next_scr != sc_console->index)) {
	tp = SC_DEV(sc, next_scr);
	if (!tty_opened(tp)) {
	splx(s);
	sc_bell(cur_scp, bios_value.bell_pitch, BELL_DURATION);
	DPRINTF(5, ("error 2, requested vty isn't open!\n"));
	return EINVAL;
	}
	if ((debugger > 0) && (SC_STAT(tp)->smode.mode == VT_PROCESS)) {
	splx(s);
	DPRINTF(5, ("error 3, requested vty is in the VT_PROCESS mode\n"));
	return EINVAL;
	}
	}

	/* this is the start of vty switching process... */
	++sc->switch_in_progress;
	sc->old_scp = cur_scp;
	sc->new_scp = sc_get_stat(SC_DEV(sc, next_scr));
	if (sc->new_scp == sc->old_scp) {
	sc->switch_in_progress = 0;
	/*
	* XXX wakeup() locks the scheduler lock which will hang if
	* the lock is in an in-between state, e.g., when we stop at
	* a breakpoint at fork_exit. It has always been wrong to call
	* wakeup() when the debugger is active. In RELENG_4, wakeup()
	* is supposed to be locked by splhigh(), but the debugger may
	* be invoked at splhigh().
	*/
	if (debugger == 0)
	wakeup(VTY_WCHAN(sc,next_scr));
	splx(s);
	DPRINTF(5, ("switch done (new == old)\n"));
	return 0;
	}

	/* has controlling process died? */
	vt_proc_alive(sc->old_scp);
	vt_proc_alive(sc->new_scp);

	/* wait for the controlling process to release the screen, if necessary */
	if (signal_vt_rel(sc->old_scp)) {
	splx(s);
	return 0;
	}

	/* go set up the new vty screen */
	splx(s);
	exchange_scr(sc);
	s = spltty();

	/* wake up processes waiting for this vty */
	if (debugger == 0)
	wakeup(VTY_WCHAN(sc,next_scr));

	/* wait for the controlling process to acknowledge, if necessary */
	if (signal_vt_acq(sc->cur_scp)) {
	splx(s);
	return 0;
	}

	sc->switch_in_progress = 0;
	if (sc->unit == sc_console_unit)
	cnavailable(sc_consptr, TRUE);
	splx(s);
	DPRINTF(5, ("switch done\n"));

	return 0;
	}

	static int
	do_switch_scr(sc_softc_t *sc, int s)
	{
	vt_proc_alive(sc->new_scp);

	splx(s);
	exchange_scr(sc);
	s = spltty();
	/* sc->cur_scp == sc->new_scp */
	wakeup(VTY_WCHAN(sc,sc->cur_scp->index));

	/* wait for the controlling process to acknowledge, if necessary */
	if (!signal_vt_acq(sc->cur_scp)) {
	sc->switch_in_progress = 0;
	if (sc->unit == sc_console_unit)
	cnavailable(sc_consptr, TRUE);
	}

	return s;
	}

	static int
	vt_proc_alive(scr_stat *scp)
	{
	struct proc *p;

	if (scp->proc) {
	if ((p = pfind(scp->pid)) != NULL)
	PROC_UNLOCK(p);
	if (scp->proc == p)
	return TRUE;
	scp->proc = NULL;
	scp->smode.mode = VT_AUTO;
	DPRINTF(5, ("vt controlling process %d died\n", scp->pid));
	}
	return FALSE;
	}

	static int
	signal_vt_rel(scr_stat *scp)
	{
	if (scp->smode.mode != VT_PROCESS)
	return FALSE;
	scp->status \|= SWITCH_WAIT_REL;
	PROC_LOCK(scp->proc);
	- psignal(scp->proc, scp->smode.relsig);
	+ kern_psignal(scp->proc, scp->smode.relsig);
	PROC_UNLOCK(scp->proc);
	DPRINTF(5, ("sending relsig to %d\n", scp->pid));
	return TRUE;
	}

	static int
	signal_vt_acq(scr_stat *scp)
	{
	if (scp->smode.mode != VT_PROCESS)
	return FALSE;
	if (scp->sc->unit == sc_console_unit)
	cnavailable(sc_consptr, FALSE);
	scp->status \|= SWITCH_WAIT_ACQ;
	PROC_LOCK(scp->proc);
	- psignal(scp->proc, scp->smode.acqsig);
	+ kern_psignal(scp->proc, scp->smode.acqsig);
	PROC_UNLOCK(scp->proc);
	DPRINTF(5, ("sending acqsig to %d\n", scp->pid));
	return TRUE;
	}

	static int
	finish_vt_rel(scr_stat scp, int release, int s)
	{
	if (scp == scp->sc->old_scp && scp->status & SWITCH_WAIT_REL) {
	scp->status &= ~SWITCH_WAIT_REL;
	if (release)
	s = do_switch_scr(scp->sc, s);
	else
	scp->sc->switch_in_progress = 0;
	return 0;
	}
	return EINVAL;
	}

	static int
	finish_vt_acq(scr_stat *scp)
	{
	if (scp == scp->sc->new_scp && scp->status & SWITCH_WAIT_ACQ) {
	scp->status &= ~SWITCH_WAIT_ACQ;
	scp->sc->switch_in_progress = 0;
	return 0;
	}
	return EINVAL;
	}

	static void
	exchange_scr(sc_softc_t *sc)
	{
	scr_stat *scp;

	/* save the current state of video and keyboard */
	sc_move_cursor(sc->old_scp, sc->old_scp->xpos, sc->old_scp->ypos);
	if (!ISGRAPHSC(sc->old_scp))
	sc_remove_cursor_image(sc->old_scp);
	if (sc->old_scp->kbd_mode == K_XLATE)
	save_kbd_state(sc->old_scp);

	/* set up the video for the new screen */
	scp = sc->cur_scp = sc->new_scp;
	#ifdef PC98
	if (sc->old_scp->mode != scp->mode \|\| ISUNKNOWNSC(sc->old_scp) \|\| ISUNKNOWNSC(sc->new_scp))
	#else
	if (sc->old_scp->mode != scp->mode \|\| ISUNKNOWNSC(sc->old_scp))
	#endif
	set_mode(scp);
	#ifndef __sparc64__
	else
	sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, scp->xsize, scp->ysize,
	(void *)sc->adp->va_window, FALSE);
	#endif
	scp->status \|= MOUSE_HIDDEN;
	sc_move_cursor(scp, scp->xpos, scp->ypos);
	if (!ISGRAPHSC(scp))
	sc_set_cursor_image(scp);
	#ifndef SC_NO_PALETTE_LOADING
	if (ISGRAPHSC(sc->old_scp)) {
	#ifdef SC_PIXEL_MODE
	if (sc->adp->va_info.vi_mem_model == V_INFO_MM_DIRECT)
	vidd_load_palette(sc->adp, sc->palette2);
	else
	#endif
	vidd_load_palette(sc->adp, sc->palette);
	}
	#endif
	sc_set_border(scp, scp->border);

	/* set up the keyboard for the new screen */
	if (sc->old_scp->kbd_mode != scp->kbd_mode)
	(void)kbdd_ioctl(sc->kbd, KDSKBMODE, (caddr_t)&scp->kbd_mode);
	update_kbd_state(scp, scp->status, LOCK_MASK);

	mark_all(scp);
	}

	void
	sc_puts(scr_stat scp, u_char buf, int len, int kernel)
	{
	int need_unlock = 0;

	#ifdef DEV_SPLASH
	/* make screensaver happy */
	if (!sticky_splash && scp == scp->sc->cur_scp && !sc_saver_keyb_only)
	run_scrn_saver = FALSE;
	#endif

	if (scp->tsw) {
	if (!kdb_active && !mtx_owned(&scp->scr_lock)) {
	need_unlock = 1;
	mtx_lock_spin(&scp->scr_lock);
	}
	(*scp->tsw->te_puts)(scp, buf, len, kernel);
	if (need_unlock)
	mtx_unlock_spin(&scp->scr_lock);
	}

	if (scp->sc->delayed_next_scr)
	sc_switch_scr(scp->sc, scp->sc->delayed_next_scr - 1);
	}

	void
	sc_draw_cursor_image(scr_stat *scp)
	{
	/* assert(scp == scp->sc->cur_scp); */
	SC_VIDEO_LOCK(scp->sc);
	(*scp->rndr->draw_cursor)(scp, scp->cursor_pos,
	scp->curs_attr.flags & CONS_BLINK_CURSOR, TRUE,
	sc_inside_cutmark(scp, scp->cursor_pos));
	scp->cursor_oldpos = scp->cursor_pos;
	SC_VIDEO_UNLOCK(scp->sc);
	}

	void
	sc_remove_cursor_image(scr_stat *scp)
	{
	/* assert(scp == scp->sc->cur_scp); */
	SC_VIDEO_LOCK(scp->sc);
	(*scp->rndr->draw_cursor)(scp, scp->cursor_oldpos,
	scp->curs_attr.flags & CONS_BLINK_CURSOR, FALSE,
	sc_inside_cutmark(scp, scp->cursor_oldpos));
	SC_VIDEO_UNLOCK(scp->sc);
	}

	static void
	update_cursor_image(scr_stat *scp)
	{
	/* assert(scp == scp->sc->cur_scp); */
	sc_remove_cursor_image(scp);
	sc_set_cursor_image(scp);
	sc_draw_cursor_image(scp);
	}

	void
	sc_set_cursor_image(scr_stat *scp)
	{
	scp->curs_attr.flags = scp->curr_curs_attr.flags;
	if (scp->curs_attr.flags & CONS_HIDDEN_CURSOR) {
	/* hidden cursor is internally represented as zero-height underline */
	scp->curs_attr.flags = CONS_CHAR_CURSOR;
	scp->curs_attr.base = scp->curs_attr.height = 0;
	} else if (scp->curs_attr.flags & CONS_CHAR_CURSOR) {
	scp->curs_attr.base = imin(scp->curr_curs_attr.base,
	scp->font_size - 1);
	scp->curs_attr.height = imin(scp->curr_curs_attr.height,
	scp->font_size - scp->curs_attr.base);
	} else { /* block cursor */
	scp->curs_attr.base = 0;
	scp->curs_attr.height = scp->font_size;
	}

	/* assert(scp == scp->sc->cur_scp); */
	SC_VIDEO_LOCK(scp->sc);
	(*scp->rndr->set_cursor)(scp, scp->curs_attr.base, scp->curs_attr.height,
	scp->curs_attr.flags & CONS_BLINK_CURSOR);
	SC_VIDEO_UNLOCK(scp->sc);
	}

	static void
	change_cursor_shape(scr_stat *scp, int flags, int base, int height)
	{
	if ((scp == scp->sc->cur_scp) && !ISGRAPHSC(scp))
	sc_remove_cursor_image(scp);

	if (base >= 0)
	scp->curr_curs_attr.base = base;
	if (height >= 0)
	scp->curr_curs_attr.height = height;
	if (flags & CONS_RESET_CURSOR)
	scp->curr_curs_attr = scp->dflt_curs_attr;
	else
	scp->curr_curs_attr.flags = flags & CONS_CURSOR_ATTRS;

	if ((scp == scp->sc->cur_scp) && !ISGRAPHSC(scp)) {
	sc_set_cursor_image(scp);
	sc_draw_cursor_image(scp);
	}
	}

	void
	sc_change_cursor_shape(scr_stat *scp, int flags, int base, int height)
	{
	sc_softc_t *sc;
	struct tty *tp;
	int s;
	int i;

	s = spltty();
	if ((flags != -1) && (flags & CONS_LOCAL_CURSOR)) {
	/* local (per vty) change */
	change_cursor_shape(scp, flags, base, height);
	splx(s);
	return;
	}

	/* global change */
	sc = scp->sc;
	if (base >= 0)
	sc->curs_attr.base = base;
	if (height >= 0)
	sc->curs_attr.height = height;
	if (flags != -1) {
	if (flags & CONS_RESET_CURSOR)
	sc->curs_attr = sc->dflt_curs_attr;
	else
	sc->curs_attr.flags = flags & CONS_CURSOR_ATTRS;
	}

	for (i = sc->first_vty; i < sc->first_vty + sc->vtys; ++i) {
	if ((tp = SC_DEV(sc, i)) == NULL)
	continue;
	if ((scp = sc_get_stat(tp)) == NULL)
	continue;
	scp->dflt_curs_attr = sc->curs_attr;
	change_cursor_shape(scp, CONS_RESET_CURSOR, -1, -1);
	}
	splx(s);
	}

	static void
	scinit(int unit, int flags)
	{

	/*
	* When syscons is being initialized as the kernel console, malloc()
	* is not yet functional, because various kernel structures has not been
	* fully initialized yet. Therefore, we need to declare the following
	* static buffers for the console. This is less than ideal,
	* but is necessry evil for the time being. XXX
	*/
	#ifdef PC98
	static u_short sc_buffer[ROWCOL2];/* XXX */
	#else
	static u_short sc_buffer[ROWCOL]; / XXX */
	#endif
	#ifndef SC_NO_FONT_LOADING
	static u_char font_8[256*8];
	static u_char font_14[256*14];
	static u_char font_16[256*16];
	#endif

	sc_softc_t *sc;
	scr_stat *scp;
	video_adapter_t *adp;
	int col;
	int row;
	int i;

	/* one time initialization */
	if (init_done == COLD)
	sc_get_bios_values(&bios_value);
	init_done = WARM;

	/*
	* Allocate resources. Even if we are being called for the second
	* time, we must allocate them again, because they might have
	* disappeared...
	*/
	sc = sc_get_softc(unit, flags & SC_KERNEL_CONSOLE);
	if ((sc->flags & SC_INIT_DONE) == 0)
	SC_VIDEO_LOCKINIT(sc);

	adp = NULL;
	if (sc->adapter >= 0) {
	vid_release(sc->adp, (void *)&sc->adapter);
	adp = sc->adp;
	sc->adp = NULL;
	}
	if (sc->keyboard >= 0) {
	DPRINTF(5, ("sc%d: releasing kbd%d\n", unit, sc->keyboard));
	i = kbd_release(sc->kbd, (void *)&sc->keyboard);
	DPRINTF(5, ("sc%d: kbd_release returned %d\n", unit, i));
	if (sc->kbd != NULL) {
	DPRINTF(5, ("sc%d: kbd != NULL!, index:%d, unit:%d, flags:0x%x\n",
	unit, sc->kbd->kb_index, sc->kbd->kb_unit, sc->kbd->kb_flags));
	}
	sc->kbd = NULL;
	}
	sc->adapter = vid_allocate("", unit, (void )&sc->adapter);
	sc->adp = vid_get_adapter(sc->adapter);
	/* assert((sc->adapter >= 0) && (sc->adp != NULL)) */

	sc->keyboard = sc_allocate_keyboard(sc, unit);
	DPRINTF(1, ("sc%d: keyboard %d\n", unit, sc->keyboard));

	sc->kbd = kbd_get_keyboard(sc->keyboard);
	if (sc->kbd != NULL) {
	DPRINTF(1, ("sc%d: kbd index:%d, unit:%d, flags:0x%x\n",
	unit, sc->kbd->kb_index, sc->kbd->kb_unit, sc->kbd->kb_flags));
	}

	if (!(sc->flags & SC_INIT_DONE) \|\| (adp != sc->adp)) {

	sc->initial_mode = sc->adp->va_initial_mode;

	#ifndef SC_NO_FONT_LOADING
	if (flags & SC_KERNEL_CONSOLE) {
	sc->font_8 = font_8;
	sc->font_14 = font_14;
	sc->font_16 = font_16;
	} else if (sc->font_8 == NULL) {
	/* assert(sc_malloc) */
	sc->font_8 = malloc(sizeof(font_8), M_DEVBUF, M_WAITOK);
	sc->font_14 = malloc(sizeof(font_14), M_DEVBUF, M_WAITOK);
	sc->font_16 = malloc(sizeof(font_16), M_DEVBUF, M_WAITOK);
	}
	#endif

	/* extract the hardware cursor location and hide the cursor for now */
	vidd_read_hw_cursor(sc->adp, &col, &row);
	vidd_set_hw_cursor(sc->adp, -1, -1);

	/* set up the first console */
	sc->first_vty = unit*MAXCONS;
	sc->vtys = MAXCONS; /* XXX: should be configurable */
	if (flags & SC_KERNEL_CONSOLE) {
	/*
	* Set up devs structure but don't use it yet, calling make_dev()
	* might panic kernel. Wait for sc_attach_unit() to actually
	* create the devices.
	*/
	sc->dev = main_devs;
	scp = &main_console;
	init_scp(sc, sc->first_vty, scp);
	sc_vtb_init(&scp->vtb, VTB_MEMORY, scp->xsize, scp->ysize,
	(void *)sc_buffer, FALSE);

	/* move cursors to the initial positions */
	if (col >= scp->xsize)
	col = 0;
	if (row >= scp->ysize)
	row = scp->ysize - 1;
	scp->xpos = col;
	scp->ypos = row;
	scp->cursor_pos = scp->cursor_oldpos = row*scp->xsize + col;

	if (sc_init_emulator(scp, SC_DFLT_TERM))
	sc_init_emulator(scp, "*");
	(*scp->tsw->te_default_attr)(scp,
	user_default.std_color,
	user_default.rev_color);
	} else {
	/* assert(sc_malloc) */
	sc->dev = malloc(sizeof(struct tty )sc->vtys, M_DEVBUF,
	M_WAITOK\|M_ZERO);
	sc->dev[0] = sc_alloc_tty(0, unit * MAXCONS);
	scp = alloc_scp(sc, sc->first_vty);
	SC_STAT(sc->dev[0]) = scp;
	}
	sc->cur_scp = scp;

	#ifndef __sparc64__
	/* copy screen to temporary buffer */
	sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, scp->xsize, scp->ysize,
	(void *)scp->sc->adp->va_window, FALSE);
	if (ISTEXTSC(scp))
	sc_vtb_copy(&scp->scr, 0, &scp->vtb, 0, scp->xsize*scp->ysize);
	#endif

	if (bios_value.cursor_end < scp->font_size)
	sc->dflt_curs_attr.base = scp->font_size -
	bios_value.cursor_end - 1;
	else
	sc->dflt_curs_attr.base = 0;
	i = bios_value.cursor_end - bios_value.cursor_start + 1;
	sc->dflt_curs_attr.height = imin(i, scp->font_size);
	sc->dflt_curs_attr.flags = 0;
	sc->curs_attr = sc->dflt_curs_attr;
	scp->curr_curs_attr = scp->dflt_curs_attr = sc->curs_attr;

	#ifndef SC_NO_SYSMOUSE
	sc_mouse_move(scp, scp->xpixel/2, scp->ypixel/2);
	#endif
	if (!ISGRAPHSC(scp)) {
	sc_set_cursor_image(scp);
	sc_draw_cursor_image(scp);
	}

	/* save font and palette */
	#ifndef SC_NO_FONT_LOADING
	sc->fonts_loaded = 0;
	if (ISFONTAVAIL(sc->adp->va_flags)) {
	#ifdef SC_DFLT_FONT
	bcopy(dflt_font_8, sc->font_8, sizeof(dflt_font_8));
	bcopy(dflt_font_14, sc->font_14, sizeof(dflt_font_14));
	bcopy(dflt_font_16, sc->font_16, sizeof(dflt_font_16));
	sc->fonts_loaded = FONT_16 \| FONT_14 \| FONT_8;
	if (scp->font_size < 14) {
	sc_load_font(scp, 0, 8, 8, sc->font_8, 0, 256);
	} else if (scp->font_size >= 16) {
	sc_load_font(scp, 0, 16, 8, sc->font_16, 0, 256);
	} else {
	sc_load_font(scp, 0, 14, 8, sc->font_14, 0, 256);
	}
	#else /* !SC_DFLT_FONT */
	if (scp->font_size < 14) {
	sc_save_font(scp, 0, 8, 8, sc->font_8, 0, 256);
	sc->fonts_loaded = FONT_8;
	} else if (scp->font_size >= 16) {
	sc_save_font(scp, 0, 16, 8, sc->font_16, 0, 256);
	sc->fonts_loaded = FONT_16;
	} else {
	sc_save_font(scp, 0, 14, 8, sc->font_14, 0, 256);
	sc->fonts_loaded = FONT_14;
	}
	#endif /* SC_DFLT_FONT */
	/* FONT KLUDGE: always use the font page #0. XXX */
	sc_show_font(scp, 0);
	}
	#endif /* !SC_NO_FONT_LOADING */

	#ifndef SC_NO_PALETTE_LOADING
	vidd_save_palette(sc->adp, sc->palette);
	#ifdef SC_PIXEL_MODE
	for (i = 0; i < sizeof(sc->palette2); i++)
	sc->palette2[i] = i / 3;
	#endif
	#endif

	#ifdef DEV_SPLASH
	if (!(sc->flags & SC_SPLASH_SCRN)) {
	/* we are ready to put up the splash image! */
	splash_init(sc->adp, scsplash_callback, sc);
	sc->flags \|= SC_SPLASH_SCRN;
	}
	#endif
	}

	/* the rest is not necessary, if we have done it once */
	if (sc->flags & SC_INIT_DONE)
	return;

	/* initialize mapscrn arrays to a one to one map */
	for (i = 0; i < sizeof(sc->scr_map); i++)
	sc->scr_map[i] = sc->scr_rmap[i] = i;
	#ifdef PC98
	sc->scr_map[0x5c] = (u_char)0xfc; /* for backslash */
	#endif

	sc->flags \|= SC_INIT_DONE;
	}

	static void
	scterm(int unit, int flags)
	{
	sc_softc_t *sc;
	scr_stat *scp;

	sc = sc_get_softc(unit, flags & SC_KERNEL_CONSOLE);
	if (sc == NULL)
	return; /* shouldn't happen */

	#ifdef DEV_SPLASH
	/* this console is no longer available for the splash screen */
	if (sc->flags & SC_SPLASH_SCRN) {
	splash_term(sc->adp);
	sc->flags &= ~SC_SPLASH_SCRN;
	}
	#endif

	#if 0 /* XXX */
	/* move the hardware cursor to the upper-left corner */
	vidd_set_hw_cursor(sc->adp, 0, 0);
	#endif

	/* release the keyboard and the video card */
	if (sc->keyboard >= 0)
	kbd_release(sc->kbd, &sc->keyboard);
	if (sc->adapter >= 0)
	vid_release(sc->adp, &sc->adapter);

	/* stop the terminal emulator, if any */
	scp = sc_get_stat(sc->dev[0]);
	if (scp->tsw)
	(*scp->tsw->te_term)(scp, &scp->ts);
	if (scp->ts != NULL)
	free(scp->ts, M_DEVBUF);
	mtx_destroy(&scp->scr_lock);

	/* clear the structure */
	if (!(flags & SC_KERNEL_CONSOLE)) {
	/* XXX: We need delete_dev() for this */
	free(sc->dev, M_DEVBUF);
	#if 0
	/* XXX: We need a ttyunregister for this */
	free(sc->tty, M_DEVBUF);
	#endif
	#ifndef SC_NO_FONT_LOADING
	free(sc->font_8, M_DEVBUF);
	free(sc->font_14, M_DEVBUF);
	free(sc->font_16, M_DEVBUF);
	#endif
	/* XXX vtb, history */
	}
	bzero(sc, sizeof(*sc));
	sc->keyboard = -1;
	sc->adapter = -1;
	}

	static void
	scshutdown(__unused void *arg, __unused int howto)
	{

	KASSERT(sc_console != NULL, ("sc_console != NULL"));
	KASSERT(sc_console->sc != NULL, ("sc_console->sc != NULL"));
	KASSERT(sc_console->sc->cur_scp != NULL,
	("sc_console->sc->cur_scp != NULL"));

	sc_touch_scrn_saver();
	if (!cold &&
	sc_console->sc->cur_scp->index != sc_console->index &&
	sc_console->sc->cur_scp->smode.mode == VT_AUTO &&
	sc_console->smode.mode == VT_AUTO)
	sc_switch_scr(sc_console->sc, sc_console->index);
	shutdown_in_progress = TRUE;
	}

	static void
	scsuspend(__unused void *arg)
	{
	int retry;

	KASSERT(sc_console != NULL, ("sc_console != NULL"));
	KASSERT(sc_console->sc != NULL, ("sc_console->sc != NULL"));
	KASSERT(sc_console->sc->cur_scp != NULL,
	("sc_console->sc->cur_scp != NULL"));

	sc_susp_scr = sc_console->sc->cur_scp->index;
	if (sc_no_suspend_vtswitch \|\|
	sc_susp_scr == sc_console->index) {
	sc_touch_scrn_saver();
	sc_susp_scr = -1;
	return;
	}
	for (retry = 0; retry < 10; retry++) {
	sc_switch_scr(sc_console->sc, sc_console->index);
	if (!sc_console->sc->switch_in_progress)
	break;
	pause("scsuspend", hz);
	}
	suspend_in_progress = TRUE;
	}

	static void
	scresume(__unused void *arg)
	{

	KASSERT(sc_console != NULL, ("sc_console != NULL"));
	KASSERT(sc_console->sc != NULL, ("sc_console->sc != NULL"));
	KASSERT(sc_console->sc->cur_scp != NULL,
	("sc_console->sc->cur_scp != NULL"));

	suspend_in_progress = FALSE;
	if (sc_susp_scr < 0) {
	mark_all(sc_console->sc->cur_scp);
	return;
	}
	sc_switch_scr(sc_console->sc, sc_susp_scr);
	}

	int
	sc_clean_up(scr_stat *scp)
	{
	#ifdef DEV_SPLASH
	int error;
	#endif

	if (scp->sc->flags & SC_SCRN_BLANKED) {
	sc_touch_scrn_saver();
	#ifdef DEV_SPLASH
	if ((error = wait_scrn_saver_stop(scp->sc)))
	return error;
	#endif
	}
	scp->status \|= MOUSE_HIDDEN;
	sc_remove_mouse_image(scp);
	sc_remove_cutmarking(scp);
	return 0;
	}

	void
	sc_alloc_scr_buffer(scr_stat *scp, int wait, int discard)
	{
	sc_vtb_t new;
	sc_vtb_t old;

	old = scp->vtb;
	sc_vtb_init(&new, VTB_MEMORY, scp->xsize, scp->ysize, NULL, wait);
	if (!discard && (old.vtb_flags & VTB_VALID)) {
	/* retain the current cursor position and buffer contants */
	scp->cursor_oldpos = scp->cursor_pos;
	/*
	* This works only if the old buffer has the same size as or larger
	* than the new one. XXX
	*/
	sc_vtb_copy(&old, 0, &new, 0, scp->xsize*scp->ysize);
	scp->vtb = new;
	} else {
	scp->vtb = new;
	sc_vtb_destroy(&old);
	}

	#ifndef SC_NO_SYSMOUSE
	/* move the mouse cursor at the center of the screen */
	sc_mouse_move(scp, scp->xpixel / 2, scp->ypixel / 2);
	#endif
	}

	static scr_stat
	alloc_scp(sc_softc_t sc, int vty)
	{
	scr_stat *scp;

	/* assert(sc_malloc) */

	scp = (scr_stat *)malloc(sizeof(scr_stat), M_DEVBUF, M_WAITOK);
	init_scp(sc, vty, scp);

	sc_alloc_scr_buffer(scp, TRUE, TRUE);
	if (sc_init_emulator(scp, SC_DFLT_TERM))
	sc_init_emulator(scp, "*");

	#ifndef SC_NO_CUTPASTE
	sc_alloc_cut_buffer(scp, TRUE);
	#endif

	#ifndef SC_NO_HISTORY
	sc_alloc_history_buffer(scp, 0, 0, TRUE);
	#endif

	return scp;
	}

	static void
	init_scp(sc_softc_t sc, int vty, scr_stat scp)
	{
	video_info_t info;

	bzero(scp, sizeof(*scp));

	scp->index = vty;
	scp->sc = sc;
	scp->status = 0;
	scp->mode = sc->initial_mode;
	vidd_get_info(sc->adp, scp->mode, &info);
	if (info.vi_flags & V_INFO_GRAPHICS) {
	scp->status \|= GRAPHICS_MODE;
	scp->xpixel = info.vi_width;
	scp->ypixel = info.vi_height;
	scp->xsize = info.vi_width/info.vi_cwidth;
	scp->ysize = info.vi_height/info.vi_cheight;
	scp->font_size = 0;
	scp->font = NULL;
	} else {
	scp->xsize = info.vi_width;
	scp->ysize = info.vi_height;
	scp->xpixel = scp->xsize*info.vi_cwidth;
	scp->ypixel = scp->ysize*info.vi_cheight;
	}

	scp->font_size = info.vi_cheight;
	scp->font_width = info.vi_cwidth;
	#ifndef SC_NO_FONT_LOADING
	if (info.vi_cheight < 14)
	scp->font = sc->font_8;
	else if (info.vi_cheight >= 16)
	scp->font = sc->font_16;
	else
	scp->font = sc->font_14;
	#else
	scp->font = NULL;
	#endif

	sc_vtb_init(&scp->vtb, VTB_MEMORY, 0, 0, NULL, FALSE);
	#ifndef __sparc64__
	sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, 0, 0, NULL, FALSE);
	#endif
	scp->xoff = scp->yoff = 0;
	scp->xpos = scp->ypos = 0;
	scp->start = scp->xsize * scp->ysize - 1;
	scp->end = 0;
	scp->tsw = NULL;
	scp->ts = NULL;
	scp->rndr = NULL;
	scp->border = (SC_NORM_ATTR >> 4) & 0x0f;
	scp->curr_curs_attr = scp->dflt_curs_attr = sc->curs_attr;
	scp->mouse_cut_start = scp->xsize*scp->ysize;
	scp->mouse_cut_end = -1;
	scp->mouse_signal = 0;
	scp->mouse_pid = 0;
	scp->mouse_proc = NULL;
	scp->kbd_mode = K_XLATE;
	scp->bell_pitch = bios_value.bell_pitch;
	scp->bell_duration = BELL_DURATION;
	scp->status \|= (bios_value.shift_state & NLKED);
	scp->status \|= CURSOR_ENABLED \| MOUSE_HIDDEN;
	scp->pid = 0;
	scp->proc = NULL;
	scp->smode.mode = VT_AUTO;
	scp->history = NULL;
	scp->history_pos = 0;
	scp->history_size = 0;

	mtx_init(&scp->scr_lock, "scrlock", NULL, MTX_SPIN);
	}

	int
	sc_init_emulator(scr_stat scp, char name)
	{
	sc_term_sw_t *sw;
	sc_rndr_sw_t *rndr;
	void *p;
	int error;

	if (name == NULL) /* if no name is given, use the current emulator */
	sw = scp->tsw;
	else /* ...otherwise find the named emulator */
	sw = sc_term_match(name);
	if (sw == NULL)
	return EINVAL;

	rndr = NULL;
	if (strcmp(sw->te_renderer, "*") != 0) {
	rndr = sc_render_match(scp, sw->te_renderer,
	scp->status & (GRAPHICS_MODE \| PIXEL_MODE));
	}
	if (rndr == NULL) {
	rndr = sc_render_match(scp, scp->sc->adp->va_name,
	scp->status & (GRAPHICS_MODE \| PIXEL_MODE));
	if (rndr == NULL)
	return ENODEV;
	}

	if (sw == scp->tsw) {
	error = (*sw->te_init)(scp, &scp->ts, SC_TE_WARM_INIT);
	scp->rndr = rndr;
	scp->rndr->init(scp);
	sc_clear_screen(scp);
	/* assert(error == 0); */
	return error;
	}

	if (sc_malloc && (sw->te_size > 0))
	p = malloc(sw->te_size, M_DEVBUF, M_NOWAIT);
	else
	p = NULL;
	error = (*sw->te_init)(scp, &p, SC_TE_COLD_INIT);
	if (error)
	return error;

	if (scp->tsw)
	(*scp->tsw->te_term)(scp, &scp->ts);
	if (scp->ts != NULL)
	free(scp->ts, M_DEVBUF);
	scp->tsw = sw;
	scp->ts = p;
	scp->rndr = rndr;
	scp->rndr->init(scp);

	/* XXX */
	(*sw->te_default_attr)(scp, user_default.std_color, user_default.rev_color);
	sc_clear_screen(scp);

	return 0;
	}

	/*
	* scgetc(flags) - get character from keyboard.
	* If flags & SCGETC_CN, then avoid harmful side effects.
	* If flags & SCGETC_NONBLOCK, then wait until a key is pressed, else
	* return NOKEY if there is nothing there.
	*/
	static u_int
	scgetc(sc_softc_t *sc, u_int flags)
	{
	scr_stat *scp;
	#ifndef SC_NO_HISTORY
	struct tty *tp;
	#endif
	u_int c;
	int this_scr;
	int f;
	int i;

	if (sc->kbd == NULL)
	return NOKEY;

	next_code:
	#if 1
	/* I don't like this, but... XXX */
	if (flags & SCGETC_CN)
	sccnupdate(sc->cur_scp);
	#endif
	scp = sc->cur_scp;
	/* first see if there is something in the keyboard port */
	for (;;) {
	c = kbdd_read_char(sc->kbd, !(flags & SCGETC_NONBLOCK));
	if (c == ERRKEY) {
	if (!(flags & SCGETC_CN))
	sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
	} else if (c == NOKEY)
	return c;
	else
	break;
	}

	/* make screensaver happy */
	if (!(c & RELKEY))
	sc_touch_scrn_saver();

	if (!(flags & SCGETC_CN))
	random_harvest(&c, sizeof(c), 1, 0, RANDOM_KEYBOARD);

	if (scp->kbd_mode != K_XLATE)
	return KEYCHAR(c);

	/* if scroll-lock pressed allow history browsing */
	if (!ISGRAPHSC(scp) && scp->history && scp->status & SLKED) {

	scp->status &= ~CURSOR_ENABLED;
	sc_remove_cursor_image(scp);

	#ifndef SC_NO_HISTORY
	if (!(scp->status & BUFFER_SAVED)) {
	scp->status \|= BUFFER_SAVED;
	sc_hist_save(scp);
	}
	switch (c) {
	/* FIXME: key codes */
	case SPCLKEY \| FKEY \| F(49): /* home key */
	sc_remove_cutmarking(scp);
	sc_hist_home(scp);
	goto next_code;

	case SPCLKEY \| FKEY \| F(57): /* end key */
	sc_remove_cutmarking(scp);
	sc_hist_end(scp);
	goto next_code;

	case SPCLKEY \| FKEY \| F(50): /* up arrow key */
	sc_remove_cutmarking(scp);
	if (sc_hist_up_line(scp))
	if (!(flags & SCGETC_CN))
	sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
	goto next_code;

	case SPCLKEY \| FKEY \| F(58): /* down arrow key */
	sc_remove_cutmarking(scp);
	if (sc_hist_down_line(scp))
	if (!(flags & SCGETC_CN))
	sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
	goto next_code;

	case SPCLKEY \| FKEY \| F(51): /* page up key */
	sc_remove_cutmarking(scp);
	for (i=0; i<scp->ysize; i++)
	if (sc_hist_up_line(scp)) {
	if (!(flags & SCGETC_CN))
	sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
	break;
	}
	goto next_code;

	case SPCLKEY \| FKEY \| F(59): /* page down key */
	sc_remove_cutmarking(scp);
	for (i=0; i<scp->ysize; i++)
	if (sc_hist_down_line(scp)) {
	if (!(flags & SCGETC_CN))
	sc_bell(scp, bios_value.bell_pitch, BELL_DURATION);
	break;
	}
	goto next_code;
	}
	#endif /* SC_NO_HISTORY */
	}

	/*
	* Process and consume special keys here. Return a plain char code
	* or a char code with the META flag or a function key code.
	*/
	if (c & RELKEY) {
	/* key released */
	/* goto next_code */
	} else {
	/* key pressed */
	if (c & SPCLKEY) {
	c &= ~SPCLKEY;
	switch (KEYCHAR(c)) {
	/* LOCKING KEYS */
	case NLK: case CLK: case ALK:
	break;
	case SLK:
	(void)kbdd_ioctl(sc->kbd, KDGKBSTATE, (caddr_t)&f);
	if (f & SLKED) {
	scp->status \|= SLKED;
	} else {
	if (scp->status & SLKED) {
	scp->status &= ~SLKED;
	#ifndef SC_NO_HISTORY
	if (scp->status & BUFFER_SAVED) {
	if (!sc_hist_restore(scp))
	sc_remove_cutmarking(scp);
	scp->status &= ~BUFFER_SAVED;
	scp->status \|= CURSOR_ENABLED;
	sc_draw_cursor_image(scp);
	}
	tp = SC_DEV(sc, scp->index);
	if (!kdb_active && tty_opened(tp))
	sctty_outwakeup(tp);
	#endif
	}
	}
	break;

	case PASTE:
	#ifndef SC_NO_CUTPASTE
	sc_mouse_paste(scp);
	#endif
	break;

	/* NON-LOCKING KEYS */
	case NOP:
	case LSH: case RSH: case LCTR: case RCTR:
	case LALT: case RALT: case ASH: case META:
	break;

	case BTAB:
	if (!(sc->flags & SC_SCRN_BLANKED))
	return c;
	break;

	case SPSC:
	#ifdef DEV_SPLASH
	/* force activatation/deactivation of the screen saver */
	if (!(sc->flags & SC_SCRN_BLANKED)) {
	run_scrn_saver = TRUE;
	sc->scrn_time_stamp -= scrn_blank_time;
	}
	if (cold) {
	/*
	* While devices are being probed, the screen saver need
	* to be invoked explictly. XXX
	*/
	if (sc->flags & SC_SCRN_BLANKED) {
	scsplash_stick(FALSE);
	stop_scrn_saver(sc, current_saver);
	} else {
	if (!ISGRAPHSC(scp)) {
	scsplash_stick(TRUE);
	(*current_saver)(sc, TRUE);
	}
	}
	}
	#endif /* DEV_SPLASH */
	break;

	case RBT:
	#ifndef SC_DISABLE_REBOOT
	if (enable_reboot)
	shutdown_nice(0);
	#endif
	break;

	case HALT:
	#ifndef SC_DISABLE_REBOOT
	if (enable_reboot)
	shutdown_nice(RB_HALT);
	#endif
	break;

	case PDWN:
	#ifndef SC_DISABLE_REBOOT
	if (enable_reboot)
	shutdown_nice(RB_HALT\|RB_POWEROFF);
	#endif
	break;

	case SUSP:
	power_pm_suspend(POWER_SLEEP_STATE_SUSPEND);
	break;
	case STBY:
	power_pm_suspend(POWER_SLEEP_STATE_STANDBY);
	break;

	case DBG:
	#ifndef SC_DISABLE_KDBKEY
	if (enable_kdbkey)
	kdb_break();
	#endif
	break;

	case PNC:
	if (enable_panic_key)
	panic("Forced by the panic key");
	break;

	case NEXT:
	this_scr = scp->index;
	for (i = (this_scr - sc->first_vty + 1)%sc->vtys;
	sc->first_vty + i != this_scr;
	i = (i + 1)%sc->vtys) {
	struct tty *tp = SC_DEV(sc, sc->first_vty + i);
	if (tty_opened(tp)) {
	sc_switch_scr(scp->sc, sc->first_vty + i);
	break;
	}
	}
	break;

	case PREV:
	this_scr = scp->index;
	for (i = (this_scr - sc->first_vty + sc->vtys - 1)%sc->vtys;
	sc->first_vty + i != this_scr;
	i = (i + sc->vtys - 1)%sc->vtys) {
	struct tty *tp = SC_DEV(sc, sc->first_vty + i);
	if (tty_opened(tp)) {
	sc_switch_scr(scp->sc, sc->first_vty + i);
	break;
	}
	}
	break;

	default:
	if (KEYCHAR(c) >= F_SCR && KEYCHAR(c) <= L_SCR) {
	sc_switch_scr(scp->sc, sc->first_vty + KEYCHAR(c) - F_SCR);
	break;
	}
	/* assert(c & FKEY) */
	if (!(sc->flags & SC_SCRN_BLANKED))
	return c;
	break;
	}
	/* goto next_code */
	} else {
	/* regular keys (maybe MKEY is set) */
	#if !defined(SC_DISABLE_KDBKEY) && defined(KDB)
	if (enable_kdbkey)
	kdb_alt_break(c, &sc->sc_altbrk);
	#endif
	if (!(sc->flags & SC_SCRN_BLANKED))
	return c;
	}
	}

	goto next_code;
	}

	static int
	sctty_mmap(struct tty tp, vm_ooffset_t offset, vm_paddr_t paddr,
	int nprot, vm_memattr_t *memattr)
	{
	scr_stat *scp;

	scp = sc_get_stat(tp);
	if (scp != scp->sc->cur_scp)
	return -1;
	return vidd_mmap(scp->sc->adp, offset, paddr, nprot, memattr);
	}

	static int
	save_kbd_state(scr_stat *scp)
	{
	int state;
	int error;

	error = kbdd_ioctl(scp->sc->kbd, KDGKBSTATE, (caddr_t)&state);
	if (error == ENOIOCTL)
	error = ENODEV;
	if (error == 0) {
	scp->status &= ~LOCK_MASK;
	scp->status \|= state;
	}
	return error;
	}

	static int
	update_kbd_state(scr_stat *scp, int new_bits, int mask)
	{
	int state;
	int error;

	if (mask != LOCK_MASK) {
	error = kbdd_ioctl(scp->sc->kbd, KDGKBSTATE, (caddr_t)&state);
	if (error == ENOIOCTL)
	error = ENODEV;
	if (error)
	return error;
	state &= ~mask;
	state \|= new_bits & mask;
	} else {
	state = new_bits & LOCK_MASK;
	}
	error = kbdd_ioctl(scp->sc->kbd, KDSKBSTATE, (caddr_t)&state);
	if (error == ENOIOCTL)
	error = ENODEV;
	return error;
	}

	static int
	update_kbd_leds(scr_stat *scp, int which)
	{
	int error;

	which &= LOCK_MASK;
	error = kbdd_ioctl(scp->sc->kbd, KDSETLED, (caddr_t)&which);
	if (error == ENOIOCTL)
	error = ENODEV;
	return error;
	}

	int
	set_mode(scr_stat *scp)
	{
	video_info_t info;

	/* reject unsupported mode */
	if (vidd_get_info(scp->sc->adp, scp->mode, &info))
	return 1;

	/* if this vty is not currently showing, do nothing */
	if (scp != scp->sc->cur_scp)
	return 0;

	/* setup video hardware for the given mode */
	vidd_set_mode(scp->sc->adp, scp->mode);
	scp->rndr->init(scp);
	#ifndef __sparc64__
	sc_vtb_init(&scp->scr, VTB_FRAMEBUFFER, scp->xsize, scp->ysize,
	(void *)scp->sc->adp->va_window, FALSE);
	#endif

	#ifndef SC_NO_FONT_LOADING
	/* load appropriate font */
	if (!(scp->status & GRAPHICS_MODE)) {
	if (!(scp->status & PIXEL_MODE) && ISFONTAVAIL(scp->sc->adp->va_flags)) {
	if (scp->font_size < 14) {
	if (scp->sc->fonts_loaded & FONT_8)
	sc_load_font(scp, 0, 8, 8, scp->sc->font_8, 0, 256);
	} else if (scp->font_size >= 16) {
	if (scp->sc->fonts_loaded & FONT_16)
	sc_load_font(scp, 0, 16, 8, scp->sc->font_16, 0, 256);
	} else {
	if (scp->sc->fonts_loaded & FONT_14)
	sc_load_font(scp, 0, 14, 8, scp->sc->font_14, 0, 256);
	}
	/*
	* FONT KLUDGE:
	* This is an interim kludge to display correct font.
	* Always use the font page #0 on the video plane 2.
	* Somehow we cannot show the font in other font pages on
	* some video cards... XXX
	*/
	sc_show_font(scp, 0);
	}
	mark_all(scp);
	}
	#endif /* !SC_NO_FONT_LOADING */

	sc_set_border(scp, scp->border);
	sc_set_cursor_image(scp);

	return 0;
	}

	void
	sc_set_border(scr_stat *scp, int color)
	{
	SC_VIDEO_LOCK(scp->sc);
	(*scp->rndr->draw_border)(scp, color);
	SC_VIDEO_UNLOCK(scp->sc);
	}

	#ifndef SC_NO_FONT_LOADING
	void
	sc_load_font(scr_stat scp, int page, int size, int width, u_char buf,
	int base, int count)
	{
	sc_softc_t *sc;

	sc = scp->sc;
	sc->font_loading_in_progress = TRUE;
	vidd_load_font(sc->adp, page, size, width, buf, base, count);
	sc->font_loading_in_progress = FALSE;
	}

	void
	sc_save_font(scr_stat scp, int page, int size, int width, u_char buf,
	int base, int count)
	{
	sc_softc_t *sc;

	sc = scp->sc;
	sc->font_loading_in_progress = TRUE;
	vidd_save_font(sc->adp, page, size, width, buf, base, count);
	sc->font_loading_in_progress = FALSE;
	}

	void
	sc_show_font(scr_stat *scp, int page)
	{
	vidd_show_font(scp->sc->adp, page);
	}
	#endif /* !SC_NO_FONT_LOADING */

	void
	sc_paste(scr_stat scp, const u_char p, int count)
	{
	struct tty *tp;
	u_char *rmap;

	tp = SC_DEV(scp->sc, scp->sc->cur_scp->index);
	if (!tty_opened(tp))
	return;
	rmap = scp->sc->scr_rmap;
	for (; count > 0; --count)
	ttydisc_rint(tp, rmap[*p++], 0);
	ttydisc_rint_done(tp);
	}

	void
	sc_respond(scr_stat scp, const u_char p, int count, int wakeup)
	{
	struct tty *tp;

	tp = SC_DEV(scp->sc, scp->sc->cur_scp->index);
	if (!tty_opened(tp))
	return;
	ttydisc_rint_simple(tp, p, count);
	if (wakeup) {
	/* XXX: we can't always call ttydisc_rint_done() here! */
	ttydisc_rint_done(tp);
	}
	}

	void
	sc_bell(scr_stat *scp, int pitch, int duration)
	{
	if (cold \|\| shutdown_in_progress \|\| !enable_bell)
	return;

	if (scp != scp->sc->cur_scp && (scp->sc->flags & SC_QUIET_BELL))
	return;

	if (scp->sc->flags & SC_VISUAL_BELL) {
	if (scp->sc->blink_in_progress)
	return;
	scp->sc->blink_in_progress = 3;
	if (scp != scp->sc->cur_scp)
	scp->sc->blink_in_progress += 2;
	blink_screen(scp->sc->cur_scp);
	} else if (duration != 0 && pitch != 0) {
	if (scp != scp->sc->cur_scp)
	pitch *= 2;
	sysbeep(1193182 / pitch, duration);
	}
	}

	static void
	blink_screen(void *arg)
	{
	scr_stat *scp = arg;
	struct tty *tp;

	if (ISGRAPHSC(scp) \|\| (scp->sc->blink_in_progress <= 1)) {
	scp->sc->blink_in_progress = 0;
	mark_all(scp);
	tp = SC_DEV(scp->sc, scp->index);
	if (tty_opened(tp))
	sctty_outwakeup(tp);
	if (scp->sc->delayed_next_scr)
	sc_switch_scr(scp->sc, scp->sc->delayed_next_scr - 1);
	}
	else {
	(scp->rndr->draw)(scp, 0, scp->xsizescp->ysize,
	scp->sc->blink_in_progress & 1);
	scp->sc->blink_in_progress--;
	timeout(blink_screen, scp, hz / 10);
	}
	}

	/*
	* Until sc_attach_unit() gets called no dev structures will be available
	* to store the per-screen current status. This is the case when the
	* kernel is initially booting and needs access to its console. During
	* this early phase of booting the console's current status is kept in
	* one statically defined scr_stat structure, and any pointers to the
	* dev structures will be NULL.
	*/

	static scr_stat *
	sc_get_stat(struct tty *tp)
	{
	if (tp == NULL)
	return (&main_console);
	return (SC_STAT(tp));
	}

	/*
	* Allocate active keyboard. Try to allocate "kbdmux" keyboard first, and,
	* if found, add all non-busy keyboards to "kbdmux". Otherwise look for
	* any keyboard.
	*/

	static int
	sc_allocate_keyboard(sc_softc_t *sc, int unit)
	{
	int idx0, idx;
	keyboard_t k0, k;
	keyboard_info_t ki;

	idx0 = kbd_allocate("kbdmux", -1, (void *)&sc->keyboard, sckbdevent, sc);
	if (idx0 != -1) {
	k0 = kbd_get_keyboard(idx0);

	for (idx = kbd_find_keyboard2("*", -1, 0);
	idx != -1;
	idx = kbd_find_keyboard2("*", -1, idx + 1)) {
	k = kbd_get_keyboard(idx);

	if (idx == idx0 \|\| KBD_IS_BUSY(k))
	continue;

	bzero(&ki, sizeof(ki));
	strcpy(ki.kb_name, k->kb_name);
	ki.kb_unit = k->kb_unit;

	(void)kbdd_ioctl(k0, KBADDKBD, (caddr_t) &ki);
	}
	} else
	idx0 = kbd_allocate("", unit, (void )&sc->keyboard, sckbdevent, sc);

	return (idx0);
	}
	Index: head/sys/dev/usb/usb_dev.c
	===================================================================
	--- head/sys/dev/usb/usb_dev.c (revision 225616)
	+++ head/sys/dev/usb/usb_dev.c (revision 225617)
	@@ -1,2295 +1,2295 @@
	/* $FreeBSD$ */
	/*-
	* Copyright (c) 2006-2008 Hans Petter Selasky. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*
	* usb_dev.c - An abstraction layer for creating devices under /dev/...
	*/

	#include <sys/stdint.h>
	#include <sys/stddef.h>
	#include <sys/param.h>
	#include <sys/queue.h>
	#include <sys/types.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/bus.h>
	#include <sys/module.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/condvar.h>
	#include <sys/sysctl.h>
	#include <sys/sx.h>
	#include <sys/unistd.h>
	#include <sys/callout.h>
	#include <sys/malloc.h>
	#include <sys/priv.h>
	#include <sys/vnode.h>
	#include <sys/conf.h>
	#include <sys/fcntl.h>

	#include <dev/usb/usb.h>
	#include <dev/usb/usb_ioctl.h>
	#include <dev/usb/usbdi.h>
	#include <dev/usb/usbdi_util.h>

	#define USB_DEBUG_VAR usb_fifo_debug

	#include <dev/usb/usb_core.h>
	#include <dev/usb/usb_dev.h>
	#include <dev/usb/usb_mbuf.h>
	#include <dev/usb/usb_process.h>
	#include <dev/usb/usb_device.h>
	#include <dev/usb/usb_debug.h>
	#include <dev/usb/usb_busdma.h>
	#include <dev/usb/usb_generic.h>
	#include <dev/usb/usb_dynamic.h>
	#include <dev/usb/usb_util.h>

	#include <dev/usb/usb_controller.h>
	#include <dev/usb/usb_bus.h>

	#include <sys/filio.h>
	#include <sys/ttycom.h>
	#include <sys/syscallsubr.h>

	#include <machine/stdarg.h>

	#if USB_HAVE_UGEN

	#ifdef USB_DEBUG
	static int usb_fifo_debug = 0;

	SYSCTL_NODE(_hw_usb, OID_AUTO, dev, CTLFLAG_RW, 0, "USB device");
	SYSCTL_INT(_hw_usb_dev, OID_AUTO, debug, CTLFLAG_RW,
	&usb_fifo_debug, 0, "Debug Level");

	TUNABLE_INT("hw.usb.dev.debug", &usb_fifo_debug);
	#endif

	#if ((__FreeBSD_version >= 700001) \|\| (__FreeBSD_version == 0) \|\| \
	((__FreeBSD_version >= 600034) && (__FreeBSD_version < 700000)))
	#define USB_UCRED struct ucred *ucred,
	#else
	#define USB_UCRED
	#endif

	/* prototypes */

	static int usb_fifo_open(struct usb_cdev_privdata *,
	struct usb_fifo *, int);
	static void usb_fifo_close(struct usb_fifo *, int);
	static void usb_dev_init(void *);
	static void usb_dev_init_post(void *);
	static void usb_dev_uninit(void *);
	static int usb_fifo_uiomove(struct usb_fifo , void , int,
	struct uio *);
	static void usb_fifo_check_methods(struct usb_fifo_methods *);
	static struct usb_fifo *usb_fifo_alloc(void);
	static struct usb_endpoint usb_dev_get_ep(struct usb_device , uint8_t,
	uint8_t);
	static void usb_loc_fill(struct usb_fs_privdata *,
	struct usb_cdev_privdata *);
	static void usb_close(void *);
	static usb_error_t usb_ref_device(struct usb_cdev_privdata , struct usb_cdev_refdata , int);
	static usb_error_t usb_usb_ref_device(struct usb_cdev_privdata , struct usb_cdev_refdata );
	static void usb_unref_device(struct usb_cdev_privdata , struct usb_cdev_refdata );

	static d_open_t usb_open;
	static d_ioctl_t usb_ioctl;
	static d_read_t usb_read;
	static d_write_t usb_write;
	static d_poll_t usb_poll;

	static d_ioctl_t usb_static_ioctl;

	static usb_fifo_open_t usb_fifo_dummy_open;
	static usb_fifo_close_t usb_fifo_dummy_close;
	static usb_fifo_ioctl_t usb_fifo_dummy_ioctl;
	static usb_fifo_cmd_t usb_fifo_dummy_cmd;

	/* character device structure used for devices (/dev/ugenX.Y and /dev/uXXX) */
	struct cdevsw usb_devsw = {
	.d_version = D_VERSION,
	.d_open = usb_open,
	.d_ioctl = usb_ioctl,
	.d_name = "usbdev",
	.d_flags = D_TRACKCLOSE,
	.d_read = usb_read,
	.d_write = usb_write,
	.d_poll = usb_poll
	};

	static struct cdev* usb_dev = NULL;

	/* character device structure used for /dev/usb */
	static struct cdevsw usb_static_devsw = {
	.d_version = D_VERSION,
	.d_ioctl = usb_static_ioctl,
	.d_name = "usb"
	};

	static TAILQ_HEAD(, usb_symlink) usb_sym_head;
	static struct sx usb_sym_lock;

	struct mtx usb_ref_lock;

	/------------------------------------------------------------------------
	* usb_loc_fill
	*
	* This is used to fill out a usb_cdev_privdata structure based on the
	* device's address as contained in usb_fs_privdata.
	------------------------------------------------------------------------/
	static void
	usb_loc_fill(struct usb_fs_privdata* pd, struct usb_cdev_privdata *cpd)
	{
	cpd->bus_index = pd->bus_index;
	cpd->dev_index = pd->dev_index;
	cpd->ep_addr = pd->ep_addr;
	cpd->fifo_index = pd->fifo_index;
	}

	/------------------------------------------------------------------------
	* usb_ref_device
	*
	* This function is used to atomically refer an USB device by its
	* device location. If this function returns success the USB device
	* will not dissappear until the USB device is unreferenced.
	*
	* Return values:
	* 0: Success, refcount incremented on the given USB device.
	* Else: Failure.
	------------------------------------------------------------------------/
	static usb_error_t
	usb_ref_device(struct usb_cdev_privdata *cpd,
	struct usb_cdev_refdata *crd, int need_uref)
	{
	struct usb_fifo **ppf;
	struct usb_fifo *f;

	DPRINTFN(2, "cpd=%p need uref=%d\n", cpd, need_uref);

	/* clear all refs */
	memset(crd, 0, sizeof(*crd));

	mtx_lock(&usb_ref_lock);
	cpd->bus = devclass_get_softc(usb_devclass_ptr, cpd->bus_index);
	if (cpd->bus == NULL) {
	DPRINTFN(2, "no bus at %u\n", cpd->bus_index);
	goto error;
	}
	cpd->udev = cpd->bus->devices[cpd->dev_index];
	if (cpd->udev == NULL) {
	DPRINTFN(2, "no device at %u\n", cpd->dev_index);
	goto error;
	}
	if (cpd->udev->refcount == USB_DEV_REF_MAX) {
	DPRINTFN(2, "no dev ref\n");
	goto error;
	}
	if (need_uref) {
	DPRINTFN(2, "ref udev - needed\n");
	cpd->udev->refcount++;

	mtx_unlock(&usb_ref_lock);

	/*
	* We need to grab the sx-lock before grabbing the
	* FIFO refs to avoid deadlock at detach!
	*/
	usbd_enum_lock(cpd->udev);

	mtx_lock(&usb_ref_lock);

	/*
	* Set "is_uref" after grabbing the default SX lock
	*/
	crd->is_uref = 1;
	}

	/* check if we are doing an open */
	if (cpd->fflags == 0) {
	/* use zero defaults */
	} else {
	/* check for write */
	if (cpd->fflags & FWRITE) {
	ppf = cpd->udev->fifo;
	f = ppf[cpd->fifo_index + USB_FIFO_TX];
	crd->txfifo = f;
	crd->is_write = 1; /* ref */
	if (f == NULL \|\| f->refcount == USB_FIFO_REF_MAX)
	goto error;
	if (f->curr_cpd != cpd)
	goto error;
	/* check if USB-FS is active */
	if (f->fs_ep_max != 0) {
	crd->is_usbfs = 1;
	}
	}

	/* check for read */
	if (cpd->fflags & FREAD) {
	ppf = cpd->udev->fifo;
	f = ppf[cpd->fifo_index + USB_FIFO_RX];
	crd->rxfifo = f;
	crd->is_read = 1; /* ref */
	if (f == NULL \|\| f->refcount == USB_FIFO_REF_MAX)
	goto error;
	if (f->curr_cpd != cpd)
	goto error;
	/* check if USB-FS is active */
	if (f->fs_ep_max != 0) {
	crd->is_usbfs = 1;
	}
	}
	}

	/* when everything is OK we increment the refcounts */
	if (crd->is_write) {
	DPRINTFN(2, "ref write\n");
	crd->txfifo->refcount++;
	}
	if (crd->is_read) {
	DPRINTFN(2, "ref read\n");
	crd->rxfifo->refcount++;
	}
	mtx_unlock(&usb_ref_lock);

	return (0);

	error:
	if (crd->is_uref) {
	usbd_enum_unlock(cpd->udev);

	if (--(cpd->udev->refcount) == 0) {
	cv_signal(&cpd->udev->ref_cv);
	}
	}
	mtx_unlock(&usb_ref_lock);
	DPRINTFN(2, "fail\n");
	return (USB_ERR_INVAL);
	}

	/------------------------------------------------------------------------
	* usb_usb_ref_device
	*
	* This function is used to upgrade an USB reference to include the
	* USB device reference on a USB location.
	*
	* Return values:
	* 0: Success, refcount incremented on the given USB device.
	* Else: Failure.
	------------------------------------------------------------------------/
	static usb_error_t
	usb_usb_ref_device(struct usb_cdev_privdata *cpd,
	struct usb_cdev_refdata *crd)
	{
	/*
	* Check if we already got an USB reference on this location:
	*/
	if (crd->is_uref)
	return (0); /* success */

	/*
	* To avoid deadlock at detach we need to drop the FIFO ref
	* and re-acquire a new ref!
	*/
	usb_unref_device(cpd, crd);

	return (usb_ref_device(cpd, crd, 1 /* need uref */));
	}

	/------------------------------------------------------------------------
	* usb_unref_device
	*
	* This function will release the reference count by one unit for the
	* given USB device.
	------------------------------------------------------------------------/
	static void
	usb_unref_device(struct usb_cdev_privdata *cpd,
	struct usb_cdev_refdata *crd)
	{

	DPRINTFN(2, "cpd=%p is_uref=%d\n", cpd, crd->is_uref);

	if (crd->is_uref)
	usbd_enum_unlock(cpd->udev);

	mtx_lock(&usb_ref_lock);
	if (crd->is_read) {
	if (--(crd->rxfifo->refcount) == 0) {
	cv_signal(&crd->rxfifo->cv_drain);
	}
	crd->is_read = 0;
	}
	if (crd->is_write) {
	if (--(crd->txfifo->refcount) == 0) {
	cv_signal(&crd->txfifo->cv_drain);
	}
	crd->is_write = 0;
	}
	if (crd->is_uref) {
	if (--(cpd->udev->refcount) == 0) {
	cv_signal(&cpd->udev->ref_cv);
	}
	crd->is_uref = 0;
	}
	mtx_unlock(&usb_ref_lock);
	}

	static struct usb_fifo *
	usb_fifo_alloc(void)
	{
	struct usb_fifo *f;

	f = malloc(sizeof(*f), M_USBDEV, M_WAITOK \| M_ZERO);
	if (f) {
	cv_init(&f->cv_io, "FIFO-IO");
	cv_init(&f->cv_drain, "FIFO-DRAIN");
	f->refcount = 1;
	}
	return (f);
	}

	/------------------------------------------------------------------------
	* usb_fifo_create
	------------------------------------------------------------------------/
	static int
	usb_fifo_create(struct usb_cdev_privdata *cpd,
	struct usb_cdev_refdata *crd)
	{
	struct usb_device *udev = cpd->udev;
	struct usb_fifo *f;
	struct usb_endpoint *ep;
	uint8_t n;
	uint8_t is_tx;
	uint8_t is_rx;
	uint8_t no_null;
	uint8_t is_busy;
	int e = cpd->ep_addr;

	is_tx = (cpd->fflags & FWRITE) ? 1 : 0;
	is_rx = (cpd->fflags & FREAD) ? 1 : 0;
	no_null = 1;
	is_busy = 0;

	/* Preallocated FIFO */
	if (e < 0) {
	DPRINTFN(5, "Preallocated FIFO\n");
	if (is_tx) {
	f = udev->fifo[cpd->fifo_index + USB_FIFO_TX];
	if (f == NULL)
	return (EINVAL);
	crd->txfifo = f;
	}
	if (is_rx) {
	f = udev->fifo[cpd->fifo_index + USB_FIFO_RX];
	if (f == NULL)
	return (EINVAL);
	crd->rxfifo = f;
	}
	return (0);
	}

	KASSERT(e >= 0 && e <= 15, ("endpoint %d out of range", e));

	/* search for a free FIFO slot */
	DPRINTFN(5, "Endpoint device, searching for 0x%02x\n", e);
	for (n = 0;; n += 2) {

	if (n == USB_FIFO_MAX) {
	if (no_null) {
	no_null = 0;
	n = 0;
	} else {
	/* end of FIFOs reached */
	DPRINTFN(5, "out of FIFOs\n");
	return (ENOMEM);
	}
	}
	/* Check for TX FIFO */
	if (is_tx) {
	f = udev->fifo[n + USB_FIFO_TX];
	if (f != NULL) {
	if (f->dev_ep_index != e) {
	/* wrong endpoint index */
	continue;
	}
	if (f->curr_cpd != NULL) {
	/* FIFO is opened */
	is_busy = 1;
	continue;
	}
	} else if (no_null) {
	continue;
	}
	}
	/* Check for RX FIFO */
	if (is_rx) {
	f = udev->fifo[n + USB_FIFO_RX];
	if (f != NULL) {
	if (f->dev_ep_index != e) {
	/* wrong endpoint index */
	continue;
	}
	if (f->curr_cpd != NULL) {
	/* FIFO is opened */
	is_busy = 1;
	continue;
	}
	} else if (no_null) {
	continue;
	}
	}
	break;
	}

	if (no_null == 0) {
	if (e >= (USB_EP_MAX / 2)) {
	/* we don't create any endpoints in this range */
	DPRINTFN(5, "ep out of range\n");
	return (is_busy ? EBUSY : EINVAL);
	}
	}

	if ((e != 0) && is_busy) {
	/*
	* Only the default control endpoint is allowed to be
	* opened multiple times!
	*/
	DPRINTFN(5, "busy\n");
	return (EBUSY);
	}

	/* Check TX FIFO */
	if (is_tx &&
	(udev->fifo[n + USB_FIFO_TX] == NULL)) {
	ep = usb_dev_get_ep(udev, e, USB_FIFO_TX);
	DPRINTFN(5, "dev_get_endpoint(%d, 0x%x)\n", e, USB_FIFO_TX);
	if (ep == NULL) {
	DPRINTFN(5, "dev_get_endpoint returned NULL\n");
	return (EINVAL);
	}
	f = usb_fifo_alloc();
	if (f == NULL) {
	DPRINTFN(5, "could not alloc tx fifo\n");
	return (ENOMEM);
	}
	/* update some fields */
	f->fifo_index = n + USB_FIFO_TX;
	f->dev_ep_index = e;
	f->priv_mtx = &udev->device_mtx;
	f->priv_sc0 = ep;
	f->methods = &usb_ugen_methods;
	f->iface_index = ep->iface_index;
	f->udev = udev;
	mtx_lock(&usb_ref_lock);
	udev->fifo[n + USB_FIFO_TX] = f;
	mtx_unlock(&usb_ref_lock);
	}
	/* Check RX FIFO */
	if (is_rx &&
	(udev->fifo[n + USB_FIFO_RX] == NULL)) {

	ep = usb_dev_get_ep(udev, e, USB_FIFO_RX);
	DPRINTFN(5, "dev_get_endpoint(%d, 0x%x)\n", e, USB_FIFO_RX);
	if (ep == NULL) {
	DPRINTFN(5, "dev_get_endpoint returned NULL\n");
	return (EINVAL);
	}
	f = usb_fifo_alloc();
	if (f == NULL) {
	DPRINTFN(5, "could not alloc rx fifo\n");
	return (ENOMEM);
	}
	/* update some fields */
	f->fifo_index = n + USB_FIFO_RX;
	f->dev_ep_index = e;
	f->priv_mtx = &udev->device_mtx;
	f->priv_sc0 = ep;
	f->methods = &usb_ugen_methods;
	f->iface_index = ep->iface_index;
	f->udev = udev;
	mtx_lock(&usb_ref_lock);
	udev->fifo[n + USB_FIFO_RX] = f;
	mtx_unlock(&usb_ref_lock);
	}
	if (is_tx) {
	crd->txfifo = udev->fifo[n + USB_FIFO_TX];
	}
	if (is_rx) {
	crd->rxfifo = udev->fifo[n + USB_FIFO_RX];
	}
	/* fill out fifo index */
	DPRINTFN(5, "fifo index = %d\n", n);
	cpd->fifo_index = n;

	/* complete */

	return (0);
	}

	void
	usb_fifo_free(struct usb_fifo *f)
	{
	uint8_t n;

	if (f == NULL) {
	/* be NULL safe */
	return;
	}
	/* destroy symlink devices, if any */
	for (n = 0; n != 2; n++) {
	if (f->symlink[n]) {
	usb_free_symlink(f->symlink[n]);
	f->symlink[n] = NULL;
	}
	}
	mtx_lock(&usb_ref_lock);

	/* delink ourselves to stop calls from userland */
	if ((f->fifo_index < USB_FIFO_MAX) &&
	(f->udev != NULL) &&
	(f->udev->fifo[f->fifo_index] == f)) {
	f->udev->fifo[f->fifo_index] = NULL;
	} else {
	DPRINTFN(0, "USB FIFO %p has not been linked\n", f);
	}

	/* decrease refcount */
	f->refcount--;
	/* prevent any write flush */
	f->flag_iserror = 1;
	/* need to wait until all callers have exited */
	while (f->refcount != 0) {
	mtx_unlock(&usb_ref_lock); /* avoid LOR */
	mtx_lock(f->priv_mtx);
	/* get I/O thread out of any sleep state */
	if (f->flag_sleeping) {
	f->flag_sleeping = 0;
	cv_broadcast(&f->cv_io);
	}
	mtx_unlock(f->priv_mtx);
	mtx_lock(&usb_ref_lock);

	/* wait for sync */
	cv_wait(&f->cv_drain, &usb_ref_lock);
	}
	mtx_unlock(&usb_ref_lock);

	/* take care of closing the device here, if any */
	usb_fifo_close(f, 0);

	cv_destroy(&f->cv_io);
	cv_destroy(&f->cv_drain);

	free(f, M_USBDEV);
	}

	static struct usb_endpoint *
	usb_dev_get_ep(struct usb_device *udev, uint8_t ep_index, uint8_t dir)
	{
	struct usb_endpoint *ep;
	uint8_t ep_dir;

	if (ep_index == 0) {
	ep = &udev->ctrl_ep;
	} else {
	if (dir == USB_FIFO_RX) {
	if (udev->flags.usb_mode == USB_MODE_HOST) {
	ep_dir = UE_DIR_IN;
	} else {
	ep_dir = UE_DIR_OUT;
	}
	} else {
	if (udev->flags.usb_mode == USB_MODE_HOST) {
	ep_dir = UE_DIR_OUT;
	} else {
	ep_dir = UE_DIR_IN;
	}
	}
	ep = usbd_get_ep_by_addr(udev, ep_index \| ep_dir);
	}

	if (ep == NULL) {
	/* if the endpoint does not exist then return */
	return (NULL);
	}
	if (ep->edesc == NULL) {
	/* invalid endpoint */
	return (NULL);
	}
	return (ep); /* success */
	}

	/------------------------------------------------------------------------
	* usb_fifo_open
	*
	* Returns:
	* 0: Success
	* Else: Failure
	------------------------------------------------------------------------/
	static int
	usb_fifo_open(struct usb_cdev_privdata *cpd,
	struct usb_fifo *f, int fflags)
	{
	int err;

	if (f == NULL) {
	/* no FIFO there */
	DPRINTFN(2, "no FIFO\n");
	return (ENXIO);
	}
	/* remove FWRITE and FREAD flags */
	fflags &= ~(FWRITE \| FREAD);

	/* set correct file flags */
	if ((f->fifo_index & 1) == USB_FIFO_TX) {
	fflags \|= FWRITE;
	} else {
	fflags \|= FREAD;
	}

	/* check if we are already opened */
	/* we don't need any locks when checking this variable */
	if (f->curr_cpd != NULL) {
	err = EBUSY;
	goto done;
	}

	/* reset short flag before open */
	f->flag_short = 0;

	/* call open method */
	err = (f->methods->f_open) (f, fflags);
	if (err) {
	goto done;
	}
	mtx_lock(f->priv_mtx);

	/* reset sleep flag */
	f->flag_sleeping = 0;

	/* reset error flag */
	f->flag_iserror = 0;

	/* reset complete flag */
	f->flag_iscomplete = 0;

	/* reset select flag */
	f->flag_isselect = 0;

	/* reset flushing flag */
	f->flag_flushing = 0;

	/* reset ASYNC proc flag */
	f->async_p = NULL;

	mtx_lock(&usb_ref_lock);
	/* flag the fifo as opened to prevent others */
	f->curr_cpd = cpd;
	mtx_unlock(&usb_ref_lock);

	/* reset queue */
	usb_fifo_reset(f);

	mtx_unlock(f->priv_mtx);
	done:
	return (err);
	}

	/------------------------------------------------------------------------
	* usb_fifo_reset
	------------------------------------------------------------------------/
	void
	usb_fifo_reset(struct usb_fifo *f)
	{
	struct usb_mbuf *m;

	if (f == NULL) {
	return;
	}
	while (1) {
	USB_IF_DEQUEUE(&f->used_q, m);
	if (m) {
	USB_IF_ENQUEUE(&f->free_q, m);
	} else {
	break;
	}
	}
	/* reset have fragment flag */
	f->flag_have_fragment = 0;
	}

	/------------------------------------------------------------------------
	* usb_fifo_close
	------------------------------------------------------------------------/
	static void
	usb_fifo_close(struct usb_fifo *f, int fflags)
	{
	int err;

	/* check if we are not opened */
	if (f->curr_cpd == NULL) {
	/* nothing to do - already closed */
	return;
	}
	mtx_lock(f->priv_mtx);

	/* clear current cdev private data pointer */
	f->curr_cpd = NULL;

	/* check if we are selected */
	if (f->flag_isselect) {
	selwakeup(&f->selinfo);
	f->flag_isselect = 0;
	}
	/* check if a thread wants SIGIO */
	if (f->async_p != NULL) {
	PROC_LOCK(f->async_p);
	- psignal(f->async_p, SIGIO);
	+ kern_psignal(f->async_p, SIGIO);
	PROC_UNLOCK(f->async_p);
	f->async_p = NULL;
	}
	/* remove FWRITE and FREAD flags */
	fflags &= ~(FWRITE \| FREAD);

	/* flush written data, if any */
	if ((f->fifo_index & 1) == USB_FIFO_TX) {

	if (!f->flag_iserror) {

	/* set flushing flag */
	f->flag_flushing = 1;

	/* get the last packet in */
	if (f->flag_have_fragment) {
	struct usb_mbuf *m;
	f->flag_have_fragment = 0;
	USB_IF_DEQUEUE(&f->free_q, m);
	if (m) {
	USB_IF_ENQUEUE(&f->used_q, m);
	}
	}

	/* start write transfer, if not already started */
	(f->methods->f_start_write) (f);

	/* check if flushed already */
	while (f->flag_flushing &&
	(!f->flag_iserror)) {
	/* wait until all data has been written */
	f->flag_sleeping = 1;
	err = cv_wait_sig(&f->cv_io, f->priv_mtx);
	if (err) {
	DPRINTF("signal received\n");
	break;
	}
	}
	}
	fflags \|= FWRITE;

	/* stop write transfer, if not already stopped */
	(f->methods->f_stop_write) (f);
	} else {
	fflags \|= FREAD;

	/* stop write transfer, if not already stopped */
	(f->methods->f_stop_read) (f);
	}

	/* check if we are sleeping */
	if (f->flag_sleeping) {
	DPRINTFN(2, "Sleeping at close!\n");
	}
	mtx_unlock(f->priv_mtx);

	/* call close method */
	(f->methods->f_close) (f, fflags);

	DPRINTF("closed\n");
	}

	/------------------------------------------------------------------------
	* usb_open - cdev callback
	------------------------------------------------------------------------/
	static int
	usb_open(struct cdev dev, int fflags, int devtype, struct thread td)
	{
	struct usb_fs_privdata* pd = (struct usb_fs_privdata*)dev->si_drv1;
	struct usb_cdev_refdata refs;
	struct usb_cdev_privdata *cpd;
	int err, ep;

	DPRINTFN(2, "%s fflags=0x%08x\n", dev->si_name, fflags);

	KASSERT(fflags & (FREAD\|FWRITE), ("invalid open flags"));
	if (((fflags & FREAD) && !(pd->mode & FREAD)) \|\|
	((fflags & FWRITE) && !(pd->mode & FWRITE))) {
	DPRINTFN(2, "access mode not supported\n");
	return (EPERM);
	}

	cpd = malloc(sizeof(*cpd), M_USBDEV, M_WAITOK \| M_ZERO);
	ep = cpd->ep_addr = pd->ep_addr;

	usb_loc_fill(pd, cpd);
	err = usb_ref_device(cpd, &refs, 1);
	if (err) {
	DPRINTFN(2, "cannot ref device\n");
	free(cpd, M_USBDEV);
	return (ENXIO);
	}
	cpd->fflags = fflags; /* access mode for open lifetime */

	/* create FIFOs, if any */
	err = usb_fifo_create(cpd, &refs);
	/* check for error */
	if (err) {
	DPRINTFN(2, "cannot create fifo\n");
	usb_unref_device(cpd, &refs);
	free(cpd, M_USBDEV);
	return (err);
	}
	if (fflags & FREAD) {
	err = usb_fifo_open(cpd, refs.rxfifo, fflags);
	if (err) {
	DPRINTFN(2, "read open failed\n");
	usb_unref_device(cpd, &refs);
	free(cpd, M_USBDEV);
	return (err);
	}
	}
	if (fflags & FWRITE) {
	err = usb_fifo_open(cpd, refs.txfifo, fflags);
	if (err) {
	DPRINTFN(2, "write open failed\n");
	if (fflags & FREAD) {
	usb_fifo_close(refs.rxfifo, fflags);
	}
	usb_unref_device(cpd, &refs);
	free(cpd, M_USBDEV);
	return (err);
	}
	}
	usb_unref_device(cpd, &refs);
	devfs_set_cdevpriv(cpd, usb_close);

	return (0);
	}

	/------------------------------------------------------------------------
	* usb_close - cdev callback
	------------------------------------------------------------------------/
	static void
	usb_close(void *arg)
	{
	struct usb_cdev_refdata refs;
	struct usb_cdev_privdata *cpd = arg;
	int err;

	DPRINTFN(2, "cpd=%p\n", cpd);

	err = usb_ref_device(cpd, &refs, 0);
	if (err)
	goto done;

	/*
	* If this function is not called directly from the root HUB
	* thread, there is usually a need to lock the enumeration
	* lock. Check this.
	*/
	if (!usbd_enum_is_locked(cpd->udev)) {

	DPRINTFN(2, "Locking enumeration\n");

	/* reference device */
	err = usb_usb_ref_device(cpd, &refs);
	if (err)
	goto done;
	}
	if (cpd->fflags & FREAD) {
	usb_fifo_close(refs.rxfifo, cpd->fflags);
	}
	if (cpd->fflags & FWRITE) {
	usb_fifo_close(refs.txfifo, cpd->fflags);
	}
	usb_unref_device(cpd, &refs);
	done:
	free(cpd, M_USBDEV);
	}

	static void
	usb_dev_init(void *arg)
	{
	mtx_init(&usb_ref_lock, "USB ref mutex", NULL, MTX_DEF);
	sx_init(&usb_sym_lock, "USB sym mutex");
	TAILQ_INIT(&usb_sym_head);

	/* check the UGEN methods */
	usb_fifo_check_methods(&usb_ugen_methods);
	}

	SYSINIT(usb_dev_init, SI_SUB_KLD, SI_ORDER_FIRST, usb_dev_init, NULL);

	static void
	usb_dev_init_post(void *arg)
	{
	/*
	* Create /dev/usb - this is needed for usbconfig(8), which
	* needs a well-known device name to access.
	*/
	usb_dev = make_dev(&usb_static_devsw, 0, UID_ROOT, GID_OPERATOR,
	0644, USB_DEVICE_NAME);
	if (usb_dev == NULL) {
	DPRINTFN(0, "Could not create usb bus device\n");
	}
	}

	SYSINIT(usb_dev_init_post, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, usb_dev_init_post, NULL);

	static void
	usb_dev_uninit(void *arg)
	{
	if (usb_dev != NULL) {
	destroy_dev(usb_dev);
	usb_dev = NULL;
	}
	mtx_destroy(&usb_ref_lock);
	sx_destroy(&usb_sym_lock);
	}

	SYSUNINIT(usb_dev_uninit, SI_SUB_KICK_SCHEDULER, SI_ORDER_ANY, usb_dev_uninit, NULL);

	static int
	usb_ioctl_f_sub(struct usb_fifo f, u_long cmd, void addr,
	struct thread *td)
	{
	int error = 0;

	switch (cmd) {
	case FIODTYPE:
	(int )addr = 0; /* character device */
	break;

	case FIONBIO:
	/* handled by upper FS layer */
	break;

	case FIOASYNC:
	if ((int )addr) {
	if (f->async_p != NULL) {
	error = EBUSY;
	break;
	}
	f->async_p = USB_TD_GET_PROC(td);
	} else {
	f->async_p = NULL;
	}
	break;

	/* XXX this is not the most general solution */
	case TIOCSPGRP:
	if (f->async_p == NULL) {
	error = EINVAL;
	break;
	}
	if ((int )addr != USB_PROC_GET_GID(f->async_p)) {
	error = EPERM;
	break;
	}
	break;
	default:
	return (ENOIOCTL);
	}
	DPRINTFN(3, "cmd 0x%lx = %d\n", cmd, error);
	return (error);
	}

	/------------------------------------------------------------------------
	* usb_ioctl - cdev callback
	------------------------------------------------------------------------/
	static int
	usb_ioctl(struct cdev dev, u_long cmd, caddr_t addr, int fflag, struct thread td)
	{
	struct usb_cdev_refdata refs;
	struct usb_cdev_privdata* cpd;
	struct usb_fifo *f;
	int fflags;
	int err;

	DPRINTFN(2, "cmd=0x%lx\n", cmd);

	err = devfs_get_cdevpriv((void **)&cpd);
	if (err != 0)
	return (err);

	/*
	* Performance optimisation: We try to check for IOCTL's that
	* don't need the USB reference first. Then we grab the USB
	* reference if we need it!
	*/
	err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
	if (err)
	return (ENXIO);

	fflags = cpd->fflags;

	f = NULL; /* set default value */
	err = ENOIOCTL; /* set default value */

	if (fflags & FWRITE) {
	f = refs.txfifo;
	err = usb_ioctl_f_sub(f, cmd, addr, td);
	}
	if (fflags & FREAD) {
	f = refs.rxfifo;
	err = usb_ioctl_f_sub(f, cmd, addr, td);
	}
	KASSERT(f != NULL, ("fifo not found"));
	if (err != ENOIOCTL)
	goto done;

	err = (f->methods->f_ioctl) (f, cmd, addr, fflags);

	DPRINTFN(2, "f_ioctl cmd 0x%lx = %d\n", cmd, err);

	if (err != ENOIOCTL)
	goto done;

	if (usb_usb_ref_device(cpd, &refs)) {
	err = ENXIO;
	goto done;
	}

	err = (f->methods->f_ioctl_post) (f, cmd, addr, fflags);

	DPRINTFN(2, "f_ioctl_post cmd 0x%lx = %d\n", cmd, err);

	if (err == ENOIOCTL)
	err = ENOTTY;

	if (err)
	goto done;

	/* Wait for re-enumeration, if any */

	while (f->udev->re_enumerate_wait != 0) {

	usb_unref_device(cpd, &refs);

	usb_pause_mtx(NULL, hz / 128);

	if (usb_ref_device(cpd, &refs, 1 /* need uref */)) {
	err = ENXIO;
	goto done;
	}
	}

	done:
	usb_unref_device(cpd, &refs);
	return (err);
	}

	/* ARGSUSED */
	static int
	usb_poll(struct cdev* dev, int events, struct thread* td)
	{
	struct usb_cdev_refdata refs;
	struct usb_cdev_privdata* cpd;
	struct usb_fifo *f;
	struct usb_mbuf *m;
	int fflags, revents;

	if (devfs_get_cdevpriv((void **)&cpd) != 0 \|\|
	usb_ref_device(cpd, &refs, 0) != 0)
	return (events &
	(POLLHUP\|POLLIN\|POLLRDNORM\|POLLOUT\|POLLWRNORM));

	fflags = cpd->fflags;

	/* Figure out who needs service */
	revents = 0;
	if ((events & (POLLOUT \| POLLWRNORM)) &&
	(fflags & FWRITE)) {

	f = refs.txfifo;

	mtx_lock(f->priv_mtx);

	if (!refs.is_usbfs) {
	if (f->flag_iserror) {
	/* we got an error */
	m = (void *)1;
	} else {
	if (f->queue_data == NULL) {
	/*
	* start write transfer, if not
	* already started
	*/
	(f->methods->f_start_write) (f);
	}
	/* check if any packets are available */
	USB_IF_POLL(&f->free_q, m);
	}
	} else {
	if (f->flag_iscomplete) {
	m = (void *)1;
	} else {
	m = NULL;
	}
	}

	if (m) {
	revents \|= events & (POLLOUT \| POLLWRNORM);
	} else {
	f->flag_isselect = 1;
	selrecord(td, &f->selinfo);
	}

	mtx_unlock(f->priv_mtx);
	}
	if ((events & (POLLIN \| POLLRDNORM)) &&
	(fflags & FREAD)) {

	f = refs.rxfifo;

	mtx_lock(f->priv_mtx);

	if (!refs.is_usbfs) {
	if (f->flag_iserror) {
	/* we have and error */
	m = (void *)1;
	} else {
	if (f->queue_data == NULL) {
	/*
	* start read transfer, if not
	* already started
	*/
	(f->methods->f_start_read) (f);
	}
	/* check if any packets are available */
	USB_IF_POLL(&f->used_q, m);
	}
	} else {
	if (f->flag_iscomplete) {
	m = (void *)1;
	} else {
	m = NULL;
	}
	}

	if (m) {
	revents \|= events & (POLLIN \| POLLRDNORM);
	} else {
	f->flag_isselect = 1;
	selrecord(td, &f->selinfo);

	if (!refs.is_usbfs) {
	/* start reading data */
	(f->methods->f_start_read) (f);
	}
	}

	mtx_unlock(f->priv_mtx);
	}
	usb_unref_device(cpd, &refs);
	return (revents);
	}

	static int
	usb_read(struct cdev dev, struct uio uio, int ioflag)
	{
	struct usb_cdev_refdata refs;
	struct usb_cdev_privdata* cpd;
	struct usb_fifo *f;
	struct usb_mbuf *m;
	int fflags;
	int resid;
	int io_len;
	int err;
	uint8_t tr_data = 0;

	err = devfs_get_cdevpriv((void **)&cpd);
	if (err != 0)
	return (err);

	err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
	if (err) {
	return (ENXIO);
	}
	fflags = cpd->fflags;

	f = refs.rxfifo;
	if (f == NULL) {
	/* should not happen */
	usb_unref_device(cpd, &refs);
	return (EPERM);
	}

	resid = uio->uio_resid;

	mtx_lock(f->priv_mtx);

	/* check for permanent read error */
	if (f->flag_iserror) {
	err = EIO;
	goto done;
	}
	/* check if USB-FS interface is active */
	if (refs.is_usbfs) {
	/*
	* The queue is used for events that should be
	* retrieved using the "USB_FS_COMPLETE" ioctl.
	*/
	err = EINVAL;
	goto done;
	}
	while (uio->uio_resid > 0) {

	USB_IF_DEQUEUE(&f->used_q, m);

	if (m == NULL) {

	/* start read transfer, if not already started */

	(f->methods->f_start_read) (f);

	if (ioflag & IO_NDELAY) {
	if (tr_data) {
	/* return length before error */
	break;
	}
	err = EWOULDBLOCK;
	break;
	}
	DPRINTF("sleeping\n");

	err = usb_fifo_wait(f);
	if (err) {
	break;
	}
	continue;
	}
	if (f->methods->f_filter_read) {
	/*
	* Sometimes it is convenient to process data at the
	* expense of a userland process instead of a kernel
	* process.
	*/
	(f->methods->f_filter_read) (f, m);
	}
	tr_data = 1;

	io_len = MIN(m->cur_data_len, uio->uio_resid);

	DPRINTFN(2, "transfer %d bytes from %p\n",
	io_len, m->cur_data_ptr);

	err = usb_fifo_uiomove(f,
	m->cur_data_ptr, io_len, uio);

	m->cur_data_len -= io_len;
	m->cur_data_ptr += io_len;

	if (m->cur_data_len == 0) {

	uint8_t last_packet;

	last_packet = m->last_packet;

	USB_IF_ENQUEUE(&f->free_q, m);

	if (last_packet) {
	/* keep framing */
	break;
	}
	} else {
	USB_IF_PREPEND(&f->used_q, m);
	}

	if (err) {
	break;
	}
	}
	done:
	mtx_unlock(f->priv_mtx);

	usb_unref_device(cpd, &refs);

	return (err);
	}

	static int
	usb_write(struct cdev dev, struct uio uio, int ioflag)
	{
	struct usb_cdev_refdata refs;
	struct usb_cdev_privdata* cpd;
	struct usb_fifo *f;
	struct usb_mbuf *m;
	uint8_t *pdata;
	int fflags;
	int resid;
	int io_len;
	int err;
	uint8_t tr_data = 0;

	DPRINTFN(2, "\n");

	err = devfs_get_cdevpriv((void **)&cpd);
	if (err != 0)
	return (err);

	err = usb_ref_device(cpd, &refs, 0 /* no uref */ );
	if (err) {
	return (ENXIO);
	}
	fflags = cpd->fflags;

	f = refs.txfifo;
	if (f == NULL) {
	/* should not happen */
	usb_unref_device(cpd, &refs);
	return (EPERM);
	}
	resid = uio->uio_resid;

	mtx_lock(f->priv_mtx);

	/* check for permanent write error */
	if (f->flag_iserror) {
	err = EIO;
	goto done;
	}
	/* check if USB-FS interface is active */
	if (refs.is_usbfs) {
	/*
	* The queue is used for events that should be
	* retrieved using the "USB_FS_COMPLETE" ioctl.
	*/
	err = EINVAL;
	goto done;
	}
	if (f->queue_data == NULL) {
	/* start write transfer, if not already started */
	(f->methods->f_start_write) (f);
	}
	/* we allow writing zero length data */
	do {
	USB_IF_DEQUEUE(&f->free_q, m);

	if (m == NULL) {

	if (ioflag & IO_NDELAY) {
	if (tr_data) {
	/* return length before error */
	break;
	}
	err = EWOULDBLOCK;
	break;
	}
	DPRINTF("sleeping\n");

	err = usb_fifo_wait(f);
	if (err) {
	break;
	}
	continue;
	}
	tr_data = 1;

	if (f->flag_have_fragment == 0) {
	USB_MBUF_RESET(m);
	io_len = m->cur_data_len;
	pdata = m->cur_data_ptr;
	if (io_len > uio->uio_resid)
	io_len = uio->uio_resid;
	m->cur_data_len = io_len;
	} else {
	io_len = m->max_data_len - m->cur_data_len;
	pdata = m->cur_data_ptr + m->cur_data_len;
	if (io_len > uio->uio_resid)
	io_len = uio->uio_resid;
	m->cur_data_len += io_len;
	}

	DPRINTFN(2, "transfer %d bytes to %p\n",
	io_len, pdata);

	err = usb_fifo_uiomove(f, pdata, io_len, uio);

	if (err) {
	f->flag_have_fragment = 0;
	USB_IF_ENQUEUE(&f->free_q, m);
	break;
	}

	/* check if the buffer is ready to be transmitted */

	if ((f->flag_write_defrag == 0) \|\|
	(m->cur_data_len == m->max_data_len)) {
	f->flag_have_fragment = 0;

	/*
	* Check for write filter:
	*
	* Sometimes it is convenient to process data
	* at the expense of a userland process
	* instead of a kernel process.
	*/
	if (f->methods->f_filter_write) {
	(f->methods->f_filter_write) (f, m);
	}

	/* Put USB mbuf in the used queue */
	USB_IF_ENQUEUE(&f->used_q, m);

	/* Start writing data, if not already started */
	(f->methods->f_start_write) (f);
	} else {
	/* Wait for more data or close */
	f->flag_have_fragment = 1;
	USB_IF_PREPEND(&f->free_q, m);
	}

	} while (uio->uio_resid > 0);
	done:
	mtx_unlock(f->priv_mtx);

	usb_unref_device(cpd, &refs);

	return (err);
	}

	int
	usb_static_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
	struct thread *td)
	{
	union {
	struct usb_read_dir *urd;
	void* data;
	} u;
	int err;

	u.data = data;
	switch (cmd) {
	case USB_READ_DIR:
	err = usb_read_symlink(u.urd->urd_data,
	u.urd->urd_startentry, u.urd->urd_maxlen);
	break;
	case USB_DEV_QUIRK_GET:
	case USB_QUIRK_NAME_GET:
	case USB_DEV_QUIRK_ADD:
	case USB_DEV_QUIRK_REMOVE:
	err = usb_quirk_ioctl_p(cmd, data, fflag, td);
	break;
	case USB_GET_TEMPLATE:
	(int )data = usb_template;
	err = 0;
	break;
	case USB_SET_TEMPLATE:
	err = priv_check(curthread, PRIV_DRIVER);
	if (err)
	break;
	usb_template = (int )data;
	break;
	default:
	err = ENOTTY;
	break;
	}
	return (err);
	}

	static int
	usb_fifo_uiomove(struct usb_fifo f, void cp,
	int n, struct uio *uio)
	{
	int error;

	mtx_unlock(f->priv_mtx);

	/*
	* "uiomove()" can sleep so one needs to make a wrapper,
	* exiting the mutex and checking things:
	*/
	error = uiomove(cp, n, uio);

	mtx_lock(f->priv_mtx);

	return (error);
	}

	int
	usb_fifo_wait(struct usb_fifo *f)
	{
	int err;

	mtx_assert(f->priv_mtx, MA_OWNED);

	if (f->flag_iserror) {
	/* we are gone */
	return (EIO);
	}
	f->flag_sleeping = 1;

	err = cv_wait_sig(&f->cv_io, f->priv_mtx);

	if (f->flag_iserror) {
	/* we are gone */
	err = EIO;
	}
	return (err);
	}

	void
	usb_fifo_signal(struct usb_fifo *f)
	{
	if (f->flag_sleeping) {
	f->flag_sleeping = 0;
	cv_broadcast(&f->cv_io);
	}
	}

	void
	usb_fifo_wakeup(struct usb_fifo *f)
	{
	usb_fifo_signal(f);

	if (f->flag_isselect) {
	selwakeup(&f->selinfo);
	f->flag_isselect = 0;
	}
	if (f->async_p != NULL) {
	PROC_LOCK(f->async_p);
	- psignal(f->async_p, SIGIO);
	+ kern_psignal(f->async_p, SIGIO);
	PROC_UNLOCK(f->async_p);
	}
	}

	static int
	usb_fifo_dummy_open(struct usb_fifo *fifo, int fflags)
	{
	return (0);
	}

	static void
	usb_fifo_dummy_close(struct usb_fifo *fifo, int fflags)
	{
	return;
	}

	static int
	usb_fifo_dummy_ioctl(struct usb_fifo fifo, u_long cmd, void addr, int fflags)
	{
	return (ENOIOCTL);
	}

	static void
	usb_fifo_dummy_cmd(struct usb_fifo *fifo)
	{
	fifo->flag_flushing = 0; /* not flushing */
	}

	static void
	usb_fifo_check_methods(struct usb_fifo_methods *pm)
	{
	/* check that all callback functions are OK */

	if (pm->f_open == NULL)
	pm->f_open = &usb_fifo_dummy_open;

	if (pm->f_close == NULL)
	pm->f_close = &usb_fifo_dummy_close;

	if (pm->f_ioctl == NULL)
	pm->f_ioctl = &usb_fifo_dummy_ioctl;

	if (pm->f_ioctl_post == NULL)
	pm->f_ioctl_post = &usb_fifo_dummy_ioctl;

	if (pm->f_start_read == NULL)
	pm->f_start_read = &usb_fifo_dummy_cmd;

	if (pm->f_stop_read == NULL)
	pm->f_stop_read = &usb_fifo_dummy_cmd;

	if (pm->f_start_write == NULL)
	pm->f_start_write = &usb_fifo_dummy_cmd;

	if (pm->f_stop_write == NULL)
	pm->f_stop_write = &usb_fifo_dummy_cmd;
	}

	/------------------------------------------------------------------------
	* usb_fifo_attach
	*
	* The following function will create a duplex FIFO.
	*
	* Return values:
	* 0: Success.
	* Else: Failure.
	------------------------------------------------------------------------/
	int
	usb_fifo_attach(struct usb_device udev, void priv_sc,
	struct mtx priv_mtx, struct usb_fifo_methods pm,
	struct usb_fifo_sc *f_sc, uint16_t unit, uint16_t subunit,
	uint8_t iface_index, uid_t uid, gid_t gid, int mode)
	{
	struct usb_fifo *f_tx;
	struct usb_fifo *f_rx;
	char devname[32];
	uint8_t n;

	f_sc->fp[USB_FIFO_TX] = NULL;
	f_sc->fp[USB_FIFO_RX] = NULL;

	if (pm == NULL)
	return (EINVAL);

	/* check the methods */
	usb_fifo_check_methods(pm);

	if (priv_mtx == NULL)
	priv_mtx = &Giant;

	/* search for a free FIFO slot */
	for (n = 0;; n += 2) {

	if (n == USB_FIFO_MAX) {
	/* end of FIFOs reached */
	return (ENOMEM);
	}
	/* Check for TX FIFO */
	if (udev->fifo[n + USB_FIFO_TX] != NULL) {
	continue;
	}
	/* Check for RX FIFO */
	if (udev->fifo[n + USB_FIFO_RX] != NULL) {
	continue;
	}
	break;
	}

	f_tx = usb_fifo_alloc();
	f_rx = usb_fifo_alloc();

	if ((f_tx == NULL) \|\| (f_rx == NULL)) {
	usb_fifo_free(f_tx);
	usb_fifo_free(f_rx);
	return (ENOMEM);
	}
	/* initialise FIFO structures */

	f_tx->fifo_index = n + USB_FIFO_TX;
	f_tx->dev_ep_index = -1;
	f_tx->priv_mtx = priv_mtx;
	f_tx->priv_sc0 = priv_sc;
	f_tx->methods = pm;
	f_tx->iface_index = iface_index;
	f_tx->udev = udev;

	f_rx->fifo_index = n + USB_FIFO_RX;
	f_rx->dev_ep_index = -1;
	f_rx->priv_mtx = priv_mtx;
	f_rx->priv_sc0 = priv_sc;
	f_rx->methods = pm;
	f_rx->iface_index = iface_index;
	f_rx->udev = udev;

	f_sc->fp[USB_FIFO_TX] = f_tx;
	f_sc->fp[USB_FIFO_RX] = f_rx;

	mtx_lock(&usb_ref_lock);
	udev->fifo[f_tx->fifo_index] = f_tx;
	udev->fifo[f_rx->fifo_index] = f_rx;
	mtx_unlock(&usb_ref_lock);

	for (n = 0; n != 4; n++) {

	if (pm->basename[n] == NULL) {
	continue;
	}
	if (subunit == 0xFFFF) {
	if (snprintf(devname, sizeof(devname),
	"%s%u%s", pm->basename[n],
	unit, pm->postfix[n] ?
	pm->postfix[n] : "")) {
	/* ignore */
	}
	} else {
	if (snprintf(devname, sizeof(devname),
	"%s%u.%u%s", pm->basename[n],
	unit, subunit, pm->postfix[n] ?
	pm->postfix[n] : "")) {
	/* ignore */
	}
	}

	/*
	* Distribute the symbolic links into two FIFO structures:
	*/
	if (n & 1) {
	f_rx->symlink[n / 2] =
	usb_alloc_symlink(devname);
	} else {
	f_tx->symlink[n / 2] =
	usb_alloc_symlink(devname);
	}

	/* Create the device */
	f_sc->dev = usb_make_dev(udev, devname, -1,
	f_tx->fifo_index & f_rx->fifo_index,
	FREAD\|FWRITE, uid, gid, mode);
	}

	DPRINTFN(2, "attached %p/%p\n", f_tx, f_rx);
	return (0);
	}

	/------------------------------------------------------------------------
	* usb_fifo_alloc_buffer
	*
	* Return values:
	* 0: Success
	* Else failure
	------------------------------------------------------------------------/
	int
	usb_fifo_alloc_buffer(struct usb_fifo *f, usb_size_t bufsize,
	uint16_t nbuf)
	{
	usb_fifo_free_buffer(f);

	/* allocate an endpoint */
	f->free_q.ifq_maxlen = nbuf;
	f->used_q.ifq_maxlen = nbuf;

	f->queue_data = usb_alloc_mbufs(
	M_USBDEV, &f->free_q, bufsize, nbuf);

	if ((f->queue_data == NULL) && bufsize && nbuf) {
	return (ENOMEM);
	}
	return (0); /* success */
	}

	/------------------------------------------------------------------------
	* usb_fifo_free_buffer
	*
	* This function will free the buffers associated with a FIFO. This
	* function can be called multiple times in a row.
	------------------------------------------------------------------------/
	void
	usb_fifo_free_buffer(struct usb_fifo *f)
	{
	if (f->queue_data) {
	/* free old buffer */
	free(f->queue_data, M_USBDEV);
	f->queue_data = NULL;
	}
	/* reset queues */

	bzero(&f->free_q, sizeof(f->free_q));
	bzero(&f->used_q, sizeof(f->used_q));
	}

	void
	usb_fifo_detach(struct usb_fifo_sc *f_sc)
	{
	if (f_sc == NULL) {
	return;
	}
	usb_fifo_free(f_sc->fp[USB_FIFO_TX]);
	usb_fifo_free(f_sc->fp[USB_FIFO_RX]);

	f_sc->fp[USB_FIFO_TX] = NULL;
	f_sc->fp[USB_FIFO_RX] = NULL;

	usb_destroy_dev(f_sc->dev);

	f_sc->dev = NULL;

	DPRINTFN(2, "detached %p\n", f_sc);
	}

	usb_size_t
	usb_fifo_put_bytes_max(struct usb_fifo *f)
	{
	struct usb_mbuf *m;
	usb_size_t len;

	USB_IF_POLL(&f->free_q, m);

	if (m) {
	len = m->max_data_len;
	} else {
	len = 0;
	}
	return (len);
	}

	/------------------------------------------------------------------------
	* usb_fifo_put_data
	*
	* what:
	* 0 - normal operation
	* 1 - set last packet flag to enforce framing
	------------------------------------------------------------------------/
	void
	usb_fifo_put_data(struct usb_fifo f, struct usb_page_cache pc,
	usb_frlength_t offset, usb_frlength_t len, uint8_t what)
	{
	struct usb_mbuf *m;
	usb_frlength_t io_len;

	while (len \|\| (what == 1)) {

	USB_IF_DEQUEUE(&f->free_q, m);

	if (m) {
	USB_MBUF_RESET(m);

	io_len = MIN(len, m->cur_data_len);

	usbd_copy_out(pc, offset, m->cur_data_ptr, io_len);

	m->cur_data_len = io_len;
	offset += io_len;
	len -= io_len;

	if ((len == 0) && (what == 1)) {
	m->last_packet = 1;
	}
	USB_IF_ENQUEUE(&f->used_q, m);

	usb_fifo_wakeup(f);

	if ((len == 0) \|\| (what == 1)) {
	break;
	}
	} else {
	break;
	}
	}
	}

	void
	usb_fifo_put_data_linear(struct usb_fifo f, void ptr,
	usb_size_t len, uint8_t what)
	{
	struct usb_mbuf *m;
	usb_size_t io_len;

	while (len \|\| (what == 1)) {

	USB_IF_DEQUEUE(&f->free_q, m);

	if (m) {
	USB_MBUF_RESET(m);

	io_len = MIN(len, m->cur_data_len);

	bcopy(ptr, m->cur_data_ptr, io_len);

	m->cur_data_len = io_len;
	ptr = USB_ADD_BYTES(ptr, io_len);
	len -= io_len;

	if ((len == 0) && (what == 1)) {
	m->last_packet = 1;
	}
	USB_IF_ENQUEUE(&f->used_q, m);

	usb_fifo_wakeup(f);

	if ((len == 0) \|\| (what == 1)) {
	break;
	}
	} else {
	break;
	}
	}
	}

	uint8_t
	usb_fifo_put_data_buffer(struct usb_fifo f, void ptr, usb_size_t len)
	{
	struct usb_mbuf *m;

	USB_IF_DEQUEUE(&f->free_q, m);

	if (m) {
	m->cur_data_len = len;
	m->cur_data_ptr = ptr;
	USB_IF_ENQUEUE(&f->used_q, m);
	usb_fifo_wakeup(f);
	return (1);
	}
	return (0);
	}

	void
	usb_fifo_put_data_error(struct usb_fifo *f)
	{
	f->flag_iserror = 1;
	usb_fifo_wakeup(f);
	}

	/------------------------------------------------------------------------
	* usb_fifo_get_data
	*
	* what:
	* 0 - normal operation
	* 1 - only get one "usb_mbuf"
	*
	* returns:
	* 0 - no more data
	* 1 - data in buffer
	------------------------------------------------------------------------/
	uint8_t
	usb_fifo_get_data(struct usb_fifo f, struct usb_page_cache pc,
	usb_frlength_t offset, usb_frlength_t len, usb_frlength_t *actlen,
	uint8_t what)
	{
	struct usb_mbuf *m;
	usb_frlength_t io_len;
	uint8_t tr_data = 0;

	actlen[0] = 0;

	while (1) {

	USB_IF_DEQUEUE(&f->used_q, m);

	if (m) {

	tr_data = 1;

	io_len = MIN(len, m->cur_data_len);

	usbd_copy_in(pc, offset, m->cur_data_ptr, io_len);

	len -= io_len;
	offset += io_len;
	actlen[0] += io_len;
	m->cur_data_ptr += io_len;
	m->cur_data_len -= io_len;

	if ((m->cur_data_len == 0) \|\| (what == 1)) {
	USB_IF_ENQUEUE(&f->free_q, m);

	usb_fifo_wakeup(f);

	if (what == 1) {
	break;
	}
	} else {
	USB_IF_PREPEND(&f->used_q, m);
	}
	} else {

	if (tr_data) {
	/* wait for data to be written out */
	break;
	}
	if (f->flag_flushing) {
	/* check if we should send a short packet */
	if (f->flag_short != 0) {
	f->flag_short = 0;
	tr_data = 1;
	break;
	}
	/* flushing complete */
	f->flag_flushing = 0;
	usb_fifo_wakeup(f);
	}
	break;
	}
	if (len == 0) {
	break;
	}
	}
	return (tr_data);
	}

	uint8_t
	usb_fifo_get_data_linear(struct usb_fifo f, void ptr,
	usb_size_t len, usb_size_t *actlen, uint8_t what)
	{
	struct usb_mbuf *m;
	usb_size_t io_len;
	uint8_t tr_data = 0;

	actlen[0] = 0;

	while (1) {

	USB_IF_DEQUEUE(&f->used_q, m);

	if (m) {

	tr_data = 1;

	io_len = MIN(len, m->cur_data_len);

	bcopy(m->cur_data_ptr, ptr, io_len);

	len -= io_len;
	ptr = USB_ADD_BYTES(ptr, io_len);
	actlen[0] += io_len;
	m->cur_data_ptr += io_len;
	m->cur_data_len -= io_len;

	if ((m->cur_data_len == 0) \|\| (what == 1)) {
	USB_IF_ENQUEUE(&f->free_q, m);

	usb_fifo_wakeup(f);

	if (what == 1) {
	break;
	}
	} else {
	USB_IF_PREPEND(&f->used_q, m);
	}
	} else {

	if (tr_data) {
	/* wait for data to be written out */
	break;
	}
	if (f->flag_flushing) {
	/* check if we should send a short packet */
	if (f->flag_short != 0) {
	f->flag_short = 0;
	tr_data = 1;
	break;
	}
	/* flushing complete */
	f->flag_flushing = 0;
	usb_fifo_wakeup(f);
	}
	break;
	}
	if (len == 0) {
	break;
	}
	}
	return (tr_data);
	}

	uint8_t
	usb_fifo_get_data_buffer(struct usb_fifo f, void pptr, usb_size_t plen)
	{
	struct usb_mbuf *m;

	USB_IF_POLL(&f->used_q, m);

	if (m) {
	*plen = m->cur_data_len;
	*pptr = m->cur_data_ptr;

	return (1);
	}
	return (0);
	}

	void
	usb_fifo_get_data_error(struct usb_fifo *f)
	{
	f->flag_iserror = 1;
	usb_fifo_wakeup(f);
	}

	/------------------------------------------------------------------------
	* usb_alloc_symlink
	*
	* Return values:
	* NULL: Failure
	* Else: Pointer to symlink entry
	------------------------------------------------------------------------/
	struct usb_symlink *
	usb_alloc_symlink(const char *target)
	{
	struct usb_symlink *ps;

	ps = malloc(sizeof(*ps), M_USBDEV, M_WAITOK);
	if (ps == NULL) {
	return (ps);
	}
	/* XXX no longer needed */
	strlcpy(ps->src_path, target, sizeof(ps->src_path));
	ps->src_len = strlen(ps->src_path);
	strlcpy(ps->dst_path, target, sizeof(ps->dst_path));
	ps->dst_len = strlen(ps->dst_path);

	sx_xlock(&usb_sym_lock);
	TAILQ_INSERT_TAIL(&usb_sym_head, ps, sym_entry);
	sx_unlock(&usb_sym_lock);
	return (ps);
	}

	/------------------------------------------------------------------------
	* usb_free_symlink
	------------------------------------------------------------------------/
	void
	usb_free_symlink(struct usb_symlink *ps)
	{
	if (ps == NULL) {
	return;
	}
	sx_xlock(&usb_sym_lock);
	TAILQ_REMOVE(&usb_sym_head, ps, sym_entry);
	sx_unlock(&usb_sym_lock);

	free(ps, M_USBDEV);
	}

	/------------------------------------------------------------------------
	* usb_read_symlink
	*
	* Return value:
	* 0: Success
	* Else: Failure
	------------------------------------------------------------------------/
	int
	usb_read_symlink(uint8_t *user_ptr, uint32_t startentry, uint32_t user_len)
	{
	struct usb_symlink *ps;
	uint32_t temp;
	uint32_t delta = 0;
	uint8_t len;
	int error = 0;

	sx_xlock(&usb_sym_lock);

	TAILQ_FOREACH(ps, &usb_sym_head, sym_entry) {

	/*
	* Compute total length of source and destination symlink
	* strings pluss one length byte and two NUL bytes:
	*/
	temp = ps->src_len + ps->dst_len + 3;

	if (temp > 255) {
	/*
	* Skip entry because this length cannot fit
	* into one byte:
	*/
	continue;
	}
	if (startentry != 0) {
	/* decrement read offset */
	startentry--;
	continue;
	}
	if (temp > user_len) {
	/* out of buffer space */
	break;
	}
	len = temp;

	/* copy out total length */

	error = copyout(&len,
	USB_ADD_BYTES(user_ptr, delta), 1);
	if (error) {
	break;
	}
	delta += 1;

	/* copy out source string */

	error = copyout(ps->src_path,
	USB_ADD_BYTES(user_ptr, delta), ps->src_len);
	if (error) {
	break;
	}
	len = 0;
	delta += ps->src_len;
	error = copyout(&len,
	USB_ADD_BYTES(user_ptr, delta), 1);
	if (error) {
	break;
	}
	delta += 1;

	/* copy out destination string */

	error = copyout(ps->dst_path,
	USB_ADD_BYTES(user_ptr, delta), ps->dst_len);
	if (error) {
	break;
	}
	len = 0;
	delta += ps->dst_len;
	error = copyout(&len,
	USB_ADD_BYTES(user_ptr, delta), 1);
	if (error) {
	break;
	}
	delta += 1;

	user_len -= temp;
	}

	/* a zero length entry indicates the end */

	if ((user_len != 0) && (error == 0)) {

	len = 0;

	error = copyout(&len,
	USB_ADD_BYTES(user_ptr, delta), 1);
	}
	sx_unlock(&usb_sym_lock);
	return (error);
	}

	void
	usb_fifo_set_close_zlp(struct usb_fifo *f, uint8_t onoff)
	{
	if (f == NULL)
	return;

	/* send a Zero Length Packet, ZLP, before close */
	f->flag_short = onoff;
	}

	void
	usb_fifo_set_write_defrag(struct usb_fifo *f, uint8_t onoff)
	{
	if (f == NULL)
	return;

	/* defrag written data */
	f->flag_write_defrag = onoff;
	/* reset defrag state */
	f->flag_have_fragment = 0;
	}

	void *
	usb_fifo_softc(struct usb_fifo *f)
	{
	return (f->priv_sc0);
	}
	#endif /* USB_HAVE_UGEN */
	Index: head/sys/fs/nfsserver/nfs_nfsdport.c
	===================================================================
	--- head/sys/fs/nfsserver/nfs_nfsdport.c (revision 225616)
	+++ head/sys/fs/nfsserver/nfs_nfsdport.c (revision 225617)
	@@ -1,3331 +1,3331 @@
	/*-
	* Copyright (c) 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Rick Macklem at The University of Guelph.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/capability.h>

	/*
	* Functions that perform the vfs operations required by the routines in
	* nfsd_serv.c. It is hoped that this change will make the server more
	* portable.
	*/

	#include <fs/nfs/nfsport.h>
	#include <sys/hash.h>
	#include <sys/sysctl.h>
	#include <nlm/nlm_prot.h>
	#include <nlm/nlm.h>

	FEATURE(nfsd, "NFSv4 server");

	extern u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1;
	extern int nfsrv_useacl;
	extern int newnfs_numnfsd;
	extern struct mount nfsv4root_mnt;
	extern struct nfsrv_stablefirst nfsrv_stablefirst;
	extern void (*nfsd_call_servertimer)(void);
	extern SVCPOOL *nfsrvd_pool;
	struct vfsoptlist nfsv4root_opt, nfsv4root_newopt;
	NFSDLOCKMUTEX;
	struct mtx nfs_cache_mutex;
	struct mtx nfs_v4root_mutex;
	struct nfsrvfh nfs_rootfh, nfs_pubfh;
	int nfs_pubfhset = 0, nfs_rootfhset = 0;
	struct proc *nfsd_master_proc = NULL;
	static pid_t nfsd_master_pid = (pid_t)-1;
	static char nfsd_master_comm[MAXCOMLEN + 1];
	static struct timeval nfsd_master_start;
	static uint32_t nfsv4_sysid = 0;

	static int nfssvc_srvcall(struct thread , struct nfssvc_args ,
	struct ucred *);

	int nfsrv_enable_crossmntpt = 1;
	static int nfs_commit_blks;
	static int nfs_commit_miss;
	extern int nfsrv_issuedelegs;
	extern int nfsrv_dolocallocks;

	SYSCTL_NODE(_vfs, OID_AUTO, nfsd, CTLFLAG_RW, 0, "New NFS server");
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, mirrormnt, CTLFLAG_RW,
	&nfsrv_enable_crossmntpt, 0, "Enable nfsd to cross mount points");
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_blks, CTLFLAG_RW, &nfs_commit_blks,
	0, "");
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_miss, CTLFLAG_RW, &nfs_commit_miss,
	0, "");
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, issue_delegations, CTLFLAG_RW,
	&nfsrv_issuedelegs, 0, "Enable nfsd to issue delegations");
	SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_locallocks, CTLFLAG_RW,
	&nfsrv_dolocallocks, 0, "Enable nfsd to acquire local locks on files");

	#define NUM_HEURISTIC 1017
	#define NHUSE_INIT 64
	#define NHUSE_INC 16
	#define NHUSE_MAX 2048

	static struct nfsheur {
	struct vnode nh_vp; / vp to match (unreferenced pointer) */
	off_t nh_nextr; /* next offset for sequential detection */
	int nh_use; /* use count for selection */
	int nh_seqcount; /* heuristic */
	} nfsheur[NUM_HEURISTIC];


	/*
	* Get attributes into nfsvattr structure.
	*/
	int
	nfsvno_getattr(struct vnode vp, struct nfsvattr nvap, struct ucred *cred,
	struct thread *p, int vpislocked)
	{
	int error, lockedit = 0;

	if (vpislocked == 0) {
	/*
	* When vpislocked == 0, the vnode is either exclusively
	* locked by this thread or not locked by this thread.
	* As such, shared lock it, if not exclusively locked.
	*/
	if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) {
	lockedit = 1;
	NFSVOPLOCK(vp, LK_SHARED \| LK_RETRY);
	}
	}
	error = VOP_GETATTR(vp, &nvap->na_vattr, cred);
	if (lockedit != 0)
	NFSVOPUNLOCK(vp, 0);

	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Get a file handle for a vnode.
	*/
	int
	nfsvno_getfh(struct vnode vp, fhandle_t fhp, struct thread *p)
	{
	int error;

	NFSBZERO((caddr_t)fhp, sizeof(fhandle_t));
	fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid;
	error = VOP_VPTOFH(vp, &fhp->fh_fid);

	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Perform access checking for vnodes obtained from file handles that would
	* refer to files already opened by a Unix client. You cannot just use
	* vn_writechk() and VOP_ACCESSX() for two reasons.
	* 1 - You must check for exported rdonly as well as MNT_RDONLY for the write
	* case.
	* 2 - The owner is to be given access irrespective of mode bits for some
	* operations, so that processes that chmod after opening a file don't
	* break.
	*/
	int
	nfsvno_accchk(struct vnode vp, accmode_t accmode, struct ucred cred,
	struct nfsexstuff exp, struct thread p, int override, int vpislocked,
	u_int32_t *supportedtypep)
	{
	struct vattr vattr;
	int error = 0, getret = 0;

	if (vpislocked == 0) {
	if (NFSVOPLOCK(vp, LK_SHARED) != 0) {
	error = EPERM;
	goto out;
	}
	}
	if (accmode & VWRITE) {
	/* Just vn_writechk() changed to check rdonly */
	/*
	* Disallow write attempts on read-only file systems;
	* unless the file is a socket or a block or character
	* device resident on the file system.
	*/
	if (NFSVNO_EXRDONLY(exp) \|\|
	(vp->v_mount->mnt_flag & MNT_RDONLY)) {
	switch (vp->v_type) {
	case VREG:
	case VDIR:
	case VLNK:
	error = EROFS;
	default:
	break;
	}
	}
	/*
	* If there's shared text associated with
	* the inode, try to free it up once. If
	* we fail, we can't allow writing.
	*/
	if ((vp->v_vflag & VV_TEXT) != 0 && error == 0)
	error = ETXTBSY;
	}
	if (error != 0) {
	if (vpislocked == 0)
	NFSVOPUNLOCK(vp, 0);
	goto out;
	}

	/*
	* Should the override still be applied when ACLs are enabled?
	*/
	error = VOP_ACCESSX(vp, accmode, cred, p);
	if (error != 0 && (accmode & (VDELETE \| VDELETE_CHILD))) {
	/*
	* Try again with VEXPLICIT_DENY, to see if the test for
	* deletion is supported.
	*/
	error = VOP_ACCESSX(vp, accmode \| VEXPLICIT_DENY, cred, p);
	if (error == 0) {
	if (vp->v_type == VDIR) {
	accmode &= ~(VDELETE \| VDELETE_CHILD);
	accmode \|= VWRITE;
	error = VOP_ACCESSX(vp, accmode, cred, p);
	} else if (supportedtypep != NULL) {
	*supportedtypep &= ~NFSACCESS_DELETE;
	}
	}
	}

	/*
	* Allow certain operations for the owner (reads and writes
	* on files that are already open).
	*/
	if (override != NFSACCCHK_NOOVERRIDE &&
	(error == EPERM \|\| error == EACCES)) {
	if (cred->cr_uid == 0 && (override & NFSACCCHK_ALLOWROOT))
	error = 0;
	else if (override & NFSACCCHK_ALLOWOWNER) {
	getret = VOP_GETATTR(vp, &vattr, cred);
	if (getret == 0 && cred->cr_uid == vattr.va_uid)
	error = 0;
	}
	}
	if (vpislocked == 0)
	NFSVOPUNLOCK(vp, 0);

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Set attribute(s) vnop.
	*/
	int
	nfsvno_setattr(struct vnode vp, struct nfsvattr nvap, struct ucred *cred,
	struct thread p, struct nfsexstuff exp)
	{
	int error;

	error = VOP_SETATTR(vp, &nvap->na_vattr, cred);
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Set up nameidata for a lookup() call and do it
	* For the cases where we are crossing mount points
	* (looking up the public fh path or the v4 root path when
	* not using a pseudo-root fs), set/release the Giant lock,
	* as required.
	*/
	int
	nfsvno_namei(struct nfsrv_descript nd, struct nameidata ndp,
	struct vnode dp, int islocked, struct nfsexstuff exp, struct thread *p,
	struct vnode **retdirp)
	{
	struct componentname *cnp = &ndp->ni_cnd;
	int i;
	struct iovec aiov;
	struct uio auio;
	int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0, linklen;
	int error = 0, crossmnt;
	char *cp;

	*retdirp = NULL;
	cnp->cn_nameptr = cnp->cn_pnbuf;
	ndp->ni_strictrelative = 0;
	/*
	* Extract and set starting directory.
	*/
	if (dp->v_type != VDIR) {
	if (islocked)
	vput(dp);
	else
	vrele(dp);
	nfsvno_relpathbuf(ndp);
	error = ENOTDIR;
	goto out1;
	}
	if (islocked)
	NFSVOPUNLOCK(dp, 0);
	VREF(dp);
	*retdirp = dp;
	if (NFSVNO_EXRDONLY(exp))
	cnp->cn_flags \|= RDONLY;
	ndp->ni_segflg = UIO_SYSSPACE;
	crossmnt = 1;

	if (nd->nd_flag & ND_PUBLOOKUP) {
	ndp->ni_loopcnt = 0;
	if (cnp->cn_pnbuf[0] == '/') {
	vrele(dp);
	/*
	* Check for degenerate pathnames here, since lookup()
	* panics on them.
	*/
	for (i = 1; i < ndp->ni_pathlen; i++)
	if (cnp->cn_pnbuf[i] != '/')
	break;
	if (i == ndp->ni_pathlen) {
	error = NFSERR_ACCES;
	goto out;
	}
	dp = rootvnode;
	VREF(dp);
	}
	} else if ((nfsrv_enable_crossmntpt == 0 && NFSVNO_EXPORTED(exp)) \|\|
	(nd->nd_flag & ND_NFSV4) == 0) {
	/*
	* Only cross mount points for NFSv4 when doing a
	* mount while traversing the file system above
	* the mount point, unless nfsrv_enable_crossmntpt is set.
	*/
	cnp->cn_flags \|= NOCROSSMOUNT;
	crossmnt = 0;
	}

	/*
	* Initialize for scan, set ni_startdir and bump ref on dp again
	* becuase lookup() will dereference ni_startdir.
	*/

	cnp->cn_thread = p;
	ndp->ni_startdir = dp;
	ndp->ni_rootdir = rootvnode;

	if (!lockleaf)
	cnp->cn_flags \|= LOCKLEAF;
	for (;;) {
	cnp->cn_nameptr = cnp->cn_pnbuf;
	/*
	* Call lookup() to do the real work. If an error occurs,
	* ndp->ni_vp and ni_dvp are left uninitialized or NULL and
	* we do not have to dereference anything before returning.
	* In either case ni_startdir will be dereferenced and NULLed
	* out.
	*/
	error = lookup(ndp);
	if (error)
	break;

	/*
	* Check for encountering a symbolic link. Trivial
	* termination occurs if no symlink encountered.
	*/
	if ((cnp->cn_flags & ISSYMLINK) == 0) {
	if ((cnp->cn_flags & (SAVENAME \| SAVESTART)) == 0)
	nfsvno_relpathbuf(ndp);
	if (ndp->ni_vp && !lockleaf)
	NFSVOPUNLOCK(ndp->ni_vp, 0);
	break;
	}

	/*
	* Validate symlink
	*/
	if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
	NFSVOPUNLOCK(ndp->ni_dvp, 0);
	if (!(nd->nd_flag & ND_PUBLOOKUP)) {
	error = EINVAL;
	goto badlink2;
	}

	if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
	error = ELOOP;
	goto badlink2;
	}
	if (ndp->ni_pathlen > 1)
	cp = uma_zalloc(namei_zone, M_WAITOK);
	else
	cp = cnp->cn_pnbuf;
	aiov.iov_base = cp;
	aiov.iov_len = MAXPATHLEN;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = 0;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_td = NULL;
	auio.uio_resid = MAXPATHLEN;
	error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
	if (error) {
	badlink1:
	if (ndp->ni_pathlen > 1)
	uma_zfree(namei_zone, cp);
	badlink2:
	vrele(ndp->ni_dvp);
	vput(ndp->ni_vp);
	break;
	}
	linklen = MAXPATHLEN - auio.uio_resid;
	if (linklen == 0) {
	error = ENOENT;
	goto badlink1;
	}
	if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
	error = ENAMETOOLONG;
	goto badlink1;
	}

	/*
	* Adjust or replace path
	*/
	if (ndp->ni_pathlen > 1) {
	NFSBCOPY(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
	uma_zfree(namei_zone, cnp->cn_pnbuf);
	cnp->cn_pnbuf = cp;
	} else
	cnp->cn_pnbuf[linklen] = '\0';
	ndp->ni_pathlen += linklen;

	/*
	* Cleanup refs for next loop and check if root directory
	* should replace current directory. Normally ni_dvp
	* becomes the new base directory and is cleaned up when
	* we loop. Explicitly null pointers after invalidation
	* to clarify operation.
	*/
	vput(ndp->ni_vp);
	ndp->ni_vp = NULL;

	if (cnp->cn_pnbuf[0] == '/') {
	vrele(ndp->ni_dvp);
	ndp->ni_dvp = ndp->ni_rootdir;
	VREF(ndp->ni_dvp);
	}
	ndp->ni_startdir = ndp->ni_dvp;
	ndp->ni_dvp = NULL;
	}
	if (!lockleaf)
	cnp->cn_flags &= ~LOCKLEAF;

	out:
	if (error) {
	uma_zfree(namei_zone, cnp->cn_pnbuf);
	ndp->ni_vp = NULL;
	ndp->ni_dvp = NULL;
	ndp->ni_startdir = NULL;
	cnp->cn_flags &= ~HASBUF;
	} else if ((ndp->ni_cnd.cn_flags & (WANTPARENT\|LOCKPARENT)) == 0) {
	ndp->ni_dvp = NULL;
	}

	out1:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Set up a pathname buffer and return a pointer to it and, optionally
	* set a hash pointer.
	*/
	void
	nfsvno_setpathbuf(struct nameidata ndp, char bufpp, u_long *hashpp)
	{
	struct componentname *cnp = &ndp->ni_cnd;

	cnp->cn_flags \|= (NOMACCHECK \| HASBUF);
	cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
	if (hashpp != NULL)
	*hashpp = NULL;
	*bufpp = cnp->cn_pnbuf;
	}

	/*
	* Release the above path buffer, if not released by nfsvno_namei().
	*/
	void
	nfsvno_relpathbuf(struct nameidata *ndp)
	{

	if ((ndp->ni_cnd.cn_flags & HASBUF) == 0)
	panic("nfsrelpath");
	uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
	ndp->ni_cnd.cn_flags &= ~HASBUF;
	}

	/*
	* Readlink vnode op into an mbuf list.
	*/
	int
	nfsvno_readlink(struct vnode vp, struct ucred cred, struct thread *p,
	struct mbuf mpp, struct mbuf mpendp, int *lenp)
	{
	struct iovec iv[(NFS_MAXPATHLEN+MLEN-1)/MLEN];
	struct iovec *ivp = iv;
	struct uio io, *uiop = &io;
	struct mbuf mp, mp2 = NULL, *mp3 = NULL;
	int i, len, tlen, error = 0;

	len = 0;
	i = 0;
	while (len < NFS_MAXPATHLEN) {
	NFSMGET(mp);
	MCLGET(mp, M_WAIT);
	mp->m_len = NFSMSIZ(mp);
	if (len == 0) {
	mp3 = mp2 = mp;
	} else {
	mp2->m_next = mp;
	mp2 = mp;
	}
	if ((len + mp->m_len) > NFS_MAXPATHLEN) {
	mp->m_len = NFS_MAXPATHLEN - len;
	len = NFS_MAXPATHLEN;
	} else {
	len += mp->m_len;
	}
	ivp->iov_base = mtod(mp, caddr_t);
	ivp->iov_len = mp->m_len;
	i++;
	ivp++;
	}
	uiop->uio_iov = iv;
	uiop->uio_iovcnt = i;
	uiop->uio_offset = 0;
	uiop->uio_resid = len;
	uiop->uio_rw = UIO_READ;
	uiop->uio_segflg = UIO_SYSSPACE;
	uiop->uio_td = NULL;
	error = VOP_READLINK(vp, uiop, cred);
	if (error) {
	m_freem(mp3);
	*lenp = 0;
	goto out;
	}
	if (uiop->uio_resid > 0) {
	len -= uiop->uio_resid;
	tlen = NFSM_RNDUP(len);
	nfsrv_adj(mp3, NFS_MAXPATHLEN - tlen, tlen - len);
	}
	*lenp = len;
	*mpp = mp3;
	*mpendp = mp;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Read vnode op call into mbuf list.
	*/
	int
	nfsvno_read(struct vnode vp, off_t off, int cnt, struct ucred cred,
	struct thread p, struct mbuf mpp, struct mbuf *mpendp)
	{
	struct mbuf *m;
	int i;
	struct iovec *iv;
	struct iovec *iv2;
	int error = 0, len, left, siz, tlen, ioflag = 0, hi, try = 32;
	struct mbuf m2 = NULL, m3;
	struct uio io, *uiop = &io;
	struct nfsheur *nh;

	/*
	* Calculate seqcount for heuristic
	*/
	/*
	* Locate best candidate
	*/

	hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC;
	nh = &nfsheur[hi];

	while (try--) {
	if (nfsheur[hi].nh_vp == vp) {
	nh = &nfsheur[hi];
	break;
	}
	if (nfsheur[hi].nh_use > 0)
	--nfsheur[hi].nh_use;
	hi = (hi + 1) % NUM_HEURISTIC;
	if (nfsheur[hi].nh_use < nh->nh_use)
	nh = &nfsheur[hi];
	}

	if (nh->nh_vp != vp) {
	nh->nh_vp = vp;
	nh->nh_nextr = off;
	nh->nh_use = NHUSE_INIT;
	if (off == 0)
	nh->nh_seqcount = 4;
	else
	nh->nh_seqcount = 1;
	}

	/*
	* Calculate heuristic
	*/

	if ((off == 0 && nh->nh_seqcount > 0) \|\| off == nh->nh_nextr) {
	if (++nh->nh_seqcount > IO_SEQMAX)
	nh->nh_seqcount = IO_SEQMAX;
	} else if (nh->nh_seqcount > 1) {
	nh->nh_seqcount = 1;
	} else {
	nh->nh_seqcount = 0;
	}
	nh->nh_use += NHUSE_INC;
	if (nh->nh_use > NHUSE_MAX)
	nh->nh_use = NHUSE_MAX;
	ioflag \|= nh->nh_seqcount << IO_SEQSHIFT;

	len = left = NFSM_RNDUP(cnt);
	m3 = NULL;
	/*
	* Generate the mbuf list with the uio_iov ref. to it.
	*/
	i = 0;
	while (left > 0) {
	NFSMGET(m);
	MCLGET(m, M_WAIT);
	m->m_len = 0;
	siz = min(M_TRAILINGSPACE(m), left);
	left -= siz;
	i++;
	if (m3)
	m2->m_next = m;
	else
	m3 = m;
	m2 = m;
	}
	MALLOC(iv, struct iovec , i sizeof (struct iovec),
	M_TEMP, M_WAITOK);
	uiop->uio_iov = iv2 = iv;
	m = m3;
	left = len;
	i = 0;
	while (left > 0) {
	if (m == NULL)
	panic("nfsvno_read iov");
	siz = min(M_TRAILINGSPACE(m), left);
	if (siz > 0) {
	iv->iov_base = mtod(m, caddr_t) + m->m_len;
	iv->iov_len = siz;
	m->m_len += siz;
	left -= siz;
	iv++;
	i++;
	}
	m = m->m_next;
	}
	uiop->uio_iovcnt = i;
	uiop->uio_offset = off;
	uiop->uio_resid = len;
	uiop->uio_rw = UIO_READ;
	uiop->uio_segflg = UIO_SYSSPACE;
	error = VOP_READ(vp, uiop, IO_NODELOCKED \| ioflag, cred);
	FREE((caddr_t)iv2, M_TEMP);
	if (error) {
	m_freem(m3);
	*mpp = NULL;
	goto out;
	}
	tlen = len - uiop->uio_resid;
	cnt = cnt < tlen ? cnt : tlen;
	tlen = NFSM_RNDUP(cnt);
	if (tlen == 0) {
	m_freem(m3);
	m3 = NULL;
	} else if (len != tlen \|\| tlen != cnt)
	nfsrv_adj(m3, len - tlen, tlen - cnt);
	*mpp = m3;
	*mpendp = m2;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Write vnode op from an mbuf list.
	*/
	int
	nfsvno_write(struct vnode *vp, off_t off, int retlen, int cnt, int stable,
	struct mbuf mp, char cp, struct ucred cred, struct thread p)
	{
	struct iovec *ivp;
	int i, len;
	struct iovec *iv;
	int ioflags, error;
	struct uio io, *uiop = &io;

	MALLOC(ivp, struct iovec , cnt sizeof (struct iovec), M_TEMP,
	M_WAITOK);
	uiop->uio_iov = iv = ivp;
	uiop->uio_iovcnt = cnt;
	i = mtod(mp, caddr_t) + mp->m_len - cp;
	len = retlen;
	while (len > 0) {
	if (mp == NULL)
	panic("nfsvno_write");
	if (i > 0) {
	i = min(i, len);
	ivp->iov_base = cp;
	ivp->iov_len = i;
	ivp++;
	len -= i;
	}
	mp = mp->m_next;
	if (mp) {
	i = mp->m_len;
	cp = mtod(mp, caddr_t);
	}
	}

	if (stable == NFSWRITE_UNSTABLE)
	ioflags = IO_NODELOCKED;
	else
	ioflags = (IO_SYNC \| IO_NODELOCKED);
	uiop->uio_resid = retlen;
	uiop->uio_rw = UIO_WRITE;
	uiop->uio_segflg = UIO_SYSSPACE;
	NFSUIOPROC(uiop, p);
	uiop->uio_offset = off;
	error = VOP_WRITE(vp, uiop, ioflags, cred);
	FREE((caddr_t)iv, M_TEMP);

	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Common code for creating a regular file (plus special files for V2).
	*/
	int
	nfsvno_createsub(struct nfsrv_descript nd, struct nameidata ndp,
	struct vnode *vpp, struct nfsvattr nvap, int *exclusive_flagp,
	int32_t cverf, NFSDEV_T rdev, struct thread p, struct nfsexstuff *exp)
	{
	u_quad_t tempsize;
	int error;

	error = nd->nd_repstat;
	if (!error && ndp->ni_vp == NULL) {
	if (nvap->na_type == VREG \|\| nvap->na_type == VSOCK) {
	vrele(ndp->ni_startdir);
	error = VOP_CREATE(ndp->ni_dvp,
	&ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
	vput(ndp->ni_dvp);
	nfsvno_relpathbuf(ndp);
	if (!error) {
	if (*exclusive_flagp) {
	*exclusive_flagp = 0;
	NFSVNO_ATTRINIT(nvap);
	nvap->na_atime.tv_sec = cverf[0];
	nvap->na_atime.tv_nsec = cverf[1];
	error = VOP_SETATTR(ndp->ni_vp,
	&nvap->na_vattr, nd->nd_cred);
	}
	}
	/*
	* NFS V2 Only. nfsrvd_mknod() does this for V3.
	* (This implies, just get out on an error.)
	*/
	} else if (nvap->na_type == VCHR \|\| nvap->na_type == VBLK \|\|
	nvap->na_type == VFIFO) {
	if (nvap->na_type == VCHR && rdev == 0xffffffff)
	nvap->na_type = VFIFO;
	if (nvap->na_type != VFIFO &&
	(error = priv_check_cred(nd->nd_cred,
	PRIV_VFS_MKNOD_DEV, 0))) {
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	vput(ndp->ni_dvp);
	goto out;
	}
	nvap->na_rdev = rdev;
	error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
	&ndp->ni_cnd, &nvap->na_vattr);
	vput(ndp->ni_dvp);
	nfsvno_relpathbuf(ndp);
	vrele(ndp->ni_startdir);
	if (error)
	goto out;
	} else {
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	vput(ndp->ni_dvp);
	error = ENXIO;
	goto out;
	}
	*vpp = ndp->ni_vp;
	} else {
	/*
	* Handle cases where error is already set and/or
	* the file exists.
	* 1 - clean up the lookup
	* 2 - iff !error and na_size set, truncate it
	*/
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	*vpp = ndp->ni_vp;
	if (ndp->ni_dvp == *vpp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	if (!error && nvap->na_size != VNOVAL) {
	error = nfsvno_accchk(*vpp, VWRITE,
	nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
	NFSACCCHK_VPISLOCKED, NULL);
	if (!error) {
	tempsize = nvap->na_size;
	NFSVNO_ATTRINIT(nvap);
	nvap->na_size = tempsize;
	error = VOP_SETATTR(*vpp,
	&nvap->na_vattr, nd->nd_cred);
	}
	}
	if (error)
	vput(*vpp);
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Do a mknod vnode op.
	*/
	int
	nfsvno_mknod(struct nameidata ndp, struct nfsvattr nvap, struct ucred *cred,
	struct thread *p)
	{
	int error = 0;
	enum vtype vtyp;

	vtyp = nvap->na_type;
	/*
	* Iff doesn't exist, create it.
	*/
	if (ndp->ni_vp) {
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	vput(ndp->ni_dvp);
	vrele(ndp->ni_vp);
	error = EEXIST;
	goto out;
	}
	if (vtyp != VCHR && vtyp != VBLK && vtyp != VSOCK && vtyp != VFIFO) {
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	vput(ndp->ni_dvp);
	error = NFSERR_BADTYPE;
	goto out;
	}
	if (vtyp == VSOCK) {
	vrele(ndp->ni_startdir);
	error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
	&ndp->ni_cnd, &nvap->na_vattr);
	vput(ndp->ni_dvp);
	nfsvno_relpathbuf(ndp);
	} else {
	if (nvap->na_type != VFIFO &&
	(error = priv_check_cred(cred, PRIV_VFS_MKNOD_DEV, 0))) {
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	vput(ndp->ni_dvp);
	goto out;
	}
	error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
	&ndp->ni_cnd, &nvap->na_vattr);
	vput(ndp->ni_dvp);
	nfsvno_relpathbuf(ndp);
	vrele(ndp->ni_startdir);
	/*
	* Since VOP_MKNOD returns the ni_vp, I can't
	* see any reason to do the lookup.
	*/
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Mkdir vnode op.
	*/
	int
	nfsvno_mkdir(struct nameidata ndp, struct nfsvattr nvap, uid_t saved_uid,
	struct ucred cred, struct thread p, struct nfsexstuff *exp)
	{
	int error = 0;

	if (ndp->ni_vp != NULL) {
	if (ndp->ni_dvp == ndp->ni_vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	vrele(ndp->ni_vp);
	nfsvno_relpathbuf(ndp);
	error = EEXIST;
	goto out;
	}
	error = VOP_MKDIR(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
	&nvap->na_vattr);
	vput(ndp->ni_dvp);
	nfsvno_relpathbuf(ndp);

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* symlink vnode op.
	*/
	int
	nfsvno_symlink(struct nameidata ndp, struct nfsvattr nvap, char *pathcp,
	int pathlen, int not_v2, uid_t saved_uid, struct ucred cred, struct thread p,
	struct nfsexstuff *exp)
	{
	int error = 0;

	if (ndp->ni_vp) {
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	if (ndp->ni_dvp == ndp->ni_vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	vrele(ndp->ni_vp);
	error = EEXIST;
	goto out;
	}

	error = VOP_SYMLINK(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
	&nvap->na_vattr, pathcp);
	vput(ndp->ni_dvp);
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	/*
	* Although FreeBSD still had the lookup code in
	* it for 7/current, there doesn't seem to be any
	* point, since VOP_SYMLINK() returns the ni_vp.
	* Just vput it for v2.
	*/
	if (!not_v2 && !error)
	vput(ndp->ni_vp);

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Parse symbolic link arguments.
	* This function has an ugly side effect. It will MALLOC() an area for
	* the symlink and set iov_base to point to it, only if it succeeds.
	* So, if it returns with uiop->uio_iov->iov_base != NULL, that must
	* be FREE'd later.
	*/
	int
	nfsvno_getsymlink(struct nfsrv_descript nd, struct nfsvattr nvap,
	struct thread p, char pathcpp, int lenp)
	{
	u_int32_t *tl;
	char *pathcp = NULL;
	int error = 0, len;
	struct nfsv2_sattr *sp;

	*pathcpp = NULL;
	*lenp = 0;
	if ((nd->nd_flag & ND_NFSV3) &&
	(error = nfsrv_sattr(nd, nvap, NULL, NULL, p)))
	goto nfsmout;
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	len = fxdr_unsigned(int, *tl);
	if (len > NFS_MAXPATHLEN \|\| len <= 0) {
	error = EBADRPC;
	goto nfsmout;
	}
	MALLOC(pathcp, caddr_t, len + 1, M_TEMP, M_WAITOK);
	error = nfsrv_mtostr(nd, pathcp, len);
	if (error)
	goto nfsmout;
	if (nd->nd_flag & ND_NFSV2) {
	NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
	nvap->na_mode = fxdr_unsigned(u_int16_t, sp->sa_mode);
	}
	*pathcpp = pathcp;
	*lenp = len;
	NFSEXITCODE2(0, nd);
	return (0);
	nfsmout:
	if (pathcp)
	free(pathcp, M_TEMP);
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Remove a non-directory object.
	*/
	int
	nfsvno_removesub(struct nameidata ndp, int is_v4, struct ucred cred,
	struct thread p, struct nfsexstuff exp)
	{
	struct vnode *vp;
	int error = 0;

	vp = ndp->ni_vp;
	if (vp->v_type == VDIR)
	error = NFSERR_ISDIR;
	else if (is_v4)
	error = nfsrv_checkremove(vp, 1, p);
	if (!error)
	error = VOP_REMOVE(ndp->ni_dvp, vp, &ndp->ni_cnd);
	if (ndp->ni_dvp == vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	vput(vp);
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Remove a directory.
	*/
	int
	nfsvno_rmdirsub(struct nameidata ndp, int is_v4, struct ucred cred,
	struct thread p, struct nfsexstuff exp)
	{
	struct vnode *vp;
	int error = 0;

	vp = ndp->ni_vp;
	if (vp->v_type != VDIR) {
	error = ENOTDIR;
	goto out;
	}
	/*
	* No rmdir "." please.
	*/
	if (ndp->ni_dvp == vp) {
	error = EINVAL;
	goto out;
	}
	/*
	* The root of a mounted filesystem cannot be deleted.
	*/
	if (vp->v_vflag & VV_ROOT)
	error = EBUSY;
	out:
	if (!error)
	error = VOP_RMDIR(ndp->ni_dvp, vp, &ndp->ni_cnd);
	if (ndp->ni_dvp == vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	vput(vp);
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Rename vnode op.
	*/
	int
	nfsvno_rename(struct nameidata fromndp, struct nameidata tondp,
	u_int32_t ndstat, u_int32_t ndflag, struct ucred cred, struct thread p)
	{
	struct vnode fvp, tvp, *tdvp;
	int error = 0;

	fvp = fromndp->ni_vp;
	if (ndstat) {
	vrele(fromndp->ni_dvp);
	vrele(fvp);
	error = ndstat;
	goto out1;
	}
	tdvp = tondp->ni_dvp;
	tvp = tondp->ni_vp;
	if (tvp != NULL) {
	if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
	error = (ndflag & ND_NFSV2) ? EISDIR : EEXIST;
	goto out;
	} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
	error = (ndflag & ND_NFSV2) ? ENOTDIR : EEXIST;
	goto out;
	}
	if (tvp->v_type == VDIR && tvp->v_mountedhere) {
	error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
	goto out;
	}

	/*
	* A rename to '.' or '..' results in a prematurely
	* unlocked vnode on FreeBSD5, so I'm just going to fail that
	* here.
	*/
	if ((tondp->ni_cnd.cn_namelen == 1 &&
	tondp->ni_cnd.cn_nameptr[0] == '.') \|\|
	(tondp->ni_cnd.cn_namelen == 2 &&
	tondp->ni_cnd.cn_nameptr[0] == '.' &&
	tondp->ni_cnd.cn_nameptr[1] == '.')) {
	error = EINVAL;
	goto out;
	}
	}
	if (fvp->v_type == VDIR && fvp->v_mountedhere) {
	error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
	goto out;
	}
	if (fvp->v_mount != tdvp->v_mount) {
	error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
	goto out;
	}
	if (fvp == tdvp) {
	error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EINVAL;
	goto out;
	}
	if (fvp == tvp) {
	/*
	* If source and destination are the same, there is nothing to
	* do. Set error to -1 to indicate this.
	*/
	error = -1;
	goto out;
	}
	if (ndflag & ND_NFSV4) {
	if (NFSVOPLOCK(fvp, LK_EXCLUSIVE) == 0) {
	error = nfsrv_checkremove(fvp, 0, p);
	NFSVOPUNLOCK(fvp, 0);
	} else
	error = EPERM;
	if (tvp && !error)
	error = nfsrv_checkremove(tvp, 1, p);
	} else {
	/*
	* For NFSv2 and NFSv3, try to get rid of the delegation, so
	* that the NFSv4 client won't be confused by the rename.
	* Since nfsd_recalldelegation() can only be called on an
	* unlocked vnode at this point and fvp is the file that will
	* still exist after the rename, just do fvp.
	*/
	nfsd_recalldelegation(fvp, p);
	}
	out:
	if (!error) {
	error = VOP_RENAME(fromndp->ni_dvp, fromndp->ni_vp,
	&fromndp->ni_cnd, tondp->ni_dvp, tondp->ni_vp,
	&tondp->ni_cnd);
	} else {
	if (tdvp == tvp)
	vrele(tdvp);
	else
	vput(tdvp);
	if (tvp)
	vput(tvp);
	vrele(fromndp->ni_dvp);
	vrele(fvp);
	if (error == -1)
	error = 0;
	}
	vrele(tondp->ni_startdir);
	nfsvno_relpathbuf(tondp);
	out1:
	vrele(fromndp->ni_startdir);
	nfsvno_relpathbuf(fromndp);
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Link vnode op.
	*/
	int
	nfsvno_link(struct nameidata ndp, struct vnode vp, struct ucred *cred,
	struct thread p, struct nfsexstuff exp)
	{
	struct vnode *xp;
	int error = 0;

	xp = ndp->ni_vp;
	if (xp != NULL) {
	error = EEXIST;
	} else {
	xp = ndp->ni_dvp;
	if (vp->v_mount != xp->v_mount)
	error = EXDEV;
	}
	if (!error) {
	NFSVOPLOCK(vp, LK_EXCLUSIVE \| LK_RETRY);
	if ((vp->v_iflag & VI_DOOMED) == 0)
	error = VOP_LINK(ndp->ni_dvp, vp, &ndp->ni_cnd);
	else
	error = EPERM;
	if (ndp->ni_dvp == vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	NFSVOPUNLOCK(vp, 0);
	} else {
	if (ndp->ni_dvp == ndp->ni_vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	if (ndp->ni_vp)
	vrele(ndp->ni_vp);
	}
	nfsvno_relpathbuf(ndp);
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Do the fsync() appropriate for the commit.
	*/
	int
	nfsvno_fsync(struct vnode vp, u_int64_t off, int cnt, struct ucred cred,
	struct thread *td)
	{
	int error = 0;

	if (cnt > MAX_COMMIT_COUNT) {
	/*
	* Give up and do the whole thing
	*/
	if (vp->v_object &&
	(vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
	VM_OBJECT_LOCK(vp->v_object);
	vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC);
	VM_OBJECT_UNLOCK(vp->v_object);
	}
	error = VOP_FSYNC(vp, MNT_WAIT, td);
	} else {
	/*
	* Locate and synchronously write any buffers that fall
	* into the requested range. Note: we are assuming that
	* f_iosize is a power of 2.
	*/
	int iosize = vp->v_mount->mnt_stat.f_iosize;
	int iomask = iosize - 1;
	struct bufobj *bo;
	daddr_t lblkno;

	/*
	* Align to iosize boundry, super-align to page boundry.
	*/
	if (off & iomask) {
	cnt += off & iomask;
	off &= ~(u_quad_t)iomask;
	}
	if (off & PAGE_MASK) {
	cnt += off & PAGE_MASK;
	off &= ~(u_quad_t)PAGE_MASK;
	}
	lblkno = off / iosize;

	if (vp->v_object &&
	(vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
	VM_OBJECT_LOCK(vp->v_object);
	vm_object_page_clean(vp->v_object, off, off + cnt,
	OBJPC_SYNC);
	VM_OBJECT_UNLOCK(vp->v_object);
	}

	bo = &vp->v_bufobj;
	BO_LOCK(bo);
	while (cnt > 0) {
	struct buf *bp;

	/*
	* If we have a buffer and it is marked B_DELWRI we
	* have to lock and write it. Otherwise the prior
	* write is assumed to have already been committed.
	*
	* gbincore() can return invalid buffers now so we
	* have to check that bit as well (though B_DELWRI
	* should not be set if B_INVAL is set there could be
	* a race here since we haven't locked the buffer).
	*/
	if ((bp = gbincore(&vp->v_bufobj, lblkno)) != NULL) {
	if (BUF_LOCK(bp, LK_EXCLUSIVE \| LK_SLEEPFAIL \|
	LK_INTERLOCK, BO_MTX(bo)) == ENOLCK) {
	BO_LOCK(bo);
	continue; /* retry */
	}
	if ((bp->b_flags & (B_DELWRI\|B_INVAL)) ==
	B_DELWRI) {
	bremfree(bp);
	bp->b_flags &= ~B_ASYNC;
	bwrite(bp);
	++nfs_commit_miss;
	} else
	BUF_UNLOCK(bp);
	BO_LOCK(bo);
	}
	++nfs_commit_blks;
	if (cnt < iosize)
	break;
	cnt -= iosize;
	++lblkno;
	}
	BO_UNLOCK(bo);
	}
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Statfs vnode op.
	*/
	int
	nfsvno_statfs(struct vnode vp, struct statfs sf)
	{
	int error;

	error = VFS_STATFS(vp->v_mount, sf);
	if (error == 0) {
	/*
	* Since NFS handles these values as unsigned on the
	* wire, there is no way to represent negative values,
	* so set them to 0. Without this, they will appear
	* to be very large positive values for clients like
	* Solaris10.
	*/
	if (sf->f_bavail < 0)
	sf->f_bavail = 0;
	if (sf->f_ffree < 0)
	sf->f_ffree = 0;
	}
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Do the vnode op stuff for Open. Similar to nfsvno_createsub(), but
	* must handle nfsrv_opencheck() calls after any other access checks.
	*/
	void
	nfsvno_open(struct nfsrv_descript nd, struct nameidata ndp,
	nfsquad_t clientid, nfsv4stateid_t stateidp, struct nfsstate stp,
	int exclusive_flagp, struct nfsvattr nvap, int32_t *cverf, int create,
	NFSACL_T aclp, nfsattrbit_t attrbitp, struct ucred cred, struct thread p,
	struct nfsexstuff exp, struct vnode *vpp)
	{
	struct vnode *vp = NULL;
	u_quad_t tempsize;
	struct nfsexstuff nes;

	if (ndp->ni_vp == NULL)
	nd->nd_repstat = nfsrv_opencheck(clientid,
	stateidp, stp, NULL, nd, p, nd->nd_repstat);
	if (!nd->nd_repstat) {
	if (ndp->ni_vp == NULL) {
	vrele(ndp->ni_startdir);
	nd->nd_repstat = VOP_CREATE(ndp->ni_dvp,
	&ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
	vput(ndp->ni_dvp);
	nfsvno_relpathbuf(ndp);
	if (!nd->nd_repstat) {
	if (*exclusive_flagp) {
	*exclusive_flagp = 0;
	NFSVNO_ATTRINIT(nvap);
	nvap->na_atime.tv_sec = cverf[0];
	nvap->na_atime.tv_nsec = cverf[1];
	nd->nd_repstat = VOP_SETATTR(ndp->ni_vp,
	&nvap->na_vattr, cred);
	} else {
	nfsrv_fixattr(nd, ndp->ni_vp, nvap,
	aclp, p, attrbitp, exp);
	}
	}
	vp = ndp->ni_vp;
	} else {
	if (ndp->ni_startdir)
	vrele(ndp->ni_startdir);
	nfsvno_relpathbuf(ndp);
	vp = ndp->ni_vp;
	if (create == NFSV4OPEN_CREATE) {
	if (ndp->ni_dvp == vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	}
	if (NFSVNO_ISSETSIZE(nvap) && vp->v_type == VREG) {
	if (ndp->ni_cnd.cn_flags & RDONLY)
	NFSVNO_SETEXRDONLY(&nes);
	else
	NFSVNO_EXINIT(&nes);
	nd->nd_repstat = nfsvno_accchk(vp,
	VWRITE, cred, &nes, p,
	NFSACCCHK_NOOVERRIDE,
	NFSACCCHK_VPISLOCKED, NULL);
	nd->nd_repstat = nfsrv_opencheck(clientid,
	stateidp, stp, vp, nd, p, nd->nd_repstat);
	if (!nd->nd_repstat) {
	tempsize = nvap->na_size;
	NFSVNO_ATTRINIT(nvap);
	nvap->na_size = tempsize;
	nd->nd_repstat = VOP_SETATTR(vp,
	&nvap->na_vattr, cred);
	}
	} else if (vp->v_type == VREG) {
	nd->nd_repstat = nfsrv_opencheck(clientid,
	stateidp, stp, vp, nd, p, nd->nd_repstat);
	}
	}
	} else {
	if (ndp->ni_cnd.cn_flags & HASBUF)
	nfsvno_relpathbuf(ndp);
	if (ndp->ni_startdir && create == NFSV4OPEN_CREATE) {
	vrele(ndp->ni_startdir);
	if (ndp->ni_dvp == ndp->ni_vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	if (ndp->ni_vp)
	vput(ndp->ni_vp);
	}
	}
	*vpp = vp;

	NFSEXITCODE2(0, nd);
	}

	/*
	* Updates the file rev and sets the mtime and ctime
	* to the current clock time, returning the va_filerev and va_Xtime
	* values.
	*/
	void
	nfsvno_updfilerev(struct vnode vp, struct nfsvattr nvap,
	struct ucred cred, struct thread p)
	{
	struct vattr va;

	VATTR_NULL(&va);
	getnanotime(&va.va_mtime);
	(void) VOP_SETATTR(vp, &va, cred);
	(void) nfsvno_getattr(vp, nvap, cred, p, 1);
	}

	/*
	* Glue routine to nfsv4_fillattr().
	*/
	int
	nfsvno_fillattr(struct nfsrv_descript nd, struct mount mp, struct vnode *vp,
	struct nfsvattr nvap, fhandle_t fhp, int rderror, nfsattrbit_t *attrbitp,
	struct ucred cred, struct thread p, int isdgram, int reterr,
	int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno)
	{
	int error;

	error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror,
	attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root,
	mounted_on_fileno);
	NFSEXITCODE2(0, nd);
	return (error);
	}

	/* Since the Readdir vnode ops vary, put the entire functions in here. */
	/*
	* nfs readdir service
	* - mallocs what it thinks is enough to read
	* count rounded up to a multiple of DIRBLKSIZ <= NFS_MAXREADDIR
	* - calls VOP_READDIR()
	* - loops around building the reply
	* if the output generated exceeds count break out of loop
	* The NFSM_CLGET macro is used here so that the reply will be packed
	* tightly in mbuf clusters.
	* - it trims out records with d_fileno == 0
	* this doesn't matter for Unix clients, but they might confuse clients
	* for other os'.
	* - it trims out records with d_type == DT_WHT
	* these cannot be seen through NFS (unless we extend the protocol)
	* The alternate call nfsrvd_readdirplus() does lookups as well.
	* PS: The NFS protocol spec. does not clarify what the "count" byte
	* argument is a count of.. just name strings and file id's or the
	* entire reply rpc or ...
	* I tried just file name and id sizes and it confused the Sun client,
	* so I am using the full rpc size now. The "paranoia.." comment refers
	* to including the status longwords that are not a part of the dir.
	* "entry" structures, but are in the rpc.
	*/
	int
	nfsrvd_readdir(struct nfsrv_descript *nd, int isdgram,
	struct vnode vp, struct thread p, struct nfsexstuff *exp)
	{
	struct dirent *dp;
	u_int32_t *tl;
	int dirlen;
	char cpos, cend, *rbuf;
	struct nfsvattr at;
	int nlen, error = 0, getret = 1;
	int siz, cnt, fullsiz, eofflag, ncookies;
	u_int64_t off, toff, verf;
	u_long cookies = NULL, cookiep;
	struct uio io;
	struct iovec iv;
	int not_zfs;

	if (nd->nd_repstat) {
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}
	if (nd->nd_flag & ND_NFSV2) {
	NFSM_DISSECT(tl, u_int32_t , 2 NFSX_UNSIGNED);
	off = fxdr_unsigned(u_quad_t, *tl++);
	} else {
	NFSM_DISSECT(tl, u_int32_t , 5 NFSX_UNSIGNED);
	off = fxdr_hyper(tl);
	tl += 2;
	verf = fxdr_hyper(tl);
	tl += 2;
	}
	toff = off;
	cnt = fxdr_unsigned(int, *tl);
	if (cnt > NFS_SRVMAXDATA(nd) \|\| cnt < 0)
	cnt = NFS_SRVMAXDATA(nd);
	siz = ((cnt + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
	fullsiz = siz;
	if (nd->nd_flag & ND_NFSV3) {
	nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd->nd_cred,
	p, 1);
	#if 0
	/*
	* va_filerev is not sufficient as a cookie verifier,
	* since it is not supposed to change when entries are
	* removed/added unless that offset cookies returned to
	* the client are no longer valid.
	*/
	if (!nd->nd_repstat && toff && verf != at.na_filerev)
	nd->nd_repstat = NFSERR_BAD_COOKIE;
	#endif
	}
	if (nd->nd_repstat == 0 && cnt == 0) {
	if (nd->nd_flag & ND_NFSV2)
	/* NFSv2 does not have NFSERR_TOOSMALL */
	nd->nd_repstat = EPERM;
	else
	nd->nd_repstat = NFSERR_TOOSMALL;
	}
	if (!nd->nd_repstat)
	nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
	nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
	NFSACCCHK_VPISLOCKED, NULL);
	if (nd->nd_repstat) {
	vput(vp);
	if (nd->nd_flag & ND_NFSV3)
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}
	not_zfs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "zfs");
	MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
	again:
	eofflag = 0;
	if (cookies) {
	free((caddr_t)cookies, M_TEMP);
	cookies = NULL;
	}

	iv.iov_base = rbuf;
	iv.iov_len = siz;
	io.uio_iov = &iv;
	io.uio_iovcnt = 1;
	io.uio_offset = (off_t)off;
	io.uio_resid = siz;
	io.uio_segflg = UIO_SYSSPACE;
	io.uio_rw = UIO_READ;
	io.uio_td = NULL;
	nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
	&cookies);
	off = (u_int64_t)io.uio_offset;
	if (io.uio_resid)
	siz -= io.uio_resid;

	if (!cookies && !nd->nd_repstat)
	nd->nd_repstat = NFSERR_PERM;
	if (nd->nd_flag & ND_NFSV3) {
	getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);
	if (!nd->nd_repstat)
	nd->nd_repstat = getret;
	}

	/*
	* Handles the failed cases. nd->nd_repstat == 0 past here.
	*/
	if (nd->nd_repstat) {
	vput(vp);
	free((caddr_t)rbuf, M_TEMP);
	if (cookies)
	free((caddr_t)cookies, M_TEMP);
	if (nd->nd_flag & ND_NFSV3)
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}
	/*
	* If nothing read, return eof
	* rpc reply
	*/
	if (siz == 0) {
	vput(vp);
	if (nd->nd_flag & ND_NFSV2) {
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	} else {
	nfsrv_postopattr(nd, getret, &at);
	NFSM_BUILD(tl, u_int32_t , 4 NFSX_UNSIGNED);
	txdr_hyper(at.na_filerev, tl);
	tl += 2;
	}
	*tl++ = newnfs_false;
	*tl = newnfs_true;
	FREE((caddr_t)rbuf, M_TEMP);
	FREE((caddr_t)cookies, M_TEMP);
	goto out;
	}

	/*
	* Check for degenerate cases of nothing useful read.
	* If so go try again
	*/
	cpos = rbuf;
	cend = rbuf + siz;
	dp = (struct dirent *)cpos;
	cookiep = cookies;

	/*
	* For some reason FreeBSD's ufs_readdir() chooses to back the
	* directory offset up to a block boundary, so it is necessary to
	* skip over the records that precede the requested offset. This
	* requires the assumption that file offset cookies monotonically
	* increase.
	* Since the offset cookies don't monotonically increase for ZFS,
	* this is not done when ZFS is the file system.
	*/
	while (cpos < cend && ncookies > 0 &&
	(dp->d_fileno == 0 \|\| dp->d_type == DT_WHT \|\|
	(not_zfs != 0 && ((u_quad_t)(*cookiep)) <= toff))) {
	cpos += dp->d_reclen;
	dp = (struct dirent *)cpos;
	cookiep++;
	ncookies--;
	}
	if (cpos >= cend \|\| ncookies == 0) {
	siz = fullsiz;
	toff = off;
	goto again;
	}
	vput(vp);

	/*
	* dirlen is the size of the reply, including all XDR and must
	* not exceed cnt. For NFSv2, RFC1094 didn't clearly indicate
	* if the XDR should be included in "count", but to be safe, we do.
	* (Include the two booleans at the end of the reply in dirlen now.)
	*/
	if (nd->nd_flag & ND_NFSV3) {
	nfsrv_postopattr(nd, getret, &at);
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	txdr_hyper(at.na_filerev, tl);
	dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
	} else {
	dirlen = 2 * NFSX_UNSIGNED;
	}

	/* Loop through the records and build reply */
	while (cpos < cend && ncookies > 0) {
	nlen = dp->d_namlen;
	if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
	nlen <= NFS_MAXNAMLEN) {
	if (nd->nd_flag & ND_NFSV3)
	dirlen += (6*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
	else
	dirlen += (4*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
	if (dirlen > cnt) {
	eofflag = 0;
	break;
	}

	/*
	* Build the directory record xdr from
	* the dirent entry.
	*/
	if (nd->nd_flag & ND_NFSV3) {
	NFSM_BUILD(tl, u_int32_t , 3 NFSX_UNSIGNED);
	*tl++ = newnfs_true;
	*tl++ = 0;
	} else {
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = newnfs_true;
	}
	*tl = txdr_unsigned(dp->d_fileno);
	(void) nfsm_strtom(nd, dp->d_name, nlen);
	if (nd->nd_flag & ND_NFSV3) {
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = 0;
	} else
	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
	tl = txdr_unsigned(cookiep);
	}
	cpos += dp->d_reclen;
	dp = (struct dirent *)cpos;
	cookiep++;
	ncookies--;
	}
	if (cpos < cend)
	eofflag = 0;
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = newnfs_false;
	if (eofflag)
	*tl = newnfs_true;
	else
	*tl = newnfs_false;
	FREE((caddr_t)rbuf, M_TEMP);
	FREE((caddr_t)cookies, M_TEMP);

	out:
	NFSEXITCODE2(0, nd);
	return (0);
	nfsmout:
	vput(vp);
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Readdirplus for V3 and Readdir for V4.
	*/
	int
	nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram,
	struct vnode vp, struct thread p, struct nfsexstuff *exp)
	{
	struct dirent *dp;
	u_int32_t *tl;
	int dirlen;
	char cpos, cend, *rbuf;
	struct vnode *nvp;
	fhandle_t nfh;
	struct nfsvattr nva, at, *nvap = &nva;
	struct mbuf mb0, mb1;
	struct nfsreferral *refp;
	int nlen, r, error = 0, getret = 1, usevget = 1;
	int siz, cnt, fullsiz, eofflag, ncookies, entrycnt;
	caddr_t bpos0, bpos1;
	u_int64_t off, toff, verf;
	u_long cookies = NULL, cookiep;
	nfsattrbit_t attrbits, rderrbits, savbits;
	struct uio io;
	struct iovec iv;
	struct componentname cn;
	int at_root, needs_unbusy, not_zfs, supports_nfsv4acls;
	struct mount mp, new_mp;
	uint64_t mounted_on_fileno;

	if (nd->nd_repstat) {
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}
	NFSM_DISSECT(tl, u_int32_t , 6 NFSX_UNSIGNED);
	off = fxdr_hyper(tl);
	toff = off;
	tl += 2;
	verf = fxdr_hyper(tl);
	tl += 2;
	siz = fxdr_unsigned(int, *tl++);
	cnt = fxdr_unsigned(int, *tl);

	/*
	* Use the server's maximum data transfer size as the upper bound
	* on reply datalen.
	*/
	if (cnt > NFS_SRVMAXDATA(nd) \|\| cnt < 0)
	cnt = NFS_SRVMAXDATA(nd);

	/*
	* siz is a "hint" of how much directory information (name, fileid,
	* cookie) should be in the reply. At least one client "hints" 0,
	* so I set it to cnt for that case. I also round it up to the
	* next multiple of DIRBLKSIZ.
	*/
	if (siz <= 0)
	siz = cnt;
	siz = ((siz + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));

	if (nd->nd_flag & ND_NFSV4) {
	error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
	if (error)
	goto nfsmout;
	NFSSET_ATTRBIT(&savbits, &attrbits);
	NFSCLRNOTFILLABLE_ATTRBIT(&attrbits);
	NFSZERO_ATTRBIT(&rderrbits);
	NFSSETBIT_ATTRBIT(&rderrbits, NFSATTRBIT_RDATTRERROR);
	} else {
	NFSZERO_ATTRBIT(&attrbits);
	}
	fullsiz = siz;
	nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);
	if (!nd->nd_repstat) {
	if (off && verf != at.na_filerev) {
	/*
	* va_filerev is not sufficient as a cookie verifier,
	* since it is not supposed to change when entries are
	* removed/added unless that offset cookies returned to
	* the client are no longer valid.
	*/
	#if 0
	if (nd->nd_flag & ND_NFSV4) {
	nd->nd_repstat = NFSERR_NOTSAME;
	} else {
	nd->nd_repstat = NFSERR_BAD_COOKIE;
	}
	#endif
	} else if ((nd->nd_flag & ND_NFSV4) && off == 0 && verf != 0) {
	nd->nd_repstat = NFSERR_BAD_COOKIE;
	}
	}
	if (!nd->nd_repstat && vp->v_type != VDIR)
	nd->nd_repstat = NFSERR_NOTDIR;
	if (!nd->nd_repstat && cnt == 0)
	nd->nd_repstat = NFSERR_TOOSMALL;
	if (!nd->nd_repstat)
	nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
	nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
	NFSACCCHK_VPISLOCKED, NULL);
	if (nd->nd_repstat) {
	vput(vp);
	if (nd->nd_flag & ND_NFSV3)
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}
	not_zfs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "zfs");

	MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
	again:
	eofflag = 0;
	if (cookies) {
	free((caddr_t)cookies, M_TEMP);
	cookies = NULL;
	}

	iv.iov_base = rbuf;
	iv.iov_len = siz;
	io.uio_iov = &iv;
	io.uio_iovcnt = 1;
	io.uio_offset = (off_t)off;
	io.uio_resid = siz;
	io.uio_segflg = UIO_SYSSPACE;
	io.uio_rw = UIO_READ;
	io.uio_td = NULL;
	nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
	&cookies);
	off = (u_int64_t)io.uio_offset;
	if (io.uio_resid)
	siz -= io.uio_resid;

	getret = nfsvno_getattr(vp, &at, nd->nd_cred, p, 1);

	if (!cookies && !nd->nd_repstat)
	nd->nd_repstat = NFSERR_PERM;
	if (!nd->nd_repstat)
	nd->nd_repstat = getret;
	if (nd->nd_repstat) {
	vput(vp);
	if (cookies)
	free((caddr_t)cookies, M_TEMP);
	free((caddr_t)rbuf, M_TEMP);
	if (nd->nd_flag & ND_NFSV3)
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}
	/*
	* If nothing read, return eof
	* rpc reply
	*/
	if (siz == 0) {
	vput(vp);
	if (nd->nd_flag & ND_NFSV3)
	nfsrv_postopattr(nd, getret, &at);
	NFSM_BUILD(tl, u_int32_t , 4 NFSX_UNSIGNED);
	txdr_hyper(at.na_filerev, tl);
	tl += 2;
	*tl++ = newnfs_false;
	*tl = newnfs_true;
	free((caddr_t)cookies, M_TEMP);
	free((caddr_t)rbuf, M_TEMP);
	goto out;
	}

	/*
	* Check for degenerate cases of nothing useful read.
	* If so go try again
	*/
	cpos = rbuf;
	cend = rbuf + siz;
	dp = (struct dirent *)cpos;
	cookiep = cookies;

	/*
	* For some reason FreeBSD's ufs_readdir() chooses to back the
	* directory offset up to a block boundary, so it is necessary to
	* skip over the records that precede the requested offset. This
	* requires the assumption that file offset cookies monotonically
	* increase.
	* Since the offset cookies don't monotonically increase for ZFS,
	* this is not done when ZFS is the file system.
	*/
	while (cpos < cend && ncookies > 0 &&
	(dp->d_fileno == 0 \|\| dp->d_type == DT_WHT \|\|
	(not_zfs != 0 && ((u_quad_t)(*cookiep)) <= toff) \|\|
	((nd->nd_flag & ND_NFSV4) &&
	((dp->d_namlen == 1 && dp->d_name[0] == '.') \|\|
	(dp->d_namlen==2 && dp->d_name[0]=='.' && dp->d_name[1]=='.'))))) {
	cpos += dp->d_reclen;
	dp = (struct dirent *)cpos;
	cookiep++;
	ncookies--;
	}
	if (cpos >= cend \|\| ncookies == 0) {
	siz = fullsiz;
	toff = off;
	goto again;
	}

	/*
	* Busy the file system so that the mount point won't go away
	* and, as such, VFS_VGET() can be used safely.
	*/
	mp = vp->v_mount;
	vfs_ref(mp);
	NFSVOPUNLOCK(vp, 0);
	nd->nd_repstat = vfs_busy(mp, 0);
	vfs_rel(mp);
	if (nd->nd_repstat != 0) {
	vrele(vp);
	free(cookies, M_TEMP);
	free(rbuf, M_TEMP);
	if (nd->nd_flag & ND_NFSV3)
	nfsrv_postopattr(nd, getret, &at);
	goto out;
	}

	/*
	* Save this position, in case there is an error before one entry
	* is created.
	*/
	mb0 = nd->nd_mb;
	bpos0 = nd->nd_bpos;

	/*
	* Fill in the first part of the reply.
	* dirlen is the reply length in bytes and cannot exceed cnt.
	* (Include the two booleans at the end of the reply in dirlen now,
	* so we recognize when we have exceeded cnt.)
	*/
	if (nd->nd_flag & ND_NFSV3) {
	dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
	nfsrv_postopattr(nd, getret, &at);
	} else {
	dirlen = NFSX_VERF + 2 * NFSX_UNSIGNED;
	}
	NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
	txdr_hyper(at.na_filerev, tl);

	/*
	* Save this position, in case there is an empty reply needed.
	*/
	mb1 = nd->nd_mb;
	bpos1 = nd->nd_bpos;

	/* Loop through the records and build reply */
	entrycnt = 0;
	while (cpos < cend && ncookies > 0 && dirlen < cnt) {
	nlen = dp->d_namlen;
	if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
	nlen <= NFS_MAXNAMLEN &&
	((nd->nd_flag & ND_NFSV3) \|\| nlen > 2 \|\|
	(nlen==2 && (dp->d_name[0]!='.' \|\| dp->d_name[1]!='.'))
	\|\| (nlen == 1 && dp->d_name[0] != '.'))) {
	/*
	* Save the current position in the reply, in case
	* this entry exceeds cnt.
	*/
	mb1 = nd->nd_mb;
	bpos1 = nd->nd_bpos;

	/*
	* For readdir_and_lookup get the vnode using
	* the file number.
	*/
	nvp = NULL;
	refp = NULL;
	r = 0;
	at_root = 0;
	needs_unbusy = 0;
	new_mp = mp;
	mounted_on_fileno = (uint64_t)dp->d_fileno;
	if ((nd->nd_flag & ND_NFSV3) \|\|
	NFSNONZERO_ATTRBIT(&savbits)) {
	if (nd->nd_flag & ND_NFSV4)
	refp = nfsv4root_getreferral(NULL,
	vp, dp->d_fileno);
	if (refp == NULL) {
	if (usevget)
	r = VFS_VGET(mp, dp->d_fileno,
	LK_SHARED, &nvp);
	else
	r = EOPNOTSUPP;
	if (r == EOPNOTSUPP) {
	if (usevget) {
	usevget = 0;
	cn.cn_nameiop = LOOKUP;
	cn.cn_lkflags =
	LK_SHARED \|
	LK_RETRY;
	cn.cn_cred =
	nd->nd_cred;
	cn.cn_thread = p;
	}
	cn.cn_nameptr = dp->d_name;
	cn.cn_namelen = nlen;
	cn.cn_flags = ISLASTCN \|
	NOFOLLOW \| LOCKLEAF \|
	MPSAFE;
	if (nlen == 2 &&
	dp->d_name[0] == '.' &&
	dp->d_name[1] == '.')
	cn.cn_flags \|=
	ISDOTDOT;
	if (NFSVOPLOCK(vp, LK_SHARED)
	!= 0) {
	nd->nd_repstat = EPERM;
	break;
	}
	if ((vp->v_vflag & VV_ROOT) != 0
	&& (cn.cn_flags & ISDOTDOT)
	!= 0) {
	vref(vp);
	nvp = vp;
	r = 0;
	} else {
	r = VOP_LOOKUP(vp, &nvp,
	&cn);
	if (vp != nvp)
	NFSVOPUNLOCK(vp,
	0);
	}
	}

	/*
	* For NFSv4, check to see if nvp is
	* a mount point and get the mount
	* point vnode, as required.
	*/
	if (r == 0 &&
	nfsrv_enable_crossmntpt != 0 &&
	(nd->nd_flag & ND_NFSV4) != 0 &&
	nvp->v_type == VDIR &&
	nvp->v_mountedhere != NULL) {
	new_mp = nvp->v_mountedhere;
	r = vfs_busy(new_mp, 0);
	vput(nvp);
	nvp = NULL;
	if (r == 0) {
	r = VFS_ROOT(new_mp,
	LK_SHARED, &nvp);
	needs_unbusy = 1;
	if (r == 0)
	at_root = 1;
	}
	}
	}
	if (!r) {
	if (refp == NULL &&
	((nd->nd_flag & ND_NFSV3) \|\|
	NFSNONZERO_ATTRBIT(&attrbits))) {
	r = nfsvno_getfh(nvp, &nfh, p);
	if (!r)
	r = nfsvno_getattr(nvp, nvap,
	nd->nd_cred, p, 1);
	}
	} else {
	nvp = NULL;
	}
	if (r) {
	if (!NFSISSET_ATTRBIT(&attrbits,
	NFSATTRBIT_RDATTRERROR)) {
	if (nvp != NULL)
	vput(nvp);
	if (needs_unbusy != 0)
	vfs_unbusy(new_mp);
	nd->nd_repstat = r;
	break;
	}
	}
	}

	/*
	* Build the directory record xdr
	*/
	if (nd->nd_flag & ND_NFSV3) {
	NFSM_BUILD(tl, u_int32_t , 3 NFSX_UNSIGNED);
	*tl++ = newnfs_true;
	*tl++ = 0;
	*tl = txdr_unsigned(dp->d_fileno);
	dirlen += nfsm_strtom(nd, dp->d_name, nlen);
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = 0;
	tl = txdr_unsigned(cookiep);
	nfsrv_postopattr(nd, 0, nvap);
	dirlen += nfsm_fhtom(nd,(u_int8_t *)&nfh,0,1);
	dirlen += (5*NFSX_UNSIGNED+NFSX_V3POSTOPATTR);
	if (nvp != NULL)
	vput(nvp);
	} else {
	NFSM_BUILD(tl, u_int32_t , 3 NFSX_UNSIGNED);
	*tl++ = newnfs_true;
	*tl++ = 0;
	tl = txdr_unsigned(cookiep);
	dirlen += nfsm_strtom(nd, dp->d_name, nlen);
	if (nvp != NULL) {
	supports_nfsv4acls =
	nfs_supportsnfsv4acls(nvp);
	NFSVOPUNLOCK(nvp, 0);
	} else
	supports_nfsv4acls = 0;
	if (refp != NULL) {
	dirlen += nfsrv_putreferralattr(nd,
	&savbits, refp, 0,
	&nd->nd_repstat);
	if (nd->nd_repstat) {
	if (nvp != NULL)
	vrele(nvp);
	if (needs_unbusy != 0)
	vfs_unbusy(new_mp);
	break;
	}
	} else if (r) {
	dirlen += nfsvno_fillattr(nd, new_mp,
	nvp, nvap, &nfh, r, &rderrbits,
	nd->nd_cred, p, isdgram, 0,
	supports_nfsv4acls, at_root,
	mounted_on_fileno);
	} else {
	dirlen += nfsvno_fillattr(nd, new_mp,
	nvp, nvap, &nfh, r, &attrbits,
	nd->nd_cred, p, isdgram, 0,
	supports_nfsv4acls, at_root,
	mounted_on_fileno);
	}
	if (nvp != NULL)
	vrele(nvp);
	dirlen += (3 * NFSX_UNSIGNED);
	}
	if (needs_unbusy != 0)
	vfs_unbusy(new_mp);
	if (dirlen <= cnt)
	entrycnt++;
	}
	cpos += dp->d_reclen;
	dp = (struct dirent *)cpos;
	cookiep++;
	ncookies--;
	}
	vrele(vp);
	vfs_unbusy(mp);

	/*
	* If dirlen > cnt, we must strip off the last entry. If that
	* results in an empty reply, report NFSERR_TOOSMALL.
	*/
	if (dirlen > cnt \|\| nd->nd_repstat) {
	if (!nd->nd_repstat && entrycnt == 0)
	nd->nd_repstat = NFSERR_TOOSMALL;
	if (nd->nd_repstat)
	newnfs_trimtrailing(nd, mb0, bpos0);
	else
	newnfs_trimtrailing(nd, mb1, bpos1);
	eofflag = 0;
	} else if (cpos < cend)
	eofflag = 0;
	if (!nd->nd_repstat) {
	NFSM_BUILD(tl, u_int32_t , 2 NFSX_UNSIGNED);
	*tl++ = newnfs_false;
	if (eofflag)
	*tl = newnfs_true;
	else
	*tl = newnfs_false;
	}
	FREE((caddr_t)cookies, M_TEMP);
	FREE((caddr_t)rbuf, M_TEMP);

	out:
	NFSEXITCODE2(0, nd);
	return (0);
	nfsmout:
	vput(vp);
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Get the settable attributes out of the mbuf list.
	* (Return 0 or EBADRPC)
	*/
	int
	nfsrv_sattr(struct nfsrv_descript nd, struct nfsvattr nvap,
	nfsattrbit_t attrbitp, NFSACL_T aclp, struct thread *p)
	{
	u_int32_t *tl;
	struct nfsv2_sattr *sp;
	struct timeval curtime;
	int error = 0, toclient = 0;

	switch (nd->nd_flag & (ND_NFSV2 \| ND_NFSV3 \| ND_NFSV4)) {
	case ND_NFSV2:
	NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
	/*
	* Some old clients didn't fill in the high order 16bits.
	* --> check the low order 2 bytes for 0xffff
	*/
	if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff)
	nvap->na_mode = nfstov_mode(sp->sa_mode);
	if (sp->sa_uid != newnfs_xdrneg1)
	nvap->na_uid = fxdr_unsigned(uid_t, sp->sa_uid);
	if (sp->sa_gid != newnfs_xdrneg1)
	nvap->na_gid = fxdr_unsigned(gid_t, sp->sa_gid);
	if (sp->sa_size != newnfs_xdrneg1)
	nvap->na_size = fxdr_unsigned(u_quad_t, sp->sa_size);
	if (sp->sa_atime.nfsv2_sec != newnfs_xdrneg1) {
	#ifdef notyet
	fxdr_nfsv2time(&sp->sa_atime, &nvap->na_atime);
	#else
	nvap->na_atime.tv_sec =
	fxdr_unsigned(u_int32_t,sp->sa_atime.nfsv2_sec);
	nvap->na_atime.tv_nsec = 0;
	#endif
	}
	if (sp->sa_mtime.nfsv2_sec != newnfs_xdrneg1)
	fxdr_nfsv2time(&sp->sa_mtime, &nvap->na_mtime);
	break;
	case ND_NFSV3:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (*tl == newnfs_true) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	nvap->na_mode = nfstov_mode(*tl);
	}
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (*tl == newnfs_true) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	nvap->na_uid = fxdr_unsigned(uid_t, *tl);
	}
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (*tl == newnfs_true) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	nvap->na_gid = fxdr_unsigned(gid_t, *tl);
	}
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (*tl == newnfs_true) {
	NFSM_DISSECT(tl, u_int32_t , 2 NFSX_UNSIGNED);
	nvap->na_size = fxdr_hyper(tl);
	}
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	switch (fxdr_unsigned(int, *tl)) {
	case NFSV3SATTRTIME_TOCLIENT:
	NFSM_DISSECT(tl, u_int32_t , 2 NFSX_UNSIGNED);
	fxdr_nfsv3time(tl, &nvap->na_atime);
	toclient = 1;
	break;
	case NFSV3SATTRTIME_TOSERVER:
	NFSGETTIME(&curtime);
	nvap->na_atime.tv_sec = curtime.tv_sec;
	nvap->na_atime.tv_nsec = curtime.tv_usec * 1000;
	nvap->na_vaflags \|= VA_UTIMES_NULL;
	break;
	};
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	switch (fxdr_unsigned(int, *tl)) {
	case NFSV3SATTRTIME_TOCLIENT:
	NFSM_DISSECT(tl, u_int32_t , 2 NFSX_UNSIGNED);
	fxdr_nfsv3time(tl, &nvap->na_mtime);
	nvap->na_vaflags &= ~VA_UTIMES_NULL;
	break;
	case NFSV3SATTRTIME_TOSERVER:
	NFSGETTIME(&curtime);
	nvap->na_mtime.tv_sec = curtime.tv_sec;
	nvap->na_mtime.tv_nsec = curtime.tv_usec * 1000;
	if (!toclient)
	nvap->na_vaflags \|= VA_UTIMES_NULL;
	break;
	};
	break;
	case ND_NFSV4:
	error = nfsv4_sattr(nd, nvap, attrbitp, aclp, p);
	};
	nfsmout:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Handle the setable attributes for V4.
	* Returns NFSERR_BADXDR if it can't be parsed, 0 otherwise.
	*/
	int
	nfsv4_sattr(struct nfsrv_descript nd, struct nfsvattr nvap,
	nfsattrbit_t attrbitp, NFSACL_T aclp, struct thread *p)
	{
	u_int32_t *tl;
	int attrsum = 0;
	int i, j;
	int error, attrsize, bitpos, aclsize, aceerr, retnotsup = 0;
	int toclient = 0;
	u_char *cp, namestr[NFSV4_SMALLSTR + 1];
	uid_t uid;
	gid_t gid;
	struct timeval curtime;

	error = nfsrv_getattrbits(nd, attrbitp, NULL, &retnotsup);
	if (error)
	goto nfsmout;
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	attrsize = fxdr_unsigned(int, *tl);

	/*
	* Loop around getting the setable attributes. If an unsupported
	* one is found, set nd_repstat == NFSERR_ATTRNOTSUPP and return.
	*/
	if (retnotsup) {
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	bitpos = NFSATTRBIT_MAX;
	} else {
	bitpos = 0;
	}
	for (; bitpos < NFSATTRBIT_MAX; bitpos++) {
	if (attrsum > attrsize) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	if (NFSISSET_ATTRBIT(attrbitp, bitpos))
	switch (bitpos) {
	case NFSATTRBIT_SIZE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
	nvap->na_size = fxdr_hyper(tl);
	attrsum += NFSX_HYPER;
	break;
	case NFSATTRBIT_ACL:
	error = nfsrv_dissectacl(nd, aclp, &aceerr, &aclsize,
	p);
	if (error)
	goto nfsmout;
	if (aceerr && !nd->nd_repstat)
	nd->nd_repstat = aceerr;
	attrsum += aclsize;
	break;
	case NFSATTRBIT_ARCHIVE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (!nd->nd_repstat)
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_HIDDEN:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (!nd->nd_repstat)
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_MIMETYPE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	i = fxdr_unsigned(int, *tl);
	error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
	if (error)
	goto nfsmout;
	if (!nd->nd_repstat)
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(i));
	break;
	case NFSATTRBIT_MODE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	nvap->na_mode = nfstov_mode(*tl);
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_OWNER:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	j = fxdr_unsigned(int, *tl);
	if (j < 0) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	if (j > NFSV4_SMALLSTR)
	cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
	else
	cp = namestr;
	error = nfsrv_mtostr(nd, cp, j);
	if (error) {
	if (j > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	goto nfsmout;
	}
	if (!nd->nd_repstat) {
	nd->nd_repstat = nfsv4_strtouid(cp,j,&uid,p);
	if (!nd->nd_repstat)
	nvap->na_uid = uid;
	}
	if (j > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
	break;
	case NFSATTRBIT_OWNERGROUP:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	j = fxdr_unsigned(int, *tl);
	if (j < 0) {
	error = NFSERR_BADXDR;
	goto nfsmout;
	}
	if (j > NFSV4_SMALLSTR)
	cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
	else
	cp = namestr;
	error = nfsrv_mtostr(nd, cp, j);
	if (error) {
	if (j > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	goto nfsmout;
	}
	if (!nd->nd_repstat) {
	nd->nd_repstat = nfsv4_strtogid(cp,j,&gid,p);
	if (!nd->nd_repstat)
	nvap->na_gid = gid;
	}
	if (j > NFSV4_SMALLSTR)
	free(cp, M_NFSSTRING);
	attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
	break;
	case NFSATTRBIT_SYSTEM:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	if (!nd->nd_repstat)
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_UNSIGNED;
	break;
	case NFSATTRBIT_TIMEACCESSSET:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	attrsum += NFSX_UNSIGNED;
	if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	fxdr_nfsv4time(tl, &nvap->na_atime);
	toclient = 1;
	attrsum += NFSX_V4TIME;
	} else {
	NFSGETTIME(&curtime);
	nvap->na_atime.tv_sec = curtime.tv_sec;
	nvap->na_atime.tv_nsec = curtime.tv_usec * 1000;
	nvap->na_vaflags \|= VA_UTIMES_NULL;
	}
	break;
	case NFSATTRBIT_TIMEBACKUP:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	if (!nd->nd_repstat)
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMECREATE:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	if (!nd->nd_repstat)
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	attrsum += NFSX_V4TIME;
	break;
	case NFSATTRBIT_TIMEMODIFYSET:
	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
	attrsum += NFSX_UNSIGNED;
	if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
	NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
	fxdr_nfsv4time(tl, &nvap->na_mtime);
	nvap->na_vaflags &= ~VA_UTIMES_NULL;
	attrsum += NFSX_V4TIME;
	} else {
	NFSGETTIME(&curtime);
	nvap->na_mtime.tv_sec = curtime.tv_sec;
	nvap->na_mtime.tv_nsec = curtime.tv_usec * 1000;
	if (!toclient)
	nvap->na_vaflags \|= VA_UTIMES_NULL;
	}
	break;
	default:
	nd->nd_repstat = NFSERR_ATTRNOTSUPP;
	/*
	* set bitpos so we drop out of the loop.
	*/
	bitpos = NFSATTRBIT_MAX;
	break;
	};
	}

	/*
	* some clients pad the attrlist, so we need to skip over the
	* padding.
	*/
	if (attrsum > attrsize) {
	error = NFSERR_BADXDR;
	} else {
	attrsize = NFSM_RNDUP(attrsize);
	if (attrsum < attrsize)
	error = nfsm_advance(nd, attrsize - attrsum, -1);
	}
	nfsmout:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Check/setup export credentials.
	*/
	int
	nfsd_excred(struct nfsrv_descript nd, struct nfsexstuff exp,
	struct ucred *credanon)
	{
	int error = 0;

	/*
	* Check/setup credentials.
	*/
	if (nd->nd_flag & ND_GSS)
	exp->nes_exflag &= ~MNT_EXPORTANON;

	/*
	* Check to see if the operation is allowed for this security flavor.
	* RFC2623 suggests that the NFSv3 Fsinfo RPC be allowed to
	* AUTH_NONE or AUTH_SYS for file systems requiring RPCSEC_GSS.
	* Also, allow Secinfo, so that it can acquire the correct flavor(s).
	*/
	if (nfsvno_testexp(nd, exp) &&
	nd->nd_procnum != NFSV4OP_SECINFO &&
	nd->nd_procnum != NFSPROC_FSINFO) {
	if (nd->nd_flag & ND_NFSV4)
	error = NFSERR_WRONGSEC;
	else
	error = (NFSERR_AUTHERR \| AUTH_TOOWEAK);
	goto out;
	}

	/*
	* Check to see if the file system is exported V4 only.
	*/
	if (NFSVNO_EXV4ONLY(exp) && !(nd->nd_flag & ND_NFSV4)) {
	error = NFSERR_PROGNOTV4;
	goto out;
	}

	/*
	* Now, map the user credentials.
	* (Note that ND_AUTHNONE will only be set for an NFSv3
	* Fsinfo RPC. If set for anything else, this code might need
	* to change.)
	*/
	if (NFSVNO_EXPORTED(exp) &&
	((!(nd->nd_flag & ND_GSS) && nd->nd_cred->cr_uid == 0) \|\|
	NFSVNO_EXPORTANON(exp) \|\|
	(nd->nd_flag & ND_AUTHNONE))) {
	nd->nd_cred->cr_uid = credanon->cr_uid;
	nd->nd_cred->cr_gid = credanon->cr_gid;
	crsetgroups(nd->nd_cred, credanon->cr_ngroups,
	credanon->cr_groups);
	}

	out:
	NFSEXITCODE2(error, nd);
	return (error);
	}

	/*
	* Check exports.
	*/
	int
	nfsvno_checkexp(struct mount mp, struct sockaddr nam, struct nfsexstuff *exp,
	struct ucred **credp)
	{
	int i, error, *secflavors;

	error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
	&exp->nes_numsecflavor, &secflavors);
	if (error) {
	if (nfs_rootfhset) {
	exp->nes_exflag = 0;
	exp->nes_numsecflavor = 0;
	error = 0;
	}
	} else {
	/* Copy the security flavors. */
	for (i = 0; i < exp->nes_numsecflavor; i++)
	exp->nes_secflavors[i] = secflavors[i];
	}
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Get a vnode for a file handle and export stuff.
	*/
	int
	nfsvno_fhtovp(struct mount mp, fhandle_t fhp, struct sockaddr *nam,
	int lktype, struct vnode *vpp, struct nfsexstuff exp,
	struct ucred **credp)
	{
	int i, error, *secflavors;

	*credp = NULL;
	exp->nes_numsecflavor = 0;
	if (VFS_NEEDSGIANT(mp))
	error = ESTALE;
	else
	error = VFS_FHTOVP(mp, &fhp->fh_fid, LK_EXCLUSIVE, vpp);
	if (error != 0)
	/* Make sure the server replies ESTALE to the client. */
	error = ESTALE;
	if (nam && !error) {
	error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
	&exp->nes_numsecflavor, &secflavors);
	if (error) {
	if (nfs_rootfhset) {
	exp->nes_exflag = 0;
	exp->nes_numsecflavor = 0;
	error = 0;
	} else {
	vput(*vpp);
	}
	} else {
	/* Copy the security flavors. */
	for (i = 0; i < exp->nes_numsecflavor; i++)
	exp->nes_secflavors[i] = secflavors[i];
	}
	}
	if (error == 0 && lktype == LK_SHARED)
	/*
	* It would be much better to pass lktype to VFS_FHTOVP(),
	* but this will have to do until VFS_FHTOVP() has a lock
	* type argument like VFS_VGET().
	*/
	NFSVOPLOCK(*vpp, LK_DOWNGRADE \| LK_RETRY);

	NFSEXITCODE(error);
	return (error);
	}

	/*
	* nfsd_fhtovp() - convert a fh to a vnode ptr
	* - look up fsid in mount list (if not found ret error)
	* - get vp and export rights by calling nfsvno_fhtovp()
	* - if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon
	* for AUTH_SYS
	* - if mpp != NULL, return the mount point so that it can
	* be used for vn_finished_write() by the caller
	*/
	void
	nfsd_fhtovp(struct nfsrv_descript nd, struct nfsrvfh nfp, int lktype,
	struct vnode *vpp, struct nfsexstuff exp,
	struct mount *mpp, int startwrite, struct thread p)
	{
	struct mount *mp;
	struct ucred *credanon;
	fhandle_t *fhp;

	fhp = (fhandle_t *)nfp->nfsrvfh_data;
	/*
	* Check for the special case of the nfsv4root_fh.
	*/
	mp = vfs_busyfs(&fhp->fh_fsid);
	if (mpp != NULL)
	*mpp = mp;
	if (mp == NULL) {
	*vpp = NULL;
	nd->nd_repstat = ESTALE;
	goto out;
	}

	if (startwrite)
	vn_start_write(NULL, mpp, V_WAIT);

	nd->nd_repstat = nfsvno_fhtovp(mp, fhp, nd->nd_nam, lktype, vpp, exp,
	&credanon);
	vfs_unbusy(mp);

	/*
	* For NFSv4 without a pseudo root fs, unexported file handles
	* can be returned, so that Lookup works everywhere.
	*/
	if (!nd->nd_repstat && exp->nes_exflag == 0 &&
	!(nd->nd_flag & ND_NFSV4)) {
	vput(*vpp);
	nd->nd_repstat = EACCES;
	}

	/*
	* Personally, I've never seen any point in requiring a
	* reserved port#, since only in the rare case where the
	* clients are all boxes with secure system priviledges,
	* does it provide any enhanced security, but... some people
	* believe it to be useful and keep putting this code back in.
	* (There is also some "security checker" out there that
	* complains if the nfs server doesn't enforce this.)
	* However, note the following:
	* RFC3530 (NFSv4) specifies that a reserved port# not be
	* required.
	* RFC2623 recommends that, if a reserved port# is checked for,
	* that there be a way to turn that off--> ifdef'd.
	*/
	#ifdef NFS_REQRSVPORT
	if (!nd->nd_repstat) {
	struct sockaddr_in *saddr;
	struct sockaddr_in6 *saddr6;

	saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
	saddr6 = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in6 *);
	if (!(nd->nd_flag & ND_NFSV4) &&
	((saddr->sin_family == AF_INET &&
	ntohs(saddr->sin_port) >= IPPORT_RESERVED) \|\|
	(saddr6->sin6_family == AF_INET6 &&
	ntohs(saddr6->sin6_port) >= IPPORT_RESERVED))) {
	vput(*vpp);
	nd->nd_repstat = (NFSERR_AUTHERR \| AUTH_TOOWEAK);
	}
	}
	#endif /* NFS_REQRSVPORT */

	/*
	* Check/setup credentials.
	*/
	if (!nd->nd_repstat) {
	nd->nd_saveduid = nd->nd_cred->cr_uid;
	nd->nd_repstat = nfsd_excred(nd, exp, credanon);
	if (nd->nd_repstat)
	vput(*vpp);
	}
	if (credanon != NULL)
	crfree(credanon);
	if (nd->nd_repstat) {
	if (startwrite)
	vn_finished_write(mp);
	*vpp = NULL;
	if (mpp != NULL)
	*mpp = NULL;
	}

	out:
	NFSEXITCODE2(0, nd);
	}

	/*
	* glue for fp.
	*/
	int
	fp_getfvp(struct thread p, int fd, struct file fpp, struct vnode *vpp)
	{
	struct filedesc *fdp;
	struct file *fp;
	int error = 0;

	fdp = p->td_proc->p_fd;
	if (fd >= fdp->fd_nfiles \|\|
	(fp = fdp->fd_ofiles[fd]) == NULL) {
	error = EBADF;
	goto out;
	}
	*fpp = fp;

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Called from nfssvc() to update the exports list. Just call
	* vfs_export(). This has to be done, since the v4 root fake fs isn't
	* in the mount list.
	*/
	int
	nfsrv_v4rootexport(void argp, struct ucred cred, struct thread *p)
	{
	struct nfsex_args nfsexargp = (struct nfsex_args )argp;
	int error = 0;
	struct nameidata nd;
	fhandle_t fh;

	error = vfs_export(&nfsv4root_mnt, &nfsexargp->export);
	if ((nfsexargp->export.ex_flags & MNT_DELEXPORT) != 0)
	nfs_rootfhset = 0;
	else if (error == 0) {
	if (nfsexargp->fspec == NULL) {
	error = EPERM;
	goto out;
	}
	/*
	* If fspec != NULL, this is the v4root path.
	*/
	NDINIT(&nd, LOOKUP, FOLLOW \| MPSAFE, UIO_USERSPACE,
	nfsexargp->fspec, p);
	if ((error = namei(&nd)) != 0)
	goto out;
	error = nfsvno_getfh(nd.ni_vp, &fh, p);
	vrele(nd.ni_vp);
	if (!error) {
	nfs_rootfh.nfsrvfh_len = NFSX_MYFH;
	NFSBCOPY((caddr_t)&fh,
	nfs_rootfh.nfsrvfh_data,
	sizeof (fhandle_t));
	nfs_rootfhset = 1;
	}
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Get the tcp socket sequence numbers we need.
	* (Maybe this should be moved to the tcp sources?)
	*/
	int
	nfsrv_getsocksndseq(struct socket so, tcp_seq maxp, tcp_seq *unap)
	{
	struct inpcb *inp;
	struct tcpcb *tp;
	int error = 0;

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("nfsrv_getsocksndseq: inp == NULL"));
	INP_RLOCK(inp);
	if (inp->inp_flags & (INP_TIMEWAIT \| INP_DROPPED)) {
	INP_RUNLOCK(inp);
	error = EPIPE;
	goto out;
	}
	tp = intotcpcb(inp);
	if (tp->t_state != TCPS_ESTABLISHED) {
	INP_RUNLOCK(inp);
	error = EPIPE;
	goto out;
	}
	*maxp = tp->snd_max;
	*unap = tp->snd_una;
	INP_RUNLOCK(inp);

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* This function needs to test to see if the system is near its limit
	* for memory allocation via malloc() or mget() and return True iff
	* either of these resources are near their limit.
	* XXX (For now, this is just a stub.)
	*/
	int nfsrv_testmalloclimit = 0;
	int
	nfsrv_mallocmget_limit(void)
	{
	static int printmesg = 0;
	static int testval = 1;

	if (nfsrv_testmalloclimit && (testval++ % 1000) == 0) {
	if ((printmesg++ % 100) == 0)
	printf("nfsd: malloc/mget near limit\n");
	return (1);
	}
	return (0);
	}

	/*
	* BSD specific initialization of a mount point.
	*/
	void
	nfsd_mntinit(void)
	{
	static int inited = 0;

	if (inited)
	return;
	inited = 1;
	nfsv4root_mnt.mnt_flag = (MNT_RDONLY \| MNT_EXPORTED);
	TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist);
	nfsv4root_mnt.mnt_export = NULL;
	TAILQ_INIT(&nfsv4root_opt);
	TAILQ_INIT(&nfsv4root_newopt);
	nfsv4root_mnt.mnt_opt = &nfsv4root_opt;
	nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt;
	nfsv4root_mnt.mnt_nvnodelistsize = 0;
	}

	/*
	* Get a vnode for a file handle, without checking exports, etc.
	*/
	struct vnode *
	nfsvno_getvp(fhandle_t *fhp)
	{
	struct mount *mp;
	struct vnode *vp;
	int error;

	mp = vfs_busyfs(&fhp->fh_fsid);
	if (mp == NULL)
	return (NULL);
	error = VFS_FHTOVP(mp, &fhp->fh_fid, LK_EXCLUSIVE, &vp);
	vfs_unbusy(mp);
	if (error)
	return (NULL);
	return (vp);
	}

	/*
	* Do a local VOP_ADVLOCK().
	*/
	int
	nfsvno_advlock(struct vnode *vp, int ftype, u_int64_t first,
	u_int64_t end, struct thread *td)
	{
	int error = 0;
	struct flock fl;
	u_int64_t tlen;

	if (nfsrv_dolocallocks == 0)
	goto out;

	/* Check for VI_DOOMED here, so that VOP_ADVLOCK() isn't performed. */
	if ((vp->v_iflag & VI_DOOMED) != 0) {
	error = EPERM;
	goto out;
	}

	fl.l_whence = SEEK_SET;
	fl.l_type = ftype;
	fl.l_start = (off_t)first;
	if (end == NFS64BITSSET) {
	fl.l_len = 0;
	} else {
	tlen = end - first;
	fl.l_len = (off_t)tlen;
	}
	/*
	* For FreeBSD8, the l_pid and l_sysid must be set to the same
	* values for all calls, so that all locks will be held by the
	* nfsd server. (The nfsd server handles conflicts between the
	* various clients.)
	* Since an NFSv4 lockowner is a ClientID plus an array of up to 1024
	* bytes, so it can't be put in l_sysid.
	*/
	if (nfsv4_sysid == 0)
	nfsv4_sysid = nlm_acquire_next_sysid();
	fl.l_pid = (pid_t)0;
	fl.l_sysid = (int)nfsv4_sysid;

	NFSVOPUNLOCK(vp, 0);
	if (ftype == F_UNLCK)
	error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_UNLCK, &fl,
	(F_POSIX \| F_REMOTE));
	else
	error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_SETLK, &fl,
	(F_POSIX \| F_REMOTE));
	NFSVOPLOCK(vp, LK_EXCLUSIVE \| LK_RETRY);

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Check the nfsv4 root exports.
	*/
	int
	nfsvno_v4rootexport(struct nfsrv_descript *nd)
	{
	struct ucred *credanon;
	int exflags, error = 0, numsecflavor, *secflavors, i;

	error = vfs_stdcheckexp(&nfsv4root_mnt, nd->nd_nam, &exflags,
	&credanon, &numsecflavor, &secflavors);
	if (error) {
	error = NFSERR_PROGUNAVAIL;
	goto out;
	}
	if (credanon != NULL)
	crfree(credanon);
	for (i = 0; i < numsecflavor; i++) {
	if (secflavors[i] == AUTH_SYS)
	nd->nd_flag \|= ND_EXAUTHSYS;
	else if (secflavors[i] == RPCSEC_GSS_KRB5)
	nd->nd_flag \|= ND_EXGSS;
	else if (secflavors[i] == RPCSEC_GSS_KRB5I)
	nd->nd_flag \|= ND_EXGSSINTEGRITY;
	else if (secflavors[i] == RPCSEC_GSS_KRB5P)
	nd->nd_flag \|= ND_EXGSSPRIVACY;
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Nfs server psuedo system call for the nfsd's
	*/
	/*
	* MPSAFE
	*/
	static int
	nfssvc_nfsd(struct thread td, struct nfssvc_args uap)
	{
	struct file *fp;
	struct nfsd_addsock_args sockarg;
	struct nfsd_nfsd_args nfsdarg;
	int error;

	if (uap->flag & NFSSVC_NFSDADDSOCK) {
	error = copyin(uap->argp, (caddr_t)&sockarg, sizeof (sockarg));
	if (error)
	goto out;
	/*
	* Since we don't know what rights might be required,
	* pretend that we need them all. It is better to be too
	* careful than too reckless.
	*/
	if ((error = fget(td, sockarg.sock, CAP_SOCK_ALL, &fp)) != 0)
	goto out;
	if (fp->f_type != DTYPE_SOCKET) {
	fdrop(fp, td);
	error = EPERM;
	goto out;
	}
	error = nfsrvd_addsock(fp);
	fdrop(fp, td);
	} else if (uap->flag & NFSSVC_NFSDNFSD) {
	if (uap->argp == NULL) {
	error = EINVAL;
	goto out;
	}
	error = copyin(uap->argp, (caddr_t)&nfsdarg,
	sizeof (nfsdarg));
	if (error)
	goto out;
	error = nfsrvd_nfsd(td, &nfsdarg);
	} else {
	error = nfssvc_srvcall(td, uap, td->td_ucred);
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}

	static int
	nfssvc_srvcall(struct thread p, struct nfssvc_args uap, struct ucred *cred)
	{
	struct nfsex_args export;
	struct file *fp = NULL;
	int stablefd, len;
	struct nfsd_clid adminrevoke;
	struct nfsd_dumplist dumplist;
	struct nfsd_dumpclients *dumpclients;
	struct nfsd_dumplocklist dumplocklist;
	struct nfsd_dumplocks *dumplocks;
	struct nameidata nd;
	vnode_t vp;
	int error = EINVAL;
	struct proc *procp;

	if (uap->flag & NFSSVC_PUBLICFH) {
	NFSBZERO((caddr_t)&nfs_pubfh.nfsrvfh_data,
	sizeof (fhandle_t));
	error = copyin(uap->argp,
	&nfs_pubfh.nfsrvfh_data, sizeof (fhandle_t));
	if (!error)
	nfs_pubfhset = 1;
	} else if (uap->flag & NFSSVC_V4ROOTEXPORT) {
	error = copyin(uap->argp,(caddr_t)&export,
	sizeof (struct nfsex_args));
	if (!error)
	error = nfsrv_v4rootexport(&export, cred, p);
	} else if (uap->flag & NFSSVC_NOPUBLICFH) {
	nfs_pubfhset = 0;
	error = 0;
	} else if (uap->flag & NFSSVC_STABLERESTART) {
	error = copyin(uap->argp, (caddr_t)&stablefd,
	sizeof (int));
	if (!error)
	error = fp_getfvp(p, stablefd, &fp, &vp);
	if (!error && (NFSFPFLAG(fp) & (FREAD \| FWRITE)) != (FREAD \| FWRITE))
	error = EBADF;
	if (!error && newnfs_numnfsd != 0)
	error = EPERM;
	if (!error) {
	nfsrv_stablefirst.nsf_fp = fp;
	nfsrv_setupstable(p);
	}
	} else if (uap->flag & NFSSVC_ADMINREVOKE) {
	error = copyin(uap->argp, (caddr_t)&adminrevoke,
	sizeof (struct nfsd_clid));
	if (!error)
	error = nfsrv_adminrevoke(&adminrevoke, p);
	} else if (uap->flag & NFSSVC_DUMPCLIENTS) {
	error = copyin(uap->argp, (caddr_t)&dumplist,
	sizeof (struct nfsd_dumplist));
	if (!error && (dumplist.ndl_size < 1 \|\|
	dumplist.ndl_size > NFSRV_MAXDUMPLIST))
	error = EPERM;
	if (!error) {
	len = sizeof (struct nfsd_dumpclients) * dumplist.ndl_size;
	dumpclients = (struct nfsd_dumpclients *)malloc(len,
	M_TEMP, M_WAITOK);
	nfsrv_dumpclients(dumpclients, dumplist.ndl_size);
	error = copyout(dumpclients,
	CAST_USER_ADDR_T(dumplist.ndl_list), len);
	free((caddr_t)dumpclients, M_TEMP);
	}
	} else if (uap->flag & NFSSVC_DUMPLOCKS) {
	error = copyin(uap->argp, (caddr_t)&dumplocklist,
	sizeof (struct nfsd_dumplocklist));
	if (!error && (dumplocklist.ndllck_size < 1 \|\|
	dumplocklist.ndllck_size > NFSRV_MAXDUMPLIST))
	error = EPERM;
	if (!error)
	error = nfsrv_lookupfilename(&nd,
	dumplocklist.ndllck_fname, p);
	if (!error) {
	len = sizeof (struct nfsd_dumplocks) *
	dumplocklist.ndllck_size;
	dumplocks = (struct nfsd_dumplocks *)malloc(len,
	M_TEMP, M_WAITOK);
	nfsrv_dumplocks(nd.ni_vp, dumplocks,
	dumplocklist.ndllck_size, p);
	vput(nd.ni_vp);
	error = copyout(dumplocks,
	CAST_USER_ADDR_T(dumplocklist.ndllck_list), len);
	free((caddr_t)dumplocks, M_TEMP);
	}
	} else if (uap->flag & NFSSVC_BACKUPSTABLE) {
	procp = p->td_proc;
	PROC_LOCK(procp);
	nfsd_master_pid = procp->p_pid;
	bcopy(procp->p_comm, nfsd_master_comm, MAXCOMLEN + 1);
	nfsd_master_start = procp->p_stats->p_start;
	nfsd_master_proc = procp;
	PROC_UNLOCK(procp);
	}

	NFSEXITCODE(error);
	return (error);
	}

	/*
	* Check exports.
	* Returns 0 if ok, 1 otherwise.
	*/
	int
	nfsvno_testexp(struct nfsrv_descript nd, struct nfsexstuff exp)
	{
	int i;

	/*
	* This seems odd, but allow the case where the security flavor
	* list is empty. This happens when NFSv4 is traversing non-exported
	* file systems. Exported file systems should always have a non-empty
	* security flavor list.
	*/
	if (exp->nes_numsecflavor == 0)
	return (0);

	for (i = 0; i < exp->nes_numsecflavor; i++) {
	/*
	* The tests for privacy and integrity must be first,
	* since ND_GSS is set for everything but AUTH_SYS.
	*/
	if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5P &&
	(nd->nd_flag & ND_GSSPRIVACY))
	return (0);
	if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5I &&
	(nd->nd_flag & ND_GSSINTEGRITY))
	return (0);
	if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5 &&
	(nd->nd_flag & ND_GSS))
	return (0);
	if (exp->nes_secflavors[i] == AUTH_SYS &&
	(nd->nd_flag & ND_GSS) == 0)
	return (0);
	}
	return (1);
	}

	/*
	* Calculate a hash value for the fid in a file handle.
	*/
	uint32_t
	nfsrv_hashfh(fhandle_t *fhp)
	{
	uint32_t hashval;

	hashval = hash32_buf(&fhp->fh_fid, sizeof(struct fid), 0);
	return (hashval);
	}

	/*
	* Signal the userland master nfsd to backup the stable restart file.
	*/
	void
	nfsrv_backupstable(void)
	{
	struct proc *procp;

	if (nfsd_master_proc != NULL) {
	procp = pfind(nfsd_master_pid);
	/* Try to make sure it is the correct process. */
	if (procp == nfsd_master_proc &&
	procp->p_stats->p_start.tv_sec ==
	nfsd_master_start.tv_sec &&
	procp->p_stats->p_start.tv_usec ==
	nfsd_master_start.tv_usec &&
	strcmp(procp->p_comm, nfsd_master_comm) == 0)
	- psignal(procp, SIGUSR2);
	+ kern_psignal(procp, SIGUSR2);
	else
	nfsd_master_proc = NULL;

	if (procp != NULL)
	PROC_UNLOCK(procp);
	}
	}

	extern int (nfsd_call_nfsd)(struct thread , struct nfssvc_args *);

	/*
	* Called once to initialize data structures...
	*/
	static int
	nfsd_modevent(module_t mod, int type, void *data)
	{
	int error = 0;
	static int loaded = 0;

	switch (type) {
	case MOD_LOAD:
	if (loaded)
	goto out;
	newnfs_portinit();
	mtx_init(&nfs_cache_mutex, "nfs_cache_mutex", NULL, MTX_DEF);
	mtx_init(&nfs_v4root_mutex, "nfs_v4root_mutex", NULL, MTX_DEF);
	mtx_init(&nfsv4root_mnt.mnt_mtx, "struct mount mtx", NULL,
	MTX_DEF);
	lockinit(&nfsv4root_mnt.mnt_explock, PVFS, "explock", 0, 0);
	nfsrvd_initcache();
	nfsd_init();
	NFSD_LOCK();
	nfsrvd_init(0);
	NFSD_UNLOCK();
	nfsd_mntinit();
	#ifdef VV_DISABLEDELEG
	vn_deleg_ops.vndeleg_recall = nfsd_recalldelegation;
	vn_deleg_ops.vndeleg_disable = nfsd_disabledelegation;
	#endif
	nfsd_call_servertimer = nfsrv_servertimer;
	nfsd_call_nfsd = nfssvc_nfsd;
	loaded = 1;
	break;

	case MOD_UNLOAD:
	if (newnfs_numnfsd != 0) {
	error = EBUSY;
	break;
	}

	#ifdef VV_DISABLEDELEG
	vn_deleg_ops.vndeleg_recall = NULL;
	vn_deleg_ops.vndeleg_disable = NULL;
	#endif
	nfsd_call_servertimer = NULL;
	nfsd_call_nfsd = NULL;

	/* Clean out all NFSv4 state. */
	nfsrv_throwawayallstate(curthread);

	/* Clean the NFS server reply cache */
	nfsrvd_cleancache();

	/* Free up the krpc server pool. */
	if (nfsrvd_pool != NULL)
	svcpool_destroy(nfsrvd_pool);

	/* and get rid of the locks */
	mtx_destroy(&nfs_cache_mutex);
	mtx_destroy(&nfs_v4root_mutex);
	mtx_destroy(&nfsv4root_mnt.mnt_mtx);
	lockdestroy(&nfsv4root_mnt.mnt_explock);
	loaded = 0;
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}

	out:
	NFSEXITCODE(error);
	return (error);
	}
	static moduledata_t nfsd_mod = {
	"nfsd",
	nfsd_modevent,
	NULL,
	};
	DECLARE_MODULE(nfsd, nfsd_mod, SI_SUB_VFS, SI_ORDER_ANY);

	/* So that loader and kldload(2) can find us, wherever we are.. */
	MODULE_VERSION(nfsd, 1);
	MODULE_DEPEND(nfsd, nfscommon, 1, 1, 1);
	MODULE_DEPEND(nfsd, nfslock, 1, 1, 1);
	MODULE_DEPEND(nfsd, nfslockd, 1, 1, 1);
	MODULE_DEPEND(nfsd, krpc, 1, 1, 1);
	MODULE_DEPEND(nfsd, nfssvc, 1, 1, 1);

	Index: head/sys/fs/procfs/procfs_ctl.c
	===================================================================
	--- head/sys/fs/procfs/procfs_ctl.c (revision 225616)
	+++ head/sys/fs/procfs/procfs_ctl.c (revision 225617)
	@@ -1,358 +1,358 @@
	/*-
	* Copyright (c) 1993 Jan-Simon Pendry
	* Copyright (c) 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Jan-Simon Pendry.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)procfs_ctl.c 8.4 (Berkeley) 6/15/94
	*
	* From:
	* $Id: procfs_ctl.c,v 1.51 2003/12/07 17:40:00 des Exp $
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/ptrace.h>
	#include <sys/sbuf.h>
	#include <sys/signalvar.h>
	#include <sys/sx.h>
	#include <sys/uio.h>

	#include <fs/pseudofs/pseudofs.h>
	#include <fs/procfs/procfs.h>

	#include <vm/vm.h>

	/*
	* True iff process (p) is in trace wait state
	* relative to process (curp)
	*/
	#define TRACE_WAIT_P(curp, p) \
	(P_SHOULDSTOP(p) && \
	(p)->p_pptr == (curp) && \
	((p)->p_flag & P_TRACED))

	#define PROCFS_CTL_ATTACH 1
	#define PROCFS_CTL_DETACH 2
	#define PROCFS_CTL_STEP 3
	#define PROCFS_CTL_RUN 4
	#define PROCFS_CTL_WAIT 5

	struct namemap {
	const char *nm_name;
	int nm_val;
	};

	static struct namemap ctlnames[] = {
	/* special /proc commands */
	{ "attach", PROCFS_CTL_ATTACH },
	{ "detach", PROCFS_CTL_DETACH },
	{ "step", PROCFS_CTL_STEP },
	{ "run", PROCFS_CTL_RUN },
	{ "wait", PROCFS_CTL_WAIT },
	{ 0 },
	};

	static struct namemap signames[] = {
	/* regular signal names */
	{ "hup", SIGHUP }, { "int", SIGINT },
	{ "quit", SIGQUIT }, { "ill", SIGILL },
	{ "trap", SIGTRAP }, { "abrt", SIGABRT },
	{ "iot", SIGIOT }, { "emt", SIGEMT },
	{ "fpe", SIGFPE }, { "kill", SIGKILL },
	{ "bus", SIGBUS }, { "segv", SIGSEGV },
	{ "sys", SIGSYS }, { "pipe", SIGPIPE },
	{ "alrm", SIGALRM }, { "term", SIGTERM },
	{ "urg", SIGURG }, { "stop", SIGSTOP },
	{ "tstp", SIGTSTP }, { "cont", SIGCONT },
	{ "chld", SIGCHLD }, { "ttin", SIGTTIN },
	{ "ttou", SIGTTOU }, { "io", SIGIO },
	{ "xcpu", SIGXCPU }, { "xfsz", SIGXFSZ },
	{ "vtalrm", SIGVTALRM }, { "prof", SIGPROF },
	{ "winch", SIGWINCH }, { "info", SIGINFO },
	{ "usr1", SIGUSR1 }, { "usr2", SIGUSR2 },
	{ 0 },
	};

	static int procfs_control(struct thread td, struct proc p, int op);

	static int
	procfs_control(struct thread td, struct proc p, int op)
	{
	int error = 0;
	struct thread *temp;

	/*
	* Attach - attaches the target process for debugging
	* by the calling process.
	*/
	if (op == PROCFS_CTL_ATTACH) {
	sx_xlock(&proctree_lock);
	PROC_LOCK(p);
	if ((error = p_candebug(td, p)) != 0)
	goto out;
	if (p->p_flag & P_TRACED) {
	error = EBUSY;
	goto out;
	}

	/* Can't trace yourself! */
	if (p->p_pid == td->td_proc->p_pid) {
	error = EINVAL;
	goto out;
	}

	/*
	* Go ahead and set the trace flag.
	* Save the old parent (it's reset in
	* _DETACH, and also in kern_exit.c:wait4()
	* Reparent the process so that the tracing
	* proc gets to see all the action.
	* Stop the target.
	*/
	p->p_flag \|= P_TRACED;
	faultin(p);
	p->p_xstat = 0; /* XXX ? */
	if (p->p_pptr != td->td_proc) {
	p->p_oppid = p->p_pptr->p_pid;
	proc_reparent(p, td->td_proc);
	}
	- psignal(p, SIGSTOP);
	+ kern_psignal(p, SIGSTOP);
	out:
	PROC_UNLOCK(p);
	sx_xunlock(&proctree_lock);
	return (error);
	}

	/*
	* Authorization check: rely on normal debugging protection, except
	* allow processes to disengage debugging on a process onto which
	* they have previously attached, but no longer have permission to
	* debug.
	*/
	PROC_LOCK(p);
	if (op != PROCFS_CTL_DETACH &&
	((error = p_candebug(td, p)))) {
	PROC_UNLOCK(p);
	return (error);
	}

	/*
	* Target process must be stopped, owned by (td) and
	* be set up for tracing (P_TRACED flag set).
	* Allow DETACH to take place at any time for sanity.
	* Allow WAIT any time, of course.
	*/
	switch (op) {
	case PROCFS_CTL_DETACH:
	case PROCFS_CTL_WAIT:
	break;

	default:
	if (!TRACE_WAIT_P(td->td_proc, p)) {
	PROC_UNLOCK(p);
	return (EBUSY);
	}
	}


	#ifdef FIX_SSTEP
	/*
	* do single-step fixup if needed
	*/
	FIX_SSTEP(FIRST_THREAD_IN_PROC(p));
	#endif

	/*
	* Don't deliver any signal by default.
	* To continue with a signal, just send
	* the signal name to the ctl file
	*/
	p->p_xstat = 0;

	switch (op) {
	/*
	* Detach. Cleans up the target process, reparent it if possible
	* and set it running once more.
	*/
	case PROCFS_CTL_DETACH:
	/* if not being traced, then this is a painless no-op */
	if ((p->p_flag & P_TRACED) == 0) {
	PROC_UNLOCK(p);
	return (0);
	}

	/* not being traced any more */
	p->p_flag &= ~(P_TRACED \| P_STOPPED_TRACE);

	/* remove pending SIGTRAP, else the process will die */
	sigqueue_delete_proc(p, SIGTRAP);
	FOREACH_THREAD_IN_PROC(p, temp)
	temp->td_dbgflags &= ~TDB_SUSPEND;
	PROC_UNLOCK(p);

	/* give process back to original parent */
	sx_xlock(&proctree_lock);
	if (p->p_oppid != p->p_pptr->p_pid) {
	struct proc *pp;

	pp = pfind(p->p_oppid);
	PROC_LOCK(p);
	if (pp) {
	PROC_UNLOCK(pp);
	proc_reparent(p, pp);
	}
	} else
	PROC_LOCK(p);
	p->p_oppid = 0;
	p->p_flag &= ~P_WAITED; /* XXX ? */
	sx_xunlock(&proctree_lock);

	wakeup(td->td_proc); /* XXX for CTL_WAIT below ? */

	break;

	/*
	* Step. Let the target process execute a single instruction.
	* What does it mean to single step a threaded program?
	*/
	case PROCFS_CTL_STEP:
	error = proc_sstep(FIRST_THREAD_IN_PROC(p));
	if (error) {
	PROC_UNLOCK(p);
	return (error);
	}
	break;

	/*
	* Run. Let the target process continue running until a breakpoint
	* or some other trap.
	*/
	case PROCFS_CTL_RUN:
	p->p_flag &= ~P_STOPPED_SIG; /* this uses SIGSTOP */
	break;

	/*
	* Wait for the target process to stop.
	* If the target is not being traced then just wait
	* to enter
	*/
	case PROCFS_CTL_WAIT:
	if (p->p_flag & P_TRACED) {
	while (error == 0 &&
	(P_SHOULDSTOP(p)) &&
	(p->p_flag & P_TRACED) &&
	(p->p_pptr == td->td_proc))
	error = msleep(p, &p->p_mtx,
	PWAIT\|PCATCH, "procfsx", 0);
	if (error == 0 && !TRACE_WAIT_P(td->td_proc, p))
	error = EBUSY;
	} else {
	while (error == 0 && P_SHOULDSTOP(p))
	error = msleep(p, &p->p_mtx,
	PWAIT\|PCATCH, "procfs", 0);
	}
	PROC_UNLOCK(p);
	return (error);
	default:
	panic("procfs_control");
	}

	PROC_SLOCK(p);
	thread_unsuspend(p); /* If it can run, let it do so. */
	PROC_SUNLOCK(p);
	PROC_UNLOCK(p);
	return (0);
	}

	static struct namemap *
	findname(struct namemap nm, char buf, int buflen)
	{

	for (; nm->nm_name; nm++)
	if (bcmp(buf, nm->nm_name, buflen+1) == 0)
	return (nm);

	return (0);
	}

	int
	procfs_doprocctl(PFS_FILL_ARGS)
	{
	int error;
	struct namemap *nm;

	if (uio == NULL \|\| uio->uio_rw != UIO_WRITE)
	return (EOPNOTSUPP);

	/*
	* Map signal names into signal generation
	* or debug control. Unknown commands and/or signals
	* return EOPNOTSUPP.
	*
	* Sending a signal while the process is being debugged
	* also has the side effect of letting the target continue
	* to run. There is no way to single-step a signal delivery.
	*/
	error = EOPNOTSUPP;

	sbuf_trim(sb);
	sbuf_finish(sb);
	nm = findname(ctlnames, sbuf_data(sb), sbuf_len(sb));
	if (nm) {
	printf("procfs: got a %s command\n", sbuf_data(sb));
	error = procfs_control(td, p, nm->nm_val);
	} else {
	nm = findname(signames, sbuf_data(sb), sbuf_len(sb));
	if (nm) {
	printf("procfs: got a sig%s\n", sbuf_data(sb));
	PROC_LOCK(p);

	if (TRACE_WAIT_P(td->td_proc, p)) {
	p->p_xstat = nm->nm_val;
	#ifdef FIX_SSTEP
	FIX_SSTEP(FIRST_THREAD_IN_PROC(p));
	#endif
	p->p_flag &= ~P_STOPPED_SIG;
	PROC_SLOCK(p);
	thread_unsuspend(p);
	PROC_SUNLOCK(p);
	} else
	- psignal(p, nm->nm_val);
	+ kern_psignal(p, nm->nm_val);
	PROC_UNLOCK(p);
	error = 0;
	}
	}

	return (error);
	}
	Index: head/sys/fs/procfs/procfs_ioctl.c
	===================================================================
	--- head/sys/fs/procfs/procfs_ioctl.c (revision 225616)
	+++ head/sys/fs/procfs/procfs_ioctl.c (revision 225617)
	@@ -1,220 +1,220 @@
	/*-
	* Copyright (c) 2001 Dag-Erling Coïdan Smørgrav
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/pioctl.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/signalvar.h>
	#include <sys/systm.h>

	#include <fs/pseudofs/pseudofs.h>
	#include <fs/procfs/procfs.h>

	#ifdef COMPAT_FREEBSD32
	struct procfs_status32 {
	int state; /* Running, stopped, something else? */
	int flags; /* Any flags */
	unsigned int events; /* Events to stop on */
	int why; /* What event, if any, proc stopped on */
	unsigned int val; /* Any extra data */
	};

	#define PIOCWAIT32 _IOR('p', 4, struct procfs_status32)
	#define PIOCSTATUS32 _IOR('p', 6, struct procfs_status32)
	#endif

	/*
	* Process ioctls
	*/
	int
	procfs_ioctl(PFS_IOCTL_ARGS)
	{
	struct procfs_status *ps;
	#ifdef COMPAT_FREEBSD32
	struct procfs_status32 *ps32;
	#endif
	int error, flags, sig;
	#ifdef COMPAT_FREEBSD6
	int ival;
	#endif

	KASSERT(p != NULL,
	("%s() called without a process", __func__));
	PROC_LOCK_ASSERT(p, MA_OWNED);

	error = 0;
	switch (cmd) {
	#if defined(COMPAT_FREEBSD5) \|\| defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IOC(IOC_IN, 'p', 1, 0):
	#endif
	#ifdef COMPAT_FREEBSD6
	case _IO('p', 1):
	ival = IOCPARM_IVAL(data);
	data = &ival;
	#endif
	case PIOCBIS:
	p->p_stops \|= (unsigned int )data;
	break;
	#if defined(COMPAT_FREEBSD5) \|\| defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IOC(IOC_IN, 'p', 2, 0):
	#endif
	#ifdef COMPAT_FREEBSD6
	case _IO('p', 2):
	ival = IOCPARM_IVAL(data);
	data = &ival;
	#endif
	case PIOCBIC:
	p->p_stops &= ~(unsigned int )data;
	break;
	#if defined(COMPAT_FREEBSD5) \|\| defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IOC(IOC_IN, 'p', 3, 0):
	#endif
	#ifdef COMPAT_FREEBSD6
	case _IO('p', 3):
	ival = IOCPARM_IVAL(data);
	data = &ival;
	#endif
	case PIOCSFL:
	flags = (unsigned int )data;
	if (flags & PF_ISUGID) {
	/*
	* XXXRW: Is this specific check required here, as
	* p_candebug() should implement it, or other checks
	* are missing.
	*/
	error = priv_check(td, PRIV_DEBUG_SUGID);
	if (error)
	break;
	}
	p->p_pfsflags = flags;
	break;
	case PIOCGFL:
	(unsigned int )data = p->p_pfsflags;
	break;
	case PIOCWAIT:
	while (p->p_step == 0 && (p->p_flag & P_WEXIT) == 0) {
	/* sleep until p stops */
	_PHOLD(p);
	error = msleep(&p->p_stype, &p->p_mtx,
	PWAIT\|PCATCH, "pioctl", 0);
	_PRELE(p);
	if (error != 0)
	break;
	}
	/* fall through to PIOCSTATUS */
	case PIOCSTATUS:
	ps = (struct procfs_status *)data;
	ps->state = (p->p_step == 0);
	ps->flags = 0; /* nope */
	ps->events = p->p_stops;
	ps->why = p->p_step ? p->p_stype : 0;
	ps->val = p->p_step ? p->p_xstat : 0;
	break;
	#ifdef COMPAT_FREEBSD32
	case PIOCWAIT32:
	while (p->p_step == 0 && (p->p_flag & P_WEXIT) == 0) {
	/* sleep until p stops */
	_PHOLD(p);
	error = msleep(&p->p_stype, &p->p_mtx,
	PWAIT\|PCATCH, "pioctl", 0);
	_PRELE(p);
	if (error != 0)
	break;
	}
	/* fall through to PIOCSTATUS32 */
	case PIOCSTATUS32:
	ps32 = (struct procfs_status32 *)data;
	ps32->state = (p->p_step == 0);
	ps32->flags = 0; /* nope */
	ps32->events = p->p_stops;
	ps32->why = p->p_step ? p->p_stype : 0;
	ps32->val = p->p_step ? p->p_xstat : 0;
	break;
	#endif
	#if defined(COMPAT_FREEBSD5) \|\| defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	case _IOC(IOC_IN, 'p', 5, 0):
	#endif
	#ifdef COMPAT_FREEBSD6
	case _IO('p', 5):
	ival = IOCPARM_IVAL(data);
	data = &ival;
	#endif
	case PIOCCONT:
	if (p->p_step == 0)
	break;
	sig = (unsigned int )data;
	if (sig != 0 && !_SIG_VALID(sig)) {
	error = EINVAL;
	break;
	}
	#if 0
	p->p_step = 0;
	if (P_SHOULDSTOP(p)) {
	p->p_xstat = sig;
	p->p_flag &= ~(P_STOPPED_TRACE\|P_STOPPED_SIG);
	PROC_SLOCK(p);
	thread_unsuspend(p);
	PROC_SUNLOCK(p);
	} else if (sig)
	- psignal(p, sig);
	+ kern_psignal(p, sig);
	#else
	if (sig)
	- psignal(p, sig);
	+ kern_psignal(p, sig);
	p->p_step = 0;
	wakeup(&p->p_step);
	#endif
	break;
	default:
	error = (ENOTTY);
	}

	return (error);
	}

	/*
	* Clean up on last close
	*/
	int
	procfs_close(PFS_CLOSE_ARGS)
	{
	if (p != NULL && (p->p_pfsflags & PF_LINGER) == 0) {
	PROC_LOCK_ASSERT(p, MA_OWNED);
	p->p_pfsflags = 0;
	p->p_stops = 0;
	p->p_step = 0;
	wakeup(&p->p_step);
	}
	return (0);
	}
	Index: head/sys/i386/i386/machdep.c
	===================================================================
	--- head/sys/i386/i386/machdep.c (revision 225616)
	+++ head/sys/i386/i386/machdep.c (revision 225617)
	@@ -1,3664 +1,3664 @@
	/*-
	* Copyright (c) 1992 Terrence R. Lambert.
	* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
	* All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* William Jolitz.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_atalk.h"
	#include "opt_compat.h"
	#include "opt_cpu.h"
	#include "opt_ddb.h"
	#include "opt_inet.h"
	#include "opt_ipx.h"
	#include "opt_isa.h"
	#include "opt_kstack_pages.h"
	#include "opt_maxmem.h"
	#include "opt_mp_watchdog.h"
	#include "opt_npx.h"
	#include "opt_perfmon.h"
	#include "opt_xbox.h"
	#include "opt_kdtrace.h"

	#include <sys/param.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/bus.h>
	#include <sys/callout.h>
	#include <sys/cons.h>
	#include <sys/cpu.h>
	#include <sys/eventhandler.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/linker.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/msgbuf.h>
	#include <sys/mutex.h>
	#include <sys/pcpu.h>
	#include <sys/ptrace.h>
	#include <sys/reboot.h>
	#include <sys/sched.h>
	#include <sys/signalvar.h>
	#ifdef SMP
	#include <sys/smp.h>
	#endif
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/ucontext.h>
	#include <sys/vmmeter.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_page.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_param.h>

	#ifdef DDB
	#ifndef KDB
	#error KDB must be enabled in order for DDB to work!
	#endif
	#include <ddb/ddb.h>
	#include <ddb/db_sym.h>
	#endif

	#include <isa/rtc.h>

	#include <net/netisr.h>

	#include <machine/bootinfo.h>
	#include <machine/clock.h>
	#include <machine/cpu.h>
	#include <machine/cputypes.h>
	#include <machine/intr_machdep.h>
	#include <x86/mca.h>
	#include <machine/md_var.h>
	#include <machine/metadata.h>
	#include <machine/mp_watchdog.h>
	#include <machine/pc/bios.h>
	#include <machine/pcb.h>
	#include <machine/pcb_ext.h>
	#include <machine/proc.h>
	#include <machine/reg.h>
	#include <machine/sigframe.h>
	#include <machine/specialreg.h>
	#include <machine/vm86.h>
	#ifdef PERFMON
	#include <machine/perfmon.h>
	#endif
	#ifdef SMP
	#include <machine/smp.h>
	#endif

	#ifdef DEV_ISA
	#include <x86/isa/icu.h>
	#endif

	#ifdef XBOX
	#include <machine/xbox.h>

	int arch_i386_is_xbox = 0;
	uint32_t arch_i386_xbox_memsize = 0;
	#endif

	#ifdef XEN
	/* XEN includes */
	#include <machine/xen/xen-os.h>
	#include <xen/hypervisor.h>
	#include <machine/xen/xen-os.h>
	#include <machine/xen/xenvar.h>
	#include <machine/xen/xenfunc.h>
	#include <xen/xen_intr.h>

	void Xhypervisor_callback(void);
	void failsafe_callback(void);

	extern trap_info_t trap_table[];
	struct proc_ldt default_proc_ldt;
	extern int init_first;
	int running_xen = 1;
	extern unsigned long physfree;
	#endif /* XEN */

	/* Sanity check for __curthread() */
	CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);

	extern void init386(int first);
	extern void dblfault_handler(void);

	extern void printcpuinfo(void); /* XXX header file */
	extern void finishidentcpu(void);
	extern void panicifcpuunsupported(void);
	extern void initializecpu(void);

	#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
	#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)

	#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
	#define CPU_ENABLE_SSE
	#endif

	static void cpu_startup(void *);
	static void fpstate_drop(struct thread *td);
	static void get_fpcontext(struct thread td, mcontext_t mcp);
	static int set_fpcontext(struct thread td, const mcontext_t mcp);
	#ifdef CPU_ENABLE_SSE
	static void set_fpregs_xmm(struct save87 , struct savexmm );
	static void fill_fpregs_xmm(struct savexmm , struct save87 );
	#endif /* CPU_ENABLE_SSE */
	SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);

	#ifdef DDB
	extern vm_offset_t ksym_start, ksym_end;
	#endif

	/* Intel ICH registers */
	#define ICH_PMBASE 0x400
	#define ICH_SMI_EN ICH_PMBASE + 0x30

	int _udatasel, _ucodesel;
	u_int basemem;

	int cold = 1;

	#ifdef COMPAT_43
	static void osendsig(sig_t catcher, ksiginfo_t , sigset_t mask);
	#endif
	#ifdef COMPAT_FREEBSD4
	static void freebsd4_sendsig(sig_t catcher, ksiginfo_t , sigset_t mask);
	#endif

	long Maxmem = 0;
	long realmem = 0;

	#ifdef PAE
	FEATURE(pae, "Physical Address Extensions");
	#endif

	/*
	* The number of PHYSMAP entries must be one less than the number of
	* PHYSSEG entries because the PHYSMAP entry that spans the largest
	* physical address that is accessible by ISA DMA is split into two
	* PHYSSEG entries.
	*/
	#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))

	vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
	vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];

	/* must be 2 less so 0 0 can signal end of chunks */
	#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
	#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)

	struct kva_md_info kmi;

	static struct trapframe proc0_tf;
	struct pcpu __pcpu[MAXCPU];

	struct mtx icu_lock;

	static void
	cpu_startup(dummy)
	void *dummy;
	{
	uintmax_t memsize;
	char *sysenv;

	/*
	* On MacBooks, we need to disallow the legacy USB circuit to
	* generate an SMI# because this can cause several problems,
	* namely: incorrect CPU frequency detection and failure to
	* start the APs.
	* We do this by disabling a bit in the SMI_EN (SMI Control and
	* Enable register) of the Intel ICH LPC Interface Bridge.
	*/
	sysenv = getenv("smbios.system.product");
	if (sysenv != NULL) {
	if (strncmp(sysenv, "MacBook1,1", 10) == 0 \|\|
	strncmp(sysenv, "MacBook3,1", 10) == 0 \|\|
	strncmp(sysenv, "MacBookPro1,1", 13) == 0 \|\|
	strncmp(sysenv, "MacBookPro1,2", 13) == 0 \|\|
	strncmp(sysenv, "MacBookPro3,1", 13) == 0 \|\|
	strncmp(sysenv, "Macmini1,1", 10) == 0) {
	if (bootverbose)
	printf("Disabling LEGACY_USB_EN bit on "
	"Intel ICH.\n");
	outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
	}
	freeenv(sysenv);
	}

	/*
	* Good {morning,afternoon,evening,night}.
	*/
	startrtclock();
	printcpuinfo();
	panicifcpuunsupported();
	#ifdef PERFMON
	perfmon_init();
	#endif
	realmem = Maxmem;

	/*
	* Display physical memory if SMBIOS reports reasonable amount.
	*/
	memsize = 0;
	sysenv = getenv("smbios.memory.enabled");
	if (sysenv != NULL) {
	memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
	freeenv(sysenv);
	}
	if (memsize < ptoa((uintmax_t)cnt.v_free_count))
	memsize = ptoa((uintmax_t)Maxmem);
	printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);

	/*
	* Display any holes after the first chunk of extended memory.
	*/
	if (bootverbose) {
	int indx;

	printf("Physical memory chunk(s):\n");
	for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
	vm_paddr_t size;

	size = phys_avail[indx + 1] - phys_avail[indx];
	printf(
	"0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
	(uintmax_t)phys_avail[indx],
	(uintmax_t)phys_avail[indx + 1] - 1,
	(uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
	}
	}

	vm_ksubmap_init(&kmi);

	printf("avail memory = %ju (%ju MB)\n",
	ptoa((uintmax_t)cnt.v_free_count),
	ptoa((uintmax_t)cnt.v_free_count) / 1048576);

	/*
	* Set up buffers, so they can be used to read disk labels.
	*/
	bufinit();
	vm_pager_bufferinit();
	#ifndef XEN
	cpu_setregs();
	#endif
	}

	/*
	* Send an interrupt to process.
	*
	* Stack is set up to allow sigcode stored
	* at top to call routine, followed by kcall
	* to sigreturn routine below. After sigreturn
	* resets the signal mask, the stack, and the
	* frame pointer, it returns to the user
	* specified pc, psl.
	*/
	#ifdef COMPAT_43
	static void
	osendsig(sig_t catcher, ksiginfo_t ksi, sigset_t mask)
	{
	struct osigframe sf, *fp;
	struct proc *p;
	struct thread *td;
	struct sigacts *psp;
	struct trapframe *regs;
	int sig;
	int oonstack;

	td = curthread;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	sig = ksi->ksi_signo;
	psp = p->p_sigacts;
	mtx_assert(&psp->ps_mtx, MA_OWNED);
	regs = td->td_frame;
	oonstack = sigonstack(regs->tf_esp);

	/* Allocate space for the signal handler context. */
	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
	SIGISMEMBER(psp->ps_sigonstack, sig)) {
	fp = (struct osigframe *)(td->td_sigstk.ss_sp +
	td->td_sigstk.ss_size - sizeof(struct osigframe));
	#if defined(COMPAT_43)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	#endif
	} else
	fp = (struct osigframe *)regs->tf_esp - 1;

	/* Translate the signal if appropriate. */
	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
	sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];

	/* Build the argument list for the signal handler. */
	sf.sf_signum = sig;
	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
	bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
	/* Signal handler installed with SA_SIGINFO. */
	sf.sf_arg2 = (register_t)&fp->sf_siginfo;
	sf.sf_siginfo.si_signo = sig;
	sf.sf_siginfo.si_code = ksi->ksi_code;
	sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
	sf.sf_addr = 0;
	} else {
	/* Old FreeBSD-style arguments. */
	sf.sf_arg2 = ksi->ksi_code;
	sf.sf_addr = (register_t)ksi->ksi_addr;
	sf.sf_ahu.sf_handler = catcher;
	}
	mtx_unlock(&psp->ps_mtx);
	PROC_UNLOCK(p);

	/* Save most if not all of trap frame. */
	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
	sf.sf_siginfo.si_sc.sc_gs = rgs();
	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;

	/* Build the signal context to be used by osigreturn(). */
	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;

	/*
	* If we're a vm86 process, we want to save the segment registers.
	* We also change eflags to be our emulated eflags, not the actual
	* eflags.
	*/
	if (regs->tf_eflags & PSL_VM) {
	/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
	struct trapframe_vm86 tf = (struct trapframe_vm86 )regs;
	struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;

	sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
	sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
	sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
	sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;

	if (vm86->vm86_has_vme == 0)
	sf.sf_siginfo.si_sc.sc_ps =
	(tf->tf_eflags & ~(PSL_VIF \| PSL_VIP)) \|
	(vm86->vm86_eflags & (PSL_VIF \| PSL_VIP));

	/* See sendsig() for comments. */
	tf->tf_eflags &= ~(PSL_VM \| PSL_NT \| PSL_VIF \| PSL_VIP);
	}

	/*
	* Copy the sigframe out to the user's stack.
	*/
	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
	#ifdef DEBUG
	printf("process %ld has trashed its stack\n", (long)p->p_pid);
	#endif
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	}

	regs->tf_esp = (int)fp;
	regs->tf_eip = PS_STRINGS - szosigcode;
	regs->tf_eflags &= ~(PSL_T \| PSL_D);
	regs->tf_cs = _ucodesel;
	regs->tf_ds = _udatasel;
	regs->tf_es = _udatasel;
	regs->tf_fs = _udatasel;
	load_gs(_udatasel);
	regs->tf_ss = _udatasel;
	PROC_LOCK(p);
	mtx_lock(&psp->ps_mtx);
	}
	#endif /* COMPAT_43 */

	#ifdef COMPAT_FREEBSD4
	static void
	freebsd4_sendsig(sig_t catcher, ksiginfo_t ksi, sigset_t mask)
	{
	struct sigframe4 sf, *sfp;
	struct proc *p;
	struct thread *td;
	struct sigacts *psp;
	struct trapframe *regs;
	int sig;
	int oonstack;

	td = curthread;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	sig = ksi->ksi_signo;
	psp = p->p_sigacts;
	mtx_assert(&psp->ps_mtx, MA_OWNED);
	regs = td->td_frame;
	oonstack = sigonstack(regs->tf_esp);

	/* Save user context. */
	bzero(&sf, sizeof(sf));
	sf.sf_uc.uc_sigmask = *mask;
	sf.sf_uc.uc_stack = td->td_sigstk;
	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
	? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
	sf.sf_uc.uc_mcontext.mc_gs = rgs();
	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
	bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
	sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
	bzero(sf.sf_uc.uc_mcontext.__spare__,
	sizeof(sf.sf_uc.uc_mcontext.__spare__));
	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));

	/* Allocate space for the signal handler context. */
	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
	SIGISMEMBER(psp->ps_sigonstack, sig)) {
	sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
	td->td_sigstk.ss_size - sizeof(struct sigframe4));
	#if defined(COMPAT_43)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	#endif
	} else
	sfp = (struct sigframe4 *)regs->tf_esp - 1;

	/* Translate the signal if appropriate. */
	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
	sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];

	/* Build the argument list for the signal handler. */
	sf.sf_signum = sig;
	sf.sf_ucontext = (register_t)&sfp->sf_uc;
	bzero(&sf.sf_si, sizeof(sf.sf_si));
	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
	/* Signal handler installed with SA_SIGINFO. */
	sf.sf_siginfo = (register_t)&sfp->sf_si;
	sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;

	/* Fill in POSIX parts */
	sf.sf_si.si_signo = sig;
	sf.sf_si.si_code = ksi->ksi_code;
	sf.sf_si.si_addr = ksi->ksi_addr;
	} else {
	/* Old FreeBSD-style arguments. */
	sf.sf_siginfo = ksi->ksi_code;
	sf.sf_addr = (register_t)ksi->ksi_addr;
	sf.sf_ahu.sf_handler = catcher;
	}
	mtx_unlock(&psp->ps_mtx);
	PROC_UNLOCK(p);

	/*
	* If we're a vm86 process, we want to save the segment registers.
	* We also change eflags to be our emulated eflags, not the actual
	* eflags.
	*/
	if (regs->tf_eflags & PSL_VM) {
	struct trapframe_vm86 tf = (struct trapframe_vm86 )regs;
	struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;

	sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
	sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
	sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
	sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;

	if (vm86->vm86_has_vme == 0)
	sf.sf_uc.uc_mcontext.mc_eflags =
	(tf->tf_eflags & ~(PSL_VIF \| PSL_VIP)) \|
	(vm86->vm86_eflags & (PSL_VIF \| PSL_VIP));

	/*
	* Clear PSL_NT to inhibit T_TSSFLT faults on return from
	* syscalls made by the signal handler. This just avoids
	* wasting time for our lazy fixup of such faults. PSL_NT
	* does nothing in vm86 mode, but vm86 programs can set it
	* almost legitimately in probes for old cpu types.
	*/
	tf->tf_eflags &= ~(PSL_VM \| PSL_NT \| PSL_VIF \| PSL_VIP);
	}

	/*
	* Copy the sigframe out to the user's stack.
	*/
	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
	#ifdef DEBUG
	printf("process %ld has trashed its stack\n", (long)p->p_pid);
	#endif
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	}

	regs->tf_esp = (int)sfp;
	regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
	regs->tf_eflags &= ~(PSL_T \| PSL_D);
	regs->tf_cs = _ucodesel;
	regs->tf_ds = _udatasel;
	regs->tf_es = _udatasel;
	regs->tf_fs = _udatasel;
	regs->tf_ss = _udatasel;
	PROC_LOCK(p);
	mtx_lock(&psp->ps_mtx);
	}
	#endif /* COMPAT_FREEBSD4 */

	void
	sendsig(sig_t catcher, ksiginfo_t ksi, sigset_t mask)
	{
	struct sigframe sf, *sfp;
	struct proc *p;
	struct thread *td;
	struct sigacts *psp;
	char *sp;
	struct trapframe *regs;
	struct segment_descriptor *sdp;
	int sig;
	int oonstack;

	td = curthread;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	sig = ksi->ksi_signo;
	psp = p->p_sigacts;
	mtx_assert(&psp->ps_mtx, MA_OWNED);
	#ifdef COMPAT_FREEBSD4
	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
	freebsd4_sendsig(catcher, ksi, mask);
	return;
	}
	#endif
	#ifdef COMPAT_43
	if (SIGISMEMBER(psp->ps_osigset, sig)) {
	osendsig(catcher, ksi, mask);
	return;
	}
	#endif
	regs = td->td_frame;
	oonstack = sigonstack(regs->tf_esp);

	/* Save user context. */
	bzero(&sf, sizeof(sf));
	sf.sf_uc.uc_sigmask = *mask;
	sf.sf_uc.uc_stack = td->td_sigstk;
	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
	? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
	sf.sf_uc.uc_mcontext.mc_gs = rgs();
	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
	get_fpcontext(td, &sf.sf_uc.uc_mcontext);
	fpstate_drop(td);
	/*
	* Unconditionally fill the fsbase and gsbase into the mcontext.
	*/
	sdp = &td->td_pcb->pcb_fsd;
	sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 \|
	sdp->sd_lobase;
	sdp = &td->td_pcb->pcb_gsd;
	sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 \|
	sdp->sd_lobase;
	bzero(sf.sf_uc.uc_mcontext.mc_spare1,
	sizeof(sf.sf_uc.uc_mcontext.mc_spare1));
	bzero(sf.sf_uc.uc_mcontext.mc_spare2,
	sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));

	/* Allocate space for the signal handler context. */
	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
	SIGISMEMBER(psp->ps_sigonstack, sig)) {
	sp = td->td_sigstk.ss_sp +
	td->td_sigstk.ss_size - sizeof(struct sigframe);
	#if defined(COMPAT_43)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	#endif
	} else
	sp = (char *)regs->tf_esp - sizeof(struct sigframe);
	/* Align to 16 bytes. */
	sfp = (struct sigframe *)((unsigned int)sp & ~0xF);

	/* Translate the signal if appropriate. */
	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
	sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];

	/* Build the argument list for the signal handler. */
	sf.sf_signum = sig;
	sf.sf_ucontext = (register_t)&sfp->sf_uc;
	bzero(&sf.sf_si, sizeof(sf.sf_si));
	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
	/* Signal handler installed with SA_SIGINFO. */
	sf.sf_siginfo = (register_t)&sfp->sf_si;
	sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;

	/* Fill in POSIX parts */
	sf.sf_si = ksi->ksi_info;
	sf.sf_si.si_signo = sig; /* maybe a translated signal */
	} else {
	/* Old FreeBSD-style arguments. */
	sf.sf_siginfo = ksi->ksi_code;
	sf.sf_addr = (register_t)ksi->ksi_addr;
	sf.sf_ahu.sf_handler = catcher;
	}
	mtx_unlock(&psp->ps_mtx);
	PROC_UNLOCK(p);

	/*
	* If we're a vm86 process, we want to save the segment registers.
	* We also change eflags to be our emulated eflags, not the actual
	* eflags.
	*/
	if (regs->tf_eflags & PSL_VM) {
	struct trapframe_vm86 tf = (struct trapframe_vm86 )regs;
	struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;

	sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
	sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
	sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
	sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;

	if (vm86->vm86_has_vme == 0)
	sf.sf_uc.uc_mcontext.mc_eflags =
	(tf->tf_eflags & ~(PSL_VIF \| PSL_VIP)) \|
	(vm86->vm86_eflags & (PSL_VIF \| PSL_VIP));

	/*
	* Clear PSL_NT to inhibit T_TSSFLT faults on return from
	* syscalls made by the signal handler. This just avoids
	* wasting time for our lazy fixup of such faults. PSL_NT
	* does nothing in vm86 mode, but vm86 programs can set it
	* almost legitimately in probes for old cpu types.
	*/
	tf->tf_eflags &= ~(PSL_VM \| PSL_NT \| PSL_VIF \| PSL_VIP);
	}

	/*
	* Copy the sigframe out to the user's stack.
	*/
	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
	#ifdef DEBUG
	printf("process %ld has trashed its stack\n", (long)p->p_pid);
	#endif
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	}

	regs->tf_esp = (int)sfp;
	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
	regs->tf_eflags &= ~(PSL_T \| PSL_D);
	regs->tf_cs = _ucodesel;
	regs->tf_ds = _udatasel;
	regs->tf_es = _udatasel;
	regs->tf_fs = _udatasel;
	regs->tf_ss = _udatasel;
	PROC_LOCK(p);
	mtx_lock(&psp->ps_mtx);
	}

	/*
	* System call to cleanup state after a signal
	* has been taken. Reset signal mask and
	* stack state from context left by sendsig (above).
	* Return to previous pc and psl as specified by
	* context left by sendsig. Check carefully to
	* make sure that the user has not modified the
	* state to gain improper privileges.
	*
	* MPSAFE
	*/
	#ifdef COMPAT_43
	int
	osigreturn(td, uap)
	struct thread *td;
	struct osigreturn_args /* {
	struct osigcontext *sigcntxp;
	} / uap;
	{
	struct osigcontext sc;
	struct trapframe *regs;
	struct osigcontext *scp;
	int eflags, error;
	ksiginfo_t ksi;

	regs = td->td_frame;
	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
	if (error != 0)
	return (error);
	scp = &sc;
	eflags = scp->sc_ps;
	if (eflags & PSL_VM) {
	struct trapframe_vm86 tf = (struct trapframe_vm86 )regs;
	struct vm86_kernel *vm86;

	/*
	* if pcb_ext == 0 or vm86_inited == 0, the user hasn't
	* set up the vm86 area, and we can't enter vm86 mode.
	*/
	if (td->td_pcb->pcb_ext == 0)
	return (EINVAL);
	vm86 = &td->td_pcb->pcb_ext->ext_vm86;
	if (vm86->vm86_inited == 0)
	return (EINVAL);

	/* Go back to user mode if both flags are set. */
	if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGBUS;
	ksi.ksi_code = BUS_OBJERR;
	ksi.ksi_addr = (void *)regs->tf_eip;
	trapsignal(td, &ksi);
	}

	if (vm86->vm86_has_vme) {
	eflags = (tf->tf_eflags & ~VME_USERCHANGE) \|
	(eflags & VME_USERCHANGE) \| PSL_VM;
	} else {
	vm86->vm86_eflags = eflags; /* save VIF, VIP */
	eflags = (tf->tf_eflags & ~VM_USERCHANGE) \|
	(eflags & VM_USERCHANGE) \| PSL_VM;
	}
	tf->tf_vm86_ds = scp->sc_ds;
	tf->tf_vm86_es = scp->sc_es;
	tf->tf_vm86_fs = scp->sc_fs;
	tf->tf_vm86_gs = scp->sc_gs;
	tf->tf_ds = _udatasel;
	tf->tf_es = _udatasel;
	tf->tf_fs = _udatasel;
	} else {
	/*
	* Don't allow users to change privileged or reserved flags.
	*/
	/*
	* XXX do allow users to change the privileged flag PSL_RF.
	* The cpu sets PSL_RF in tf_eflags for faults. Debuggers
	* should sometimes set it there too. tf_eflags is kept in
	* the signal context during signal handling and there is no
	* other place to remember it, so the PSL_RF bit may be
	* corrupted by the signal handler without us knowing.
	* Corruption of the PSL_RF bit at worst causes one more or
	* one less debugger trap, so allowing it is fairly harmless.
	*/
	if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
	return (EINVAL);
	}

	/*
	* Don't allow users to load a valid privileged %cs. Let the
	* hardware check for invalid selectors, excess privilege in
	* other selectors, invalid %eip's and invalid %esp's.
	*/
	if (!CS_SECURE(scp->sc_cs)) {
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGBUS;
	ksi.ksi_code = BUS_OBJERR;
	ksi.ksi_trapno = T_PROTFLT;
	ksi.ksi_addr = (void *)regs->tf_eip;
	trapsignal(td, &ksi);
	return (EINVAL);
	}
	regs->tf_ds = scp->sc_ds;
	regs->tf_es = scp->sc_es;
	regs->tf_fs = scp->sc_fs;
	}

	/* Restore remaining registers. */
	regs->tf_eax = scp->sc_eax;
	regs->tf_ebx = scp->sc_ebx;
	regs->tf_ecx = scp->sc_ecx;
	regs->tf_edx = scp->sc_edx;
	regs->tf_esi = scp->sc_esi;
	regs->tf_edi = scp->sc_edi;
	regs->tf_cs = scp->sc_cs;
	regs->tf_ss = scp->sc_ss;
	regs->tf_isp = scp->sc_isp;
	regs->tf_ebp = scp->sc_fp;
	regs->tf_esp = scp->sc_sp;
	regs->tf_eip = scp->sc_pc;
	regs->tf_eflags = eflags;

	#if defined(COMPAT_43)
	if (scp->sc_onstack & 1)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	else
	td->td_sigstk.ss_flags &= ~SS_ONSTACK;
	#endif
	kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
	SIGPROCMASK_OLD);
	return (EJUSTRETURN);
	}
	#endif /* COMPAT_43 */

	#ifdef COMPAT_FREEBSD4
	/*
	* MPSAFE
	*/
	int
	freebsd4_sigreturn(td, uap)
	struct thread *td;
	struct freebsd4_sigreturn_args /* {
	const ucontext4 *sigcntxp;
	} / uap;
	{
	struct ucontext4 uc;
	struct trapframe *regs;
	struct ucontext4 *ucp;
	int cs, eflags, error;
	ksiginfo_t ksi;

	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
	if (error != 0)
	return (error);
	ucp = &uc;
	regs = td->td_frame;
	eflags = ucp->uc_mcontext.mc_eflags;
	if (eflags & PSL_VM) {
	struct trapframe_vm86 tf = (struct trapframe_vm86 )regs;
	struct vm86_kernel *vm86;

	/*
	* if pcb_ext == 0 or vm86_inited == 0, the user hasn't
	* set up the vm86 area, and we can't enter vm86 mode.
	*/
	if (td->td_pcb->pcb_ext == 0)
	return (EINVAL);
	vm86 = &td->td_pcb->pcb_ext->ext_vm86;
	if (vm86->vm86_inited == 0)
	return (EINVAL);

	/* Go back to user mode if both flags are set. */
	if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGBUS;
	ksi.ksi_code = BUS_OBJERR;
	ksi.ksi_addr = (void *)regs->tf_eip;
	trapsignal(td, &ksi);
	}
	if (vm86->vm86_has_vme) {
	eflags = (tf->tf_eflags & ~VME_USERCHANGE) \|
	(eflags & VME_USERCHANGE) \| PSL_VM;
	} else {
	vm86->vm86_eflags = eflags; /* save VIF, VIP */
	eflags = (tf->tf_eflags & ~VM_USERCHANGE) \|
	(eflags & VM_USERCHANGE) \| PSL_VM;
	}
	bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
	tf->tf_eflags = eflags;
	tf->tf_vm86_ds = tf->tf_ds;
	tf->tf_vm86_es = tf->tf_es;
	tf->tf_vm86_fs = tf->tf_fs;
	tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
	tf->tf_ds = _udatasel;
	tf->tf_es = _udatasel;
	tf->tf_fs = _udatasel;
	} else {
	/*
	* Don't allow users to change privileged or reserved flags.
	*/
	/*
	* XXX do allow users to change the privileged flag PSL_RF.
	* The cpu sets PSL_RF in tf_eflags for faults. Debuggers
	* should sometimes set it there too. tf_eflags is kept in
	* the signal context during signal handling and there is no
	* other place to remember it, so the PSL_RF bit may be
	* corrupted by the signal handler without us knowing.
	* Corruption of the PSL_RF bit at worst causes one more or
	* one less debugger trap, so allowing it is fairly harmless.
	*/
	if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
	uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
	td->td_proc->p_pid, td->td_name, eflags);
	return (EINVAL);
	}

	/*
	* Don't allow users to load a valid privileged %cs. Let the
	* hardware check for invalid selectors, excess privilege in
	* other selectors, invalid %eip's and invalid %esp's.
	*/
	cs = ucp->uc_mcontext.mc_cs;
	if (!CS_SECURE(cs)) {
	uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
	td->td_proc->p_pid, td->td_name, cs);
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGBUS;
	ksi.ksi_code = BUS_OBJERR;
	ksi.ksi_trapno = T_PROTFLT;
	ksi.ksi_addr = (void *)regs->tf_eip;
	trapsignal(td, &ksi);
	return (EINVAL);
	}

	bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
	}

	#if defined(COMPAT_43)
	if (ucp->uc_mcontext.mc_onstack & 1)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	else
	td->td_sigstk.ss_flags &= ~SS_ONSTACK;
	#endif
	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
	return (EJUSTRETURN);
	}
	#endif /* COMPAT_FREEBSD4 */

	/*
	* MPSAFE
	*/
	int
	-sigreturn(td, uap)
	+sys_sigreturn(td, uap)
	struct thread *td;
	struct sigreturn_args /* {
	const struct __ucontext *sigcntxp;
	} / uap;
	{
	ucontext_t uc;
	struct trapframe *regs;
	ucontext_t *ucp;
	int cs, eflags, error, ret;
	ksiginfo_t ksi;

	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
	if (error != 0)
	return (error);
	ucp = &uc;
	regs = td->td_frame;
	eflags = ucp->uc_mcontext.mc_eflags;
	if (eflags & PSL_VM) {
	struct trapframe_vm86 tf = (struct trapframe_vm86 )regs;
	struct vm86_kernel *vm86;

	/*
	* if pcb_ext == 0 or vm86_inited == 0, the user hasn't
	* set up the vm86 area, and we can't enter vm86 mode.
	*/
	if (td->td_pcb->pcb_ext == 0)
	return (EINVAL);
	vm86 = &td->td_pcb->pcb_ext->ext_vm86;
	if (vm86->vm86_inited == 0)
	return (EINVAL);

	/* Go back to user mode if both flags are set. */
	if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGBUS;
	ksi.ksi_code = BUS_OBJERR;
	ksi.ksi_addr = (void *)regs->tf_eip;
	trapsignal(td, &ksi);
	}

	if (vm86->vm86_has_vme) {
	eflags = (tf->tf_eflags & ~VME_USERCHANGE) \|
	(eflags & VME_USERCHANGE) \| PSL_VM;
	} else {
	vm86->vm86_eflags = eflags; /* save VIF, VIP */
	eflags = (tf->tf_eflags & ~VM_USERCHANGE) \|
	(eflags & VM_USERCHANGE) \| PSL_VM;
	}
	bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
	tf->tf_eflags = eflags;
	tf->tf_vm86_ds = tf->tf_ds;
	tf->tf_vm86_es = tf->tf_es;
	tf->tf_vm86_fs = tf->tf_fs;
	tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
	tf->tf_ds = _udatasel;
	tf->tf_es = _udatasel;
	tf->tf_fs = _udatasel;
	} else {
	/*
	* Don't allow users to change privileged or reserved flags.
	*/
	/*
	* XXX do allow users to change the privileged flag PSL_RF.
	* The cpu sets PSL_RF in tf_eflags for faults. Debuggers
	* should sometimes set it there too. tf_eflags is kept in
	* the signal context during signal handling and there is no
	* other place to remember it, so the PSL_RF bit may be
	* corrupted by the signal handler without us knowing.
	* Corruption of the PSL_RF bit at worst causes one more or
	* one less debugger trap, so allowing it is fairly harmless.
	*/
	if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
	uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
	td->td_proc->p_pid, td->td_name, eflags);
	return (EINVAL);
	}

	/*
	* Don't allow users to load a valid privileged %cs. Let the
	* hardware check for invalid selectors, excess privilege in
	* other selectors, invalid %eip's and invalid %esp's.
	*/
	cs = ucp->uc_mcontext.mc_cs;
	if (!CS_SECURE(cs)) {
	uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
	td->td_proc->p_pid, td->td_name, cs);
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGBUS;
	ksi.ksi_code = BUS_OBJERR;
	ksi.ksi_trapno = T_PROTFLT;
	ksi.ksi_addr = (void *)regs->tf_eip;
	trapsignal(td, &ksi);
	return (EINVAL);
	}

	ret = set_fpcontext(td, &ucp->uc_mcontext);
	if (ret != 0)
	return (ret);
	bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
	}

	#if defined(COMPAT_43)
	if (ucp->uc_mcontext.mc_onstack & 1)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	else
	td->td_sigstk.ss_flags &= ~SS_ONSTACK;
	#endif

	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
	return (EJUSTRETURN);
	}

	/*
	* Machine dependent boot() routine
	*
	* I haven't seen anything to put here yet
	* Possibly some stuff might be grafted back here from boot()
	*/
	void
	cpu_boot(int howto)
	{
	}

	/*
	* Flush the D-cache for non-DMA I/O so that the I-cache can
	* be made coherent later.
	*/
	void
	cpu_flush_dcache(void *ptr, size_t len)
	{
	/* Not applicable */
	}

	/* Get current clock frequency for the given cpu id. */
	int
	cpu_est_clockrate(int cpu_id, uint64_t *rate)
	{
	uint64_t tsc1, tsc2;
	uint64_t acnt, mcnt, perf;
	register_t reg;

	if (pcpu_find(cpu_id) == NULL \|\| rate == NULL)
	return (EINVAL);
	if ((cpu_feature & CPUID_TSC) == 0)
	return (EOPNOTSUPP);

	/*
	* If TSC is P-state invariant and APERF/MPERF MSRs do not exist,
	* DELAY(9) based logic fails.
	*/
	if (tsc_is_invariant && !tsc_perf_stat)
	return (EOPNOTSUPP);

	#ifdef SMP
	if (smp_cpus > 1) {
	/* Schedule ourselves on the indicated cpu. */
	thread_lock(curthread);
	sched_bind(curthread, cpu_id);
	thread_unlock(curthread);
	}
	#endif

	/* Calibrate by measuring a short delay. */
	reg = intr_disable();
	if (tsc_is_invariant) {
	wrmsr(MSR_MPERF, 0);
	wrmsr(MSR_APERF, 0);
	tsc1 = rdtsc();
	DELAY(1000);
	mcnt = rdmsr(MSR_MPERF);
	acnt = rdmsr(MSR_APERF);
	tsc2 = rdtsc();
	intr_restore(reg);
	perf = 1000 * acnt / mcnt;
	rate = (tsc2 - tsc1) perf;
	} else {
	tsc1 = rdtsc();
	DELAY(1000);
	tsc2 = rdtsc();
	intr_restore(reg);
	rate = (tsc2 - tsc1) 1000;
	}

	#ifdef SMP
	if (smp_cpus > 1) {
	thread_lock(curthread);
	sched_unbind(curthread);
	thread_unlock(curthread);
	}
	#endif

	return (0);
	}

	#ifdef XEN

	void
	cpu_halt(void)
	{
	HYPERVISOR_shutdown(SHUTDOWN_poweroff);
	}

	int scheduler_running;

	static void
	cpu_idle_hlt(int busy)
	{

	scheduler_running = 1;
	enable_intr();
	idle_block();
	}

	#else
	/*
	* Shutdown the CPU as much as possible
	*/
	void
	cpu_halt(void)
	{
	for (;;)
	__asm__ ("hlt");
	}

	#endif

	void (cpu_idle_hook)(void) = NULL; / ACPI idle hook. */
	static int cpu_ident_amdc1e = 0; /* AMD C1E supported. */
	static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
	TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
	SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
	0, "Use MONITOR/MWAIT for short idle");

	#define STATE_RUNNING 0x0
	#define STATE_MWAIT 0x1
	#define STATE_SLEEPING 0x2

	static void
	cpu_idle_acpi(int busy)
	{
	int *state;

	state = (int *)PCPU_PTR(monitorbuf);
	*state = STATE_SLEEPING;
	disable_intr();
	if (sched_runnable())
	enable_intr();
	else if (cpu_idle_hook)
	cpu_idle_hook();
	else
	__asm __volatile("sti; hlt");
	*state = STATE_RUNNING;
	}

	#ifndef XEN
	static void
	cpu_idle_hlt(int busy)
	{
	int *state;

	state = (int *)PCPU_PTR(monitorbuf);
	*state = STATE_SLEEPING;
	/*
	* We must absolutely guarentee that hlt is the next instruction
	* after sti or we introduce a timing window.
	*/
	disable_intr();
	if (sched_runnable())
	enable_intr();
	else
	__asm __volatile("sti; hlt");
	*state = STATE_RUNNING;
	}
	#endif

	/*
	* MWAIT cpu power states. Lower 4 bits are sub-states.
	*/
	#define MWAIT_C0 0xf0
	#define MWAIT_C1 0x00
	#define MWAIT_C2 0x10
	#define MWAIT_C3 0x20
	#define MWAIT_C4 0x30

	static void
	cpu_idle_mwait(int busy)
	{
	int *state;

	state = (int *)PCPU_PTR(monitorbuf);
	*state = STATE_MWAIT;
	if (!sched_runnable()) {
	cpu_monitor(state, 0, 0);
	if (*state == STATE_MWAIT)
	cpu_mwait(0, MWAIT_C1);
	}
	*state = STATE_RUNNING;
	}

	static void
	cpu_idle_spin(int busy)
	{
	int *state;
	int i;

	state = (int *)PCPU_PTR(monitorbuf);
	*state = STATE_RUNNING;
	for (i = 0; i < 1000; i++) {
	if (sched_runnable())
	return;
	cpu_spinwait();
	}
	}

	/*
	* C1E renders the local APIC timer dead, so we disable it by
	* reading the Interrupt Pending Message register and clearing
	* both C1eOnCmpHalt (bit 28) and SmiOnCmpHalt (bit 27).
	*
	* Reference:
	* "BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh Processors"
	* #32559 revision 3.00+
	*/
	#define MSR_AMDK8_IPM 0xc0010055
	#define AMDK8_SMIONCMPHALT (1ULL << 27)
	#define AMDK8_C1EONCMPHALT (1ULL << 28)
	#define AMDK8_CMPHALT (AMDK8_SMIONCMPHALT \| AMDK8_C1EONCMPHALT)

	static void
	cpu_probe_amdc1e(void)
	{

	/*
	* Detect the presence of C1E capability mostly on latest
	* dual-cores (or future) k8 family.
	*/
	if (cpu_vendor_id == CPU_VENDOR_AMD &&
	(cpu_id & 0x00000f00) == 0x00000f00 &&
	(cpu_id & 0x0fff0000) >= 0x00040000) {
	cpu_ident_amdc1e = 1;
	}
	}

	#ifdef XEN
	void (*cpu_idle_fn)(int) = cpu_idle_hlt;
	#else
	void (*cpu_idle_fn)(int) = cpu_idle_acpi;
	#endif

	void
	cpu_idle(int busy)
	{
	#ifndef XEN
	uint64_t msr;
	#endif

	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
	busy, curcpu);
	#if defined(MP_WATCHDOG) && !defined(XEN)
	ap_watchdog(PCPU_GET(cpuid));
	#endif
	#ifndef XEN
	/* If we are busy - try to use fast methods. */
	if (busy) {
	if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
	cpu_idle_mwait(busy);
	goto out;
	}
	}
	#endif

	/* If we have time - switch timers into idle mode. */
	if (!busy) {
	critical_enter();
	cpu_idleclock();
	}

	#ifndef XEN
	/* Apply AMD APIC timer C1E workaround. */
	if (cpu_ident_amdc1e && cpu_disable_deep_sleep) {
	msr = rdmsr(MSR_AMDK8_IPM);
	if (msr & AMDK8_CMPHALT)
	wrmsr(MSR_AMDK8_IPM, msr & ~AMDK8_CMPHALT);
	}
	#endif

	/* Call main idle method. */
	cpu_idle_fn(busy);

	/* Switch timers mack into active mode. */
	if (!busy) {
	cpu_activeclock();
	critical_exit();
	}
	#ifndef XEN
	out:
	#endif
	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
	busy, curcpu);
	}

	int
	cpu_idle_wakeup(int cpu)
	{
	struct pcpu *pcpu;
	int *state;

	pcpu = pcpu_find(cpu);
	state = (int *)pcpu->pc_monitorbuf;
	/*
	* This doesn't need to be atomic since missing the race will
	* simply result in unnecessary IPIs.
	*/
	if (*state == STATE_SLEEPING)
	return (0);
	if (*state == STATE_MWAIT)
	*state = STATE_RUNNING;
	return (1);
	}

	/*
	* Ordered by speed/power consumption.
	*/
	struct {
	void *id_fn;
	char *id_name;
	} idle_tbl[] = {
	{ cpu_idle_spin, "spin" },
	{ cpu_idle_mwait, "mwait" },
	{ cpu_idle_hlt, "hlt" },
	{ cpu_idle_acpi, "acpi" },
	{ NULL, NULL }
	};

	static int
	idle_sysctl_available(SYSCTL_HANDLER_ARGS)
	{
	char avail, p;
	int error;
	int i;

	avail = malloc(256, M_TEMP, M_WAITOK);
	p = avail;
	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
	if (strstr(idle_tbl[i].id_name, "mwait") &&
	(cpu_feature2 & CPUID2_MON) == 0)
	continue;
	if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
	cpu_idle_hook == NULL)
	continue;
	p += sprintf(p, "%s%s", p != avail ? ", " : "",
	idle_tbl[i].id_name);
	}
	error = sysctl_handle_string(oidp, avail, 0, req);
	free(avail, M_TEMP);
	return (error);
	}

	SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING \| CTLFLAG_RD,
	0, 0, idle_sysctl_available, "A", "list of available idle functions");

	static int
	idle_sysctl(SYSCTL_HANDLER_ARGS)
	{
	char buf[16];
	int error;
	char *p;
	int i;

	p = "unknown";
	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
	if (idle_tbl[i].id_fn == cpu_idle_fn) {
	p = idle_tbl[i].id_name;
	break;
	}
	}
	strncpy(buf, p, sizeof(buf));
	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
	if (strstr(idle_tbl[i].id_name, "mwait") &&
	(cpu_feature2 & CPUID2_MON) == 0)
	continue;
	if (strcmp(idle_tbl[i].id_name, "acpi") == 0 &&
	cpu_idle_hook == NULL)
	continue;
	if (strcmp(idle_tbl[i].id_name, buf))
	continue;
	cpu_idle_fn = idle_tbl[i].id_fn;
	return (0);
	}
	return (EINVAL);
	}

	SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING \| CTLFLAG_RW, 0, 0,
	idle_sysctl, "A", "currently selected idle function");

	uint64_t (atomic_load_acq_64)(volatile uint64_t ) =
	atomic_load_acq_64_i386;
	void (atomic_store_rel_64)(volatile uint64_t , uint64_t) =
	atomic_store_rel_64_i386;

	static void
	cpu_probe_cmpxchg8b(void)
	{

	if ((cpu_feature & CPUID_CX8) != 0 \|\|
	cpu_vendor_id == CPU_VENDOR_RISE) {
	atomic_load_acq_64 = atomic_load_acq_64_i586;
	atomic_store_rel_64 = atomic_store_rel_64_i586;
	}
	}

	/*
	* Reset registers to default values on exec.
	*/
	void
	exec_setregs(struct thread td, struct image_params imgp, u_long stack)
	{
	struct trapframe *regs = td->td_frame;
	struct pcb *pcb = td->td_pcb;

	/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
	pcb->pcb_gs = _udatasel;
	load_gs(_udatasel);

	mtx_lock_spin(&dt_lock);
	if (td->td_proc->p_md.md_ldt)
	user_ldt_free(td);
	else
	mtx_unlock_spin(&dt_lock);

	bzero((char *)regs, sizeof(struct trapframe));
	regs->tf_eip = imgp->entry_addr;
	regs->tf_esp = stack;
	regs->tf_eflags = PSL_USER \| (regs->tf_eflags & PSL_T);
	regs->tf_ss = _udatasel;
	regs->tf_ds = _udatasel;
	regs->tf_es = _udatasel;
	regs->tf_fs = _udatasel;
	regs->tf_cs = _ucodesel;

	/* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
	regs->tf_ebx = imgp->ps_strings;

	/*
	* Reset the hardware debug registers if they were in use.
	* They won't have any meaning for the newly exec'd process.
	*/
	if (pcb->pcb_flags & PCB_DBREGS) {
	pcb->pcb_dr0 = 0;
	pcb->pcb_dr1 = 0;
	pcb->pcb_dr2 = 0;
	pcb->pcb_dr3 = 0;
	pcb->pcb_dr6 = 0;
	pcb->pcb_dr7 = 0;
	if (pcb == PCPU_GET(curpcb)) {
	/*
	* Clear the debug registers on the running
	* CPU, otherwise they will end up affecting
	* the next process we switch to.
	*/
	reset_dbregs();
	}
	pcb->pcb_flags &= ~PCB_DBREGS;
	}

	/*
	* Initialize the math emulator (if any) for the current process.
	* Actually, just clear the bit that says that the emulator has
	* been initialized. Initialization is delayed until the process
	* traps to the emulator (if it is done at all) mainly because
	* emulators don't provide an entry point for initialization.
	*/
	td->td_pcb->pcb_flags &= ~FP_SOFTFP;
	pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;

	/*
	* Drop the FP state if we hold it, so that the process gets a
	* clean FP state if it uses the FPU again.
	*/
	fpstate_drop(td);

	/*
	* XXX - Linux emulator
	* Make sure sure edx is 0x0 on entry. Linux binaries depend
	* on it.
	*/
	td->td_retval[1] = 0;
	}

	void
	cpu_setregs(void)
	{
	unsigned int cr0;

	cr0 = rcr0();

	/*
	* CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
	*
	* Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
	* instructions. We must set the CR0_MP bit and use the CR0_TS
	* bit to control the trap, because setting the CR0_EM bit does
	* not cause WAIT instructions to trap. It's important to trap
	* WAIT instructions - otherwise the "wait" variants of no-wait
	* control instructions would degenerate to the "no-wait" variants
	* after FP context switches but work correctly otherwise. It's
	* particularly important to trap WAITs when there is no NPX -
	* otherwise the "wait" variants would always degenerate.
	*
	* Try setting CR0_NE to get correct error reporting on 486DX's.
	* Setting it should fail or do nothing on lesser processors.
	*/
	cr0 \|= CR0_MP \| CR0_NE \| CR0_TS \| CR0_WP \| CR0_AM;
	load_cr0(cr0);
	load_gs(_udatasel);
	}

	u_long bootdev; /* not a struct cdev - encoding is different /
	SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
	CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");

	/*
	* Initialize 386 and configure to run kernel
	*/

	/*
	* Initialize segments & interrupt table
	*/

	int _default_ldt;

	#ifdef XEN
	union descriptor *gdt;
	union descriptor *ldt;
	#else
	union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */
	union descriptor ldt[NLDT]; /* local descriptor table */
	#endif
	static struct gate_descriptor idt0[NIDT];
	struct gate_descriptor idt = &idt0[0]; / interrupt descriptor table */
	struct region_descriptor r_gdt, r_idt; /* table descriptors */
	struct mtx dt_lock; /* lock for GDT and LDT */

	#if defined(I586_CPU) && !defined(NO_F00F_HACK)
	extern int has_f00f_bug;
	#endif

	static struct i386tss dblfault_tss;
	static char dblfault_stack[PAGE_SIZE];

	extern vm_offset_t proc0kstack;


	/*
	* software prototypes -- in more palatable form.
	*
	* GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
	* GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
	*/
	struct soft_segment_descriptor gdt_segs[] = {
	/* GNULL_SEL 0 Null Descriptor */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = SEL_KPL,
	.ssd_p = 0,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_KPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GUFS_SEL 2 %fs Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GUGS_SEL 3 %gs Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GCODE_SEL 4 Code Descriptor for kernel */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMERA,
	.ssd_dpl = SEL_KPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GDATA_SEL 5 Data Descriptor for kernel */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_KPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GUCODE_SEL 6 Code Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMERA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GUDATA_SEL 7 Data Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
	{ .ssd_base = 0x400,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_KPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	#ifndef XEN
	/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
	{
	.ssd_base = 0x0,
	.ssd_limit = sizeof(struct i386tss)-1,
	.ssd_type = SDT_SYS386TSS,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GLDT_SEL 10 LDT Descriptor */
	{ .ssd_base = (int) ldt,
	.ssd_limit = sizeof(ldt)-1,
	.ssd_type = SDT_SYSLDT,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GUSERLDT_SEL 11 User LDT Descriptor per process */
	{ .ssd_base = (int) ldt,
	.ssd_limit = (512 * sizeof(union descriptor)-1),
	.ssd_type = SDT_SYSLDT,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GPANIC_SEL 12 Panic Tss Descriptor */
	{ .ssd_base = (int) &dblfault_tss,
	.ssd_limit = sizeof(struct i386tss)-1,
	.ssd_type = SDT_SYS386TSS,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
	{ .ssd_base = 0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMERA,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 1 },
	/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
	{ .ssd_base = 0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMERA,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 1 },
	/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
	{ .ssd_base = 0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
	{ .ssd_base = 0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 1 },
	/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
	{ .ssd_base = 0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 1 },
	/* GNDIS_SEL 18 NDIS Descriptor */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	#endif /* !XEN */
	};

	static struct soft_segment_descriptor ldt_segs[] = {
	/* Null Descriptor - overwritten by call gate */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* Null Descriptor - overwritten by call gate */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* Null Descriptor - overwritten by call gate */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* Code Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMERA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* Null Descriptor - overwritten by call gate */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* Data Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	};

	void
	setidt(idx, func, typ, dpl, selec)
	int idx;
	inthand_t *func;
	int typ;
	int dpl;
	int selec;
	{
	struct gate_descriptor *ip;

	ip = idt + idx;
	ip->gd_looffset = (int)func;
	ip->gd_selector = selec;
	ip->gd_stkcpy = 0;
	ip->gd_xx = 0;
	ip->gd_type = typ;
	ip->gd_dpl = dpl;
	ip->gd_p = 1;
	ip->gd_hioffset = ((int)func)>>16 ;
	}

	extern inthand_t
	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
	IDTVEC(xmm),
	#ifdef KDTRACE_HOOKS
	IDTVEC(dtrace_ret),
	#endif
	IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);

	#ifdef DDB
	/*
	* Display the index and function name of any IDT entries that don't use
	* the default 'rsvd' entry point.
	*/
	DB_SHOW_COMMAND(idt, db_show_idt)
	{
	struct gate_descriptor *ip;
	int idx;
	uintptr_t func;

	ip = idt;
	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
	func = (ip->gd_hioffset << 16 \| ip->gd_looffset);
	if (func != (uintptr_t)&IDTVEC(rsvd)) {
	db_printf("%3d\t", idx);
	db_printsym(func, DB_STGY_PROC);
	db_printf("\n");
	}
	ip++;
	}
	}

	/* Show privileged registers. */
	DB_SHOW_COMMAND(sysregs, db_show_sysregs)
	{
	uint64_t idtr, gdtr;

	idtr = ridt();
	db_printf("idtr\t0x%08x/%04x\n",
	(u_int)(idtr >> 16), (u_int)idtr & 0xffff);
	gdtr = rgdt();
	db_printf("gdtr\t0x%08x/%04x\n",
	(u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
	db_printf("ldtr\t0x%04x\n", rldt());
	db_printf("tr\t0x%04x\n", rtr());
	db_printf("cr0\t0x%08x\n", rcr0());
	db_printf("cr2\t0x%08x\n", rcr2());
	db_printf("cr3\t0x%08x\n", rcr3());
	db_printf("cr4\t0x%08x\n", rcr4());
	}
	#endif

	void
	sdtossd(sd, ssd)
	struct segment_descriptor *sd;
	struct soft_segment_descriptor *ssd;
	{
	ssd->ssd_base = (sd->sd_hibase << 24) \| sd->sd_lobase;
	ssd->ssd_limit = (sd->sd_hilimit << 16) \| sd->sd_lolimit;
	ssd->ssd_type = sd->sd_type;
	ssd->ssd_dpl = sd->sd_dpl;
	ssd->ssd_p = sd->sd_p;
	ssd->ssd_def32 = sd->sd_def32;
	ssd->ssd_gran = sd->sd_gran;
	}

	#ifndef XEN
	static int
	add_smap_entry(struct bios_smap smap, vm_paddr_t physmap, int *physmap_idxp)
	{
	int i, insert_idx, physmap_idx;

	physmap_idx = *physmap_idxp;

	if (boothowto & RB_VERBOSE)
	printf("SMAP type=%02x base=%016llx len=%016llx\n",
	smap->type, smap->base, smap->length);

	if (smap->type != SMAP_TYPE_MEMORY)
	return (1);

	if (smap->length == 0)
	return (1);

	#ifndef PAE
	if (smap->base > 0xffffffff) {
	printf("%uK of memory above 4GB ignored\n",
	(u_int)(smap->length / 1024));
	return (1);
	}
	#endif

	/*
	* Find insertion point while checking for overlap. Start off by
	* assuming the new entry will be added to the end.
	*/
	insert_idx = physmap_idx + 2;
	for (i = 0; i <= physmap_idx; i += 2) {
	if (smap->base < physmap[i + 1]) {
	if (smap->base + smap->length <= physmap[i]) {
	insert_idx = i;
	break;
	}
	if (boothowto & RB_VERBOSE)
	printf(
	"Overlapping memory regions, ignoring second region\n");
	return (1);
	}
	}

	/* See if we can prepend to the next entry. */
	if (insert_idx <= physmap_idx &&
	smap->base + smap->length == physmap[insert_idx]) {
	physmap[insert_idx] = smap->base;
	return (1);
	}

	/* See if we can append to the previous entry. */
	if (insert_idx > 0 && smap->base == physmap[insert_idx - 1]) {
	physmap[insert_idx - 1] += smap->length;
	return (1);
	}

	physmap_idx += 2;
	*physmap_idxp = physmap_idx;
	if (physmap_idx == PHYSMAP_SIZE) {
	printf(
	"Too many segments in the physical address map, giving up\n");
	return (0);
	}

	/*
	* Move the last 'N' entries down to make room for the new
	* entry if needed.
	*/
	for (i = physmap_idx; i > insert_idx; i -= 2) {
	physmap[i] = physmap[i - 2];
	physmap[i + 1] = physmap[i - 1];
	}

	/* Insert the new entry. */
	physmap[insert_idx] = smap->base;
	physmap[insert_idx + 1] = smap->base + smap->length;
	return (1);
	}

	static void
	basemem_setup(void)
	{
	vm_paddr_t pa;
	pt_entry_t *pte;
	int i;

	if (basemem > 640) {
	printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
	basemem);
	basemem = 640;
	}

	/*
	* XXX if biosbasemem is now < 640, there is a `hole'
	* between the end of base memory and the start of
	* ISA memory. The hole may be empty or it may
	* contain BIOS code or data. Map it read/write so
	* that the BIOS can write to it. (Memory from 0 to
	* the physical end of the kernel is mapped read-only
	* to begin with and then parts of it are remapped.
	* The parts that aren't remapped form holes that
	* remain read-only and are unused by the kernel.
	* The base memory area is below the physical end of
	* the kernel and right now forms a read-only hole.
	* The part of it from PAGE_SIZE to
	* (trunc_page(biosbasemem * 1024) - 1) will be
	* remapped and used by the kernel later.)
	*
	* This code is similar to the code used in
	* pmap_mapdev, but since no memory needs to be
	* allocated we simply change the mapping.
	*/
	for (pa = trunc_page(basemem * 1024);
	pa < ISA_HOLE_START; pa += PAGE_SIZE)
	pmap_kenter(KERNBASE + pa, pa);

	/*
	* Map pages between basemem and ISA_HOLE_START, if any, r/w into
	* the vm86 page table so that vm86 can scribble on them using
	* the vm86 map too. XXX: why 2 ways for this and only 1 way for
	* page 0, at least as initialized here?
	*/
	pte = (pt_entry_t *)vm86paddr;
	for (i = basemem / 4; i < 160; i++)
	pte[i] = (i << PAGE_SHIFT) \| PG_V \| PG_RW \| PG_U;
	}
	#endif

	/*
	* Populate the (physmap) array with base/bound pairs describing the
	* available physical memory in the system, then test this memory and
	* build the phys_avail array describing the actually-available memory.
	*
	* If we cannot accurately determine the physical memory map, then use
	* value from the 0xE801 call, and failing that, the RTC.
	*
	* Total memory size may be set by the kernel environment variable
	* hw.physmem or the compile-time define MAXMEM.
	*
	* XXX first should be vm_paddr_t.
	*/
	static void
	getmemsize(int first)
	{
	int has_smap, off, physmap_idx, pa_indx, da_indx;
	u_long physmem_tunable, memtest;
	vm_paddr_t physmap[PHYSMAP_SIZE];
	pt_entry_t *pte;
	quad_t dcons_addr, dcons_size;
	#ifndef XEN
	int hasbrokenint12, i;
	u_int extmem;
	struct vm86frame vmf;
	struct vm86context vmc;
	vm_paddr_t pa;
	struct bios_smap smap, smapbase, *smapend;
	u_int32_t smapsize;
	caddr_t kmdp;
	#endif

	has_smap = 0;
	#if defined(XEN)
	Maxmem = xen_start_info->nr_pages - init_first;
	physmem = Maxmem;
	basemem = 0;
	physmap[0] = init_first << PAGE_SHIFT;
	physmap[1] = ptoa(Maxmem) - round_page(msgbufsize);
	physmap_idx = 0;
	#else
	#ifdef XBOX
	if (arch_i386_is_xbox) {
	/*
	* We queried the memory size before, so chop off 4MB for
	* the framebuffer and inform the OS of this.
	*/
	physmap[0] = 0;
	physmap[1] = (arch_i386_xbox_memsize * 1024 * 1024) - XBOX_FB_SIZE;
	physmap_idx = 0;
	goto physmap_done;
	}
	#endif
	bzero(&vmf, sizeof(vmf));
	bzero(physmap, sizeof(physmap));
	basemem = 0;

	/*
	* Check if the loader supplied an SMAP memory map. If so,
	* use that and do not make any VM86 calls.
	*/
	physmap_idx = 0;
	smapbase = NULL;
	kmdp = preload_search_by_type("elf kernel");
	if (kmdp == NULL)
	kmdp = preload_search_by_type("elf32 kernel");
	if (kmdp != NULL)
	smapbase = (struct bios_smap *)preload_search_info(kmdp,
	MODINFO_METADATA \| MODINFOMD_SMAP);
	if (smapbase != NULL) {
	/*
	* subr_module.c says:
	* "Consumer may safely assume that size value precedes data."
	* ie: an int32_t immediately precedes SMAP.
	*/
	smapsize = ((u_int32_t )smapbase - 1);
	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
	has_smap = 1;

	for (smap = smapbase; smap < smapend; smap++)
	if (!add_smap_entry(smap, physmap, &physmap_idx))
	break;
	goto have_smap;
	}

	/*
	* Some newer BIOSes have a broken INT 12H implementation
	* which causes a kernel panic immediately. In this case, we
	* need use the SMAP to determine the base memory size.
	*/
	hasbrokenint12 = 0;
	TUNABLE_INT_FETCH("hw.hasbrokenint12", &hasbrokenint12);
	if (hasbrokenint12 == 0) {
	/* Use INT12 to determine base memory size. */
	vm86_intcall(0x12, &vmf);
	basemem = vmf.vmf_ax;
	basemem_setup();
	}

	/*
	* Fetch the memory map with INT 15:E820. Map page 1 R/W into
	* the kernel page table so we can use it as a buffer. The
	* kernel will unmap this page later.
	*/
	pmap_kenter(KERNBASE + (1 << PAGE_SHIFT), 1 << PAGE_SHIFT);
	vmc.npages = 0;
	smap = (void *)vm86_addpage(&vmc, 1, KERNBASE + (1 << PAGE_SHIFT));
	vm86_getptr(&vmc, (vm_offset_t)smap, &vmf.vmf_es, &vmf.vmf_di);

	vmf.vmf_ebx = 0;
	do {
	vmf.vmf_eax = 0xE820;
	vmf.vmf_edx = SMAP_SIG;
	vmf.vmf_ecx = sizeof(struct bios_smap);
	i = vm86_datacall(0x15, &vmf, &vmc);
	if (i \|\| vmf.vmf_eax != SMAP_SIG)
	break;
	has_smap = 1;
	if (!add_smap_entry(smap, physmap, &physmap_idx))
	break;
	} while (vmf.vmf_ebx != 0);

	have_smap:
	/*
	* If we didn't fetch the "base memory" size from INT12,
	* figure it out from the SMAP (or just guess).
	*/
	if (basemem == 0) {
	for (i = 0; i <= physmap_idx; i += 2) {
	if (physmap[i] == 0x00000000) {
	basemem = physmap[i + 1] / 1024;
	break;
	}
	}

	/* XXX: If we couldn't find basemem from SMAP, just guess. */
	if (basemem == 0)
	basemem = 640;
	basemem_setup();
	}

	if (physmap[1] != 0)
	goto physmap_done;

	/*
	* If we failed to find an SMAP, figure out the extended
	* memory size. We will then build a simple memory map with
	* two segments, one for "base memory" and the second for
	* "extended memory". Note that "extended memory" starts at a
	* physical address of 1MB and that both basemem and extmem
	* are in units of 1KB.
	*
	* First, try to fetch the extended memory size via INT 15:E801.
	*/
	vmf.vmf_ax = 0xE801;
	if (vm86_intcall(0x15, &vmf) == 0) {
	extmem = vmf.vmf_cx + vmf.vmf_dx * 64;
	} else {
	/*
	* If INT15:E801 fails, this is our last ditch effort
	* to determine the extended memory size. Currently
	* we prefer the RTC value over INT15:88.
	*/
	#if 0
	vmf.vmf_ah = 0x88;
	vm86_intcall(0x15, &vmf);
	extmem = vmf.vmf_ax;
	#else
	extmem = rtcin(RTC_EXTLO) + (rtcin(RTC_EXTHI) << 8);
	#endif
	}

	/*
	* Special hack for chipsets that still remap the 384k hole when
	* there's 16MB of memory - this really confuses people that
	* are trying to use bus mastering ISA controllers with the
	* "16MB limit"; they only have 16MB, but the remapping puts
	* them beyond the limit.
	*
	* If extended memory is between 15-16MB (16-17MB phys address range),
	* chop it to 15MB.
	*/
	if ((extmem > 15 * 1024) && (extmem < 16 * 1024))
	extmem = 15 * 1024;

	physmap[0] = 0;
	physmap[1] = basemem * 1024;
	physmap_idx = 2;
	physmap[physmap_idx] = 0x100000;
	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;

	physmap_done:
	#endif
	/*
	* Now, physmap contains a map of physical memory.
	*/

	#ifdef SMP
	/* make hole for AP bootstrap code */
	physmap[1] = mp_bootaddress(physmap[1]);
	#endif

	/*
	* Maxmem isn't the "maximum memory", it's one larger than the
	* highest page of the physical address space. It should be
	* called something like "Maxphyspage". We may adjust this
	* based on ``hw.physmem'' and the results of the memory test.
	*/
	Maxmem = atop(physmap[physmap_idx + 1]);

	#ifdef MAXMEM
	Maxmem = MAXMEM / 4;
	#endif

	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
	Maxmem = atop(physmem_tunable);

	/*
	* If we have an SMAP, don't allow MAXMEM or hw.physmem to extend
	* the amount of memory in the system.
	*/
	if (has_smap && Maxmem > atop(physmap[physmap_idx + 1]))
	Maxmem = atop(physmap[physmap_idx + 1]);

	/*
	* By default keep the memtest enabled. Use a general name so that
	* one could eventually do more with the code than just disable it.
	*/
	memtest = 1;
	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);

	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
	(boothowto & RB_VERBOSE))
	printf("Physical memory use set to %ldK\n", Maxmem * 4);

	/*
	* If Maxmem has been increased beyond what the system has detected,
	* extend the last memory segment to the new limit.
	*/
	if (atop(physmap[physmap_idx + 1]) < Maxmem)
	physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);

	/* call pmap initialization to make new kernel address space */
	pmap_bootstrap(first);

	/*
	* Size up each available chunk of physical memory.
	*/
	physmap[0] = PAGE_SIZE; /* mask off page 0 */
	pa_indx = 0;
	da_indx = 1;
	phys_avail[pa_indx++] = physmap[0];
	phys_avail[pa_indx] = physmap[0];
	dump_avail[da_indx] = physmap[0];
	pte = CMAP1;

	/*
	* Get dcons buffer address
	*/
	if (getenv_quad("dcons.addr", &dcons_addr) == 0 \|\|
	getenv_quad("dcons.size", &dcons_size) == 0)
	dcons_addr = 0;

	#ifndef XEN
	/*
	* physmap is in bytes, so when converting to page boundaries,
	* round up the start address and round down the end address.
	*/
	for (i = 0; i <= physmap_idx; i += 2) {
	vm_paddr_t end;

	end = ptoa((vm_paddr_t)Maxmem);
	if (physmap[i + 1] < end)
	end = trunc_page(physmap[i + 1]);
	for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
	int tmp, page_bad, full;
	int ptr = (int )CADDR1;

	full = FALSE;
	/*
	* block out kernel memory as not available.
	*/
	if (pa >= KERNLOAD && pa < first)
	goto do_dump_avail;

	/*
	* block out dcons buffer
	*/
	if (dcons_addr > 0
	&& pa >= trunc_page(dcons_addr)
	&& pa < dcons_addr + dcons_size)
	goto do_dump_avail;

	page_bad = FALSE;
	if (memtest == 0)
	goto skip_memtest;

	/*
	* map page into kernel: valid, read/write,non-cacheable
	*/
	*pte = pa \| PG_V \| PG_RW \| PG_N;
	invltlb();

	tmp = (int )ptr;
	/*
	* Test for alternating 1's and 0's
	*/
	(volatile int )ptr = 0xaaaaaaaa;
	if ((volatile int )ptr != 0xaaaaaaaa)
	page_bad = TRUE;
	/*
	* Test for alternating 0's and 1's
	*/
	(volatile int )ptr = 0x55555555;
	if ((volatile int )ptr != 0x55555555)
	page_bad = TRUE;
	/*
	* Test for all 1's
	*/
	(volatile int )ptr = 0xffffffff;
	if ((volatile int )ptr != 0xffffffff)
	page_bad = TRUE;
	/*
	* Test for all 0's
	*/
	(volatile int )ptr = 0x0;
	if ((volatile int )ptr != 0x0)
	page_bad = TRUE;
	/*
	* Restore original value.
	*/
	(int )ptr = tmp;

	skip_memtest:
	/*
	* Adjust array of valid/good pages.
	*/
	if (page_bad == TRUE)
	continue;
	/*
	* If this good page is a continuation of the
	* previous set of good pages, then just increase
	* the end pointer. Otherwise start a new chunk.
	* Note that "end" points one higher than end,
	* making the range >= start and < end.
	* If we're also doing a speculative memory
	* test and we at or past the end, bump up Maxmem
	* so that we keep going. The first bad page
	* will terminate the loop.
	*/
	if (phys_avail[pa_indx] == pa) {
	phys_avail[pa_indx] += PAGE_SIZE;
	} else {
	pa_indx++;
	if (pa_indx == PHYS_AVAIL_ARRAY_END) {
	printf(
	"Too many holes in the physical address space, giving up\n");
	pa_indx--;
	full = TRUE;
	goto do_dump_avail;
	}
	phys_avail[pa_indx++] = pa; /* start */
	phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
	}
	physmem++;
	do_dump_avail:
	if (dump_avail[da_indx] == pa) {
	dump_avail[da_indx] += PAGE_SIZE;
	} else {
	da_indx++;
	if (da_indx == DUMP_AVAIL_ARRAY_END) {
	da_indx--;
	goto do_next;
	}
	dump_avail[da_indx++] = pa; /* start */
	dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
	}
	do_next:
	if (full)
	break;
	}
	}
	*pte = 0;
	invltlb();
	#else
	phys_avail[0] = physfree;
	phys_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;
	dump_avail[0] = 0;
	dump_avail[1] = xen_start_info->nr_pages*PAGE_SIZE;

	#endif

	/*
	* XXX
	* The last chunk must contain at least one page plus the message
	* buffer to avoid complicating other code (message buffer address
	* calculation, etc.).
	*/
	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
	round_page(msgbufsize) >= phys_avail[pa_indx]) {
	physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
	phys_avail[pa_indx--] = 0;
	phys_avail[pa_indx--] = 0;
	}

	Maxmem = atop(phys_avail[pa_indx]);

	/* Trim off space for the message buffer. */
	phys_avail[pa_indx] -= round_page(msgbufsize);

	/* Map the message buffer. */
	for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
	pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
	off);

	PT_UPDATES_FLUSH();
	}

	#ifdef XEN
	#define MTOPSIZE (1<<(14 + PAGE_SHIFT))

	void
	init386(first)
	int first;
	{
	unsigned long gdtmachpfn;
	int error, gsel_tss, metadata_missing, x, pa;
	size_t kstack0_sz;
	struct pcpu *pc;
	struct callback_register event = {
	.type = CALLBACKTYPE_event,
	.address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)Xhypervisor_callback },
	};
	struct callback_register failsafe = {
	.type = CALLBACKTYPE_failsafe,
	.address = {GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback },
	};

	thread0.td_kstack = proc0kstack;
	thread0.td_kstack_pages = KSTACK_PAGES;
	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
	thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;

	/*
	* This may be done better later if it gets more high level
	* components in it. If so just link td->td_proc here.
	*/
	proc_linkup0(&proc0, &thread0);

	metadata_missing = 0;
	if (xen_start_info->mod_start) {
	preload_metadata = (caddr_t)xen_start_info->mod_start;
	preload_bootstrap_relocate(KERNBASE);
	} else {
	metadata_missing = 1;
	}
	if (envmode == 1)
	kern_envp = static_env;
	else if ((caddr_t)xen_start_info->cmd_line)
	kern_envp = xen_setbootenv((caddr_t)xen_start_info->cmd_line);

	boothowto \|= xen_boothowto(kern_envp);

	/* Init basic tunables, hz etc */
	init_param1();

	/*
	* XEN occupies a portion of the upper virtual address space
	* At its base it manages an array mapping machine page frames
	* to physical page frames - hence we need to be able to
	* access 4GB - (64MB - 4MB + 64k)
	*/
	gdt_segs[GPRIV_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
	gdt_segs[GUFS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
	gdt_segs[GUGS_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
	gdt_segs[GCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
	gdt_segs[GDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
	gdt_segs[GUCODE_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
	gdt_segs[GUDATA_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);
	gdt_segs[GBIOSLOWMEM_SEL].ssd_limit = atop(HYPERVISOR_VIRT_START + MTOPSIZE);

	pc = &__pcpu[0];
	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;

	PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) \| PG_V \| PG_RW);
	bzero(gdt, PAGE_SIZE);
	for (x = 0; x < NGDT; x++)
	ssdtosd(&gdt_segs[x], &gdt[x].sd);

	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);

	gdtmachpfn = vtomach(gdt) >> PAGE_SHIFT;
	PT_SET_MA(gdt, xpmap_ptom(VTOP(gdt)) \| PG_V);
	PANIC_IF(HYPERVISOR_set_gdt(&gdtmachpfn, 512) != 0);
	lgdt(&r_gdt);
	gdtset = 1;

	if ((error = HYPERVISOR_set_trap_table(trap_table)) != 0) {
	panic("set_trap_table failed - error %d\n", error);
	}

	error = HYPERVISOR_callback_op(CALLBACKOP_register, &event);
	if (error == 0)
	error = HYPERVISOR_callback_op(CALLBACKOP_register, &failsafe);
	#if CONFIG_XEN_COMPAT <= 0x030002
	if (error == -ENOXENSYS)
	HYPERVISOR_set_callbacks(GSEL(GCODE_SEL, SEL_KPL),
	(unsigned long)Xhypervisor_callback,
	GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
	#endif
	pcpu_init(pc, 0, sizeof(struct pcpu));
	for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
	pmap_kenter(pa + KERNBASE, pa);
	dpcpu_init((void *)(first + KERNBASE), 0);
	first += DPCPU_SIZE;
	physfree += DPCPU_SIZE;
	init_first += DPCPU_SIZE / PAGE_SIZE;

	PCPU_SET(prvspace, pc);
	PCPU_SET(curthread, &thread0);
	PCPU_SET(curpcb, thread0.td_pcb);

	/*
	* Initialize mutexes.
	*
	* icu_lock: in order to allow an interrupt to occur in a critical
	* section, to set pcpu->ipending (etc...) properly, we
	* must be able to get the icu lock, so it can't be
	* under witness.
	*/
	mutex_init();
	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN \| MTX_NOWITNESS \| MTX_NOPROFILE);

	/* make ldt memory segments */
	PT_SET_MA(ldt, xpmap_ptom(VTOP(ldt)) \| PG_V \| PG_RW);
	bzero(ldt, PAGE_SIZE);
	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
	ssdtosd(&ldt_segs[x], &ldt[x].sd);

	default_proc_ldt.ldt_base = (caddr_t)ldt;
	default_proc_ldt.ldt_len = 6;
	_default_ldt = (int)&default_proc_ldt;
	PCPU_SET(currentldt, _default_ldt);
	PT_SET_MA(ldt, *vtopte((unsigned long)ldt) & ~PG_RW);
	xen_set_ldt((unsigned long) ldt, (sizeof ldt_segs / sizeof ldt_segs[0]));

	#if defined(XEN_PRIVILEGED)
	/*
	* Initialize the i8254 before the console so that console
	* initialization can use DELAY().
	*/
	i8254_init();
	#endif

	/*
	* Initialize the console before we print anything out.
	*/
	cninit();

	if (metadata_missing)
	printf("WARNING: loader(8) metadata is missing!\n");

	#ifdef DEV_ISA
	elcr_probe();
	atpic_startup();
	#endif

	#ifdef DDB
	ksym_start = bootinfo.bi_symtab;
	ksym_end = bootinfo.bi_esymtab;
	#endif

	kdb_init();

	#ifdef KDB
	if (boothowto & RB_KDB)
	kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
	#endif

	finishidentcpu(); /* Final stage of CPU initialization */
	setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	initializecpu(); /* Initialize CPU registers */

	/* make an initial tss so cpu can get interrupt stack on syscall! */
	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
	kstack0_sz - sizeof(struct pcb) - 16);
	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
	HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL),
	PCPU_GET(common_tss.tss_esp0));

	/* pointer to selector slot for %fs/%gs */
	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);

	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
	dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
	dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
	#ifdef PAE
	dblfault_tss.tss_cr3 = (int)IdlePDPT;
	#else
	dblfault_tss.tss_cr3 = (int)IdlePTD;
	#endif
	dblfault_tss.tss_eip = (int)dblfault_handler;
	dblfault_tss.tss_eflags = PSL_KERNEL;
	dblfault_tss.tss_ds = dblfault_tss.tss_es =
	dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);

	vm86_initialize();
	getmemsize(first);
	init_param2(physmem);

	/* now running on new page tables, configured,and u/iom is accessible */

	msgbufinit(msgbufp, msgbufsize);
	/* transfer to user mode */

	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);

	/* setup proc 0's pcb */
	thread0.td_pcb->pcb_flags = 0;
	#ifdef PAE
	thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
	#else
	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
	#endif
	thread0.td_pcb->pcb_ext = 0;
	thread0.td_frame = &proc0_tf;
	thread0.td_pcb->pcb_fsd = PCPU_GET(fsgs_gdt)[0];
	thread0.td_pcb->pcb_gsd = PCPU_GET(fsgs_gdt)[1];

	cpu_probe_amdc1e();
	cpu_probe_cmpxchg8b();
	}

	#else
	void
	init386(first)
	int first;
	{
	struct gate_descriptor *gdp;
	int gsel_tss, metadata_missing, x, pa;
	size_t kstack0_sz;
	struct pcpu *pc;

	thread0.td_kstack = proc0kstack;
	thread0.td_kstack_pages = KSTACK_PAGES;
	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
	thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;

	/*
	* This may be done better later if it gets more high level
	* components in it. If so just link td->td_proc here.
	*/
	proc_linkup0(&proc0, &thread0);

	metadata_missing = 0;
	if (bootinfo.bi_modulep) {
	preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
	preload_bootstrap_relocate(KERNBASE);
	} else {
	metadata_missing = 1;
	}
	if (envmode == 1)
	kern_envp = static_env;
	else if (bootinfo.bi_envp)
	kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;

	/* Init basic tunables, hz etc */
	init_param1();

	/*
	* Make gdt memory segments. All segments cover the full 4GB
	* of address space and permissions are enforced at page level.
	*/
	gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
	gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
	gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
	gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
	gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
	gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);

	pc = &__pcpu[0];
	gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;

	for (x = 0; x < NGDT; x++)
	ssdtosd(&gdt_segs[x], &gdt[x].sd);

	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
	r_gdt.rd_base = (int) gdt;
	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
	lgdt(&r_gdt);

	pcpu_init(pc, 0, sizeof(struct pcpu));
	for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
	pmap_kenter(pa + KERNBASE, pa);
	dpcpu_init((void *)(first + KERNBASE), 0);
	first += DPCPU_SIZE;
	PCPU_SET(prvspace, pc);
	PCPU_SET(curthread, &thread0);
	PCPU_SET(curpcb, thread0.td_pcb);

	/*
	* Initialize mutexes.
	*
	* icu_lock: in order to allow an interrupt to occur in a critical
	* section, to set pcpu->ipending (etc...) properly, we
	* must be able to get the icu lock, so it can't be
	* under witness.
	*/
	mutex_init();
	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN \| MTX_NOWITNESS \| MTX_NOPROFILE);

	/* make ldt memory segments */
	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
	ssdtosd(&ldt_segs[x], &ldt[x].sd);

	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
	lldt(_default_ldt);
	PCPU_SET(currentldt, _default_ldt);

	/* exceptions */
	for (x = 0; x < NIDT; x++)
	setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL
	, GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
	setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
	GSEL(GCODE_SEL, SEL_KPL));
	#ifdef KDTRACE_HOOKS
	setidt(IDT_DTRACE_RET, &IDTVEC(dtrace_ret), SDT_SYS386TGT, SEL_UPL,
	GSEL(GCODE_SEL, SEL_KPL));
	#endif

	r_idt.rd_limit = sizeof(idt0) - 1;
	r_idt.rd_base = (int) idt;
	lidt(&r_idt);

	#ifdef XBOX
	/*
	* The following code queries the PCI ID of 0:0:0. For the XBOX,
	* This should be 0x10de / 0x02a5.
	*
	* This is exactly what Linux does.
	*/
	outl(0xcf8, 0x80000000);
	if (inl(0xcfc) == 0x02a510de) {
	arch_i386_is_xbox = 1;
	pic16l_setled(XBOX_LED_GREEN);

	/*
	* We are an XBOX, but we may have either 64MB or 128MB of
	* memory. The PCI host bridge should be programmed for this,
	* so we just query it.
	*/
	outl(0xcf8, 0x80000084);
	arch_i386_xbox_memsize = (inl(0xcfc) == 0x7FFFFFF) ? 128 : 64;
	}
	#endif /* XBOX */

	/*
	* Initialize the i8254 before the console so that console
	* initialization can use DELAY().
	*/
	i8254_init();

	/*
	* Initialize the console before we print anything out.
	*/
	cninit();

	if (metadata_missing)
	printf("WARNING: loader(8) metadata is missing!\n");

	#ifdef DEV_ISA
	elcr_probe();
	atpic_startup();
	#endif

	#ifdef DDB
	ksym_start = bootinfo.bi_symtab;
	ksym_end = bootinfo.bi_esymtab;
	#endif

	kdb_init();

	#ifdef KDB
	if (boothowto & RB_KDB)
	kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
	#endif

	finishidentcpu(); /* Final stage of CPU initialization */
	setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	initializecpu(); /* Initialize CPU registers */

	/* make an initial tss so cpu can get interrupt stack on syscall! */
	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
	kstack0_sz - sizeof(struct pcb) - 16);
	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
	ltr(gsel_tss);

	/* pointer to selector slot for %fs/%gs */
	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);

	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
	dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
	dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
	#ifdef PAE
	dblfault_tss.tss_cr3 = (int)IdlePDPT;
	#else
	dblfault_tss.tss_cr3 = (int)IdlePTD;
	#endif
	dblfault_tss.tss_eip = (int)dblfault_handler;
	dblfault_tss.tss_eflags = PSL_KERNEL;
	dblfault_tss.tss_ds = dblfault_tss.tss_es =
	dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);

	vm86_initialize();
	getmemsize(first);
	init_param2(physmem);

	/* now running on new page tables, configured,and u/iom is accessible */

	msgbufinit(msgbufp, msgbufsize);

	/* make a call gate to reenter kernel with */
	gdp = &ldt[LSYS5CALLS_SEL].gd;

	x = (int) &IDTVEC(lcall_syscall);
	gdp->gd_looffset = x;
	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
	gdp->gd_stkcpy = 1;
	gdp->gd_type = SDT_SYS386CGT;
	gdp->gd_dpl = SEL_UPL;
	gdp->gd_p = 1;
	gdp->gd_hioffset = x >> 16;

	/* XXX does this work? */
	/* XXX yes! */
	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];

	/* transfer to user mode */

	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);

	/* setup proc 0's pcb */
	thread0.td_pcb->pcb_flags = 0;
	#ifdef PAE
	thread0.td_pcb->pcb_cr3 = (int)IdlePDPT;
	#else
	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
	#endif
	thread0.td_pcb->pcb_ext = 0;
	thread0.td_frame = &proc0_tf;

	cpu_probe_amdc1e();
	cpu_probe_cmpxchg8b();
	}
	#endif

	void
	cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
	{

	pcpu->pc_acpi_id = 0xffffffff;
	}

	void
	spinlock_enter(void)
	{
	struct thread *td;
	register_t flags;

	td = curthread;
	if (td->td_md.md_spinlock_count == 0) {
	flags = intr_disable();
	td->td_md.md_spinlock_count = 1;
	td->td_md.md_saved_flags = flags;
	} else
	td->td_md.md_spinlock_count++;
	critical_enter();
	}

	void
	spinlock_exit(void)
	{
	struct thread *td;
	register_t flags;

	td = curthread;
	critical_exit();
	flags = td->td_md.md_saved_flags;
	td->td_md.md_spinlock_count--;
	if (td->td_md.md_spinlock_count == 0)
	intr_restore(flags);
	}

	#if defined(I586_CPU) && !defined(NO_F00F_HACK)
	static void f00f_hack(void *unused);
	SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);

	static void
	f00f_hack(void *unused)
	{
	struct gate_descriptor *new_idt;
	vm_offset_t tmp;

	if (!has_f00f_bug)
	return;

	GIANT_REQUIRED;

	printf("Intel Pentium detected, installing workaround for F00F bug\n");

	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
	if (tmp == 0)
	panic("kmem_alloc returned 0");

	/* Put the problematic entry (#6) at the end of the lower page. */
	new_idt = (struct gate_descriptor*)
	(tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
	bcopy(idt, new_idt, sizeof(idt0));
	r_idt.rd_base = (u_int)new_idt;
	lidt(&r_idt);
	idt = new_idt;
	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
	VM_PROT_READ, FALSE) != KERN_SUCCESS)
	panic("vm_map_protect failed");
	}
	#endif /* defined(I586_CPU) && !NO_F00F_HACK */

	/*
	* Construct a PCB from a trapframe. This is called from kdb_trap() where
	* we want to start a backtrace from the function that caused us to enter
	* the debugger. We have the context in the trapframe, but base the trace
	* on the PCB. The PCB doesn't have to be perfect, as long as it contains
	* enough for a backtrace.
	*/
	void
	makectx(struct trapframe tf, struct pcb pcb)
	{

	pcb->pcb_edi = tf->tf_edi;
	pcb->pcb_esi = tf->tf_esi;
	pcb->pcb_ebp = tf->tf_ebp;
	pcb->pcb_ebx = tf->tf_ebx;
	pcb->pcb_eip = tf->tf_eip;
	pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
	}

	int
	ptrace_set_pc(struct thread *td, u_long addr)
	{

	td->td_frame->tf_eip = addr;
	return (0);
	}

	int
	ptrace_single_step(struct thread *td)
	{
	td->td_frame->tf_eflags \|= PSL_T;
	return (0);
	}

	int
	ptrace_clear_single_step(struct thread *td)
	{
	td->td_frame->tf_eflags &= ~PSL_T;
	return (0);
	}

	int
	fill_regs(struct thread td, struct reg regs)
	{
	struct pcb *pcb;
	struct trapframe *tp;

	tp = td->td_frame;
	pcb = td->td_pcb;
	regs->r_gs = pcb->pcb_gs;
	return (fill_frame_regs(tp, regs));
	}

	int
	fill_frame_regs(struct trapframe tp, struct reg regs)
	{
	regs->r_fs = tp->tf_fs;
	regs->r_es = tp->tf_es;
	regs->r_ds = tp->tf_ds;
	regs->r_edi = tp->tf_edi;
	regs->r_esi = tp->tf_esi;
	regs->r_ebp = tp->tf_ebp;
	regs->r_ebx = tp->tf_ebx;
	regs->r_edx = tp->tf_edx;
	regs->r_ecx = tp->tf_ecx;
	regs->r_eax = tp->tf_eax;
	regs->r_eip = tp->tf_eip;
	regs->r_cs = tp->tf_cs;
	regs->r_eflags = tp->tf_eflags;
	regs->r_esp = tp->tf_esp;
	regs->r_ss = tp->tf_ss;
	return (0);
	}

	int
	set_regs(struct thread td, struct reg regs)
	{
	struct pcb *pcb;
	struct trapframe *tp;

	tp = td->td_frame;
	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) \|\|
	!CS_SECURE(regs->r_cs))
	return (EINVAL);
	pcb = td->td_pcb;
	tp->tf_fs = regs->r_fs;
	tp->tf_es = regs->r_es;
	tp->tf_ds = regs->r_ds;
	tp->tf_edi = regs->r_edi;
	tp->tf_esi = regs->r_esi;
	tp->tf_ebp = regs->r_ebp;
	tp->tf_ebx = regs->r_ebx;
	tp->tf_edx = regs->r_edx;
	tp->tf_ecx = regs->r_ecx;
	tp->tf_eax = regs->r_eax;
	tp->tf_eip = regs->r_eip;
	tp->tf_cs = regs->r_cs;
	tp->tf_eflags = regs->r_eflags;
	tp->tf_esp = regs->r_esp;
	tp->tf_ss = regs->r_ss;
	pcb->pcb_gs = regs->r_gs;
	return (0);
	}

	#ifdef CPU_ENABLE_SSE
	static void
	fill_fpregs_xmm(sv_xmm, sv_87)
	struct savexmm *sv_xmm;
	struct save87 *sv_87;
	{
	register struct env87 *penv_87 = &sv_87->sv_env;
	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
	int i;

	bzero(sv_87, sizeof(*sv_87));

	/* FPU control/status */
	penv_87->en_cw = penv_xmm->en_cw;
	penv_87->en_sw = penv_xmm->en_sw;
	penv_87->en_tw = penv_xmm->en_tw;
	penv_87->en_fip = penv_xmm->en_fip;
	penv_87->en_fcs = penv_xmm->en_fcs;
	penv_87->en_opcode = penv_xmm->en_opcode;
	penv_87->en_foo = penv_xmm->en_foo;
	penv_87->en_fos = penv_xmm->en_fos;

	/* FPU registers */
	for (i = 0; i < 8; ++i)
	sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
	}

	static void
	set_fpregs_xmm(sv_87, sv_xmm)
	struct save87 *sv_87;
	struct savexmm *sv_xmm;
	{
	register struct env87 *penv_87 = &sv_87->sv_env;
	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
	int i;

	/* FPU control/status */
	penv_xmm->en_cw = penv_87->en_cw;
	penv_xmm->en_sw = penv_87->en_sw;
	penv_xmm->en_tw = penv_87->en_tw;
	penv_xmm->en_fip = penv_87->en_fip;
	penv_xmm->en_fcs = penv_87->en_fcs;
	penv_xmm->en_opcode = penv_87->en_opcode;
	penv_xmm->en_foo = penv_87->en_foo;
	penv_xmm->en_fos = penv_87->en_fos;

	/* FPU registers */
	for (i = 0; i < 8; ++i)
	sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
	}
	#endif /* CPU_ENABLE_SSE */

	int
	fill_fpregs(struct thread td, struct fpreg fpregs)
	{

	KASSERT(td == curthread \|\| TD_IS_SUSPENDED(td),
	("not suspended thread %p", td));
	#ifdef DEV_NPX
	npxgetregs(td);
	#else
	bzero(fpregs, sizeof(*fpregs));
	#endif
	#ifdef CPU_ENABLE_SSE
	if (cpu_fxsr)
	fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm,
	(struct save87 *)fpregs);
	else
	#endif /* CPU_ENABLE_SSE */
	bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs,
	sizeof(*fpregs));
	return (0);
	}

	int
	set_fpregs(struct thread td, struct fpreg fpregs)
	{

	#ifdef CPU_ENABLE_SSE
	if (cpu_fxsr)
	set_fpregs_xmm((struct save87 *)fpregs,
	&td->td_pcb->pcb_user_save.sv_xmm);
	else
	#endif /* CPU_ENABLE_SSE */
	bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87,
	sizeof(*fpregs));
	#ifdef DEV_NPX
	npxuserinited(td);
	#endif
	return (0);
	}

	/*
	* Get machine context.
	*/
	int
	get_mcontext(struct thread td, mcontext_t mcp, int flags)
	{
	struct trapframe *tp;
	struct segment_descriptor *sdp;

	tp = td->td_frame;

	PROC_LOCK(curthread->td_proc);
	mcp->mc_onstack = sigonstack(tp->tf_esp);
	PROC_UNLOCK(curthread->td_proc);
	mcp->mc_gs = td->td_pcb->pcb_gs;
	mcp->mc_fs = tp->tf_fs;
	mcp->mc_es = tp->tf_es;
	mcp->mc_ds = tp->tf_ds;
	mcp->mc_edi = tp->tf_edi;
	mcp->mc_esi = tp->tf_esi;
	mcp->mc_ebp = tp->tf_ebp;
	mcp->mc_isp = tp->tf_isp;
	mcp->mc_eflags = tp->tf_eflags;
	if (flags & GET_MC_CLEAR_RET) {
	mcp->mc_eax = 0;
	mcp->mc_edx = 0;
	mcp->mc_eflags &= ~PSL_C;
	} else {
	mcp->mc_eax = tp->tf_eax;
	mcp->mc_edx = tp->tf_edx;
	}
	mcp->mc_ebx = tp->tf_ebx;
	mcp->mc_ecx = tp->tf_ecx;
	mcp->mc_eip = tp->tf_eip;
	mcp->mc_cs = tp->tf_cs;
	mcp->mc_esp = tp->tf_esp;
	mcp->mc_ss = tp->tf_ss;
	mcp->mc_len = sizeof(*mcp);
	get_fpcontext(td, mcp);
	sdp = &td->td_pcb->pcb_fsd;
	mcp->mc_fsbase = sdp->sd_hibase << 24 \| sdp->sd_lobase;
	sdp = &td->td_pcb->pcb_gsd;
	mcp->mc_gsbase = sdp->sd_hibase << 24 \| sdp->sd_lobase;
	bzero(mcp->mc_spare1, sizeof(mcp->mc_spare1));
	bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
	return (0);
	}

	/*
	* Set machine context.
	*
	* However, we don't set any but the user modifiable flags, and we won't
	* touch the cs selector.
	*/
	int
	set_mcontext(struct thread td, const mcontext_t mcp)
	{
	struct trapframe *tp;
	int eflags, ret;

	tp = td->td_frame;
	if (mcp->mc_len != sizeof(*mcp))
	return (EINVAL);
	eflags = (mcp->mc_eflags & PSL_USERCHANGE) \|
	(tp->tf_eflags & ~PSL_USERCHANGE);
	if ((ret = set_fpcontext(td, mcp)) == 0) {
	tp->tf_fs = mcp->mc_fs;
	tp->tf_es = mcp->mc_es;
	tp->tf_ds = mcp->mc_ds;
	tp->tf_edi = mcp->mc_edi;
	tp->tf_esi = mcp->mc_esi;
	tp->tf_ebp = mcp->mc_ebp;
	tp->tf_ebx = mcp->mc_ebx;
	tp->tf_edx = mcp->mc_edx;
	tp->tf_ecx = mcp->mc_ecx;
	tp->tf_eax = mcp->mc_eax;
	tp->tf_eip = mcp->mc_eip;
	tp->tf_eflags = eflags;
	tp->tf_esp = mcp->mc_esp;
	tp->tf_ss = mcp->mc_ss;
	td->td_pcb->pcb_gs = mcp->mc_gs;
	ret = 0;
	}
	return (ret);
	}

	static void
	get_fpcontext(struct thread td, mcontext_t mcp)
	{

	#ifndef DEV_NPX
	mcp->mc_fpformat = _MC_FPFMT_NODEV;
	mcp->mc_ownedfp = _MC_FPOWNED_NONE;
	bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
	#else
	mcp->mc_ownedfp = npxgetregs(td);
	bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate,
	sizeof(mcp->mc_fpstate));
	mcp->mc_fpformat = npxformat();
	#endif
	}

	static int
	set_fpcontext(struct thread td, const mcontext_t mcp)
	{

	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
	return (0);
	else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
	mcp->mc_fpformat != _MC_FPFMT_XMM)
	return (EINVAL);
	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
	/* We don't care what state is left in the FPU or PCB. */
	fpstate_drop(td);
	else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU \|\|
	mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
	#ifdef DEV_NPX
	#ifdef CPU_ENABLE_SSE
	if (cpu_fxsr)
	((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env.
	en_mxcsr &= cpu_mxcsr_mask;
	#endif
	npxsetregs(td, (union savefpu *)&mcp->mc_fpstate);
	#endif
	} else
	return (EINVAL);
	return (0);
	}

	static void
	fpstate_drop(struct thread *td)
	{

	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
	critical_enter();
	#ifdef DEV_NPX
	if (PCPU_GET(fpcurthread) == td)
	npxdrop();
	#endif
	/*
	* XXX force a full drop of the npx. The above only drops it if we
	* owned it. npxgetregs() has the same bug in the !cpu_fxsr case.
	*
	* XXX I don't much like npxgetregs()'s semantics of doing a full
	* drop. Dropping only to the pcb matches fnsave's behaviour.
	* We only need to drop to !PCB_INITDONE in sendsig(). But
	* sendsig() is the only caller of npxgetregs()... perhaps we just
	* have too many layers.
	*/
	curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE \|
	PCB_NPXUSERINITDONE);
	critical_exit();
	}

	int
	fill_dbregs(struct thread td, struct dbreg dbregs)
	{
	struct pcb *pcb;

	if (td == NULL) {
	dbregs->dr[0] = rdr0();
	dbregs->dr[1] = rdr1();
	dbregs->dr[2] = rdr2();
	dbregs->dr[3] = rdr3();
	dbregs->dr[4] = rdr4();
	dbregs->dr[5] = rdr5();
	dbregs->dr[6] = rdr6();
	dbregs->dr[7] = rdr7();
	} else {
	pcb = td->td_pcb;
	dbregs->dr[0] = pcb->pcb_dr0;
	dbregs->dr[1] = pcb->pcb_dr1;
	dbregs->dr[2] = pcb->pcb_dr2;
	dbregs->dr[3] = pcb->pcb_dr3;
	dbregs->dr[4] = 0;
	dbregs->dr[5] = 0;
	dbregs->dr[6] = pcb->pcb_dr6;
	dbregs->dr[7] = pcb->pcb_dr7;
	}
	return (0);
	}

	int
	set_dbregs(struct thread td, struct dbreg dbregs)
	{
	struct pcb *pcb;
	int i;

	if (td == NULL) {
	load_dr0(dbregs->dr[0]);
	load_dr1(dbregs->dr[1]);
	load_dr2(dbregs->dr[2]);
	load_dr3(dbregs->dr[3]);
	load_dr4(dbregs->dr[4]);
	load_dr5(dbregs->dr[5]);
	load_dr6(dbregs->dr[6]);
	load_dr7(dbregs->dr[7]);
	} else {
	/*
	* Don't let an illegal value for dr7 get set. Specifically,
	* check for undefined settings. Setting these bit patterns
	* result in undefined behaviour and can lead to an unexpected
	* TRCTRAP.
	*/
	for (i = 0; i < 4; i++) {
	if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
	return (EINVAL);
	if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
	return (EINVAL);
	}

	pcb = td->td_pcb;

	/*
	* Don't let a process set a breakpoint that is not within the
	* process's address space. If a process could do this, it
	* could halt the system by setting a breakpoint in the kernel
	* (if ddb was enabled). Thus, we need to check to make sure
	* that no breakpoints are being enabled for addresses outside
	* process's address space.
	*
	* XXX - what about when the watched area of the user's
	* address space is written into from within the kernel
	* ... wouldn't that still cause a breakpoint to be generated
	* from within kernel mode?
	*/

	if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
	/* dr0 is enabled */
	if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
	return (EINVAL);
	}

	if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
	/* dr1 is enabled */
	if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
	return (EINVAL);
	}

	if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
	/* dr2 is enabled */
	if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
	return (EINVAL);
	}

	if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
	/* dr3 is enabled */
	if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
	return (EINVAL);
	}

	pcb->pcb_dr0 = dbregs->dr[0];
	pcb->pcb_dr1 = dbregs->dr[1];
	pcb->pcb_dr2 = dbregs->dr[2];
	pcb->pcb_dr3 = dbregs->dr[3];
	pcb->pcb_dr6 = dbregs->dr[6];
	pcb->pcb_dr7 = dbregs->dr[7];

	pcb->pcb_flags \|= PCB_DBREGS;
	}

	return (0);
	}

	/*
	* Return > 0 if a hardware breakpoint has been hit, and the
	* breakpoint was in user space. Return 0, otherwise.
	*/
	int
	user_dbreg_trap(void)
	{
	u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
	u_int32_t bp; /* breakpoint bits extracted from dr6 */
	int nbp; /* number of breakpoints that triggered */
	caddr_t addr[4]; /* breakpoint addresses */
	int i;

	dr7 = rdr7();
	if ((dr7 & 0x000000ff) == 0) {
	/*
	* all GE and LE bits in the dr7 register are zero,
	* thus the trap couldn't have been caused by the
	* hardware debug registers
	*/
	return 0;
	}

	nbp = 0;
	dr6 = rdr6();
	bp = dr6 & 0x0000000f;

	if (!bp) {
	/*
	* None of the breakpoint bits are set meaning this
	* trap was not caused by any of the debug registers
	*/
	return 0;
	}

	/*
	* at least one of the breakpoints were hit, check to see
	* which ones and if any of them are user space addresses
	*/

	if (bp & 0x01) {
	addr[nbp++] = (caddr_t)rdr0();
	}
	if (bp & 0x02) {
	addr[nbp++] = (caddr_t)rdr1();
	}
	if (bp & 0x04) {
	addr[nbp++] = (caddr_t)rdr2();
	}
	if (bp & 0x08) {
	addr[nbp++] = (caddr_t)rdr3();
	}

	for (i = 0; i < nbp; i++) {
	if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
	/*
	* addr[i] is in user space
	*/
	return nbp;
	}
	}

	/*
	* None of the breakpoints are in user space.
	*/
	return 0;
	}

	#ifdef KDB

	/*
	* Provide inb() and outb() as functions. They are normally only available as
	* inline functions, thus cannot be called from the debugger.
	*/

	/* silence compiler warnings */
	u_char inb_(u_short);
	void outb_(u_short, u_char);

	u_char
	inb_(u_short port)
	{
	return inb(port);
	}

	void
	outb_(u_short port, u_char data)
	{
	outb(port, data);
	}

	#endif /* KDB */
	Index: head/sys/i386/i386/trap.c
	===================================================================
	--- head/sys/i386/i386/trap.c (revision 225616)
	+++ head/sys/i386/i386/trap.c (revision 225617)
	@@ -1,1106 +1,1106 @@
	/*-
	* Copyright (C) 1994, David Greenman
	* Copyright (c) 1990, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* the University of Utah, and William Jolitz.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)trap.c 7.4 (Berkeley) 5/13/91
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* 386 Trap and System call handling
	*/

	#include "opt_clock.h"
	#include "opt_cpu.h"
	#include "opt_hwpmc_hooks.h"
	#include "opt_isa.h"
	#include "opt_kdb.h"
	#include "opt_kdtrace.h"
	#include "opt_npx.h"
	#include "opt_trap.h"

	#include <sys/param.h>
	#include <sys/bus.h>
	#include <sys/systm.h>
	#include <sys/proc.h>
	#include <sys/pioctl.h>
	#include <sys/ptrace.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/resourcevar.h>
	#include <sys/signalvar.h>
	#include <sys/syscall.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/uio.h>
	#include <sys/vmmeter.h>
	#ifdef HWPMC_HOOKS
	#include <sys/pmckern.h>
	#endif
	#include <security/audit/audit.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_map.h>
	#include <vm/vm_page.h>
	#include <vm/vm_extern.h>

	#include <machine/cpu.h>
	#include <machine/intr_machdep.h>
	#include <x86/mca.h>
	#include <machine/md_var.h>
	#include <machine/pcb.h>
	#ifdef SMP
	#include <machine/smp.h>
	#endif
	#include <machine/tss.h>
	#include <machine/vm86.h>

	#ifdef POWERFAIL_NMI
	#include <sys/syslog.h>
	#include <machine/clock.h>
	#endif

	#ifdef KDTRACE_HOOKS
	#include <sys/dtrace_bsd.h>

	/*
	* This is a hook which is initialised by the dtrace module
	* to handle traps which might occur during DTrace probe
	* execution.
	*/
	dtrace_trap_func_t dtrace_trap_func;

	dtrace_doubletrap_func_t dtrace_doubletrap_func;

	/*
	* This is a hook which is initialised by the systrace module
	* when it is loaded. This keeps the DTrace syscall provider
	* implementation opaque.
	*/
	systrace_probe_func_t systrace_probe_func;

	/*
	* These hooks are necessary for the pid, usdt and fasttrap providers.
	*/
	dtrace_fasttrap_probe_ptr_t dtrace_fasttrap_probe_ptr;
	dtrace_pid_probe_ptr_t dtrace_pid_probe_ptr;
	dtrace_return_probe_ptr_t dtrace_return_probe_ptr;
	#endif

	extern void trap(struct trapframe *frame);
	extern void syscall(struct trapframe *frame);

	static int trap_pfault(struct trapframe *, int, vm_offset_t);
	static void trap_fatal(struct trapframe *, vm_offset_t);
	void dblfault_handler(void);

	extern inthand_t IDTVEC(lcall_syscall);

	#define MAX_TRAP_MSG 30
	static char *trap_msg[] = {
	"", /* 0 unused */
	"privileged instruction fault", /* 1 T_PRIVINFLT */
	"", /* 2 unused */
	"breakpoint instruction fault", /* 3 T_BPTFLT */
	"", /* 4 unused */
	"", /* 5 unused */
	"arithmetic trap", /* 6 T_ARITHTRAP */
	"", /* 7 unused */
	"", /* 8 unused */
	"general protection fault", /* 9 T_PROTFLT */
	"trace trap", /* 10 T_TRCTRAP */
	"", /* 11 unused */
	"page fault", /* 12 T_PAGEFLT */
	"", /* 13 unused */
	"alignment fault", /* 14 T_ALIGNFLT */
	"", /* 15 unused */
	"", /* 16 unused */
	"", /* 17 unused */
	"integer divide fault", /* 18 T_DIVIDE */
	"non-maskable interrupt trap", /* 19 T_NMI */
	"overflow trap", /* 20 T_OFLOW */
	"FPU bounds check fault", /* 21 T_BOUND */
	"FPU device not available", /* 22 T_DNA */
	"double fault", /* 23 T_DOUBLEFLT */
	"FPU operand fetch fault", /* 24 T_FPOPFLT */
	"invalid TSS fault", /* 25 T_TSSFLT */
	"segment not present fault", /* 26 T_SEGNPFLT */
	"stack fault", /* 27 T_STKFLT */
	"machine check trap", /* 28 T_MCHK */
	"SIMD floating-point exception", /* 29 T_XMMFLT */
	"reserved (unknown) fault", /* 30 T_RESERVED */
	};

	#if defined(I586_CPU) && !defined(NO_F00F_HACK)
	extern int has_f00f_bug;
	#endif

	#ifdef KDB
	static int kdb_on_nmi = 1;
	SYSCTL_INT(_machdep, OID_AUTO, kdb_on_nmi, CTLFLAG_RW,
	&kdb_on_nmi, 0, "Go to KDB on NMI");
	TUNABLE_INT("machdep.kdb_on_nmi", &kdb_on_nmi);
	#endif
	static int panic_on_nmi = 1;
	SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
	&panic_on_nmi, 0, "Panic on NMI");
	TUNABLE_INT("machdep.panic_on_nmi", &panic_on_nmi);
	static int prot_fault_translation = 0;
	SYSCTL_INT(_machdep, OID_AUTO, prot_fault_translation, CTLFLAG_RW,
	&prot_fault_translation, 0, "Select signal to deliver on protection fault");

	/*
	* Exception, fault, and trap interface to the FreeBSD kernel.
	* This common code is called from assembly language IDT gate entry
	* routines that prepare a suitable stack frame, and restore this
	* frame after the exception has been processed.
	*/

	void
	trap(struct trapframe *frame)
	{
	struct thread *td = curthread;
	struct proc *p = td->td_proc;
	int i = 0, ucode = 0, code;
	u_int type;
	register_t addr = 0;
	vm_offset_t eva;
	ksiginfo_t ksi;
	#ifdef POWERFAIL_NMI
	static int lastalert = 0;
	#endif

	PCPU_INC(cnt.v_trap);
	type = frame->tf_trapno;

	#ifdef SMP
	/* Handler for NMI IPIs used for stopping CPUs. */
	if (type == T_NMI) {
	if (ipi_nmi_handler() == 0)
	goto out;
	}
	#endif /* SMP */

	#ifdef KDB
	if (kdb_active) {
	kdb_reenter();
	goto out;
	}
	#endif

	if (type == T_RESERVED) {
	trap_fatal(frame, 0);
	goto out;
	}

	#ifdef HWPMC_HOOKS
	/*
	* CPU PMCs interrupt using an NMI so we check for that first.
	* If the HWPMC module is active, 'pmc_hook' will point to
	* the function to be called. A return value of '1' from the
	* hook means that the NMI was handled by it and that we can
	* return immediately.
	*/
	if (type == T_NMI && pmc_intr &&
	(*pmc_intr)(PCPU_GET(cpuid), frame))
	goto out;
	#endif

	if (type == T_MCHK) {
	if (!mca_intr())
	trap_fatal(frame, 0);
	goto out;
	}

	#ifdef KDTRACE_HOOKS
	/*
	* A trap can occur while DTrace executes a probe. Before
	* executing the probe, DTrace blocks re-scheduling and sets
	* a flag in it's per-cpu flags to indicate that it doesn't
	* want to fault. On returning from the probe, the no-fault
	* flag is cleared and finally re-scheduling is enabled.
	*
	* If the DTrace kernel module has registered a trap handler,
	* call it and if it returns non-zero, assume that it has
	* handled the trap and modified the trap frame so that this
	* function can return normally.
	*/
	if ((type == T_PROTFLT \|\| type == T_PAGEFLT) &&
	dtrace_trap_func != NULL)
	if ((*dtrace_trap_func)(frame, type))
	goto out;
	if (type == T_DTRACE_PROBE \|\| type == T_DTRACE_RET \|\|
	type == T_BPTFLT) {
	struct reg regs;

	fill_frame_regs(frame, &regs);
	if (type == T_DTRACE_PROBE &&
	dtrace_fasttrap_probe_ptr != NULL &&
	dtrace_fasttrap_probe_ptr(&regs) == 0)
	goto out;
	if (type == T_BPTFLT &&
	dtrace_pid_probe_ptr != NULL &&
	dtrace_pid_probe_ptr(&regs) == 0)
	goto out;
	if (type == T_DTRACE_RET &&
	dtrace_return_probe_ptr != NULL &&
	dtrace_return_probe_ptr(&regs) == 0)
	goto out;
	}
	#endif

	if ((frame->tf_eflags & PSL_I) == 0) {
	/*
	* Buggy application or kernel code has disabled
	* interrupts and then trapped. Enabling interrupts
	* now is wrong, but it is better than running with
	* interrupts disabled until they are accidentally
	* enabled later.
	*/
	if (ISPL(frame->tf_cs) == SEL_UPL \|\| (frame->tf_eflags & PSL_VM))
	uprintf(
	"pid %ld (%s): trap %d with interrupts disabled\n",
	(long)curproc->p_pid, curthread->td_name, type);
	else if (type != T_BPTFLT && type != T_TRCTRAP &&
	frame->tf_eip != (int)cpu_switch_load_gs) {
	/*
	* XXX not quite right, since this may be for a
	* multiple fault in user mode.
	*/
	printf("kernel trap %d with interrupts disabled\n",
	type);
	/*
	* Page faults need interrupts disabled until later,
	* and we shouldn't enable interrupts while holding
	* a spin lock or if servicing an NMI.
	*/
	if (type != T_NMI && type != T_PAGEFLT &&
	td->td_md.md_spinlock_count == 0)
	enable_intr();
	}
	}
	eva = 0;
	code = frame->tf_err;
	if (type == T_PAGEFLT) {
	/*
	* For some Cyrix CPUs, %cr2 is clobbered by
	* interrupts. This problem is worked around by using
	* an interrupt gate for the pagefault handler. We
	* are finally ready to read %cr2 and then must
	* reenable interrupts.
	*
	* If we get a page fault while in a critical section, then
	* it is most likely a fatal kernel page fault. The kernel
	* is already going to panic trying to get a sleep lock to
	* do the VM lookup, so just consider it a fatal trap so the
	* kernel can print out a useful trap message and even get
	* to the debugger.
	*
	* If we get a page fault while holding a non-sleepable
	* lock, then it is most likely a fatal kernel page fault.
	* If WITNESS is enabled, then it's going to whine about
	* bogus LORs with various VM locks, so just skip to the
	* fatal trap handling directly.
	*/
	eva = rcr2();
	if (td->td_critnest != 0 \|\|
	WITNESS_CHECK(WARN_SLEEPOK \| WARN_GIANTOK, NULL,
	"Kernel page fault") != 0)
	trap_fatal(frame, eva);
	else
	enable_intr();
	}

	if ((ISPL(frame->tf_cs) == SEL_UPL) \|\|
	((frame->tf_eflags & PSL_VM) &&
	!(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL))) {
	/* user trap */

	td->td_pticks = 0;
	td->td_frame = frame;
	addr = frame->tf_eip;
	if (td->td_ucred != p->p_ucred)
	cred_update_thread(td);

	switch (type) {
	case T_PRIVINFLT: /* privileged instruction fault */
	i = SIGILL;
	ucode = ILL_PRVOPC;
	break;

	case T_BPTFLT: /* bpt instruction fault */
	case T_TRCTRAP: /* trace trap */
	enable_intr();
	frame->tf_eflags &= ~PSL_T;
	i = SIGTRAP;
	ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
	break;

	case T_ARITHTRAP: /* arithmetic trap */
	#ifdef DEV_NPX
	ucode = npxtrap();
	if (ucode == -1)
	goto userout;
	#else
	ucode = 0;
	#endif
	i = SIGFPE;
	break;

	/*
	* The following two traps can happen in
	* vm86 mode, and, if so, we want to handle
	* them specially.
	*/
	case T_PROTFLT: /* general protection fault */
	case T_STKFLT: /* stack fault */
	if (frame->tf_eflags & PSL_VM) {
	i = vm86_emulate((struct vm86frame *)frame);
	if (i == 0)
	goto user;
	break;
	}
	i = SIGBUS;
	ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
	break;
	case T_SEGNPFLT: /* segment not present fault */
	i = SIGBUS;
	ucode = BUS_ADRERR;
	break;
	case T_TSSFLT: /* invalid TSS fault */
	i = SIGBUS;
	ucode = BUS_OBJERR;
	break;
	case T_DOUBLEFLT: /* double fault */
	default:
	i = SIGBUS;
	ucode = BUS_OBJERR;
	break;

	case T_PAGEFLT: /* page fault */

	i = trap_pfault(frame, TRUE, eva);
	#if defined(I586_CPU) && !defined(NO_F00F_HACK)
	if (i == -2) {
	/*
	* The f00f hack workaround has triggered, so
	* treat the fault as an illegal instruction
	* (T_PRIVINFLT) instead of a page fault.
	*/
	type = frame->tf_trapno = T_PRIVINFLT;

	/* Proceed as in that case. */
	ucode = ILL_PRVOPC;
	i = SIGILL;
	break;
	}
	#endif
	if (i == -1)
	goto userout;
	if (i == 0)
	goto user;

	if (i == SIGSEGV)
	ucode = SEGV_MAPERR;
	else {
	if (prot_fault_translation == 0) {
	/*
	* Autodetect.
	* This check also covers the images
	* without the ABI-tag ELF note.
	*/
	if (SV_CURPROC_ABI() == SV_ABI_FREEBSD
	&& p->p_osrel >= P_OSREL_SIGSEGV) {
	i = SIGSEGV;
	ucode = SEGV_ACCERR;
	} else {
	i = SIGBUS;
	ucode = BUS_PAGE_FAULT;
	}
	} else if (prot_fault_translation == 1) {
	/*
	* Always compat mode.
	*/
	i = SIGBUS;
	ucode = BUS_PAGE_FAULT;
	} else {
	/*
	* Always SIGSEGV mode.
	*/
	i = SIGSEGV;
	ucode = SEGV_ACCERR;
	}
	}
	addr = eva;
	break;

	case T_DIVIDE: /* integer divide fault */
	ucode = FPE_INTDIV;
	i = SIGFPE;
	break;

	#ifdef DEV_ISA
	case T_NMI:
	#ifdef POWERFAIL_NMI
	#ifndef TIMER_FREQ
	# define TIMER_FREQ 1193182
	#endif
	if (time_second - lastalert > 10) {
	log(LOG_WARNING, "NMI: power fail\n");
	sysbeep(880, hz);
	lastalert = time_second;
	}
	goto userout;
	#else /* !POWERFAIL_NMI */
	/* machine/parity/power fail/"kitchen sink" faults */
	if (isa_nmi(code) == 0) {
	#ifdef KDB
	/*
	* NMI can be hooked up to a pushbutton
	* for debugging.
	*/
	if (kdb_on_nmi) {
	printf ("NMI ... going to debugger\n");
	kdb_trap(type, 0, frame);
	}
	#endif /* KDB */
	goto userout;
	} else if (panic_on_nmi)
	panic("NMI indicates hardware failure");
	break;
	#endif /* POWERFAIL_NMI */
	#endif /* DEV_ISA */

	case T_OFLOW: /* integer overflow fault */
	ucode = FPE_INTOVF;
	i = SIGFPE;
	break;

	case T_BOUND: /* bounds check fault */
	ucode = FPE_FLTSUB;
	i = SIGFPE;
	break;

	case T_DNA:
	#ifdef DEV_NPX
	KASSERT(PCB_USER_FPU(td->td_pcb),
	("kernel FPU ctx has leaked"));
	/* transparent fault (due to context switch "late") */
	if (npxdna())
	goto userout;
	#endif
	uprintf("pid %d killed due to lack of floating point\n",
	p->p_pid);
	i = SIGKILL;
	ucode = 0;
	break;

	case T_FPOPFLT: /* FPU operand fetch fault */
	ucode = ILL_COPROC;
	i = SIGILL;
	break;

	case T_XMMFLT: /* SIMD floating-point exception */
	ucode = 0; /* XXX */
	i = SIGFPE;
	break;
	}
	} else {
	/* kernel trap */

	KASSERT(cold \|\| td->td_ucred != NULL,
	("kernel trap doesn't have ucred"));
	switch (type) {
	case T_PAGEFLT: /* page fault */
	(void) trap_pfault(frame, FALSE, eva);
	goto out;

	case T_DNA:
	#ifdef DEV_NPX
	KASSERT(!PCB_USER_FPU(td->td_pcb),
	("Unregistered use of FPU in kernel"));
	if (npxdna())
	goto out;
	#endif
	break;

	case T_ARITHTRAP: /* arithmetic trap */
	case T_XMMFLT: /* SIMD floating-point exception */
	case T_FPOPFLT: /* FPU operand fetch fault */
	/*
	* XXXKIB for now disable any FPU traps in kernel
	* handler registration seems to be overkill
	*/
	trap_fatal(frame, 0);
	goto out;

	/*
	* The following two traps can happen in
	* vm86 mode, and, if so, we want to handle
	* them specially.
	*/
	case T_PROTFLT: /* general protection fault */
	case T_STKFLT: /* stack fault */
	if (frame->tf_eflags & PSL_VM) {
	i = vm86_emulate((struct vm86frame *)frame);
	if (i != 0)
	/*
	* returns to original process
	*/
	vm86_trap((struct vm86frame *)frame);
	goto out;
	}
	if (type == T_STKFLT)
	break;

	/* FALL THROUGH */

	case T_SEGNPFLT: /* segment not present fault */
	if (PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL)
	break;

	/*
	* Invalid %fs's and %gs's can be created using
	* procfs or PT_SETREGS or by invalidating the
	* underlying LDT entry. This causes a fault
	* in kernel mode when the kernel attempts to
	* switch contexts. Lose the bad context
	* (XXX) so that we can continue, and generate
	* a signal.
	*/
	if (frame->tf_eip == (int)cpu_switch_load_gs) {
	PCPU_GET(curpcb)->pcb_gs = 0;
	#if 0
	PROC_LOCK(p);
	- psignal(p, SIGBUS);
	+ kern_psignal(p, SIGBUS);
	PROC_UNLOCK(p);
	#endif
	goto out;
	}

	if (td->td_intr_nesting_level != 0)
	break;

	/*
	* Invalid segment selectors and out of bounds
	* %eip's and %esp's can be set up in user mode.
	* This causes a fault in kernel mode when the
	* kernel tries to return to user mode. We want
	* to get this fault so that we can fix the
	* problem here and not have to check all the
	* selectors and pointers when the user changes
	* them.
	*/
	if (frame->tf_eip == (int)doreti_iret) {
	frame->tf_eip = (int)doreti_iret_fault;
	goto out;
	}
	if (frame->tf_eip == (int)doreti_popl_ds) {
	frame->tf_eip = (int)doreti_popl_ds_fault;
	goto out;
	}
	if (frame->tf_eip == (int)doreti_popl_es) {
	frame->tf_eip = (int)doreti_popl_es_fault;
	goto out;
	}
	if (frame->tf_eip == (int)doreti_popl_fs) {
	frame->tf_eip = (int)doreti_popl_fs_fault;
	goto out;
	}
	if (PCPU_GET(curpcb)->pcb_onfault != NULL) {
	frame->tf_eip =
	(int)PCPU_GET(curpcb)->pcb_onfault;
	goto out;
	}
	break;

	case T_TSSFLT:
	/*
	* PSL_NT can be set in user mode and isn't cleared
	* automatically when the kernel is entered. This
	* causes a TSS fault when the kernel attempts to
	* `iret' because the TSS link is uninitialized. We
	* want to get this fault so that we can fix the
	* problem here and not every time the kernel is
	* entered.
	*/
	if (frame->tf_eflags & PSL_NT) {
	frame->tf_eflags &= ~PSL_NT;
	goto out;
	}
	break;

	case T_TRCTRAP: /* trace trap */
	if (frame->tf_eip == (int)IDTVEC(lcall_syscall)) {
	/*
	* We've just entered system mode via the
	* syscall lcall. Continue single stepping
	* silently until the syscall handler has
	* saved the flags.
	*/
	goto out;
	}
	if (frame->tf_eip == (int)IDTVEC(lcall_syscall) + 1) {
	/*
	* The syscall handler has now saved the
	* flags. Stop single stepping it.
	*/
	frame->tf_eflags &= ~PSL_T;
	goto out;
	}
	/*
	* Ignore debug register trace traps due to
	* accesses in the user's address space, which
	* can happen under several conditions such as
	* if a user sets a watchpoint on a buffer and
	* then passes that buffer to a system call.
	* We still want to get TRCTRAPS for addresses
	* in kernel space because that is useful when
	* debugging the kernel.
	*/
	if (user_dbreg_trap() &&
	!(PCPU_GET(curpcb)->pcb_flags & PCB_VM86CALL)) {
	/*
	* Reset breakpoint bits because the
	* processor doesn't
	*/
	load_dr6(rdr6() & 0xfffffff0);
	goto out;
	}
	/*
	* FALLTHROUGH (TRCTRAP kernel mode, kernel address)
	*/
	case T_BPTFLT:
	/*
	* If KDB is enabled, let it handle the debugger trap.
	* Otherwise, debugger traps "can't happen".
	*/
	#ifdef KDB
	if (kdb_trap(type, 0, frame))
	goto out;
	#endif
	break;

	#ifdef DEV_ISA
	case T_NMI:
	#ifdef POWERFAIL_NMI
	if (time_second - lastalert > 10) {
	log(LOG_WARNING, "NMI: power fail\n");
	sysbeep(880, hz);
	lastalert = time_second;
	}
	goto out;
	#else /* !POWERFAIL_NMI */
	/* machine/parity/power fail/"kitchen sink" faults */
	if (isa_nmi(code) == 0) {
	#ifdef KDB
	/*
	* NMI can be hooked up to a pushbutton
	* for debugging.
	*/
	if (kdb_on_nmi) {
	printf ("NMI ... going to debugger\n");
	kdb_trap(type, 0, frame);
	}
	#endif /* KDB */
	goto out;
	} else if (panic_on_nmi == 0)
	goto out;
	/* FALLTHROUGH */
	#endif /* POWERFAIL_NMI */
	#endif /* DEV_ISA */
	}

	trap_fatal(frame, eva);
	goto out;
	}

	/* Translate fault for emulators (e.g. Linux) */
	if (*p->p_sysent->sv_transtrap)
	i = (*p->p_sysent->sv_transtrap)(i, type);

	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = i;
	ksi.ksi_code = ucode;
	ksi.ksi_addr = (void *)addr;
	ksi.ksi_trapno = type;
	trapsignal(td, &ksi);

	#ifdef DEBUG
	if (type <= MAX_TRAP_MSG) {
	uprintf("fatal process exception: %s",
	trap_msg[type]);
	if ((type == T_PAGEFLT) \|\| (type == T_PROTFLT))
	uprintf(", fault VA = 0x%lx", (u_long)eva);
	uprintf("\n");
	}
	#endif

	user:
	userret(td, frame);
	mtx_assert(&Giant, MA_NOTOWNED);
	KASSERT(PCB_USER_FPU(td->td_pcb),
	("Return from trap with kernel FPU ctx leaked"));
	userout:
	out:
	return;
	}

	static int
	trap_pfault(frame, usermode, eva)
	struct trapframe *frame;
	int usermode;
	vm_offset_t eva;
	{
	vm_offset_t va;
	struct vmspace *vm = NULL;
	vm_map_t map;
	int rv = 0;
	vm_prot_t ftype;
	struct thread *td = curthread;
	struct proc *p = td->td_proc;

	va = trunc_page(eva);
	if (va >= KERNBASE) {
	/*
	* Don't allow user-mode faults in kernel address space.
	* An exception: if the faulting address is the invalid
	* instruction entry in the IDT, then the Intel Pentium
	* F00F bug workaround was triggered, and we need to
	* treat it is as an illegal instruction, and not a page
	* fault.
	*/
	#if defined(I586_CPU) && !defined(NO_F00F_HACK)
	if ((eva == (unsigned int)&idt[6]) && has_f00f_bug)
	return -2;
	#endif
	if (usermode)
	goto nogo;

	map = kernel_map;
	} else {
	/*
	* This is a fault on non-kernel virtual memory.
	* vm is initialized above to NULL. If curproc is NULL
	* or curproc->p_vmspace is NULL the fault is fatal.
	*/
	if (p != NULL)
	vm = p->p_vmspace;

	if (vm == NULL)
	goto nogo;

	map = &vm->vm_map;
	}

	/*
	* PGEX_I is defined only if the execute disable bit capability is
	* supported and enabled.
	*/
	if (frame->tf_err & PGEX_W)
	ftype = VM_PROT_WRITE;
	#ifdef PAE
	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
	ftype = VM_PROT_EXECUTE;
	#endif
	else
	ftype = VM_PROT_READ;

	if (map != kernel_map) {
	/*
	* Keep swapout from messing with us during this
	* critical time.
	*/
	PROC_LOCK(p);
	++p->p_lock;
	PROC_UNLOCK(p);

	/* Fault in the user page: */
	rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);

	PROC_LOCK(p);
	--p->p_lock;
	PROC_UNLOCK(p);
	} else {
	/*
	* Don't have to worry about process locking or stacks in the
	* kernel.
	*/
	rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
	}
	if (rv == KERN_SUCCESS)
	return (0);
	nogo:
	if (!usermode) {
	if (td->td_intr_nesting_level == 0 &&
	PCPU_GET(curpcb)->pcb_onfault != NULL) {
	frame->tf_eip = (int)PCPU_GET(curpcb)->pcb_onfault;
	return (0);
	}
	trap_fatal(frame, eva);
	return (-1);
	}

	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
	}

	static void
	trap_fatal(frame, eva)
	struct trapframe *frame;
	vm_offset_t eva;
	{
	int code, ss, esp;
	u_int type;
	struct soft_segment_descriptor softseg;
	char *msg;

	code = frame->tf_err;
	type = frame->tf_trapno;
	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);

	if (type <= MAX_TRAP_MSG)
	msg = trap_msg[type];
	else
	msg = "UNKNOWN";
	printf("\n\nFatal trap %d: %s while in %s mode\n", type, msg,
	frame->tf_eflags & PSL_VM ? "vm86" :
	ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
	#ifdef SMP
	/* two separate prints in case of a trap on an unmapped page */
	printf("cpuid = %d; ", PCPU_GET(cpuid));
	printf("apic id = %02x\n", PCPU_GET(apic_id));
	#endif
	if (type == T_PAGEFLT) {
	printf("fault virtual address = 0x%x\n", eva);
	printf("fault code = %s %s, %s\n",
	code & PGEX_U ? "user" : "supervisor",
	code & PGEX_W ? "write" : "read",
	code & PGEX_P ? "protection violation" : "page not present");
	}
	printf("instruction pointer = 0x%x:0x%x\n",
	frame->tf_cs & 0xffff, frame->tf_eip);
	if ((ISPL(frame->tf_cs) == SEL_UPL) \|\| (frame->tf_eflags & PSL_VM)) {
	ss = frame->tf_ss & 0xffff;
	esp = frame->tf_esp;
	} else {
	ss = GSEL(GDATA_SEL, SEL_KPL);
	esp = (int)&frame->tf_esp;
	}
	printf("stack pointer = 0x%x:0x%x\n", ss, esp);
	printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp);
	printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n",
	softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
	printf(" = DPL %d, pres %d, def32 %d, gran %d\n",
	softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
	softseg.ssd_gran);
	printf("processor eflags = ");
	if (frame->tf_eflags & PSL_T)
	printf("trace trap, ");
	if (frame->tf_eflags & PSL_I)
	printf("interrupt enabled, ");
	if (frame->tf_eflags & PSL_NT)
	printf("nested task, ");
	if (frame->tf_eflags & PSL_RF)
	printf("resume, ");
	if (frame->tf_eflags & PSL_VM)
	printf("vm86, ");
	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
	printf("current process = ");
	if (curproc) {
	printf("%lu (%s)\n", (u_long)curproc->p_pid, curthread->td_name);
	} else {
	printf("Idle\n");
	}

	#ifdef KDB
	if (debugger_on_panic \|\| kdb_active) {
	frame->tf_err = eva; /* smuggle fault address to ddb */
	if (kdb_trap(type, 0, frame)) {
	frame->tf_err = code; /* restore error code */
	return;
	}
	frame->tf_err = code; /* restore error code */
	}
	#endif
	printf("trap number = %d\n", type);
	if (type <= MAX_TRAP_MSG)
	panic("%s", trap_msg[type]);
	else
	panic("unknown/reserved trap");
	}

	/*
	* Double fault handler. Called when a fault occurs while writing
	* a frame for a trap/exception onto the stack. This usually occurs
	* when the stack overflows (such is the case with infinite recursion,
	* for example).
	*
	* XXX Note that the current PTD gets replaced by IdlePTD when the
	* task switch occurs. This means that the stack that was active at
	* the time of the double fault is not available at <kstack> unless
	* the machine was idle when the double fault occurred. The downside
	* of this is that "trace <ebp>" in ddb won't work.
	*/
	void
	dblfault_handler()
	{
	#ifdef KDTRACE_HOOKS
	if (dtrace_doubletrap_func != NULL)
	(*dtrace_doubletrap_func)();
	#endif
	printf("\nFatal double fault:\n");
	printf("eip = 0x%x\n", PCPU_GET(common_tss.tss_eip));
	printf("esp = 0x%x\n", PCPU_GET(common_tss.tss_esp));
	printf("ebp = 0x%x\n", PCPU_GET(common_tss.tss_ebp));
	#ifdef SMP
	/* two separate prints in case of a trap on an unmapped page */
	printf("cpuid = %d; ", PCPU_GET(cpuid));
	printf("apic id = %02x\n", PCPU_GET(apic_id));
	#endif
	panic("double fault");
	}

	int
	cpu_fetch_syscall_args(struct thread td, struct syscall_args sa)
	{
	struct proc *p;
	struct trapframe *frame;
	caddr_t params;
	int error;

	p = td->td_proc;
	frame = td->td_frame;

	params = (caddr_t)frame->tf_esp + sizeof(int);
	sa->code = frame->tf_eax;

	/*
	* Need to check if this is a 32 bit or 64 bit syscall.
	*/
	if (sa->code == SYS_syscall) {
	/*
	* Code is first argument, followed by actual args.
	*/
	sa->code = fuword(params);
	params += sizeof(int);
	} else if (sa->code == SYS___syscall) {
	/*
	* Like syscall, but code is a quad, so as to maintain
	* quad alignment for the rest of the arguments.
	*/
	sa->code = fuword(params);
	params += sizeof(quad_t);
	}

	if (p->p_sysent->sv_mask)
	sa->code &= p->p_sysent->sv_mask;
	if (sa->code >= p->p_sysent->sv_size)
	sa->callp = &p->p_sysent->sv_table[0];
	else
	sa->callp = &p->p_sysent->sv_table[sa->code];
	sa->narg = sa->callp->sy_narg;

	if (params != NULL && sa->narg != 0)
	error = copyin(params, (caddr_t)sa->args,
	(u_int)(sa->narg * sizeof(int)));
	else
	error = 0;

	if (error == 0) {
	td->td_retval[0] = 0;
	td->td_retval[1] = frame->tf_edx;
	}

	return (error);
	}

	#include "../../kern/subr_syscall.c"

	/*
	* syscall - system call request C handler
	*
	* A system call is essentially treated as a trap.
	*/
	void
	syscall(struct trapframe *frame)
	{
	struct thread *td;
	struct syscall_args sa;
	register_t orig_tf_eflags;
	int error;
	ksiginfo_t ksi;

	#ifdef DIAGNOSTIC
	if (ISPL(frame->tf_cs) != SEL_UPL) {
	panic("syscall");
	/* NOT REACHED */
	}
	#endif
	orig_tf_eflags = frame->tf_eflags;

	td = curthread;
	td->td_frame = frame;

	error = syscallenter(td, &sa);

	/*
	* Traced syscall.
	*/
	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
	frame->tf_eflags &= ~PSL_T;
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGTRAP;
	ksi.ksi_code = TRAP_TRACE;
	ksi.ksi_addr = (void *)frame->tf_eip;
	trapsignal(td, &ksi);
	}

	KASSERT(PCB_USER_FPU(td->td_pcb),
	("System call %s returning with kernel FPU ctx leaked",
	syscallname(td->td_proc, sa.code)));
	KASSERT(td->td_pcb->pcb_save == &td->td_pcb->pcb_user_save,
	("System call %s returning with mangled pcb_save",
	syscallname(td->td_proc, sa.code)));

	syscallret(td, error, &sa);
	}
	Index: head/sys/i386/ibcs2/ibcs2_ioctl.c
	===================================================================
	--- head/sys/i386/ibcs2/ibcs2_ioctl.c (revision 225616)
	+++ head/sys/i386/ibcs2/ibcs2_ioctl.c (revision 225617)
	@@ -1,687 +1,687 @@
	/* $NetBSD: ibcs2_ioctl.c,v 1.6 1995/03/14 15:12:28 scottb Exp $ */

	/*-
	* Copyright (c) 1994, 1995 Scott Bartram
	* All rights reserved.
	*
	* based on compat/sunos/sun_ioctl.c
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/consio.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/filio.h>
	#include <sys/kbio.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysproto.h>
	#include <sys/tty.h>

	#include <i386/ibcs2/ibcs2_signal.h>
	#include <i386/ibcs2/ibcs2_socksys.h>
	#include <i386/ibcs2/ibcs2_stropts.h>
	#include <i386/ibcs2/ibcs2_proto.h>
	#include <i386/ibcs2/ibcs2_termios.h>
	#include <i386/ibcs2/ibcs2_util.h>
	#include <i386/ibcs2/ibcs2_ioctl.h>

	static void stios2btios(struct ibcs2_termios , struct termios );
	static void btios2stios(struct termios , struct ibcs2_termios );
	static void stios2stio(struct ibcs2_termios , struct ibcs2_termio );
	static void stio2stios(struct ibcs2_termio , struct ibcs2_termios );

	/*
	* iBCS2 ioctl calls.
	*/

	struct speedtab {
	int sp_speed; /* Speed. */
	int sp_code; /* Code. */
	};

	static struct speedtab sptab[] = {
	{ 0, 0 },
	{ 50, 1 },
	{ 75, 2 },
	{ 110, 3 },
	{ 134, 4 },
	{ 135, 4 },
	{ 150, 5 },
	{ 200, 6 },
	{ 300, 7 },
	{ 600, 8 },
	{ 1200, 9 },
	{ 1800, 10 },
	{ 2400, 11 },
	{ 4800, 12 },
	{ 9600, 13 },
	{ 19200, 14 },
	{ 38400, 15 },
	{ -1, -1 }
	};

	static u_long s2btab[] = {
	0,
	50,
	75,
	110,
	134,
	150,
	200,
	300,
	600,
	1200,
	1800,
	2400,
	4800,
	9600,
	19200,
	38400,
	};

	static int
	ttspeedtab(int speed, struct speedtab *table)
	{

	for ( ; table->sp_speed != -1; table++)
	if (table->sp_speed == speed)
	return (table->sp_code);
	return (-1);
	}

	static void
	stios2btios(st, bt)
	struct ibcs2_termios *st;
	struct termios *bt;
	{
	register u_long l, r;

	l = st->c_iflag; r = 0;
	if (l & IBCS2_IGNBRK) r \|= IGNBRK;
	if (l & IBCS2_BRKINT) r \|= BRKINT;
	if (l & IBCS2_IGNPAR) r \|= IGNPAR;
	if (l & IBCS2_PARMRK) r \|= PARMRK;
	if (l & IBCS2_INPCK) r \|= INPCK;
	if (l & IBCS2_ISTRIP) r \|= ISTRIP;
	if (l & IBCS2_INLCR) r \|= INLCR;
	if (l & IBCS2_IGNCR) r \|= IGNCR;
	if (l & IBCS2_ICRNL) r \|= ICRNL;
	if (l & IBCS2_IXON) r \|= IXON;
	if (l & IBCS2_IXANY) r \|= IXANY;
	if (l & IBCS2_IXOFF) r \|= IXOFF;
	if (l & IBCS2_IMAXBEL) r \|= IMAXBEL;
	bt->c_iflag = r;

	l = st->c_oflag; r = 0;
	if (l & IBCS2_OPOST) r \|= OPOST;
	if (l & IBCS2_ONLCR) r \|= ONLCR;
	if (l & IBCS2_TAB3) r \|= TAB3;
	bt->c_oflag = r;

	l = st->c_cflag; r = 0;
	switch (l & IBCS2_CSIZE) {
	case IBCS2_CS5: r \|= CS5; break;
	case IBCS2_CS6: r \|= CS6; break;
	case IBCS2_CS7: r \|= CS7; break;
	case IBCS2_CS8: r \|= CS8; break;
	}
	if (l & IBCS2_CSTOPB) r \|= CSTOPB;
	if (l & IBCS2_CREAD) r \|= CREAD;
	if (l & IBCS2_PARENB) r \|= PARENB;
	if (l & IBCS2_PARODD) r \|= PARODD;
	if (l & IBCS2_HUPCL) r \|= HUPCL;
	if (l & IBCS2_CLOCAL) r \|= CLOCAL;
	bt->c_cflag = r;

	bt->c_ispeed = bt->c_ospeed = s2btab[l & 0x0000000f];

	l = st->c_lflag; r = 0;
	if (l & IBCS2_ISIG) r \|= ISIG;
	if (l & IBCS2_ICANON) r \|= ICANON;
	if (l & IBCS2_ECHO) r \|= ECHO;
	if (l & IBCS2_ECHOE) r \|= ECHOE;
	if (l & IBCS2_ECHOK) r \|= ECHOK;
	if (l & IBCS2_ECHONL) r \|= ECHONL;
	if (l & IBCS2_NOFLSH) r \|= NOFLSH;
	if (l & IBCS2_TOSTOP) r \|= TOSTOP;
	bt->c_lflag = r;

	bt->c_cc[VINTR] =
	st->c_cc[IBCS2_VINTR] ? st->c_cc[IBCS2_VINTR] : _POSIX_VDISABLE;
	bt->c_cc[VQUIT] =
	st->c_cc[IBCS2_VQUIT] ? st->c_cc[IBCS2_VQUIT] : _POSIX_VDISABLE;
	bt->c_cc[VERASE] =
	st->c_cc[IBCS2_VERASE] ? st->c_cc[IBCS2_VERASE] : _POSIX_VDISABLE;
	bt->c_cc[VKILL] =
	st->c_cc[IBCS2_VKILL] ? st->c_cc[IBCS2_VKILL] : _POSIX_VDISABLE;
	if (bt->c_lflag & ICANON) {
	bt->c_cc[VEOF] =
	st->c_cc[IBCS2_VEOF] ? st->c_cc[IBCS2_VEOF] : _POSIX_VDISABLE;
	bt->c_cc[VEOL] =
	st->c_cc[IBCS2_VEOL] ? st->c_cc[IBCS2_VEOL] : _POSIX_VDISABLE;
	} else {
	bt->c_cc[VMIN] = st->c_cc[IBCS2_VMIN];
	bt->c_cc[VTIME] = st->c_cc[IBCS2_VTIME];
	}
	bt->c_cc[VEOL2] =
	st->c_cc[IBCS2_VEOL2] ? st->c_cc[IBCS2_VEOL2] : _POSIX_VDISABLE;
	#if 0
	bt->c_cc[VSWTCH] =
	st->c_cc[IBCS2_VSWTCH] ? st->c_cc[IBCS2_VSWTCH] : _POSIX_VDISABLE;
	#endif
	bt->c_cc[VSTART] =
	st->c_cc[IBCS2_VSTART] ? st->c_cc[IBCS2_VSTART] : _POSIX_VDISABLE;
	bt->c_cc[VSTOP] =
	st->c_cc[IBCS2_VSTOP] ? st->c_cc[IBCS2_VSTOP] : _POSIX_VDISABLE;
	bt->c_cc[VSUSP] =
	st->c_cc[IBCS2_VSUSP] ? st->c_cc[IBCS2_VSUSP] : _POSIX_VDISABLE;
	bt->c_cc[VDSUSP] = _POSIX_VDISABLE;
	bt->c_cc[VREPRINT] = _POSIX_VDISABLE;
	bt->c_cc[VDISCARD] = _POSIX_VDISABLE;
	bt->c_cc[VWERASE] = _POSIX_VDISABLE;
	bt->c_cc[VLNEXT] = _POSIX_VDISABLE;
	bt->c_cc[VSTATUS] = _POSIX_VDISABLE;
	}

	static void
	btios2stios(bt, st)
	struct termios *bt;
	struct ibcs2_termios *st;
	{
	register u_long l, r;

	l = bt->c_iflag; r = 0;
	if (l & IGNBRK) r \|= IBCS2_IGNBRK;
	if (l & BRKINT) r \|= IBCS2_BRKINT;
	if (l & IGNPAR) r \|= IBCS2_IGNPAR;
	if (l & PARMRK) r \|= IBCS2_PARMRK;
	if (l & INPCK) r \|= IBCS2_INPCK;
	if (l & ISTRIP) r \|= IBCS2_ISTRIP;
	if (l & INLCR) r \|= IBCS2_INLCR;
	if (l & IGNCR) r \|= IBCS2_IGNCR;
	if (l & ICRNL) r \|= IBCS2_ICRNL;
	if (l & IXON) r \|= IBCS2_IXON;
	if (l & IXANY) r \|= IBCS2_IXANY;
	if (l & IXOFF) r \|= IBCS2_IXOFF;
	if (l & IMAXBEL) r \|= IBCS2_IMAXBEL;
	st->c_iflag = r;

	l = bt->c_oflag; r = 0;
	if (l & OPOST) r \|= IBCS2_OPOST;
	if (l & ONLCR) r \|= IBCS2_ONLCR;
	if (l & TAB3) r \|= IBCS2_TAB3;
	st->c_oflag = r;

	l = bt->c_cflag; r = 0;
	switch (l & CSIZE) {
	case CS5: r \|= IBCS2_CS5; break;
	case CS6: r \|= IBCS2_CS6; break;
	case CS7: r \|= IBCS2_CS7; break;
	case CS8: r \|= IBCS2_CS8; break;
	}
	if (l & CSTOPB) r \|= IBCS2_CSTOPB;
	if (l & CREAD) r \|= IBCS2_CREAD;
	if (l & PARENB) r \|= IBCS2_PARENB;
	if (l & PARODD) r \|= IBCS2_PARODD;
	if (l & HUPCL) r \|= IBCS2_HUPCL;
	if (l & CLOCAL) r \|= IBCS2_CLOCAL;
	st->c_cflag = r;

	l = bt->c_lflag; r = 0;
	if (l & ISIG) r \|= IBCS2_ISIG;
	if (l & ICANON) r \|= IBCS2_ICANON;
	if (l & ECHO) r \|= IBCS2_ECHO;
	if (l & ECHOE) r \|= IBCS2_ECHOE;
	if (l & ECHOK) r \|= IBCS2_ECHOK;
	if (l & ECHONL) r \|= IBCS2_ECHONL;
	if (l & NOFLSH) r \|= IBCS2_NOFLSH;
	if (l & TOSTOP) r \|= IBCS2_TOSTOP;
	st->c_lflag = r;

	l = ttspeedtab(bt->c_ospeed, sptab);
	if ((int)l >= 0)
	st->c_cflag \|= l;

	st->c_cc[IBCS2_VINTR] =
	bt->c_cc[VINTR] != _POSIX_VDISABLE ? bt->c_cc[VINTR] : 0;
	st->c_cc[IBCS2_VQUIT] =
	bt->c_cc[VQUIT] != _POSIX_VDISABLE ? bt->c_cc[VQUIT] : 0;
	st->c_cc[IBCS2_VERASE] =
	bt->c_cc[VERASE] != _POSIX_VDISABLE ? bt->c_cc[VERASE] : 0;
	st->c_cc[IBCS2_VKILL] =
	bt->c_cc[VKILL] != _POSIX_VDISABLE ? bt->c_cc[VKILL] : 0;
	if (bt->c_lflag & ICANON) {
	st->c_cc[IBCS2_VEOF] =
	bt->c_cc[VEOF] != _POSIX_VDISABLE ? bt->c_cc[VEOF] : 0;
	st->c_cc[IBCS2_VEOL] =
	bt->c_cc[VEOL] != _POSIX_VDISABLE ? bt->c_cc[VEOL] : 0;
	} else {
	st->c_cc[IBCS2_VMIN] = bt->c_cc[VMIN];
	st->c_cc[IBCS2_VTIME] = bt->c_cc[VTIME];
	}
	st->c_cc[IBCS2_VEOL2] =
	bt->c_cc[VEOL2] != _POSIX_VDISABLE ? bt->c_cc[VEOL2] : 0;
	st->c_cc[IBCS2_VSWTCH] =
	0;
	st->c_cc[IBCS2_VSUSP] =
	bt->c_cc[VSUSP] != _POSIX_VDISABLE ? bt->c_cc[VSUSP] : 0;
	st->c_cc[IBCS2_VSTART] =
	bt->c_cc[VSTART] != _POSIX_VDISABLE ? bt->c_cc[VSTART] : 0;
	st->c_cc[IBCS2_VSTOP] =
	bt->c_cc[VSTOP] != _POSIX_VDISABLE ? bt->c_cc[VSTOP] : 0;

	st->c_line = 0;
	}

	static void
	stios2stio(ts, t)
	struct ibcs2_termios *ts;
	struct ibcs2_termio *t;
	{
	t->c_iflag = ts->c_iflag;
	t->c_oflag = ts->c_oflag;
	t->c_cflag = ts->c_cflag;
	t->c_lflag = ts->c_lflag;
	t->c_line = ts->c_line;
	bcopy(ts->c_cc, t->c_cc, IBCS2_NCC);
	}

	static void
	stio2stios(t, ts)
	struct ibcs2_termio *t;
	struct ibcs2_termios *ts;
	{
	ts->c_iflag = t->c_iflag;
	ts->c_oflag = t->c_oflag;
	ts->c_cflag = t->c_cflag;
	ts->c_lflag = t->c_lflag;
	ts->c_line = t->c_line;
	bcopy(t->c_cc, ts->c_cc, IBCS2_NCC);
	}

	int
	ibcs2_ioctl(td, uap)
	struct thread *td;
	struct ibcs2_ioctl_args *uap;
	{
	struct proc *p = td->td_proc;
	struct file *fp;
	int error;

	if ((error = fget(td, uap->fd, CAP_IOCTL, &fp)) != 0) {
	DPRINTF(("ibcs2_ioctl(%d): bad fd %d ", p->p_pid,
	uap->fd));
	return EBADF;
	}

	if ((fp->f_flag & (FREAD\|FWRITE)) == 0) {
	fdrop(fp, td);
	DPRINTF(("ibcs2_ioctl(%d): bad fp flag ", p->p_pid));
	return EBADF;
	}

	switch (uap->cmd) {
	case IBCS2_TCGETA:
	case IBCS2_XCGETA:
	case IBCS2_OXCGETA:
	{
	struct termios bts;
	struct ibcs2_termios sts;
	struct ibcs2_termio st;

	if ((error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bts,
	td->td_ucred, td)) != 0)
	break;

	btios2stios (&bts, &sts);
	if (uap->cmd == IBCS2_TCGETA) {
	stios2stio (&sts, &st);
	error = copyout((caddr_t)&st, uap->data,
	sizeof (st));
	#ifdef DEBUG_IBCS2
	if (error)
	DPRINTF(("ibcs2_ioctl(%d): copyout failed ",
	p->p_pid));
	#endif
	break;
	} else {
	error = copyout((caddr_t)&sts, uap->data,
	sizeof (sts));
	break;
	}
	/NOTREACHED/
	}

	case IBCS2_TCSETA:
	case IBCS2_TCSETAW:
	case IBCS2_TCSETAF:
	{
	struct termios bts;
	struct ibcs2_termios sts;
	struct ibcs2_termio st;

	if ((error = copyin(uap->data, (caddr_t)&st,
	sizeof(st))) != 0) {
	DPRINTF(("ibcs2_ioctl(%d): TCSET copyin failed ",
	p->p_pid));
	break;
	}

	/* get full BSD termios so we don't lose information */
	if ((error = fo_ioctl(fp, TIOCGETA, (caddr_t)&bts,
	td->td_ucred, td)) != 0) {
	DPRINTF(("ibcs2_ioctl(%d): TCSET ctl failed fd %d ",
	p->p_pid, uap->fd));
	break;
	}

	/*
	* convert to iBCS2 termios, copy in information from
	* termio, and convert back, then set new values.
	*/
	btios2stios(&bts, &sts);
	stio2stios(&st, &sts);
	stios2btios(&sts, &bts);

	error = fo_ioctl(fp, uap->cmd - IBCS2_TCSETA + TIOCSETA,
	(caddr_t)&bts, td->td_ucred, td);
	break;
	}

	case IBCS2_XCSETA:
	case IBCS2_XCSETAW:
	case IBCS2_XCSETAF:
	{
	struct termios bts;
	struct ibcs2_termios sts;

	if ((error = copyin(uap->data, (caddr_t)&sts,
	sizeof (sts))) != 0)
	break;
	stios2btios (&sts, &bts);
	error = fo_ioctl(fp, uap->cmd - IBCS2_XCSETA + TIOCSETA,
	(caddr_t)&bts, td->td_ucred, td);
	break;
	}

	case IBCS2_OXCSETA:
	case IBCS2_OXCSETAW:
	case IBCS2_OXCSETAF:
	{
	struct termios bts;
	struct ibcs2_termios sts;

	if ((error = copyin(uap->data, (caddr_t)&sts,
	sizeof (sts))) != 0)
	break;
	stios2btios (&sts, &bts);
	error = fo_ioctl(fp, uap->cmd - IBCS2_OXCSETA + TIOCSETA,
	(caddr_t)&bts, td->td_ucred, td);
	break;
	}

	case IBCS2_TCSBRK:
	DPRINTF(("ibcs2_ioctl(%d): TCSBRK ", p->p_pid));
	error = ENOSYS;
	break;

	case IBCS2_TCXONC:
	{
	switch ((int)uap->data) {
	case 0:
	case 1:
	DPRINTF(("ibcs2_ioctl(%d): TCXONC ", p->p_pid));
	error = ENOSYS;
	break;
	case 2:
	error = fo_ioctl(fp, TIOCSTOP, (caddr_t)0,
	td->td_ucred, td);
	break;
	case 3:
	error = fo_ioctl(fp, TIOCSTART, (caddr_t)1,
	td->td_ucred, td);
	break;
	default:
	error = EINVAL;
	break;
	}
	break;
	}

	case IBCS2_TCFLSH:
	{
	int arg;

	switch ((int)uap->data) {
	case 0:
	arg = FREAD;
	break;
	case 1:
	arg = FWRITE;
	break;
	case 2:
	arg = FREAD \| FWRITE;
	break;
	default:
	fdrop(fp, td);
	return EINVAL;
	}
	error = fo_ioctl(fp, TIOCFLUSH, (caddr_t)&arg, td->td_ucred,
	td);
	break;
	}

	case IBCS2_TIOCGWINSZ:
	uap->cmd = TIOCGWINSZ;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_TIOCSWINSZ:
	uap->cmd = TIOCSWINSZ;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_TIOCGPGRP:
	{
	pid_t pg_id;

	PROC_LOCK(p);
	pg_id = p->p_pgrp->pg_id;
	PROC_UNLOCK(p);
	error = copyout((caddr_t)&pg_id, uap->data,
	sizeof(pg_id));
	break;
	}

	case IBCS2_TIOCSPGRP: /* XXX - is uap->data a pointer to pgid? */
	{
	struct setpgid_args sa;

	sa.pid = 0;
	sa.pgid = (int)uap->data;
	- error = setpgid(td, &sa);
	+ error = sys_setpgid(td, &sa);
	break;
	}

	case IBCS2_TCGETSC: /* SCO console - get scancode flags */
	error = EINTR; /* ENOSYS; */
	break;

	case IBCS2_TCSETSC: /* SCO console - set scancode flags */
	error = 0; /* ENOSYS; */
	break;

	case IBCS2_JWINSIZE: /* Unix to Jerq I/O control */
	{
	struct ibcs2_jwinsize {
	char bytex, bytey;
	short bitx, bity;
	} ibcs2_jwinsize;

	PROC_LOCK(p);
	SESS_LOCK(p->p_session);
	ibcs2_jwinsize.bytex = 80;
	/* p->p_session->s_ttyp->t_winsize.ws_col; XXX */
	ibcs2_jwinsize.bytey = 25;
	/* p->p_session->s_ttyp->t_winsize.ws_row; XXX */
	ibcs2_jwinsize.bitx =
	p->p_session->s_ttyp->t_winsize.ws_xpixel;
	ibcs2_jwinsize.bity =
	p->p_session->s_ttyp->t_winsize.ws_ypixel;
	SESS_UNLOCK(p->p_session);
	PROC_UNLOCK(p);
	error = copyout((caddr_t)&ibcs2_jwinsize, uap->data,
	sizeof(ibcs2_jwinsize));
	break;
	}

	/* keyboard and display ioctl's -- type 'K' */
	case IBCS2_KDGKBMODE: /* get keyboard translation mode */
	uap->cmd = KDGKBMODE;
	/* printf("ioctl KDGKBMODE = %x\n", uap->cmd);*/
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_KDSKBMODE: /* set keyboard translation mode */
	uap->cmd = KDSKBMODE;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_KDMKTONE: /* sound tone */
	uap->cmd = KDMKTONE;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_KDGETMODE: /* get text/graphics mode */
	uap->cmd = KDGETMODE;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_KDSETMODE: /* set text/graphics mode */
	uap->cmd = KDSETMODE;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_KDSBORDER: /* set ega color border */
	uap->cmd = KDSBORDER;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_KDGKBSTATE:
	uap->cmd = KDGKBSTATE;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_KDSETRAD:
	uap->cmd = KDSETRAD;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_KDENABIO: /* enable direct I/O to ports */
	uap->cmd = KDENABIO;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_KDDISABIO: /* disable direct I/O to ports */
	uap->cmd = KDDISABIO;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_KIOCSOUND: /* start sound generation */
	uap->cmd = KIOCSOUND;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_KDGKBTYPE: /* get keyboard type */
	uap->cmd = KDGKBTYPE;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_KDGETLED: /* get keyboard LED status */
	uap->cmd = KDGETLED;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_KDSETLED: /* set keyboard LED status */
	uap->cmd = KDSETLED;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	/* Xenix keyboard and display ioctl's from sys/kd.h -- type 'k' */
	case IBCS2_GETFKEY: /* Get function key */
	uap->cmd = GETFKEY;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_SETFKEY: /* Set function key */
	uap->cmd = SETFKEY;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_GIO_SCRNMAP: /* Get screen output map table */
	uap->cmd = GIO_SCRNMAP;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_PIO_SCRNMAP: /* Set screen output map table */
	uap->cmd = PIO_SCRNMAP;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_GIO_KEYMAP: /* Get keyboard map table */
	uap->cmd = OGIO_KEYMAP;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	case IBCS2_PIO_KEYMAP: /* Set keyboard map table */
	uap->cmd = OPIO_KEYMAP;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	/* socksys */
	case IBCS2_SIOCSOCKSYS:
	error = ibcs2_socksys(td, (struct ibcs2_socksys_args *)uap);
	break;

	case IBCS2_FIONREAD:
	case IBCS2_I_NREAD: /* STREAMS */
	uap->cmd = FIONREAD;
	- error = ioctl(td, (struct ioctl_args *)uap);
	+ error = sys_ioctl(td, (struct ioctl_args *)uap);
	break;

	default:
	DPRINTF(("ibcs2_ioctl(%d): unknown cmd 0x%lx ",
	td->proc->p_pid, uap->cmd));
	error = ENOSYS;
	break;
	}

	fdrop(fp, td);
	return error;
	}
	Index: head/sys/i386/ibcs2/ibcs2_ipc.c
	===================================================================
	--- head/sys/i386/ibcs2/ibcs2_ipc.c (revision 225616)
	+++ head/sys/i386/ibcs2/ibcs2_ipc.c (revision 225617)
	@@ -1,560 +1,560 @@
	/*-
	* Copyright (c) 1995 Scott Bartram
	* Copyright (c) 1995 Steven Wallace
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/limits.h>
	#include <sys/msg.h>
	#include <sys/sem.h>
	#include <sys/shm.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>

	#include <i386/ibcs2/ibcs2_types.h>
	#include <i386/ibcs2/ibcs2_signal.h>
	#include <i386/ibcs2/ibcs2_proto.h>
	#include <i386/ibcs2/ibcs2_util.h>
	#include <i386/ibcs2/ibcs2_ipc.h>

	#define IBCS2_IPC_RMID 0
	#define IBCS2_IPC_SET 1
	#define IBCS2_IPC_STAT 2
	#define IBCS2_SETVAL 8



	static void cvt_msqid2imsqid(struct msqid_ds , struct ibcs2_msqid_ds );
	static void cvt_imsqid2msqid(struct ibcs2_msqid_ds , struct msqid_ds );
	#ifdef unused
	static void cvt_sem2isem(struct sem , struct ibcs2_sem );
	static void cvt_isem2sem(struct ibcs2_sem , struct sem );
	#endif
	static void cvt_semid2isemid(struct semid_ds , struct ibcs2_semid_ds );
	static void cvt_isemid2semid(struct ibcs2_semid_ds , struct semid_ds );
	static void cvt_shmid2ishmid(struct shmid_ds , struct ibcs2_shmid_ds );
	static void cvt_ishmid2shmid(struct ibcs2_shmid_ds , struct shmid_ds );
	static void cvt_perm2iperm(struct ipc_perm , struct ibcs2_ipc_perm );
	static void cvt_iperm2perm(struct ibcs2_ipc_perm , struct ipc_perm );


	/*
	* iBCS2 msgsys call
	*/

	static void
	cvt_msqid2imsqid(bp, ibp)
	struct msqid_ds *bp;
	struct ibcs2_msqid_ds *ibp;
	{
	cvt_perm2iperm(&bp->msg_perm, &ibp->msg_perm);
	ibp->msg_first = bp->msg_first;
	ibp->msg_last = bp->msg_last;
	ibp->msg_cbytes = (u_short)bp->msg_cbytes;
	ibp->msg_qnum = (u_short)bp->msg_qnum;
	ibp->msg_qbytes = (u_short)bp->msg_qbytes;
	ibp->msg_lspid = (u_short)bp->msg_lspid;
	ibp->msg_lrpid = (u_short)bp->msg_lrpid;
	ibp->msg_stime = bp->msg_stime;
	ibp->msg_rtime = bp->msg_rtime;
	ibp->msg_ctime = bp->msg_ctime;
	return;
	}

	static void
	cvt_imsqid2msqid(ibp, bp)
	struct ibcs2_msqid_ds *ibp;
	struct msqid_ds *bp;
	{
	cvt_iperm2perm(&ibp->msg_perm, &bp->msg_perm);
	bp->msg_first = ibp->msg_first;
	bp->msg_last = ibp->msg_last;
	bp->msg_cbytes = ibp->msg_cbytes;
	bp->msg_qnum = ibp->msg_qnum;
	bp->msg_qbytes = ibp->msg_qbytes;
	bp->msg_lspid = ibp->msg_lspid;
	bp->msg_lrpid = ibp->msg_lrpid;
	bp->msg_stime = ibp->msg_stime;
	bp->msg_rtime = ibp->msg_rtime;
	bp->msg_ctime = ibp->msg_ctime;
	return;
	}

	struct ibcs2_msgget_args {
	int what;
	ibcs2_key_t key;
	int msgflg;
	};

	static int
	ibcs2_msgget(struct thread td, void v)
	{
	struct ibcs2_msgget_args *uap = v;
	struct msgget_args ap;

	ap.key = uap->key;
	ap.msgflg = uap->msgflg;
	- return msgget(td, &ap);
	+ return sys_msgget(td, &ap);
	}

	struct ibcs2_msgctl_args {
	int what;
	int msqid;
	int cmd;
	struct ibcs2_msqid_ds *buf;
	};

	static int
	ibcs2_msgctl(struct thread td, void v)
	{
	struct ibcs2_msgctl_args *uap = v;
	struct ibcs2_msqid_ds is;
	struct msqid_ds bs;
	int error;

	switch (uap->cmd) {
	case IBCS2_IPC_STAT:
	error = kern_msgctl(td, uap->msqid, IPC_STAT, &bs);
	if (!error) {
	cvt_msqid2imsqid(&bs, &is);
	error = copyout(&is, uap->buf, sizeof(is));
	}
	return (error);
	case IBCS2_IPC_SET:
	error = copyin(uap->buf, &is, sizeof(is));
	if (error)
	return (error);
	cvt_imsqid2msqid(&is, &bs);
	return (kern_msgctl(td, uap->msqid, IPC_SET, &bs));
	case IBCS2_IPC_RMID:
	return (kern_msgctl(td, uap->msqid, IPC_RMID, NULL));
	}
	return (EINVAL);
	}

	struct ibcs2_msgrcv_args {
	int what;
	int msqid;
	void *msgp;
	size_t msgsz;
	long msgtyp;
	int msgflg;
	};

	static int
	ibcs2_msgrcv(struct thread td, void v)
	{
	struct ibcs2_msgrcv_args *uap = v;
	struct msgrcv_args ap;

	ap.msqid = uap->msqid;
	ap.msgp = uap->msgp;
	ap.msgsz = uap->msgsz;
	ap.msgtyp = uap->msgtyp;
	ap.msgflg = uap->msgflg;
	- return (msgrcv(td, &ap));
	+ return (sys_msgrcv(td, &ap));
	}

	struct ibcs2_msgsnd_args {
	int what;
	int msqid;
	void *msgp;
	size_t msgsz;
	int msgflg;
	};

	static int
	ibcs2_msgsnd(struct thread td, void v)
	{
	struct ibcs2_msgsnd_args *uap = v;
	struct msgsnd_args ap;

	ap.msqid = uap->msqid;
	ap.msgp = uap->msgp;
	ap.msgsz = uap->msgsz;
	ap.msgflg = uap->msgflg;
	- return (msgsnd(td, &ap));
	+ return (sys_msgsnd(td, &ap));
	}

	int
	ibcs2_msgsys(td, uap)
	struct thread *td;
	struct ibcs2_msgsys_args *uap;
	{
	switch (uap->which) {
	case 0:
	return (ibcs2_msgget(td, uap));
	case 1:
	return (ibcs2_msgctl(td, uap));
	case 2:
	return (ibcs2_msgrcv(td, uap));
	case 3:
	return (ibcs2_msgsnd(td, uap));
	default:
	return (EINVAL);
	}
	}

	/*
	* iBCS2 semsys call
	*/
	#ifdef unused
	static void
	cvt_sem2isem(bp, ibp)
	struct sem *bp;
	struct ibcs2_sem *ibp;
	{
	ibp->semval = bp->semval;
	ibp->sempid = bp->sempid;
	ibp->semncnt = bp->semncnt;
	ibp->semzcnt = bp->semzcnt;
	return;
	}

	static void
	cvt_isem2sem(ibp, bp)
	struct ibcs2_sem *ibp;
	struct sem *bp;
	{
	bp->semval = ibp->semval;
	bp->sempid = ibp->sempid;
	bp->semncnt = ibp->semncnt;
	bp->semzcnt = ibp->semzcnt;
	return;
	}
	#endif

	static void
	cvt_iperm2perm(ipp, pp)
	struct ibcs2_ipc_perm *ipp;
	struct ipc_perm *pp;
	{
	pp->uid = ipp->uid;
	pp->gid = ipp->gid;
	pp->cuid = ipp->cuid;
	pp->cgid = ipp->cgid;
	pp->mode = ipp->mode;
	pp->seq = ipp->seq;
	pp->key = ipp->key;
	}

	static void
	cvt_perm2iperm(pp, ipp)
	struct ipc_perm *pp;
	struct ibcs2_ipc_perm *ipp;
	{
	ipp->uid = pp->uid;
	ipp->gid = pp->gid;
	ipp->cuid = pp->cuid;
	ipp->cgid = pp->cgid;
	ipp->mode = pp->mode;
	ipp->seq = pp->seq;
	ipp->key = pp->key;
	}

	static void
	cvt_semid2isemid(bp, ibp)
	struct semid_ds *bp;
	struct ibcs2_semid_ds *ibp;
	{
	cvt_perm2iperm(&bp->sem_perm, &ibp->sem_perm);
	ibp->sem_base = (struct ibcs2_sem *)bp->sem_base;
	ibp->sem_nsems = bp->sem_nsems;
	ibp->sem_otime = bp->sem_otime;
	ibp->sem_ctime = bp->sem_ctime;
	return;
	}

	static void
	cvt_isemid2semid(ibp, bp)
	struct ibcs2_semid_ds *ibp;
	struct semid_ds *bp;
	{
	cvt_iperm2perm(&ibp->sem_perm, &bp->sem_perm);
	bp->sem_base = (struct sem *)ibp->sem_base;
	bp->sem_nsems = ibp->sem_nsems;
	bp->sem_otime = ibp->sem_otime;
	bp->sem_ctime = ibp->sem_ctime;
	return;
	}

	struct ibcs2_semctl_args {
	int what;
	int semid;
	int semnum;
	int cmd;
	union semun arg;
	};

	static int
	ibcs2_semctl(struct thread td, void v)
	{
	struct ibcs2_semctl_args *uap = v;
	struct ibcs2_semid_ds is;
	struct semid_ds bs;
	union semun semun;
	register_t rval;
	int error;

	switch(uap->cmd) {
	case IBCS2_IPC_STAT:
	semun.buf = &bs;
	error = kern_semctl(td, uap->semid, uap->semnum, IPC_STAT,
	&semun, &rval);
	if (error)
	return (error);
	cvt_semid2isemid(&bs, &is);
	error = copyout(&is, uap->arg.buf, sizeof(is));
	if (error == 0)
	td->td_retval[0] = rval;
	return (error);

	case IBCS2_IPC_SET:
	error = copyin(uap->arg.buf, &is, sizeof(is));
	if (error)
	return (error);
	cvt_isemid2semid(&is, &bs);
	semun.buf = &bs;
	return (kern_semctl(td, uap->semid, uap->semnum, IPC_SET,
	&semun, td->td_retval));
	}

	return (kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &uap->arg,
	td->td_retval));
	}

	struct ibcs2_semget_args {
	int what;
	ibcs2_key_t key;
	int nsems;
	int semflg;
	};

	static int
	ibcs2_semget(struct thread td, void v)
	{
	struct ibcs2_semget_args *uap = v;
	struct semget_args ap;

	ap.key = uap->key;
	ap.nsems = uap->nsems;
	ap.semflg = uap->semflg;
	- return (semget(td, &ap));
	+ return (sys_semget(td, &ap));
	}

	struct ibcs2_semop_args {
	int what;
	int semid;
	struct sembuf *sops;
	size_t nsops;
	};

	static int
	ibcs2_semop(struct thread td, void v)
	{
	struct ibcs2_semop_args *uap = v;
	struct semop_args ap;

	ap.semid = uap->semid;
	ap.sops = uap->sops;
	ap.nsops = uap->nsops;
	- return (semop(td, &ap));
	+ return (sys_semop(td, &ap));
	}

	int
	ibcs2_semsys(td, uap)
	struct thread *td;
	struct ibcs2_semsys_args *uap;
	{

	switch (uap->which) {
	case 0:
	return (ibcs2_semctl(td, uap));
	case 1:
	return (ibcs2_semget(td, uap));
	case 2:
	return (ibcs2_semop(td, uap));
	}
	return (EINVAL);
	}


	/*
	* iBCS2 shmsys call
	*/

	static void
	cvt_shmid2ishmid(bp, ibp)
	struct shmid_ds *bp;
	struct ibcs2_shmid_ds *ibp;
	{
	cvt_perm2iperm(&bp->shm_perm, &ibp->shm_perm);
	ibp->shm_segsz = bp->shm_segsz;
	ibp->shm_lpid = bp->shm_lpid;
	ibp->shm_cpid = bp->shm_cpid;
	if (bp->shm_nattch > SHRT_MAX)
	ibp->shm_nattch = SHRT_MAX;
	else
	ibp->shm_nattch = bp->shm_nattch;
	ibp->shm_cnattch = 0; /* ignored anyway */
	ibp->shm_atime = bp->shm_atime;
	ibp->shm_dtime = bp->shm_dtime;
	ibp->shm_ctime = bp->shm_ctime;
	return;
	}

	static void
	cvt_ishmid2shmid(ibp, bp)
	struct ibcs2_shmid_ds *ibp;
	struct shmid_ds *bp;
	{
	cvt_iperm2perm(&ibp->shm_perm, &bp->shm_perm);
	bp->shm_segsz = ibp->shm_segsz;
	bp->shm_lpid = ibp->shm_lpid;
	bp->shm_cpid = ibp->shm_cpid;
	bp->shm_nattch = ibp->shm_nattch;
	bp->shm_atime = ibp->shm_atime;
	bp->shm_dtime = ibp->shm_dtime;
	bp->shm_ctime = ibp->shm_ctime;
	return;
	}

	struct ibcs2_shmat_args {
	int what;
	int shmid;
	const void *shmaddr;
	int shmflg;
	};

	static int
	ibcs2_shmat(struct thread td, void v)
	{
	struct ibcs2_shmat_args *uap = v;
	struct shmat_args ap;

	ap.shmid = uap->shmid;
	ap.shmaddr = uap->shmaddr;
	ap.shmflg = uap->shmflg;
	- return (shmat(td, &ap));
	+ return (sys_shmat(td, &ap));
	}

	struct ibcs2_shmctl_args {
	int what;
	int shmid;
	int cmd;
	struct ibcs2_shmid_ds *buf;
	};

	static int
	ibcs2_shmctl(struct thread td, void v)
	{
	struct ibcs2_shmctl_args *uap = v;
	struct ibcs2_shmid_ds is;
	struct shmid_ds bs;
	int error;

	switch(uap->cmd) {
	case IBCS2_IPC_STAT:
	error = kern_shmctl(td, uap->shmid, IPC_STAT, &bs, NULL);
	if (error)
	return (error);
	cvt_shmid2ishmid(&bs, &is);
	return (copyout(&is, uap->buf, sizeof(is)));

	case IBCS2_IPC_SET:
	error = copyin(uap->buf, &is, sizeof(is));
	if (error)
	return (error);
	cvt_ishmid2shmid(&is, &bs);
	return (kern_shmctl(td, uap->shmid, IPC_SET, &bs, NULL));

	case IPC_INFO:
	case SHM_INFO:
	case SHM_STAT:
	/* XXX: */
	return (EINVAL);
	}

	return (kern_shmctl(td, uap->shmid, uap->cmd, NULL, NULL));
	}

	struct ibcs2_shmdt_args {
	int what;
	const void *shmaddr;
	};

	static int
	ibcs2_shmdt(struct thread td, void v)
	{
	struct ibcs2_shmdt_args *uap = v;
	struct shmdt_args ap;

	ap.shmaddr = uap->shmaddr;
	- return (shmdt(td, &ap));
	+ return (sys_shmdt(td, &ap));
	}

	struct ibcs2_shmget_args {
	int what;
	ibcs2_key_t key;
	size_t size;
	int shmflg;
	};

	static int
	ibcs2_shmget(struct thread td, void v)
	{
	struct ibcs2_shmget_args *uap = v;
	struct shmget_args ap;

	ap.key = uap->key;
	ap.size = uap->size;
	ap.shmflg = uap->shmflg;
	- return (shmget(td, &ap));
	+ return (sys_shmget(td, &ap));
	}

	int
	ibcs2_shmsys(td, uap)
	struct thread *td;
	struct ibcs2_shmsys_args *uap;
	{

	switch (uap->which) {
	case 0:
	return (ibcs2_shmat(td, uap));
	case 1:
	return (ibcs2_shmctl(td, uap));
	case 2:
	return (ibcs2_shmdt(td, uap));
	case 3:
	return (ibcs2_shmget(td, uap));
	}
	return (EINVAL);
	}

	MODULE_DEPEND(ibcs2, sysvmsg, 1, 1, 1);
	MODULE_DEPEND(ibcs2, sysvsem, 1, 1, 1);
	MODULE_DEPEND(ibcs2, sysvshm, 1, 1, 1);
	Index: head/sys/i386/ibcs2/ibcs2_misc.c
	===================================================================
	--- head/sys/i386/ibcs2/ibcs2_misc.c (revision 225616)
	+++ head/sys/i386/ibcs2/ibcs2_misc.c (revision 225617)
	@@ -1,1267 +1,1267 @@
	/*-
	* Copyright (c) 1995 Steven Wallace
	* Copyright (c) 1994, 1995 Scott Bartram
	* Copyright (c) 1992, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This software was developed by the Computer Systems Engineering group
	* at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
	* contributed to Berkeley.
	*
	* All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Lawrence Berkeley Laboratory.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: Header: sun_misc.c,v 1.16 93/04/07 02:46:27 torek Exp
	*
	* @(#)sun_misc.c 8.1 (Berkeley) 6/18/93
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* IBCS2 compatibility module.
	*
	* IBCS2 system calls that are implemented differently in BSD are
	* handled here.
	*/
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/dirent.h>
	#include <sys/fcntl.h>
	#include <sys/filedesc.h>
	#include <sys/imgact.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/file.h> /* Must come after sys/malloc.h */
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/reboot.h>
	#include <sys/resourcevar.h>
	#include <sys/stat.h>
	#include <sys/sysctl.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>
	#include <sys/time.h>
	#include <sys/times.h>
	#include <sys/vnode.h>
	#include <sys/wait.h>

	#include <machine/cpu.h>

	#include <i386/ibcs2/ibcs2_dirent.h>
	#include <i386/ibcs2/ibcs2_signal.h>
	#include <i386/ibcs2/ibcs2_proto.h>
	#include <i386/ibcs2/ibcs2_unistd.h>
	#include <i386/ibcs2/ibcs2_util.h>
	#include <i386/ibcs2/ibcs2_utime.h>
	#include <i386/ibcs2/ibcs2_xenix.h>

	#include <security/mac/mac_framework.h>

	int
	ibcs2_ulimit(td, uap)
	struct thread *td;
	struct ibcs2_ulimit_args *uap;
	{
	struct rlimit rl;
	struct proc *p;
	int error;
	#define IBCS2_GETFSIZE 1
	#define IBCS2_SETFSIZE 2
	#define IBCS2_GETPSIZE 3
	#define IBCS2_GETDTABLESIZE 4

	p = td->td_proc;
	switch (uap->cmd) {
	case IBCS2_GETFSIZE:
	PROC_LOCK(p);
	td->td_retval[0] = lim_cur(p, RLIMIT_FSIZE);
	PROC_UNLOCK(p);
	if (td->td_retval[0] == -1)
	td->td_retval[0] = 0x7fffffff;
	return 0;
	case IBCS2_SETFSIZE:
	PROC_LOCK(p);
	rl.rlim_max = lim_max(p, RLIMIT_FSIZE);
	PROC_UNLOCK(p);
	rl.rlim_cur = uap->newlimit;
	error = kern_setrlimit(td, RLIMIT_FSIZE, &rl);
	if (!error) {
	PROC_LOCK(p);
	td->td_retval[0] = lim_cur(p, RLIMIT_FSIZE);
	PROC_UNLOCK(p);
	} else {
	DPRINTF(("failed "));
	}
	return error;
	case IBCS2_GETPSIZE:
	PROC_LOCK(p);
	td->td_retval[0] = lim_cur(p, RLIMIT_RSS); /* XXX */
	PROC_UNLOCK(p);
	return 0;
	case IBCS2_GETDTABLESIZE:
	uap->cmd = IBCS2_SC_OPEN_MAX;
	return ibcs2_sysconf(td, (struct ibcs2_sysconf_args *)uap);
	default:
	return ENOSYS;
	}
	}

	#define IBCS2_WSTOPPED 0177
	#define IBCS2_STOPCODE(sig) ((sig) << 8 \| IBCS2_WSTOPPED)
	int
	ibcs2_wait(td, uap)
	struct thread *td;
	struct ibcs2_wait_args *uap;
	{
	int error, options, status;
	int *statusp;
	pid_t pid;
	struct trapframe *tf = td->td_frame;

	if ((tf->tf_eflags & (PSL_Z\|PSL_PF\|PSL_N\|PSL_V))
	== (PSL_Z\|PSL_PF\|PSL_N\|PSL_V)) {
	/* waitpid */
	pid = uap->a1;
	statusp = (int *)uap->a2;
	options = uap->a3;
	} else {
	/* wait */
	pid = WAIT_ANY;
	statusp = (int *)uap->a1;
	options = 0;
	}
	error = kern_wait(td, pid, &status, options, NULL);
	if (error)
	return error;
	if (statusp) {
	/*
	* Convert status/signal result.
	*/
	if (WIFSTOPPED(status)) {
	if (WSTOPSIG(status) <= 0 \|\|
	WSTOPSIG(status) > IBCS2_SIGTBLSZ)
	return (EINVAL);
	status =
	IBCS2_STOPCODE(bsd_to_ibcs2_sig[_SIG_IDX(WSTOPSIG(status))]);
	} else if (WIFSIGNALED(status)) {
	if (WTERMSIG(status) <= 0 \|\|
	WTERMSIG(status) > IBCS2_SIGTBLSZ)
	return (EINVAL);
	status = bsd_to_ibcs2_sig[_SIG_IDX(WTERMSIG(status))];
	}
	/* else exit status -- identical */

	/* record result/status */
	td->td_retval[1] = status;
	return copyout(&status, statusp, sizeof(status));
	}

	return 0;
	}

	int
	ibcs2_execv(td, uap)
	struct thread *td;
	struct ibcs2_execv_args *uap;
	{
	struct image_args eargs;
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);

	error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp, NULL);
	free(path, M_TEMP);
	if (error == 0)
	error = kern_execve(td, &eargs, NULL);
	return (error);
	}

	int
	ibcs2_execve(td, uap)
	struct thread *td;
	struct ibcs2_execve_args *uap;
	{
	struct image_args eargs;
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);

	error = exec_copyin_args(&eargs, path, UIO_SYSSPACE, uap->argp,
	uap->envp);
	free(path, M_TEMP);
	if (error == 0)
	error = kern_execve(td, &eargs, NULL);
	return (error);
	}

	int
	ibcs2_umount(td, uap)
	struct thread *td;
	struct ibcs2_umount_args *uap;
	{
	struct unmount_args um;

	um.path = uap->name;
	um.flags = 0;
	- return unmount(td, &um);
	+ return sys_unmount(td, &um);
	}

	int
	ibcs2_mount(td, uap)
	struct thread *td;
	struct ibcs2_mount_args *uap;
	{
	#ifdef notyet
	int oflags = uap->flags, nflags, error;
	char fsname[MFSNAMELEN];

	if (oflags & (IBCS2_MS_NOSUB \| IBCS2_MS_SYS5))
	return (EINVAL);
	if ((oflags & IBCS2_MS_NEWTYPE) == 0)
	return (EINVAL);
	nflags = 0;
	if (oflags & IBCS2_MS_RDONLY)
	nflags \|= MNT_RDONLY;
	if (oflags & IBCS2_MS_NOSUID)
	nflags \|= MNT_NOSUID;
	if (oflags & IBCS2_MS_REMOUNT)
	nflags \|= MNT_UPDATE;
	uap->flags = nflags;

	if (error = copyinstr((caddr_t)uap->type, fsname, sizeof fsname,
	(u_int *)0))
	return (error);

	if (strcmp(fsname, "4.2") == 0) {
	uap->type = (caddr_t)STACK_ALLOC();
	if (error = copyout("ufs", uap->type, sizeof("ufs")))
	return (error);
	} else if (strcmp(fsname, "nfs") == 0) {
	struct ibcs2_nfs_args sna;
	struct sockaddr_in sain;
	struct nfs_args na;
	struct sockaddr sa;

	if (error = copyin(uap->data, &sna, sizeof sna))
	return (error);
	if (error = copyin(sna.addr, &sain, sizeof sain))
	return (error);
	bcopy(&sain, &sa, sizeof sa);
	sa.sa_len = sizeof(sain);
	uap->data = (caddr_t)STACK_ALLOC();
	na.addr = (struct sockaddr *)((int)uap->data + sizeof na);
	na.sotype = SOCK_DGRAM;
	na.proto = IPPROTO_UDP;
	na.fh = (nfsv2fh_t *)sna.fh;
	na.flags = sna.flags;
	na.wsize = sna.wsize;
	na.rsize = sna.rsize;
	na.timeo = sna.timeo;
	na.retrans = sna.retrans;
	na.hostname = sna.hostname;

	if (error = copyout(&sa, na.addr, sizeof sa))
	return (error);
	if (error = copyout(&na, uap->data, sizeof na))
	return (error);
	}
	return (mount(td, uap));
	#else
	return EINVAL;
	#endif
	}

	/*
	* Read iBCS2-style directory entries. We suck them into kernel space so
	* that they can be massaged before being copied out to user code. Like
	* SunOS, we squish out `empty' entries.
	*
	* This is quite ugly, but what do you expect from compatibility code?
	*/

	int
	ibcs2_getdents(td, uap)
	struct thread *td;
	register struct ibcs2_getdents_args *uap;
	{
	register struct vnode *vp;
	register caddr_t inp, buf; /* BSD-format */
	register int len, reclen; /* BSD-format */
	register caddr_t outp; /* iBCS2-format */
	register int resid; /* iBCS2-format */
	struct file *fp;
	struct uio auio;
	struct iovec aiov;
	struct ibcs2_dirent idb;
	off_t off; /* true file offset */
	int buflen, error, eofflag, vfslocked;
	u_long cookies = NULL, cookiep;
	int ncookies;
	#define BSD_DIRENT(cp) ((struct dirent *)(cp))
	#define IBCS2_RECLEN(reclen) (reclen + sizeof(u_short))

	if ((error = getvnode(td->td_proc->p_fd, uap->fd,
	CAP_READ \| CAP_SEEK, &fp)) != 0)
	return (error);
	if ((fp->f_flag & FREAD) == 0) {
	fdrop(fp, td);
	return (EBADF);
	}
	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	if (vp->v_type != VDIR) { /* XXX vnode readdir op should do this */
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	return (EINVAL);
	}

	off = fp->f_offset;
	#define DIRBLKSIZ 512 /* XXX we used to use ufs's DIRBLKSIZ */
	buflen = max(DIRBLKSIZ, uap->nbytes);
	buflen = min(buflen, MAXBSIZE);
	buf = malloc(buflen, M_TEMP, M_WAITOK);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	again:
	aiov.iov_base = buf;
	aiov.iov_len = buflen;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_td = td;
	auio.uio_resid = buflen;
	auio.uio_offset = off;

	if (cookies) {
	free(cookies, M_TEMP);
	cookies = NULL;
	}

	#ifdef MAC
	error = mac_vnode_check_readdir(td->td_ucred, vp);
	if (error)
	goto out;
	#endif

	/*
	* First we read into the malloc'ed buffer, then
	* we massage it into user space, one record at a time.
	*/
	if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0)
	goto out;
	inp = buf;
	outp = uap->buf;
	resid = uap->nbytes;
	if ((len = buflen - auio.uio_resid) <= 0)
	goto eof;

	cookiep = cookies;

	if (cookies) {
	/*
	* When using cookies, the vfs has the option of reading from
	* a different offset than that supplied (UFS truncates the
	* offset to a block boundary to make sure that it never reads
	* partway through a directory entry, even if the directory
	* has been compacted).
	*/
	while (len > 0 && ncookies > 0 && *cookiep <= off) {
	len -= BSD_DIRENT(inp)->d_reclen;
	inp += BSD_DIRENT(inp)->d_reclen;
	cookiep++;
	ncookies--;
	}
	}

	for (; len > 0; len -= reclen) {
	if (cookiep && ncookies == 0)
	break;
	reclen = BSD_DIRENT(inp)->d_reclen;
	if (reclen & 3) {
	printf("ibcs2_getdents: reclen=%d\n", reclen);
	error = EFAULT;
	goto out;
	}
	if (BSD_DIRENT(inp)->d_fileno == 0) {
	inp += reclen; /* it is a hole; squish it out */
	if (cookiep) {
	off = *cookiep++;
	ncookies--;
	} else
	off += reclen;
	continue;
	}
	if (reclen > len \|\| resid < IBCS2_RECLEN(reclen)) {
	/* entry too big for buffer, so just stop */
	outp++;
	break;
	}
	/*
	* Massage in place to make an iBCS2-shaped dirent (otherwise
	* we have to worry about touching user memory outside of
	* the copyout() call).
	*/
	idb.d_ino = (ibcs2_ino_t)BSD_DIRENT(inp)->d_fileno;
	idb.d_off = (ibcs2_off_t)off;
	idb.d_reclen = (u_short)IBCS2_RECLEN(reclen);
	if ((error = copyout((caddr_t)&idb, outp, 10)) != 0 \|\|
	(error = copyout(BSD_DIRENT(inp)->d_name, outp + 10,
	BSD_DIRENT(inp)->d_namlen + 1)) != 0)
	goto out;
	/* advance past this real entry */
	if (cookiep) {
	off = *cookiep++;
	ncookies--;
	} else
	off += reclen;
	inp += reclen;
	/* advance output past iBCS2-shaped entry */
	outp += IBCS2_RECLEN(reclen);
	resid -= IBCS2_RECLEN(reclen);
	}
	/* if we squished out the whole block, try again */
	if (outp == uap->buf)
	goto again;
	fp->f_offset = off; /* update the vnode offset */
	eof:
	td->td_retval[0] = uap->nbytes - resid;
	out:
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	if (cookies)
	free(cookies, M_TEMP);
	free(buf, M_TEMP);
	return (error);
	}

	int
	ibcs2_read(td, uap)
	struct thread *td;
	struct ibcs2_read_args *uap;
	{
	register struct vnode *vp;
	register caddr_t inp, buf; /* BSD-format */
	register int len, reclen; /* BSD-format */
	register caddr_t outp; /* iBCS2-format */
	register int resid; /* iBCS2-format */
	struct file *fp;
	struct uio auio;
	struct iovec aiov;
	struct ibcs2_direct {
	ibcs2_ino_t ino;
	char name[14];
	} idb;
	off_t off; /* true file offset */
	int buflen, error, eofflag, size, vfslocked;
	u_long cookies = NULL, cookiep;
	int ncookies;

	if ((error = getvnode(td->td_proc->p_fd, uap->fd,
	CAP_READ \| CAP_SEEK, &fp)) != 0) {
	if (error == EINVAL)
	- return read(td, (struct read_args *)uap);
	+ return sys_read(td, (struct read_args *)uap);
	else
	return error;
	}
	if ((fp->f_flag & FREAD) == 0) {
	fdrop(fp, td);
	return (EBADF);
	}
	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	if (vp->v_type != VDIR) {
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	- return read(td, (struct read_args *)uap);
	+ return sys_read(td, (struct read_args *)uap);
	}

	off = fp->f_offset;

	DPRINTF(("ibcs2_read: read directory\n"));

	buflen = max(DIRBLKSIZ, uap->nbytes);
	buflen = min(buflen, MAXBSIZE);
	buf = malloc(buflen, M_TEMP, M_WAITOK);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	again:
	aiov.iov_base = buf;
	aiov.iov_len = buflen;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_td = td;
	auio.uio_resid = buflen;
	auio.uio_offset = off;

	if (cookies) {
	free(cookies, M_TEMP);
	cookies = NULL;
	}

	#ifdef MAC
	error = mac_vnode_check_readdir(td->td_ucred, vp);
	if (error)
	goto out;
	#endif

	/*
	* First we read into the malloc'ed buffer, then
	* we massage it into user space, one record at a time.
	*/
	if ((error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, &ncookies, &cookies)) != 0) {
	DPRINTF(("VOP_READDIR failed: %d\n", error));
	goto out;
	}
	inp = buf;
	outp = uap->buf;
	resid = uap->nbytes;
	if ((len = buflen - auio.uio_resid) <= 0)
	goto eof;

	cookiep = cookies;

	if (cookies) {
	/*
	* When using cookies, the vfs has the option of reading from
	* a different offset than that supplied (UFS truncates the
	* offset to a block boundary to make sure that it never reads
	* partway through a directory entry, even if the directory
	* has been compacted).
	*/
	while (len > 0 && ncookies > 0 && *cookiep <= off) {
	len -= BSD_DIRENT(inp)->d_reclen;
	inp += BSD_DIRENT(inp)->d_reclen;
	cookiep++;
	ncookies--;
	}
	}

	for (; len > 0 && resid > 0; len -= reclen) {
	if (cookiep && ncookies == 0)
	break;
	reclen = BSD_DIRENT(inp)->d_reclen;
	if (reclen & 3) {
	printf("ibcs2_read: reclen=%d\n", reclen);
	error = EFAULT;
	goto out;
	}
	if (BSD_DIRENT(inp)->d_fileno == 0) {
	inp += reclen; /* it is a hole; squish it out */
	if (cookiep) {
	off = *cookiep++;
	ncookies--;
	} else
	off += reclen;
	continue;
	}
	if (reclen > len \|\| resid < sizeof(struct ibcs2_direct)) {
	/* entry too big for buffer, so just stop */
	outp++;
	break;
	}
	/*
	* Massage in place to make an iBCS2-shaped dirent (otherwise
	* we have to worry about touching user memory outside of
	* the copyout() call).
	*
	* TODO: if length(filename) > 14, then break filename into
	* multiple entries and set inode = 0xffff except last
	*/
	idb.ino = (BSD_DIRENT(inp)->d_fileno > 0xfffe) ? 0xfffe :
	BSD_DIRENT(inp)->d_fileno;
	(void)copystr(BSD_DIRENT(inp)->d_name, idb.name, 14, &size);
	bzero(idb.name + size, 14 - size);
	if ((error = copyout(&idb, outp, sizeof(struct ibcs2_direct))) != 0)
	goto out;
	/* advance past this real entry */
	if (cookiep) {
	off = *cookiep++;
	ncookies--;
	} else
	off += reclen;
	inp += reclen;
	/* advance output past iBCS2-shaped entry */
	outp += sizeof(struct ibcs2_direct);
	resid -= sizeof(struct ibcs2_direct);
	}
	/* if we squished out the whole block, try again */
	if (outp == uap->buf)
	goto again;
	fp->f_offset = off; /* update the vnode offset */
	eof:
	td->td_retval[0] = uap->nbytes - resid;
	out:
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	if (cookies)
	free(cookies, M_TEMP);
	free(buf, M_TEMP);
	return (error);
	}

	int
	ibcs2_mknod(td, uap)
	struct thread *td;
	struct ibcs2_mknod_args *uap;
	{
	char *path;
	int error;

	CHECKALTCREAT(td, uap->path, &path);
	if (S_ISFIFO(uap->mode))
	error = kern_mkfifo(td, path, UIO_SYSSPACE, uap->mode);
	else
	error = kern_mknod(td, path, UIO_SYSSPACE, uap->mode, uap->dev);
	free(path, M_TEMP);
	return (error);
	}

	int
	ibcs2_getgroups(td, uap)
	struct thread *td;
	struct ibcs2_getgroups_args *uap;
	{
	ibcs2_gid_t *iset;
	gid_t *gp;
	u_int i, ngrp;
	int error;

	if (uap->gidsetsize < td->td_ucred->cr_ngroups) {
	if (uap->gidsetsize == 0)
	ngrp = 0;
	else
	return (EINVAL);
	} else
	ngrp = td->td_ucred->cr_ngroups;
	gp = malloc(ngrp * sizeof(*gp), M_TEMP, M_WAITOK);
	error = kern_getgroups(td, &ngrp, gp);
	if (error)
	goto out;
	if (uap->gidsetsize > 0) {
	iset = malloc(ngrp * sizeof(*iset), M_TEMP, M_WAITOK);
	for (i = 0; i < ngrp; i++)
	iset[i] = (ibcs2_gid_t)gp[i];
	error = copyout(iset, uap->gidset, ngrp * sizeof(ibcs2_gid_t));
	free(iset, M_TEMP);
	}
	if (error == 0)
	td->td_retval[0] = ngrp;
	out:
	free(gp, M_TEMP);
	return (error);
	}

	int
	ibcs2_setgroups(td, uap)
	struct thread *td;
	struct ibcs2_setgroups_args *uap;
	{
	ibcs2_gid_t *iset;
	gid_t *gp;
	int error, i;

	if (uap->gidsetsize < 0 \|\| uap->gidsetsize > ngroups_max + 1)
	return (EINVAL);
	if (uap->gidsetsize && uap->gidset == NULL)
	return (EINVAL);
	gp = malloc(uap->gidsetsize * sizeof(*gp), M_TEMP, M_WAITOK);
	if (uap->gidsetsize) {
	iset = malloc(uap->gidsetsize * sizeof(*iset), M_TEMP, M_WAITOK);
	error = copyin(uap->gidset, iset, sizeof(ibcs2_gid_t) *
	uap->gidsetsize);
	if (error) {
	free(iset, M_TEMP);
	goto out;
	}
	for (i = 0; i < uap->gidsetsize; i++)
	gp[i] = (gid_t)iset[i];
	}

	error = kern_setgroups(td, uap->gidsetsize, gp);
	out:
	free(gp, M_TEMP);
	return (error);
	}

	int
	ibcs2_setuid(td, uap)
	struct thread *td;
	struct ibcs2_setuid_args *uap;
	{
	struct setuid_args sa;

	sa.uid = (uid_t)uap->uid;
	- return setuid(td, &sa);
	+ return sys_setuid(td, &sa);
	}

	int
	ibcs2_setgid(td, uap)
	struct thread *td;
	struct ibcs2_setgid_args *uap;
	{
	struct setgid_args sa;

	sa.gid = (gid_t)uap->gid;
	- return setgid(td, &sa);
	+ return sys_setgid(td, &sa);
	}

	int
	ibcs2_time(td, uap)
	struct thread *td;
	struct ibcs2_time_args *uap;
	{
	struct timeval tv;

	microtime(&tv);
	td->td_retval[0] = tv.tv_sec;
	if (uap->tp)
	return copyout((caddr_t)&tv.tv_sec, (caddr_t)uap->tp,
	sizeof(ibcs2_time_t));
	else
	return 0;
	}

	int
	ibcs2_pathconf(td, uap)
	struct thread *td;
	struct ibcs2_pathconf_args *uap;
	{
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);
	uap->name++; /* iBCS2 _PC_* defines are offset by one */
	error = kern_pathconf(td, path, UIO_SYSSPACE, uap->name, FOLLOW);
	free(path, M_TEMP);
	return (error);
	}

	int
	ibcs2_fpathconf(td, uap)
	struct thread *td;
	struct ibcs2_fpathconf_args *uap;
	{
	uap->name++; /* iBCS2 _PC_* defines are offset by one */
	- return fpathconf(td, (struct fpathconf_args *)uap);
	+ return sys_fpathconf(td, (struct fpathconf_args *)uap);
	}

	int
	ibcs2_sysconf(td, uap)
	struct thread *td;
	struct ibcs2_sysconf_args *uap;
	{
	int mib[2], value, len, error;
	struct proc *p;

	p = td->td_proc;
	switch(uap->name) {
	case IBCS2_SC_ARG_MAX:
	mib[1] = KERN_ARGMAX;
	break;

	case IBCS2_SC_CHILD_MAX:
	PROC_LOCK(p);
	td->td_retval[0] = lim_cur(td->td_proc, RLIMIT_NPROC);
	PROC_UNLOCK(p);
	return 0;

	case IBCS2_SC_CLK_TCK:
	td->td_retval[0] = hz;
	return 0;

	case IBCS2_SC_NGROUPS_MAX:
	mib[1] = KERN_NGROUPS;
	break;

	case IBCS2_SC_OPEN_MAX:
	PROC_LOCK(p);
	td->td_retval[0] = lim_cur(td->td_proc, RLIMIT_NOFILE);
	PROC_UNLOCK(p);
	return 0;

	case IBCS2_SC_JOB_CONTROL:
	mib[1] = KERN_JOB_CONTROL;
	break;

	case IBCS2_SC_SAVED_IDS:
	mib[1] = KERN_SAVED_IDS;
	break;

	case IBCS2_SC_VERSION:
	mib[1] = KERN_POSIX1;
	break;

	case IBCS2_SC_PASS_MAX:
	td->td_retval[0] = 128; /* XXX - should we create PASS_MAX ? */
	return 0;

	case IBCS2_SC_XOPEN_VERSION:
	td->td_retval[0] = 2; /* XXX: What should that be? */
	return 0;

	default:
	return EINVAL;
	}

	mib[0] = CTL_KERN;
	len = sizeof(value);
	error = kernel_sysctl(td, mib, 2, &value, &len, NULL, 0, NULL, 0);
	if (error)
	return error;
	td->td_retval[0] = value;
	return 0;
	}

	int
	ibcs2_alarm(td, uap)
	struct thread *td;
	struct ibcs2_alarm_args *uap;
	{
	struct itimerval itv, oitv;
	int error;

	timevalclear(&itv.it_interval);
	itv.it_value.tv_sec = uap->sec;
	itv.it_value.tv_usec = 0;
	error = kern_setitimer(td, ITIMER_REAL, &itv, &oitv);
	if (error)
	return (error);
	if (oitv.it_value.tv_usec != 0)
	oitv.it_value.tv_sec++;
	td->td_retval[0] = oitv.it_value.tv_sec;
	return (0);
	}

	int
	ibcs2_times(td, uap)
	struct thread *td;
	struct ibcs2_times_args *uap;
	{
	struct rusage ru;
	struct timeval t;
	struct tms tms;
	int error;

	#define CONVTCK(r) (r.tv_sec * hz + r.tv_usec / (1000000 / hz))

	error = kern_getrusage(td, RUSAGE_SELF, &ru);
	if (error)
	return (error);
	tms.tms_utime = CONVTCK(ru.ru_utime);
	tms.tms_stime = CONVTCK(ru.ru_stime);

	error = kern_getrusage(td, RUSAGE_CHILDREN, &ru);
	if (error)
	return (error);
	tms.tms_cutime = CONVTCK(ru.ru_utime);
	tms.tms_cstime = CONVTCK(ru.ru_stime);

	microtime(&t);
	td->td_retval[0] = CONVTCK(t);

	return (copyout(&tms, uap->tp, sizeof(struct tms)));
	}

	int
	ibcs2_stime(td, uap)
	struct thread *td;
	struct ibcs2_stime_args *uap;
	{
	struct timeval tv;
	long secs;
	int error;

	error = copyin(uap->timep, &secs, sizeof(long));
	if (error)
	return (error);
	tv.tv_sec = secs;
	tv.tv_usec = 0;
	error = kern_settimeofday(td, &tv, NULL);
	if (error)
	error = EPERM;
	return (error);
	}

	int
	ibcs2_utime(td, uap)
	struct thread *td;
	struct ibcs2_utime_args *uap;
	{
	struct ibcs2_utimbuf ubuf;
	struct timeval tbuf[2], *tp;
	char *path;
	int error;

	if (uap->buf) {
	error = copyin(uap->buf, &ubuf, sizeof(ubuf));
	if (error)
	return (error);
	tbuf[0].tv_sec = ubuf.actime;
	tbuf[0].tv_usec = 0;
	tbuf[1].tv_sec = ubuf.modtime;
	tbuf[1].tv_usec = 0;
	tp = tbuf;
	} else
	tp = NULL;

	CHECKALTEXIST(td, uap->path, &path);
	error = kern_utimes(td, path, UIO_SYSSPACE, tp, UIO_SYSSPACE);
	free(path, M_TEMP);
	return (error);
	}

	int
	ibcs2_nice(td, uap)
	struct thread *td;
	struct ibcs2_nice_args *uap;
	{
	int error;
	struct setpriority_args sa;

	sa.which = PRIO_PROCESS;
	sa.who = 0;
	sa.prio = td->td_proc->p_nice + uap->incr;
	- if ((error = setpriority(td, &sa)) != 0)
	+ if ((error = sys_setpriority(td, &sa)) != 0)
	return EPERM;
	td->td_retval[0] = td->td_proc->p_nice;
	return 0;
	}

	/*
	* iBCS2 getpgrp, setpgrp, setsid, and setpgid
	*/

	int
	ibcs2_pgrpsys(td, uap)
	struct thread *td;
	struct ibcs2_pgrpsys_args *uap;
	{
	struct proc *p = td->td_proc;
	switch (uap->type) {
	case 0: /* getpgrp */
	PROC_LOCK(p);
	td->td_retval[0] = p->p_pgrp->pg_id;
	PROC_UNLOCK(p);
	return 0;

	case 1: /* setpgrp */
	{
	struct setpgid_args sa;

	sa.pid = 0;
	sa.pgid = 0;
	- setpgid(td, &sa);
	+ sys_setpgid(td, &sa);
	PROC_LOCK(p);
	td->td_retval[0] = p->p_pgrp->pg_id;
	PROC_UNLOCK(p);
	return 0;
	}

	case 2: /* setpgid */
	{
	struct setpgid_args sa;

	sa.pid = uap->pid;
	sa.pgid = uap->pgid;
	- return setpgid(td, &sa);
	+ return sys_setpgid(td, &sa);
	}

	case 3: /* setsid */
	- return setsid(td, NULL);
	+ return sys_setsid(td, NULL);

	default:
	return EINVAL;
	}
	}

	/*
	* XXX - need to check for nested calls
	*/

	int
	ibcs2_plock(td, uap)
	struct thread *td;
	struct ibcs2_plock_args *uap;
	{
	int error;
	#define IBCS2_UNLOCK 0
	#define IBCS2_PROCLOCK 1
	#define IBCS2_TEXTLOCK 2
	#define IBCS2_DATALOCK 4


	switch(uap->cmd) {
	case IBCS2_UNLOCK:
	error = priv_check(td, PRIV_VM_MUNLOCK);
	if (error)
	return (error);
	/* XXX - TODO */
	return (0);

	case IBCS2_PROCLOCK:
	case IBCS2_TEXTLOCK:
	case IBCS2_DATALOCK:
	error = priv_check(td, PRIV_VM_MLOCK);
	if (error)
	return (error);
	/* XXX - TODO */
	return 0;
	}
	return EINVAL;
	}

	int
	ibcs2_uadmin(td, uap)
	struct thread *td;
	struct ibcs2_uadmin_args *uap;
	{
	#define SCO_A_REBOOT 1
	#define SCO_A_SHUTDOWN 2
	#define SCO_A_REMOUNT 4
	#define SCO_A_CLOCK 8
	#define SCO_A_SETCONFIG 128
	#define SCO_A_GETDEV 130

	#define SCO_AD_HALT 0
	#define SCO_AD_BOOT 1
	#define SCO_AD_IBOOT 2
	#define SCO_AD_PWRDOWN 3
	#define SCO_AD_PWRNAP 4

	#define SCO_AD_PANICBOOT 1

	#define SCO_AD_GETBMAJ 0
	#define SCO_AD_GETCMAJ 1

	switch(uap->cmd) {
	case SCO_A_REBOOT:
	case SCO_A_SHUTDOWN:
	switch(uap->func) {
	struct reboot_args r;
	case SCO_AD_HALT:
	case SCO_AD_PWRDOWN:
	case SCO_AD_PWRNAP:
	r.opt = RB_HALT;
	- return (reboot(td, &r));
	+ return (sys_reboot(td, &r));
	case SCO_AD_BOOT:
	case SCO_AD_IBOOT:
	r.opt = RB_AUTOBOOT;
	- return (reboot(td, &r));
	+ return (sys_reboot(td, &r));
	}
	return EINVAL;
	case SCO_A_REMOUNT:
	case SCO_A_CLOCK:
	case SCO_A_SETCONFIG:
	return 0;
	case SCO_A_GETDEV:
	return EINVAL; /* XXX - TODO */
	}
	return EINVAL;
	}

	int
	ibcs2_sysfs(td, uap)
	struct thread *td;
	struct ibcs2_sysfs_args *uap;
	{
	#define IBCS2_GETFSIND 1
	#define IBCS2_GETFSTYP 2
	#define IBCS2_GETNFSTYP 3

	switch(uap->cmd) {
	case IBCS2_GETFSIND:
	case IBCS2_GETFSTYP:
	case IBCS2_GETNFSTYP:
	break;
	}
	return EINVAL; /* XXX - TODO */
	}

	int
	ibcs2_unlink(td, uap)
	struct thread *td;
	struct ibcs2_unlink_args *uap;
	{
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);
	error = kern_unlink(td, path, UIO_SYSSPACE);
	free(path, M_TEMP);
	return (error);
	}

	int
	ibcs2_chdir(td, uap)
	struct thread *td;
	struct ibcs2_chdir_args *uap;
	{
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);
	error = kern_chdir(td, path, UIO_SYSSPACE);
	free(path, M_TEMP);
	return (error);
	}

	int
	ibcs2_chmod(td, uap)
	struct thread *td;
	struct ibcs2_chmod_args *uap;
	{
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);
	error = kern_chmod(td, path, UIO_SYSSPACE, uap->mode);
	free(path, M_TEMP);
	return (error);
	}

	int
	ibcs2_chown(td, uap)
	struct thread *td;
	struct ibcs2_chown_args *uap;
	{
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);
	error = kern_chown(td, path, UIO_SYSSPACE, uap->uid, uap->gid);
	free(path, M_TEMP);
	return (error);
	}

	int
	ibcs2_rmdir(td, uap)
	struct thread *td;
	struct ibcs2_rmdir_args *uap;
	{
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);
	error = kern_rmdir(td, path, UIO_SYSSPACE);
	free(path, M_TEMP);
	return (error);
	}

	int
	ibcs2_mkdir(td, uap)
	struct thread *td;
	struct ibcs2_mkdir_args *uap;
	{
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);
	error = kern_mkdir(td, path, UIO_SYSSPACE, uap->mode);
	free(path, M_TEMP);
	return (error);
	}

	int
	ibcs2_symlink(td, uap)
	struct thread *td;
	struct ibcs2_symlink_args *uap;
	{
	char path, link;
	int error;

	CHECKALTEXIST(td, uap->path, &path);

	/*
	* Have to expand CHECKALTCREAT() so that 'path' can be freed on
	* errors.
	*/
	error = ibcs2_emul_find(td, uap->link, UIO_USERSPACE, &link, 1);
	if (link == NULL) {
	free(path, M_TEMP);
	return (error);
	}
	error = kern_symlink(td, path, link, UIO_SYSSPACE);
	free(path, M_TEMP);
	free(link, M_TEMP);
	return (error);
	}

	int
	ibcs2_rename(td, uap)
	struct thread *td;
	struct ibcs2_rename_args *uap;
	{
	char from, to;
	int error;

	CHECKALTEXIST(td, uap->from, &from);

	/*
	* Have to expand CHECKALTCREAT() so that 'from' can be freed on
	* errors.
	*/
	error = ibcs2_emul_find(td, uap->to, UIO_USERSPACE, &to, 1);
	if (to == NULL) {
	free(from, M_TEMP);
	return (error);
	}
	error = kern_rename(td, from, to, UIO_SYSSPACE);
	free(from, M_TEMP);
	free(to, M_TEMP);
	return (error);
	}

	int
	ibcs2_readlink(td, uap)
	struct thread *td;
	struct ibcs2_readlink_args *uap;
	{
	char *path;
	int error;

	CHECKALTEXIST(td, uap->path, &path);
	error = kern_readlink(td, path, UIO_SYSSPACE, uap->buf, UIO_USERSPACE,
	uap->count);
	free(path, M_TEMP);
	return (error);
	}
	Index: head/sys/i386/ibcs2/ibcs2_other.c
	===================================================================
	--- head/sys/i386/ibcs2/ibcs2_other.c (revision 225616)
	+++ head/sys/i386/ibcs2/ibcs2_other.c (revision 225617)
	@@ -1,118 +1,118 @@
	/*-
	* Copyright (c) 1995 Steven Wallace
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/*
	* IBCS2 compatibility module.
	*/

	#include "opt_spx_hack.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>
	#include <sys/un.h>

	#include <i386/ibcs2/ibcs2_types.h>
	#include <i386/ibcs2/ibcs2_signal.h>
	#include <i386/ibcs2/ibcs2_util.h>
	#include <i386/ibcs2/ibcs2_proto.h>

	#define IBCS2_SECURE_GETLUID 1
	#define IBCS2_SECURE_SETLUID 2

	int
	ibcs2_secure(struct thread td, struct ibcs2_secure_args uap)
	{
	switch (uap->cmd) {

	case IBCS2_SECURE_GETLUID: /* get login uid */
	td->td_retval[0] = td->td_ucred->cr_uid;
	return 0;

	case IBCS2_SECURE_SETLUID: /* set login uid */
	return EPERM;

	default:
	printf("IBCS2: 'secure' cmd=%d not implemented\n", uap->cmd);
	}

	return EINVAL;
	}

	int
	ibcs2_lseek(struct thread td, register struct ibcs2_lseek_args uap)
	{
	struct lseek_args largs;
	int error;

	largs.fd = uap->fd;
	largs.offset = uap->offset;
	largs.whence = uap->whence;
	- error = lseek(td, &largs);
	+ error = sys_lseek(td, &largs);
	return (error);
	}

	#ifdef SPX_HACK
	#include <sys/socket.h>
	#include <sys/un.h>

	int
	spx_open(struct thread *td)
	{
	struct socket_args sock;
	struct sockaddr_un sun;
	int fd, error;

	/* obtain a socket. */
	DPRINTF(("SPX: open socket\n"));
	sock.domain = AF_UNIX;
	sock.type = SOCK_STREAM;
	sock.protocol = 0;
	- error = socket(td, &sock);
	+ error = sys_socket(td, &sock);
	if (error)
	return error;
	fd = td->td_retval[0];

	/* connect the socket to standard X socket */
	DPRINTF(("SPX: connect to /tmp/X11-unix/X0\n"));
	sun.sun_family = AF_UNIX;
	strcpy(sun.sun_path, "/tmp/.X11-unix/X0");
	sun.sun_len = sizeof(struct sockaddr_un) - sizeof(sun.sun_path) +
	strlen(sun.sun_path) + 1;

	error = kern_connect(td, fd, (struct sockaddr *)&sun);
	if (error) {
	kern_close(td, fd);
	return error;
	}
	td->td_retval[0] = fd;
	return 0;
	}
	#endif /* SPX_HACK */
	Index: head/sys/i386/ibcs2/ibcs2_signal.c
	===================================================================
	--- head/sys/i386/ibcs2/ibcs2_signal.c (revision 225616)
	+++ head/sys/i386/ibcs2/ibcs2_signal.c (revision 225617)
	@@ -1,441 +1,441 @@
	/*-
	* Copyright (c) 1995 Scott Bartram
	* Copyright (c) 1995 Steven Wallace
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/signalvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>

	#include <i386/ibcs2/ibcs2_types.h>
	#include <i386/ibcs2/ibcs2_signal.h>
	#include <i386/ibcs2/ibcs2_proto.h>
	#include <i386/ibcs2/ibcs2_xenix.h>
	#include <i386/ibcs2/ibcs2_util.h>

	#define sigemptyset(s) SIGEMPTYSET(*(s))
	#define sigismember(s, n) SIGISMEMBER(*(s), n)
	#define sigaddset(s, n) SIGADDSET(*(s), n)

	#define ibcs2_sigmask(n) (1 << ((n) - 1))
	#define ibcs2_sigemptyset(s) bzero((s), sizeof(*(s)))
	#define ibcs2_sigismember(s, n) (*(s) & ibcs2_sigmask(n))
	#define ibcs2_sigaddset(s, n) (*(s) \|= ibcs2_sigmask(n))

	static void ibcs2_to_bsd_sigset(const ibcs2_sigset_t , sigset_t );
	static void bsd_to_ibcs2_sigset(const sigset_t , ibcs2_sigset_t );
	static void ibcs2_to_bsd_sigaction(struct ibcs2_sigaction *,
	struct sigaction *);
	static void bsd_to_ibcs2_sigaction(struct sigaction *,
	struct ibcs2_sigaction *);

	int bsd_to_ibcs2_sig[IBCS2_SIGTBLSZ] = {
	IBCS2_SIGHUP, /* 1 */
	IBCS2_SIGINT, /* 2 */
	IBCS2_SIGQUIT, /* 3 */
	IBCS2_SIGILL, /* 4 */
	IBCS2_SIGTRAP, /* 5 */
	IBCS2_SIGABRT, /* 6 */
	IBCS2_SIGEMT, /* 7 */
	IBCS2_SIGFPE, /* 8 */
	IBCS2_SIGKILL, /* 9 */
	IBCS2_SIGBUS, /* 10 */
	IBCS2_SIGSEGV, /* 11 */
	IBCS2_SIGSYS, /* 12 */
	IBCS2_SIGPIPE, /* 13 */
	IBCS2_SIGALRM, /* 14 */
	IBCS2_SIGTERM, /* 15 */
	0, /* 16 - SIGURG */
	IBCS2_SIGSTOP, /* 17 */
	IBCS2_SIGTSTP, /* 18 */
	IBCS2_SIGCONT, /* 19 */
	IBCS2_SIGCLD, /* 20 */
	IBCS2_SIGTTIN, /* 21 */
	IBCS2_SIGTTOU, /* 22 */
	IBCS2_SIGPOLL, /* 23 */
	0, /* 24 - SIGXCPU */
	0, /* 25 - SIGXFSZ */
	IBCS2_SIGVTALRM, /* 26 */
	IBCS2_SIGPROF, /* 27 */
	IBCS2_SIGWINCH, /* 28 */
	0, /* 29 */
	IBCS2_SIGUSR1, /* 30 */
	IBCS2_SIGUSR2, /* 31 */
	0 /* 32 */
	};

	static int ibcs2_to_bsd_sig[IBCS2_SIGTBLSZ] = {
	SIGHUP, /* 1 */
	SIGINT, /* 2 */
	SIGQUIT, /* 3 */
	SIGILL, /* 4 */
	SIGTRAP, /* 5 */
	SIGABRT, /* 6 */
	SIGEMT, /* 7 */
	SIGFPE, /* 8 */
	SIGKILL, /* 9 */
	SIGBUS, /* 10 */
	SIGSEGV, /* 11 */
	SIGSYS, /* 12 */
	SIGPIPE, /* 13 */
	SIGALRM, /* 14 */
	SIGTERM, /* 15 */
	SIGUSR1, /* 16 */
	SIGUSR2, /* 17 */
	SIGCHLD, /* 18 */
	0, /* 19 - SIGPWR */
	SIGWINCH, /* 20 */
	0, /* 21 */
	SIGIO, /* 22 */
	SIGSTOP, /* 23 */
	SIGTSTP, /* 24 */
	SIGCONT, /* 25 */
	SIGTTIN, /* 26 */
	SIGTTOU, /* 27 */
	SIGVTALRM, /* 28 */
	SIGPROF, /* 29 */
	0, /* 30 */
	0, /* 31 */
	0 /* 32 */
	};

	void
	ibcs2_to_bsd_sigset(iss, bss)
	const ibcs2_sigset_t *iss;
	sigset_t *bss;
	{
	int i, newsig;

	sigemptyset(bss);
	for (i = 1; i <= IBCS2_SIGTBLSZ; i++) {
	if (ibcs2_sigismember(iss, i)) {
	newsig = ibcs2_to_bsd_sig[_SIG_IDX(i)];
	if (newsig)
	sigaddset(bss, newsig);
	}
	}
	}

	static void
	bsd_to_ibcs2_sigset(bss, iss)
	const sigset_t *bss;
	ibcs2_sigset_t *iss;
	{
	int i, newsig;

	ibcs2_sigemptyset(iss);
	for (i = 1; i <= IBCS2_SIGTBLSZ; i++) {
	if (sigismember(bss, i)) {
	newsig = bsd_to_ibcs2_sig[_SIG_IDX(i)];
	if (newsig)
	ibcs2_sigaddset(iss, newsig);
	}
	}
	}

	static void
	ibcs2_to_bsd_sigaction(isa, bsa)
	struct ibcs2_sigaction *isa;
	struct sigaction *bsa;
	{

	bsa->sa_handler = isa->isa_handler;
	ibcs2_to_bsd_sigset(&isa->isa_mask, &bsa->sa_mask);
	bsa->sa_flags = 0; /* ??? SA_NODEFER */
	if ((isa->isa_flags & IBCS2_SA_NOCLDSTOP) != 0)
	bsa->sa_flags \|= SA_NOCLDSTOP;
	}

	static void
	bsd_to_ibcs2_sigaction(bsa, isa)
	struct sigaction *bsa;
	struct ibcs2_sigaction *isa;
	{

	isa->isa_handler = bsa->sa_handler;
	bsd_to_ibcs2_sigset(&bsa->sa_mask, &isa->isa_mask);
	isa->isa_flags = 0;
	if ((bsa->sa_flags & SA_NOCLDSTOP) != 0)
	isa->isa_flags \|= IBCS2_SA_NOCLDSTOP;
	}

	int
	ibcs2_sigaction(td, uap)
	register struct thread *td;
	struct ibcs2_sigaction_args *uap;
	{
	struct ibcs2_sigaction isa;
	struct sigaction nbsa, obsa;
	struct sigaction *nbsap;
	int error;

	if (uap->act != NULL) {
	if ((error = copyin(uap->act, &isa, sizeof(isa))) != 0)
	return (error);
	ibcs2_to_bsd_sigaction(&isa, &nbsa);
	nbsap = &nbsa;
	} else
	nbsap = NULL;
	if (uap->sig <= 0 \|\| uap->sig > IBCS2_NSIG)
	return (EINVAL);
	error = kern_sigaction(td, ibcs2_to_bsd_sig[_SIG_IDX(uap->sig)], &nbsa,
	&obsa, 0);
	if (error == 0 && uap->oact != NULL) {
	bsd_to_ibcs2_sigaction(&obsa, &isa);
	error = copyout(&isa, uap->oact, sizeof(isa));
	}
	return (error);
	}

	int
	ibcs2_sigsys(td, uap)
	register struct thread *td;
	struct ibcs2_sigsys_args *uap;
	{
	struct proc *p = td->td_proc;
	struct sigaction sa;
	int signum = IBCS2_SIGNO(uap->sig);
	int error;

	if (signum <= 0 \|\| signum > IBCS2_NSIG) {
	if (IBCS2_SIGCALL(uap->sig) == IBCS2_SIGNAL_MASK \|\|
	IBCS2_SIGCALL(uap->sig) == IBCS2_SIGSET_MASK)
	td->td_retval[0] = (int)IBCS2_SIG_ERR;
	return EINVAL;
	}
	signum = ibcs2_to_bsd_sig[_SIG_IDX(signum)];

	switch (IBCS2_SIGCALL(uap->sig)) {
	case IBCS2_SIGSET_MASK:
	/*
	* Check for SIG_HOLD action.
	* Otherwise, perform signal() except with different sa_flags.
	*/
	if (uap->fp != IBCS2_SIG_HOLD) {
	/* add sig to mask before exececuting signal handler */
	sa.sa_flags = 0;
	goto ibcs2_sigset;
	}
	/* else FALLTHROUGH to sighold */

	case IBCS2_SIGHOLD_MASK:
	{
	sigset_t mask;

	SIGEMPTYSET(mask);
	SIGADDSET(mask, signum);
	return (kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
	0));
	}

	case IBCS2_SIGNAL_MASK:
	{
	struct sigaction osa;

	/* do not automatically block signal */
	sa.sa_flags = SA_NODEFER;
	#ifdef SA_RESETHAND
	if((signum != IBCS2_SIGILL) &&
	(signum != IBCS2_SIGTRAP) &&
	(signum != IBCS2_SIGPWR))
	/* set to SIG_DFL before executing handler */
	sa.sa_flags \|= SA_RESETHAND;
	#endif
	ibcs2_sigset:
	sa.sa_handler = uap->fp;
	sigemptyset(&sa.sa_mask);
	#if 0
	if (signum != SIGALRM)
	sa.sa_flags \|= SA_RESTART;
	#endif
	error = kern_sigaction(td, signum, &sa, &osa, 0);
	if (error != 0) {
	DPRINTF(("signal: sigaction failed: %d\n",
	error));
	td->td_retval[0] = (int)IBCS2_SIG_ERR;
	return (error);
	}
	td->td_retval[0] = (int)osa.sa_handler;

	/* special sigset() check */
	if(IBCS2_SIGCALL(uap->sig) == IBCS2_SIGSET_MASK) {
	PROC_LOCK(p);
	/* check to make sure signal is not blocked */
	if(sigismember(&td->td_sigmask, signum)) {
	/* return SIG_HOLD and unblock signal*/
	td->td_retval[0] = (int)IBCS2_SIG_HOLD;
	SIGDELSET(td->td_sigmask, signum);
	signotify(td);
	}
	PROC_UNLOCK(p);
	}

	return 0;
	}

	case IBCS2_SIGRELSE_MASK:
	{
	sigset_t mask;

	SIGEMPTYSET(mask);
	SIGADDSET(mask, signum);
	return (kern_sigprocmask(td, SIG_UNBLOCK, &mask, NULL,
	0));
	}

	case IBCS2_SIGIGNORE_MASK:
	{
	sa.sa_handler = SIG_IGN;
	sigemptyset(&sa.sa_mask);
	sa.sa_flags = 0;
	error = kern_sigaction(td, signum, &sa, NULL, 0);
	if (error != 0)
	DPRINTF(("sigignore: sigaction failed\n"));
	return (error);
	}

	case IBCS2_SIGPAUSE_MASK:
	{
	sigset_t mask;

	PROC_LOCK(p);
	mask = td->td_sigmask;
	PROC_UNLOCK(p);
	SIGDELSET(mask, signum);
	return kern_sigsuspend(td, mask);
	}

	default:
	return ENOSYS;
	}
	}

	int
	ibcs2_sigprocmask(td, uap)
	register struct thread *td;
	struct ibcs2_sigprocmask_args *uap;
	{
	ibcs2_sigset_t iss;
	sigset_t oss, nss;
	sigset_t *nssp;
	int error, how;

	switch (uap->how) {
	case IBCS2_SIG_BLOCK:
	how = SIG_BLOCK;
	break;
	case IBCS2_SIG_UNBLOCK:
	how = SIG_UNBLOCK;
	break;
	case IBCS2_SIG_SETMASK:
	how = SIG_SETMASK;
	break;
	default:
	return (EINVAL);
	}
	if (uap->set != NULL) {
	if ((error = copyin(uap->set, &iss, sizeof(iss))) != 0)
	return error;
	ibcs2_to_bsd_sigset(&iss, &nss);
	nssp = &nss;
	} else
	nssp = NULL;
	error = kern_sigprocmask(td, how, nssp, &oss, 0);
	if (error == 0 && uap->oset != NULL) {
	bsd_to_ibcs2_sigset(&oss, &iss);
	error = copyout(&iss, uap->oset, sizeof(iss));
	}
	return (error);
	}

	int
	ibcs2_sigpending(td, uap)
	register struct thread *td;
	struct ibcs2_sigpending_args *uap;
	{
	struct proc *p = td->td_proc;
	sigset_t bss;
	ibcs2_sigset_t iss;

	PROC_LOCK(p);
	bss = td->td_siglist;
	SIGSETOR(bss, p->p_siglist);
	SIGSETAND(bss, td->td_sigmask);
	PROC_UNLOCK(p);
	bsd_to_ibcs2_sigset(&bss, &iss);

	return copyout(&iss, uap->mask, sizeof(iss));
	}

	int
	ibcs2_sigsuspend(td, uap)
	register struct thread *td;
	struct ibcs2_sigsuspend_args *uap;
	{
	ibcs2_sigset_t sss;
	sigset_t bss;
	int error;

	if ((error = copyin(uap->mask, &sss, sizeof(sss))) != 0)
	return error;

	ibcs2_to_bsd_sigset(&sss, &bss);
	return kern_sigsuspend(td, bss);
	}

	int
	ibcs2_pause(td, uap)
	register struct thread *td;
	struct ibcs2_pause_args *uap;
	{
	sigset_t mask;

	PROC_LOCK(td->td_proc);
	mask = td->td_sigmask;
	PROC_UNLOCK(td->td_proc);
	return kern_sigsuspend(td, mask);
	}

	int
	ibcs2_kill(td, uap)
	register struct thread *td;
	struct ibcs2_kill_args *uap;
	{
	struct kill_args ka;

	if (uap->signo <= 0 \|\| uap->signo > IBCS2_NSIG)
	return (EINVAL);
	ka.pid = uap->pid;
	ka.signum = ibcs2_to_bsd_sig[_SIG_IDX(uap->signo)];
	- return kill(td, &ka);
	+ return sys_kill(td, &ka);
	}
	Index: head/sys/i386/ibcs2/ibcs2_socksys.c
	===================================================================
	--- head/sys/i386/ibcs2/ibcs2_socksys.c (revision 225616)
	+++ head/sys/i386/ibcs2/ibcs2_socksys.c (revision 225617)
	@@ -1,210 +1,210 @@
	/*-
	* Copyright (c) 1994, 1995 Scott Bartram
	* Copyright (c) 1994 Arne H Juul
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>

	#include <i386/ibcs2/ibcs2_socksys.h>
	#include <i386/ibcs2/ibcs2_util.h>

	/* Local structures */
	struct getipdomainname_args {
	char *ipdomainname;
	int len;
	};

	struct setipdomainname_args {
	char *ipdomainname;
	int len;
	};

	/* Local prototypes */
	static int ibcs2_getipdomainname(struct thread *,
	struct getipdomainname_args *);
	static int ibcs2_setipdomainname(struct thread *,
	struct setipdomainname_args *);

	/*
	* iBCS2 socksys calls.
	*/

	int
	ibcs2_socksys(td, uap)
	register struct thread *td;
	register struct ibcs2_socksys_args *uap;
	{
	int error;
	int realargs[7]; /* 1 for command, 6 for recvfrom */
	void *passargs;

	/*
	* SOCKET should only be legal on /dev/socksys.
	* GETIPDOMAINNAME should only be legal on /dev/socksys ?
	* The others are (and should be) only legal on sockets.
	*/

	if ((error = copyin(uap->argsp, (caddr_t)realargs, sizeof(realargs))) != 0)
	return error;
	DPRINTF(("ibcs2_socksys: %08x %08x %08x %08x %08x %08x %08x\n",
	realargs[0], realargs[1], realargs[2], realargs[3],
	realargs[4], realargs[5], realargs[6]));

	passargs = (void *)(realargs + 1);
	switch (realargs[0]) {
	case SOCKSYS_ACCEPT:
	- return accept(td, passargs);
	+ return sys_accept(td, passargs);
	case SOCKSYS_BIND:
	- return bind(td, passargs);
	+ return sys_bind(td, passargs);
	case SOCKSYS_CONNECT:
	- return connect(td, passargs);
	+ return sys_connect(td, passargs);
	case SOCKSYS_GETPEERNAME:
	- return getpeername(td, passargs);
	+ return sys_getpeername(td, passargs);
	case SOCKSYS_GETSOCKNAME:
	- return getsockname(td, passargs);
	+ return sys_getsockname(td, passargs);
	case SOCKSYS_GETSOCKOPT:
	- return getsockopt(td, passargs);
	+ return sys_getsockopt(td, passargs);
	case SOCKSYS_LISTEN:
	- return listen(td, passargs);
	+ return sys_listen(td, passargs);
	case SOCKSYS_RECV:
	realargs[5] = realargs[6] = 0;
	/* FALLTHROUGH */
	case SOCKSYS_RECVFROM:
	- return recvfrom(td, passargs);
	+ return sys_recvfrom(td, passargs);
	case SOCKSYS_SEND:
	realargs[5] = realargs[6] = 0;
	/* FALLTHROUGH */
	case SOCKSYS_SENDTO:
	- return sendto(td, passargs);
	+ return sys_sendto(td, passargs);
	case SOCKSYS_SETSOCKOPT:
	- return setsockopt(td, passargs);
	+ return sys_setsockopt(td, passargs);
	case SOCKSYS_SHUTDOWN:
	- return shutdown(td, passargs);
	+ return sys_shutdown(td, passargs);
	case SOCKSYS_SOCKET:
	- return socket(td, passargs);
	+ return sys_socket(td, passargs);
	case SOCKSYS_SELECT:
	- return select(td, passargs);
	+ return sys_select(td, passargs);
	case SOCKSYS_GETIPDOMAIN:
	return ibcs2_getipdomainname(td, passargs);
	case SOCKSYS_SETIPDOMAIN:
	return ibcs2_setipdomainname(td, passargs);
	case SOCKSYS_ADJTIME:
	- return adjtime(td, passargs);
	+ return sys_adjtime(td, passargs);
	case SOCKSYS_SETREUID:
	- return setreuid(td, passargs);
	+ return sys_setreuid(td, passargs);
	case SOCKSYS_SETREGID:
	- return setregid(td, passargs);
	+ return sys_setregid(td, passargs);
	case SOCKSYS_GETTIME:
	- return gettimeofday(td, passargs);
	+ return sys_gettimeofday(td, passargs);
	case SOCKSYS_SETTIME:
	- return settimeofday(td, passargs);
	+ return sys_settimeofday(td, passargs);
	case SOCKSYS_GETITIMER:
	- return getitimer(td, passargs);
	+ return sys_getitimer(td, passargs);
	case SOCKSYS_SETITIMER:
	- return setitimer(td, passargs);
	+ return sys_setitimer(td, passargs);

	default:
	printf("socksys unknown %08x %08x %08x %08x %08x %08x %08x\n",
	realargs[0], realargs[1], realargs[2], realargs[3],
	realargs[4], realargs[5], realargs[6]);
	return EINVAL;
	}
	/* NOTREACHED */
	}

	/* ARGSUSED */
	static int
	ibcs2_getipdomainname(td, uap)
	struct thread *td;
	struct getipdomainname_args *uap;
	{
	char hname[MAXHOSTNAMELEN], *dptr;
	int len;

	/* Get the domain name. */
	getcredhostname(td->td_ucred, hname, sizeof(hname));

	dptr = index(hname, '.');
	if ( dptr )
	dptr++;
	else
	/* Make it effectively an empty string */
	dptr = hname + strlen(hname);

	len = strlen(dptr) + 1;
	if ((u_int)uap->len > len + 1)
	uap->len = len + 1;
	return (copyout((caddr_t)dptr, (caddr_t)uap->ipdomainname, uap->len));
	}

	/* ARGSUSED */
	static int
	ibcs2_setipdomainname(td, uap)
	struct thread *td;
	struct setipdomainname_args *uap;
	{
	char hname[MAXHOSTNAMELEN], *ptr;
	int error, sctl[2], hlen;

	/* Get the domain name */
	getcredhostname(td->td_ucred, hname, sizeof(hname));

	/* W/out a hostname a domain-name is nonsense */
	if ( strlen(hname) == 0 )
	return EINVAL;

	/* Get the host's unqualified name (strip off the domain) */
	ptr = index(hname, '.');
	if ( ptr != NULL ) {
	ptr++;
	*ptr = '\0';
	} else {
	if (strlcat(hname, ".", sizeof(hname)) >= sizeof(hname))
	return (EINVAL);
	}

	/* Set ptr to the end of the string so we can append to it */
	hlen = strlen(hname);
	ptr = hname + hlen;
	if ((u_int)uap->len > (sizeof (hname) - hlen - 1))
	return EINVAL;

	/* Append the ipdomain to the end */
	error = copyinstr((caddr_t)uap->ipdomainname, ptr, uap->len, NULL);
	if (error)
	return (error);

	/* 'sethostname' with the new information */
	sctl[0] = CTL_KERN;
	sctl[1] = KERN_HOSTNAME;
	hlen = strlen(hname) + 1;
	return (kernel_sysctl(td, sctl, 2, 0, 0, hname, hlen, 0, 0));
	}
	Index: head/sys/i386/ibcs2/ibcs2_xenix.c
	===================================================================
	--- head/sys/i386/ibcs2/ibcs2_xenix.c (revision 225616)
	+++ head/sys/i386/ibcs2/ibcs2_xenix.c (revision 225617)
	@@ -1,215 +1,215 @@
	/*-
	* Copyright (c) 1994 Sean Eric Fagan
	* Copyright (c) 1994 Søren Schmidt
	* Copyright (c) 1995 Steven Wallace
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/namei.h>
	#include <sys/sysproto.h>
	#include <sys/clock.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/filio.h>
	#include <sys/vnode.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/unistd.h>

	#include <machine/cpu.h>

	#include <i386/ibcs2/ibcs2_types.h>
	#include <i386/ibcs2/ibcs2_unistd.h>
	#include <i386/ibcs2/ibcs2_signal.h>
	#include <i386/ibcs2/ibcs2_util.h>
	#include <i386/ibcs2/ibcs2_proto.h>
	#include <i386/ibcs2/ibcs2_xenix.h>
	#include <i386/ibcs2/ibcs2_xenix_syscall.h>


	extern struct sysent xenix_sysent[];

	int
	ibcs2_xenix(struct thread td, struct ibcs2_xenix_args uap)
	{
	struct trapframe *tf = td->td_frame;
	struct sysent *callp;
	u_int code;
	int error;

	code = (tf->tf_eax & 0xff00) >> 8;
	callp = &xenix_sysent[code];

	if (code < IBCS2_XENIX_MAXSYSCALL)
	error = ((callp->sy_call)(td, (void )uap));
	else
	error = ENOSYS;
	return (error);
	}

	int
	xenix_rdchk(td, uap)
	struct thread *td;
	struct xenix_rdchk_args *uap;
	{
	int data, error;

	DPRINTF(("IBCS2: 'xenix rdchk'\n"));

	error = kern_ioctl(td, uap->fd, FIONREAD, (caddr_t)&data);
	if (error)
	return (error);
	td->td_retval[0] = data ? 1 : 0;
	return (0);
	}

	int
	xenix_chsize(td, uap)
	struct thread *td;
	struct xenix_chsize_args *uap;
	{
	struct ftruncate_args sa;

	DPRINTF(("IBCS2: 'xenix chsize'\n"));
	sa.fd = uap->fd;
	sa.length = uap->size;
	- return ftruncate(td, &sa);
	+ return sys_ftruncate(td, &sa);
	}


	int
	xenix_ftime(td, uap)
	struct thread *td;
	struct xenix_ftime_args *uap;
	{
	struct timeval tv;
	struct ibcs2_timeb {
	unsigned long time __packed;
	unsigned short millitm;
	short timezone;
	short dstflag;
	} itb;

	DPRINTF(("IBCS2: 'xenix ftime'\n"));
	microtime(&tv);
	itb.time = tv.tv_sec;
	itb.millitm = (tv.tv_usec / 1000);
	itb.timezone = tz_minuteswest;
	itb.dstflag = tz_dsttime != DST_NONE;

	return copyout((caddr_t)&itb, (caddr_t)uap->tp,
	sizeof(struct ibcs2_timeb));
	}

	int
	xenix_nap(struct thread td, struct xenix_nap_args uap)
	{
	long period;

	DPRINTF(("IBCS2: 'xenix nap %d ms'\n", uap->millisec));
	period = (long)uap->millisec / (1000/hz);
	if (period)
	pause("nap", period);
	return 0;
	}

	int
	xenix_utsname(struct thread td, struct xenix_utsname_args uap)
	{
	struct ibcs2_sco_utsname {
	char sysname[9];
	char nodename[9];
	char release[16];
	char kernelid[20];
	char machine[9];
	char bustype[9];
	char sysserial[10];
	unsigned short sysorigin;
	unsigned short sysoem;
	char numusers[9];
	unsigned short numcpu;
	} ibcs2_sco_uname;

	DPRINTF(("IBCS2: 'xenix sco_utsname'\n"));
	bzero(&ibcs2_sco_uname, sizeof(struct ibcs2_sco_utsname));
	strncpy(ibcs2_sco_uname.sysname, ostype,
	sizeof(ibcs2_sco_uname.sysname) - 1);
	getcredhostname(td->td_ucred, ibcs2_sco_uname.nodename,
	sizeof(ibcs2_sco_uname.nodename) - 1);
	strncpy(ibcs2_sco_uname.release, osrelease,
	sizeof(ibcs2_sco_uname.release) - 1);
	strncpy(ibcs2_sco_uname.kernelid, version,
	sizeof(ibcs2_sco_uname.kernelid) - 1);
	strncpy(ibcs2_sco_uname.machine, machine,
	sizeof(ibcs2_sco_uname.machine) - 1);
	strncpy(ibcs2_sco_uname.bustype, "ISA/EISA",
	sizeof(ibcs2_sco_uname.bustype) - 1);
	strncpy(ibcs2_sco_uname.sysserial, "no charge",
	sizeof(ibcs2_sco_uname.sysserial) - 1);
	strncpy(ibcs2_sco_uname.numusers, "unlim",
	sizeof(ibcs2_sco_uname.numusers) - 1);
	ibcs2_sco_uname.sysorigin = 0xFFFF;
	ibcs2_sco_uname.sysoem = 0xFFFF;
	ibcs2_sco_uname.numcpu = 1;
	return copyout((caddr_t)&ibcs2_sco_uname,
	(caddr_t)(void *)(intptr_t)uap->addr,
	sizeof(struct ibcs2_sco_utsname));
	}

	int
	xenix_scoinfo(struct thread td, struct xenix_scoinfo_args uap)
	{
	/* scoinfo (not documented) */
	td->td_retval[0] = 0;
	return 0;
	}

	int
	xenix_eaccess(struct thread td, struct xenix_eaccess_args uap)
	{
	char *path;
	int error, bsd_flags;

	bsd_flags = 0;
	if (uap->flags & IBCS2_R_OK)
	bsd_flags \|= R_OK;
	if (uap->flags & IBCS2_W_OK)
	bsd_flags \|= W_OK;
	if (uap->flags & IBCS2_X_OK)
	bsd_flags \|= X_OK;

	CHECKALTEXIST(td, uap->path, &path);
	error = kern_eaccess(td, path, UIO_SYSSPACE, bsd_flags);
	free(path, M_TEMP);
	return (error);
	}
	Index: head/sys/i386/linux/linux_machdep.c
	===================================================================
	--- head/sys/i386/linux/linux_machdep.c (revision 225616)
	+++ head/sys/i386/linux/linux_machdep.c (revision 225617)
	@@ -1,1120 +1,1120 @@
	/*-
	* Copyright (c) 2000 Marcel Moolenaar
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer
	* in this position and unchanged.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/file.h>
	#include <sys/fcntl.h>
	#include <sys/imgact.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mman.h>
	#include <sys/mutex.h>
	#include <sys/sx.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/queue.h>
	#include <sys/resource.h>
	#include <sys/resourcevar.h>
	#include <sys/signalvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>
	#include <sys/unistd.h>
	#include <sys/wait.h>
	#include <sys/sched.h>

	#include <machine/frame.h>
	#include <machine/psl.h>
	#include <machine/segments.h>
	#include <machine/sysarch.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>

	#include <i386/linux/linux.h>
	#include <i386/linux/linux_proto.h>
	#include <compat/linux/linux_ipc.h>
	#include <compat/linux/linux_misc.h>
	#include <compat/linux/linux_signal.h>
	#include <compat/linux/linux_util.h>
	#include <compat/linux/linux_emul.h>

	#include <i386/include/pcb.h> /* needed for pcb definition in linux_set_thread_area */

	#include "opt_posix.h"

	extern struct sysentvec elf32_freebsd_sysvec; /* defined in i386/i386/elf_machdep.c */

	struct l_descriptor {
	l_uint entry_number;
	l_ulong base_addr;
	l_uint limit;
	l_uint seg_32bit:1;
	l_uint contents:2;
	l_uint read_exec_only:1;
	l_uint limit_in_pages:1;
	l_uint seg_not_present:1;
	l_uint useable:1;
	};

	struct l_old_select_argv {
	l_int nfds;
	l_fd_set *readfds;
	l_fd_set *writefds;
	l_fd_set *exceptfds;
	struct l_timeval *timeout;
	};

	static int linux_mmap_common(struct thread *td, l_uintptr_t addr,
	l_size_t len, l_int prot, l_int flags, l_int fd,
	l_loff_t pos);

	int
	linux_to_bsd_sigaltstack(int lsa)
	{
	int bsa = 0;

	if (lsa & LINUX_SS_DISABLE)
	bsa \|= SS_DISABLE;
	if (lsa & LINUX_SS_ONSTACK)
	bsa \|= SS_ONSTACK;
	return (bsa);
	}

	int
	bsd_to_linux_sigaltstack(int bsa)
	{
	int lsa = 0;

	if (bsa & SS_DISABLE)
	lsa \|= LINUX_SS_DISABLE;
	if (bsa & SS_ONSTACK)
	lsa \|= LINUX_SS_ONSTACK;
	return (lsa);
	}

	int
	linux_execve(struct thread td, struct linux_execve_args args)
	{
	int error;
	char *newpath;
	struct image_args eargs;

	LCONVPATHEXIST(td, args->path, &newpath);

	#ifdef DEBUG
	if (ldebug(execve))
	printf(ARGS(execve, "%s"), newpath);
	#endif

	error = exec_copyin_args(&eargs, newpath, UIO_SYSSPACE,
	args->argp, args->envp);
	free(newpath, M_TEMP);
	if (error == 0)
	error = kern_execve(td, &eargs, NULL);
	if (error == 0)
	/* linux process can exec fbsd one, dont attempt
	* to create emuldata for such process using
	* linux_proc_init, this leads to a panic on KASSERT
	* because such process has p->p_emuldata == NULL
	*/
	if (SV_PROC_ABI(td->td_proc) == SV_ABI_LINUX)
	error = linux_proc_init(td, 0, 0);
	return (error);
	}

	struct l_ipc_kludge {
	struct l_msgbuf *msgp;
	l_long msgtyp;
	};

	int
	linux_ipc(struct thread td, struct linux_ipc_args args)
	{

	switch (args->what & 0xFFFF) {
	case LINUX_SEMOP: {
	struct linux_semop_args a;

	a.semid = args->arg1;
	a.tsops = args->ptr;
	a.nsops = args->arg2;
	return (linux_semop(td, &a));
	}
	case LINUX_SEMGET: {
	struct linux_semget_args a;

	a.key = args->arg1;
	a.nsems = args->arg2;
	a.semflg = args->arg3;
	return (linux_semget(td, &a));
	}
	case LINUX_SEMCTL: {
	struct linux_semctl_args a;
	int error;

	a.semid = args->arg1;
	a.semnum = args->arg2;
	a.cmd = args->arg3;
	error = copyin(args->ptr, &a.arg, sizeof(a.arg));
	if (error)
	return (error);
	return (linux_semctl(td, &a));
	}
	case LINUX_MSGSND: {
	struct linux_msgsnd_args a;

	a.msqid = args->arg1;
	a.msgp = args->ptr;
	a.msgsz = args->arg2;
	a.msgflg = args->arg3;
	return (linux_msgsnd(td, &a));
	}
	case LINUX_MSGRCV: {
	struct linux_msgrcv_args a;

	a.msqid = args->arg1;
	a.msgsz = args->arg2;
	a.msgflg = args->arg3;
	if ((args->what >> 16) == 0) {
	struct l_ipc_kludge tmp;
	int error;

	if (args->ptr == NULL)
	return (EINVAL);
	error = copyin(args->ptr, &tmp, sizeof(tmp));
	if (error)
	return (error);
	a.msgp = tmp.msgp;
	a.msgtyp = tmp.msgtyp;
	} else {
	a.msgp = args->ptr;
	a.msgtyp = args->arg5;
	}
	return (linux_msgrcv(td, &a));
	}
	case LINUX_MSGGET: {
	struct linux_msgget_args a;

	a.key = args->arg1;
	a.msgflg = args->arg2;
	return (linux_msgget(td, &a));
	}
	case LINUX_MSGCTL: {
	struct linux_msgctl_args a;

	a.msqid = args->arg1;
	a.cmd = args->arg2;
	a.buf = args->ptr;
	return (linux_msgctl(td, &a));
	}
	case LINUX_SHMAT: {
	struct linux_shmat_args a;

	a.shmid = args->arg1;
	a.shmaddr = args->ptr;
	a.shmflg = args->arg2;
	a.raddr = (l_ulong *)args->arg3;
	return (linux_shmat(td, &a));
	}
	case LINUX_SHMDT: {
	struct linux_shmdt_args a;

	a.shmaddr = args->ptr;
	return (linux_shmdt(td, &a));
	}
	case LINUX_SHMGET: {
	struct linux_shmget_args a;

	a.key = args->arg1;
	a.size = args->arg2;
	a.shmflg = args->arg3;
	return (linux_shmget(td, &a));
	}
	case LINUX_SHMCTL: {
	struct linux_shmctl_args a;

	a.shmid = args->arg1;
	a.cmd = args->arg2;
	a.buf = args->ptr;
	return (linux_shmctl(td, &a));
	}
	default:
	break;
	}

	return (EINVAL);
	}

	int
	linux_old_select(struct thread td, struct linux_old_select_args args)
	{
	struct l_old_select_argv linux_args;
	struct linux_select_args newsel;
	int error;

	#ifdef DEBUG
	if (ldebug(old_select))
	printf(ARGS(old_select, "%p"), args->ptr);
	#endif

	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
	if (error)
	return (error);

	newsel.nfds = linux_args.nfds;
	newsel.readfds = linux_args.readfds;
	newsel.writefds = linux_args.writefds;
	newsel.exceptfds = linux_args.exceptfds;
	newsel.timeout = linux_args.timeout;
	return (linux_select(td, &newsel));
	}

	int
	linux_set_cloned_tls(struct thread td, void desc)
	{
	struct segment_descriptor sd;
	struct l_user_desc info;
	int idx, error;
	int a[2];

	error = copyin(desc, &info, sizeof(struct l_user_desc));
	if (error) {
	printf(LMSG("copyin failed!"));
	} else {
	idx = info.entry_number;

	/*
	* looks like we're getting the idx we returned
	* in the set_thread_area() syscall
	*/
	if (idx != 6 && idx != 3) {
	printf(LMSG("resetting idx!"));
	idx = 3;
	}

	/* this doesnt happen in practice */
	if (idx == 6) {
	/* we might copy out the entry_number as 3 */
	info.entry_number = 3;
	error = copyout(&info, desc, sizeof(struct l_user_desc));
	if (error)
	printf(LMSG("copyout failed!"));
	}

	a[0] = LINUX_LDT_entry_a(&info);
	a[1] = LINUX_LDT_entry_b(&info);

	memcpy(&sd, &a, sizeof(a));
	#ifdef DEBUG
	if (ldebug(clone))
	printf("Segment created in clone with "
	"CLONE_SETTLS: lobase: %x, hibase: %x, "
	"lolimit: %x, hilimit: %x, type: %i, "
	"dpl: %i, p: %i, xx: %i, def32: %i, "
	"gran: %i\n", sd.sd_lobase, sd.sd_hibase,
	sd.sd_lolimit, sd.sd_hilimit, sd.sd_type,
	sd.sd_dpl, sd.sd_p, sd.sd_xx,
	sd.sd_def32, sd.sd_gran);
	#endif

	/* set %gs */
	td->td_pcb->pcb_gsd = sd;
	td->td_pcb->pcb_gs = GSEL(GUGS_SEL, SEL_UPL);
	}

	return (error);
	}

	int
	linux_set_upcall_kse(struct thread *td, register_t stack)
	{

	td->td_frame->tf_esp = stack;

	return (0);
	}

	#define STACK_SIZE (2 * 1024 * 1024)
	#define GUARD_SIZE (4 * PAGE_SIZE)

	int
	linux_mmap2(struct thread td, struct linux_mmap2_args args)
	{

	#ifdef DEBUG
	if (ldebug(mmap2))
	printf(ARGS(mmap2, "%p, %d, %d, 0x%08x, %d, %d"),
	(void *)args->addr, args->len, args->prot,
	args->flags, args->fd, args->pgoff);
	#endif

	return (linux_mmap_common(td, args->addr, args->len, args->prot,
	args->flags, args->fd, (uint64_t)(uint32_t)args->pgoff *
	PAGE_SIZE));
	}

	int
	linux_mmap(struct thread td, struct linux_mmap_args args)
	{
	int error;
	struct l_mmap_argv linux_args;

	error = copyin(args->ptr, &linux_args, sizeof(linux_args));
	if (error)
	return (error);

	#ifdef DEBUG
	if (ldebug(mmap))
	printf(ARGS(mmap, "%p, %d, %d, 0x%08x, %d, %d"),
	(void *)linux_args.addr, linux_args.len, linux_args.prot,
	linux_args.flags, linux_args.fd, linux_args.pgoff);
	#endif

	return (linux_mmap_common(td, linux_args.addr, linux_args.len,
	linux_args.prot, linux_args.flags, linux_args.fd,
	(uint32_t)linux_args.pgoff));
	}

	static int
	linux_mmap_common(struct thread *td, l_uintptr_t addr, l_size_t len, l_int prot,
	l_int flags, l_int fd, l_loff_t pos)
	{
	struct proc *p = td->td_proc;
	struct mmap_args /* {
	caddr_t addr;
	size_t len;
	int prot;
	int flags;
	int fd;
	long pad;
	off_t pos;
	} */ bsd_args;
	int error;
	struct file *fp;

	error = 0;
	bsd_args.flags = 0;
	fp = NULL;

	/*
	* Linux mmap(2):
	* You must specify exactly one of MAP_SHARED and MAP_PRIVATE
	*/
	if (!((flags & LINUX_MAP_SHARED) ^ (flags & LINUX_MAP_PRIVATE)))
	return (EINVAL);

	if (flags & LINUX_MAP_SHARED)
	bsd_args.flags \|= MAP_SHARED;
	if (flags & LINUX_MAP_PRIVATE)
	bsd_args.flags \|= MAP_PRIVATE;
	if (flags & LINUX_MAP_FIXED)
	bsd_args.flags \|= MAP_FIXED;
	if (flags & LINUX_MAP_ANON) {
	/* Enforce pos to be on page boundary, then ignore. */
	if ((pos & PAGE_MASK) != 0)
	return (EINVAL);
	pos = 0;
	bsd_args.flags \|= MAP_ANON;
	} else
	bsd_args.flags \|= MAP_NOSYNC;
	if (flags & LINUX_MAP_GROWSDOWN)
	bsd_args.flags \|= MAP_STACK;

	/*
	* PROT_READ, PROT_WRITE, or PROT_EXEC implies PROT_READ and PROT_EXEC
	* on Linux/i386. We do this to ensure maximum compatibility.
	* Linux/ia64 does the same in i386 emulation mode.
	*/
	bsd_args.prot = prot;
	if (bsd_args.prot & (PROT_READ \| PROT_WRITE \| PROT_EXEC))
	bsd_args.prot \|= PROT_READ \| PROT_EXEC;

	/* Linux does not check file descriptor when MAP_ANONYMOUS is set. */
	bsd_args.fd = (bsd_args.flags & MAP_ANON) ? -1 : fd;
	if (bsd_args.fd != -1) {
	/*
	* Linux follows Solaris mmap(2) description:
	* The file descriptor fildes is opened with
	* read permission, regardless of the
	* protection options specified.
	*
	* Checking just CAP_MMAP is fine here, since the real work
	* is done in the FreeBSD mmap().
	*/

	if ((error = fget(td, bsd_args.fd, CAP_MMAP, &fp)) != 0)
	return (error);
	if (fp->f_type != DTYPE_VNODE) {
	fdrop(fp, td);
	return (EINVAL);
	}

	/* Linux mmap() just fails for O_WRONLY files */
	if (!(fp->f_flag & FREAD)) {
	fdrop(fp, td);
	return (EACCES);
	}

	fdrop(fp, td);
	}

	if (flags & LINUX_MAP_GROWSDOWN) {
	/*
	* The Linux MAP_GROWSDOWN option does not limit auto
	* growth of the region. Linux mmap with this option
	* takes as addr the inital BOS, and as len, the initial
	* region size. It can then grow down from addr without
	* limit. However, linux threads has an implicit internal
	* limit to stack size of STACK_SIZE. Its just not
	* enforced explicitly in linux. But, here we impose
	* a limit of (STACK_SIZE - GUARD_SIZE) on the stack
	* region, since we can do this with our mmap.
	*
	* Our mmap with MAP_STACK takes addr as the maximum
	* downsize limit on BOS, and as len the max size of
	* the region. It them maps the top SGROWSIZ bytes,
	* and auto grows the region down, up to the limit
	* in addr.
	*
	* If we don't use the MAP_STACK option, the effect
	* of this code is to allocate a stack region of a
	* fixed size of (STACK_SIZE - GUARD_SIZE).
	*/

	if ((caddr_t)PTRIN(addr) + len > p->p_vmspace->vm_maxsaddr) {
	/*
	* Some linux apps will attempt to mmap
	* thread stacks near the top of their
	* address space. If their TOS is greater
	* than vm_maxsaddr, vm_map_growstack()
	* will confuse the thread stack with the
	* process stack and deliver a SEGV if they
	* attempt to grow the thread stack past their
	* current stacksize rlimit. To avoid this,
	* adjust vm_maxsaddr upwards to reflect
	* the current stacksize rlimit rather
	* than the maximum possible stacksize.
	* It would be better to adjust the
	* mmap'ed region, but some apps do not check
	* mmap's return value.
	*/
	PROC_LOCK(p);
	p->p_vmspace->vm_maxsaddr = (char *)USRSTACK -
	lim_cur(p, RLIMIT_STACK);
	PROC_UNLOCK(p);
	}

	/*
	* This gives us our maximum stack size and a new BOS.
	* If we're using VM_STACK, then mmap will just map
	* the top SGROWSIZ bytes, and let the stack grow down
	* to the limit at BOS. If we're not using VM_STACK
	* we map the full stack, since we don't have a way
	* to autogrow it.
	*/
	if (len > STACK_SIZE - GUARD_SIZE) {
	bsd_args.addr = (caddr_t)PTRIN(addr);
	bsd_args.len = len;
	} else {
	bsd_args.addr = (caddr_t)PTRIN(addr) -
	(STACK_SIZE - GUARD_SIZE - len);
	bsd_args.len = STACK_SIZE - GUARD_SIZE;
	}
	} else {
	bsd_args.addr = (caddr_t)PTRIN(addr);
	bsd_args.len = len;
	}
	bsd_args.pos = pos;

	#ifdef DEBUG
	if (ldebug(mmap))
	printf("-> %s(%p, %d, %d, 0x%08x, %d, 0x%x)\n",
	__func__,
	(void *)bsd_args.addr, bsd_args.len, bsd_args.prot,
	bsd_args.flags, bsd_args.fd, (int)bsd_args.pos);
	#endif
	- error = mmap(td, &bsd_args);
	+ error = sys_mmap(td, &bsd_args);
	#ifdef DEBUG
	if (ldebug(mmap))
	printf("-> %s() return: 0x%x (0x%08x)\n",
	__func__, error, (u_int)td->td_retval[0]);
	#endif
	return (error);
	}

	int
	linux_mprotect(struct thread td, struct linux_mprotect_args uap)
	{
	struct mprotect_args bsd_args;

	bsd_args.addr = uap->addr;
	bsd_args.len = uap->len;
	bsd_args.prot = uap->prot;
	if (bsd_args.prot & (PROT_READ \| PROT_WRITE \| PROT_EXEC))
	bsd_args.prot \|= PROT_READ \| PROT_EXEC;
	- return (mprotect(td, &bsd_args));
	+ return (sys_mprotect(td, &bsd_args));
	}

	int
	linux_pipe(struct thread td, struct linux_pipe_args args)
	{
	int error;
	int fildes[2];

	#ifdef DEBUG
	if (ldebug(pipe))
	printf(ARGS(pipe, "*"));
	#endif

	error = kern_pipe(td, fildes);
	if (error)
	return (error);

	/* XXX: Close descriptors on error. */
	return (copyout(fildes, args->pipefds, sizeof fildes));
	}

	int
	linux_ioperm(struct thread td, struct linux_ioperm_args args)
	{
	int error;
	struct i386_ioperm_args iia;

	iia.start = args->start;
	iia.length = args->length;
	iia.enable = args->enable;
	error = i386_set_ioperm(td, &iia);
	return (error);
	}

	int
	linux_iopl(struct thread td, struct linux_iopl_args args)
	{
	int error;

	if (args->level < 0 \|\| args->level > 3)
	return (EINVAL);
	if ((error = priv_check(td, PRIV_IO)) != 0)
	return (error);
	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
	return (error);
	td->td_frame->tf_eflags = (td->td_frame->tf_eflags & ~PSL_IOPL) \|
	(args->level * (PSL_IOPL / 3));
	return (0);
	}

	int
	linux_modify_ldt(struct thread td, struct linux_modify_ldt_args uap)
	{
	int error;
	struct i386_ldt_args ldt;
	struct l_descriptor ld;
	union descriptor desc;
	int size, written;

	switch (uap->func) {
	case 0x00: /* read_ldt */
	ldt.start = 0;
	ldt.descs = uap->ptr;
	ldt.num = uap->bytecount / sizeof(union descriptor);
	error = i386_get_ldt(td, &ldt);
	td->td_retval[0] *= sizeof(union descriptor);
	break;
	case 0x02: /* read_default_ldt = 0 */
	size = 5*sizeof(struct l_desc_struct);
	if (size > uap->bytecount)
	size = uap->bytecount;
	for (written = error = 0; written < size && error == 0; written++)
	error = subyte((char *)uap->ptr + written, 0);
	td->td_retval[0] = written;
	break;
	case 0x01: /* write_ldt */
	case 0x11: /* write_ldt */
	if (uap->bytecount != sizeof(ld))
	return (EINVAL);

	error = copyin(uap->ptr, &ld, sizeof(ld));
	if (error)
	return (error);

	ldt.start = ld.entry_number;
	ldt.descs = &desc;
	ldt.num = 1;
	desc.sd.sd_lolimit = (ld.limit & 0x0000ffff);
	desc.sd.sd_hilimit = (ld.limit & 0x000f0000) >> 16;
	desc.sd.sd_lobase = (ld.base_addr & 0x00ffffff);
	desc.sd.sd_hibase = (ld.base_addr & 0xff000000) >> 24;
	desc.sd.sd_type = SDT_MEMRO \| ((ld.read_exec_only ^ 1) << 1) \|
	(ld.contents << 2);
	desc.sd.sd_dpl = 3;
	desc.sd.sd_p = (ld.seg_not_present ^ 1);
	desc.sd.sd_xx = 0;
	desc.sd.sd_def32 = ld.seg_32bit;
	desc.sd.sd_gran = ld.limit_in_pages;
	error = i386_set_ldt(td, &ldt, &desc);
	break;
	default:
	error = ENOSYS;
	break;
	}

	if (error == EOPNOTSUPP) {
	printf("linux: modify_ldt needs kernel option USER_LDT\n");
	error = ENOSYS;
	}

	return (error);
	}

	int
	linux_sigaction(struct thread td, struct linux_sigaction_args args)
	{
	l_osigaction_t osa;
	l_sigaction_t act, oact;
	int error;

	#ifdef DEBUG
	if (ldebug(sigaction))
	printf(ARGS(sigaction, "%d, %p, %p"),
	args->sig, (void )args->nsa, (void )args->osa);
	#endif

	if (args->nsa != NULL) {
	error = copyin(args->nsa, &osa, sizeof(l_osigaction_t));
	if (error)
	return (error);
	act.lsa_handler = osa.lsa_handler;
	act.lsa_flags = osa.lsa_flags;
	act.lsa_restorer = osa.lsa_restorer;
	LINUX_SIGEMPTYSET(act.lsa_mask);
	act.lsa_mask.__bits[0] = osa.lsa_mask;
	}

	error = linux_do_sigaction(td, args->sig, args->nsa ? &act : NULL,
	args->osa ? &oact : NULL);

	if (args->osa != NULL && !error) {
	osa.lsa_handler = oact.lsa_handler;
	osa.lsa_flags = oact.lsa_flags;
	osa.lsa_restorer = oact.lsa_restorer;
	osa.lsa_mask = oact.lsa_mask.__bits[0];
	error = copyout(&osa, args->osa, sizeof(l_osigaction_t));
	}

	return (error);
	}

	/*
	* Linux has two extra args, restart and oldmask. We dont use these,
	* but it seems that "restart" is actually a context pointer that
	* enables the signal to happen with a different register set.
	*/
	int
	linux_sigsuspend(struct thread td, struct linux_sigsuspend_args args)
	{
	sigset_t sigmask;
	l_sigset_t mask;

	#ifdef DEBUG
	if (ldebug(sigsuspend))
	printf(ARGS(sigsuspend, "%08lx"), (unsigned long)args->mask);
	#endif

	LINUX_SIGEMPTYSET(mask);
	mask.__bits[0] = args->mask;
	linux_to_bsd_sigset(&mask, &sigmask);
	return (kern_sigsuspend(td, sigmask));
	}

	int
	linux_rt_sigsuspend(struct thread td, struct linux_rt_sigsuspend_args uap)
	{
	l_sigset_t lmask;
	sigset_t sigmask;
	int error;

	#ifdef DEBUG
	if (ldebug(rt_sigsuspend))
	printf(ARGS(rt_sigsuspend, "%p, %d"),
	(void *)uap->newset, uap->sigsetsize);
	#endif

	if (uap->sigsetsize != sizeof(l_sigset_t))
	return (EINVAL);

	error = copyin(uap->newset, &lmask, sizeof(l_sigset_t));
	if (error)
	return (error);

	linux_to_bsd_sigset(&lmask, &sigmask);
	return (kern_sigsuspend(td, sigmask));
	}

	int
	linux_pause(struct thread td, struct linux_pause_args args)
	{
	struct proc *p = td->td_proc;
	sigset_t sigmask;

	#ifdef DEBUG
	if (ldebug(pause))
	printf(ARGS(pause, ""));
	#endif

	PROC_LOCK(p);
	sigmask = td->td_sigmask;
	PROC_UNLOCK(p);
	return (kern_sigsuspend(td, sigmask));
	}

	int
	linux_sigaltstack(struct thread td, struct linux_sigaltstack_args uap)
	{
	stack_t ss, oss;
	l_stack_t lss;
	int error;

	#ifdef DEBUG
	if (ldebug(sigaltstack))
	printf(ARGS(sigaltstack, "%p, %p"), uap->uss, uap->uoss);
	#endif

	if (uap->uss != NULL) {
	error = copyin(uap->uss, &lss, sizeof(l_stack_t));
	if (error)
	return (error);

	ss.ss_sp = lss.ss_sp;
	ss.ss_size = lss.ss_size;
	ss.ss_flags = linux_to_bsd_sigaltstack(lss.ss_flags);
	}
	error = kern_sigaltstack(td, (uap->uss != NULL) ? &ss : NULL,
	(uap->uoss != NULL) ? &oss : NULL);
	if (!error && uap->uoss != NULL) {
	lss.ss_sp = oss.ss_sp;
	lss.ss_size = oss.ss_size;
	lss.ss_flags = bsd_to_linux_sigaltstack(oss.ss_flags);
	error = copyout(&lss, uap->uoss, sizeof(l_stack_t));
	}

	return (error);
	}

	int
	linux_ftruncate64(struct thread td, struct linux_ftruncate64_args args)
	{
	struct ftruncate_args sa;

	#ifdef DEBUG
	if (ldebug(ftruncate64))
	printf(ARGS(ftruncate64, "%u, %jd"), args->fd,
	(intmax_t)args->length);
	#endif

	sa.fd = args->fd;
	sa.length = args->length;
	- return ftruncate(td, &sa);
	+ return sys_ftruncate(td, &sa);
	}

	int
	linux_set_thread_area(struct thread td, struct linux_set_thread_area_args args)
	{
	struct l_user_desc info;
	int error;
	int idx;
	int a[2];
	struct segment_descriptor sd;

	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
	if (error)
	return (error);

	#ifdef DEBUG
	if (ldebug(set_thread_area))
	printf(ARGS(set_thread_area, "%i, %x, %x, %i, %i, %i, %i, %i, %i\n"),
	info.entry_number,
	info.base_addr,
	info.limit,
	info.seg_32bit,
	info.contents,
	info.read_exec_only,
	info.limit_in_pages,
	info.seg_not_present,
	info.useable);
	#endif

	idx = info.entry_number;
	/*
	* Semantics of linux version: every thread in the system has array of
	* 3 tls descriptors. 1st is GLIBC TLS, 2nd is WINE, 3rd unknown. This
	* syscall loads one of the selected tls decriptors with a value and
	* also loads GDT descriptors 6, 7 and 8 with the content of the
	* per-thread descriptors.
	*
	* Semantics of fbsd version: I think we can ignore that linux has 3
	* per-thread descriptors and use just the 1st one. The tls_array[]
	* is used only in set/get-thread_area() syscalls and for loading the
	* GDT descriptors. In fbsd we use just one GDT descriptor for TLS so
	* we will load just one.
	*
	* XXX: this doesn't work when a user space process tries to use more
	* than 1 TLS segment. Comment in the linux sources says wine might do
	* this.
	*/

	/*
	* we support just GLIBC TLS now
	* we should let 3 proceed as well because we use this segment so
	* if code does two subsequent calls it should succeed
	*/
	if (idx != 6 && idx != -1 && idx != 3)
	return (EINVAL);

	/*
	* we have to copy out the GDT entry we use
	* FreeBSD uses GDT entry #3 for storing %gs so load that
	*
	* XXX: what if a user space program doesn't check this value and tries
	* to use 6, 7 or 8?
	*/
	idx = info.entry_number = 3;
	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
	if (error)
	return (error);

	if (LINUX_LDT_empty(&info)) {
	a[0] = 0;
	a[1] = 0;
	} else {
	a[0] = LINUX_LDT_entry_a(&info);
	a[1] = LINUX_LDT_entry_b(&info);
	}

	memcpy(&sd, &a, sizeof(a));
	#ifdef DEBUG
	if (ldebug(set_thread_area))
	printf("Segment created in set_thread_area: lobase: %x, hibase: %x, lolimit: %x, hilimit: %x, type: %i, dpl: %i, p: %i, xx: %i, def32: %i, gran: %i\n", sd.sd_lobase,
	sd.sd_hibase,
	sd.sd_lolimit,
	sd.sd_hilimit,
	sd.sd_type,
	sd.sd_dpl,
	sd.sd_p,
	sd.sd_xx,
	sd.sd_def32,
	sd.sd_gran);
	#endif

	/* this is taken from i386 version of cpu_set_user_tls() */
	critical_enter();
	/* set %gs */
	td->td_pcb->pcb_gsd = sd;
	PCPU_GET(fsgs_gdt)[1] = sd;
	load_gs(GSEL(GUGS_SEL, SEL_UPL));
	critical_exit();

	return (0);
	}

	int
	linux_get_thread_area(struct thread td, struct linux_get_thread_area_args args)
	{

	struct l_user_desc info;
	int error;
	int idx;
	struct l_desc_struct desc;
	struct segment_descriptor sd;

	#ifdef DEBUG
	if (ldebug(get_thread_area))
	printf(ARGS(get_thread_area, "%p"), args->desc);
	#endif

	error = copyin(args->desc, &info, sizeof(struct l_user_desc));
	if (error)
	return (error);

	idx = info.entry_number;
	/* XXX: I am not sure if we want 3 to be allowed too. */
	if (idx != 6 && idx != 3)
	return (EINVAL);

	idx = 3;

	memset(&info, 0, sizeof(info));

	sd = PCPU_GET(fsgs_gdt)[1];

	memcpy(&desc, &sd, sizeof(desc));

	info.entry_number = idx;
	info.base_addr = LINUX_GET_BASE(&desc);
	info.limit = LINUX_GET_LIMIT(&desc);
	info.seg_32bit = LINUX_GET_32BIT(&desc);
	info.contents = LINUX_GET_CONTENTS(&desc);
	info.read_exec_only = !LINUX_GET_WRITABLE(&desc);
	info.limit_in_pages = LINUX_GET_LIMIT_PAGES(&desc);
	info.seg_not_present = !LINUX_GET_PRESENT(&desc);
	info.useable = LINUX_GET_USEABLE(&desc);

	error = copyout(&info, args->desc, sizeof(struct l_user_desc));
	if (error)
	return (EFAULT);

	return (0);
	}

	/* copied from kern/kern_time.c */
	int
	linux_timer_create(struct thread td, struct linux_timer_create_args args)
	{
	- return ktimer_create(td, (struct ktimer_create_args *) args);
	+ return sys_ktimer_create(td, (struct ktimer_create_args *) args);
	}

	int
	linux_timer_settime(struct thread td, struct linux_timer_settime_args args)
	{
	- return ktimer_settime(td, (struct ktimer_settime_args *) args);
	+ return sys_ktimer_settime(td, (struct ktimer_settime_args *) args);
	}

	int
	linux_timer_gettime(struct thread td, struct linux_timer_gettime_args args)
	{
	- return ktimer_gettime(td, (struct ktimer_gettime_args *) args);
	+ return sys_ktimer_gettime(td, (struct ktimer_gettime_args *) args);
	}

	int
	linux_timer_getoverrun(struct thread td, struct linux_timer_getoverrun_args args)
	{
	- return ktimer_getoverrun(td, (struct ktimer_getoverrun_args *) args);
	+ return sys_ktimer_getoverrun(td, (struct ktimer_getoverrun_args *) args);
	}

	int
	linux_timer_delete(struct thread td, struct linux_timer_delete_args args)
	{
	- return ktimer_delete(td, (struct ktimer_delete_args *) args);
	+ return sys_ktimer_delete(td, (struct ktimer_delete_args *) args);
	}

	/* XXX: this wont work with module - convert it */
	int
	linux_mq_open(struct thread td, struct linux_mq_open_args args)
	{
	#ifdef P1003_1B_MQUEUE
	- return kmq_open(td, (struct kmq_open_args *) args);
	+ return sys_kmq_open(td, (struct kmq_open_args *) args);
	#else
	return (ENOSYS);
	#endif
	}

	int
	linux_mq_unlink(struct thread td, struct linux_mq_unlink_args args)
	{
	#ifdef P1003_1B_MQUEUE
	- return kmq_unlink(td, (struct kmq_unlink_args *) args);
	+ return sys_kmq_unlink(td, (struct kmq_unlink_args *) args);
	#else
	return (ENOSYS);
	#endif
	}

	int
	linux_mq_timedsend(struct thread td, struct linux_mq_timedsend_args args)
	{
	#ifdef P1003_1B_MQUEUE
	- return kmq_timedsend(td, (struct kmq_timedsend_args *) args);
	+ return sys_kmq_timedsend(td, (struct kmq_timedsend_args *) args);
	#else
	return (ENOSYS);
	#endif
	}

	int
	linux_mq_timedreceive(struct thread td, struct linux_mq_timedreceive_args args)
	{
	#ifdef P1003_1B_MQUEUE
	- return kmq_timedreceive(td, (struct kmq_timedreceive_args *) args);
	+ return sys_kmq_timedreceive(td, (struct kmq_timedreceive_args *) args);
	#else
	return (ENOSYS);
	#endif
	}

	int
	linux_mq_notify(struct thread td, struct linux_mq_notify_args args)
	{
	#ifdef P1003_1B_MQUEUE
	- return kmq_notify(td, (struct kmq_notify_args *) args);
	+ return sys_kmq_notify(td, (struct kmq_notify_args *) args);
	#else
	return (ENOSYS);
	#endif
	}

	int
	linux_mq_getsetattr(struct thread td, struct linux_mq_getsetattr_args args)
	{
	#ifdef P1003_1B_MQUEUE
	- return kmq_setattr(td, (struct kmq_setattr_args *) args);
	+ return sys_kmq_setattr(td, (struct kmq_setattr_args *) args);
	#else
	return (ENOSYS);
	#endif
	}

	int
	linux_wait4(struct thread td, struct linux_wait4_args args)
	{
	int error, options;
	struct rusage ru, *rup;

	#ifdef DEBUG
	if (ldebug(wait4))
	printf(ARGS(wait4, "%d, %p, %d, %p"),
	args->pid, (void *)args->status, args->options,
	(void *)args->rusage);
	#endif

	options = (args->options & (WNOHANG \| WUNTRACED));
	/* WLINUXCLONE should be equal to __WCLONE, but we make sure */
	if (args->options & __WCLONE)
	options \|= WLINUXCLONE;

	if (args->rusage != NULL)
	rup = &ru;
	else
	rup = NULL;
	error = linux_common_wait(td, args->pid, args->status, options, rup);
	if (error)
	return (error);
	if (args->rusage != NULL)
	error = copyout(&ru, args->rusage, sizeof(ru));

	return (error);
	}
	Index: head/sys/ia64/ia32/ia32_signal.c
	===================================================================
	--- head/sys/ia64/ia32/ia32_signal.c (revision 225616)
	+++ head/sys/ia64/ia32/ia32_signal.c (revision 225617)
	@@ -1,298 +1,298 @@
	/*-
	* Copyright (c) 2002 Doug Rabson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"

	#define __ELF_WORD_SIZE 32

	#include <sys/param.h>
	#include <sys/exec.h>
	#include <sys/fcntl.h>
	#include <sys/imgact.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/mman.h>
	#include <sys/namei.h>
	#include <sys/pioctl.h>
	#include <sys/proc.h>
	#include <sys/procfs.h>
	#include <sys/resourcevar.h>
	#include <sys/systm.h>
	#include <sys/signalvar.h>
	#include <sys/stat.h>
	#include <sys/sx.h>
	#include <sys/syscall.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/vnode.h>
	#include <sys/imgact_elf.h>
	#include <sys/sysproto.h>

	#include <machine/frame.h>
	#include <machine/md_var.h>
	#include <machine/pcb.h>

	#include <vm/vm.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_extern.h>

	#include <compat/freebsd32/freebsd32_signal.h>
	#include <compat/freebsd32/freebsd32_util.h>
	#include <compat/freebsd32/freebsd32_proto.h>
	#include <compat/ia32/ia32_signal.h>
	#include <i386/include/psl.h>
	#include <i386/include/segments.h>
	#include <i386/include/specialreg.h>

	char ia32_sigcode[] = {
	0xff, 0x54, 0x24, 0x10, /* call SIGF_HANDLER(%esp) /
	0x8d, 0x44, 0x24, 0x14, /* lea SIGF_UC(%esp),%eax */
	0x50, /* pushl %eax */
	0xf7, 0x40, 0x54, 0x00, 0x00, 0x02, 0x02, /* testl $PSL_VM,UC_EFLAGS(%ea
	x) */
	0x75, 0x03, /* jne 9f */
	0x8e, 0x68, 0x14, /* movl UC_GS(%eax),%gs */
	0xb8, 0x57, 0x01, 0x00, 0x00, /* 9: movl $SYS_sigreturn,%eax */
	0x50, /* pushl %eax */
	0xcd, 0x80, /* int $0x80 */
	0xeb, 0xfe, /* 0: jmp 0b */
	0
	};
	int sz_ia32_sigcode = sizeof(ia32_sigcode);

	#ifdef COMPAT_43
	int
	ofreebsd32_sigreturn(struct thread td, struct ofreebsd32_sigreturn_args uap)
	{

	return (EOPNOTSUPP);
	}
	#endif

	/*
	* Signal sending has not been implemented on ia64. This causes
	* the sigtramp code to not understand the arguments and the application
	* will generally crash if it tries to handle a signal. Calling
	* sendsig() means that at least untrapped signals will work.
	*/
	void
	ia32_sendsig(sig_t catcher, ksiginfo_t ksi, sigset_t mask)
	{
	sendsig(catcher, ksi, mask);
	}

	#ifdef COMPAT_FREEBSD4
	int
	freebsd4_freebsd32_sigreturn(struct thread td, struct freebsd4_freebsd32_sigreturn_args uap)
	{
	- return (sigreturn(td, (struct sigreturn_args *)uap));
	+ return (sys_sigreturn(td, (struct sigreturn_args *)uap));
	}
	#endif

	int
	freebsd32_sigreturn(struct thread td, struct freebsd32_sigreturn_args uap)
	{
	- return (sigreturn(td, (struct sigreturn_args *)uap));
	+ return (sys_sigreturn(td, (struct sigreturn_args *)uap));
	}


	void
	ia32_setregs(struct thread td, struct image_params imgp, u_long stack)
	{
	struct trapframe *tf = td->td_frame;
	vm_offset_t gdt, ldt;
	u_int64_t codesel, datasel, ldtsel;
	u_int64_t codeseg, dataseg, gdtseg, ldtseg;
	struct segment_descriptor desc;
	struct vmspace *vmspace = td->td_proc->p_vmspace;
	struct sysentvec *sv;

	sv = td->td_proc->p_sysent;
	exec_setregs(td, imgp, stack);

	/* Non-syscall frames are cleared by exec_setregs() */
	if (tf->tf_flags & FRAME_SYSCALL) {
	bzero(&tf->tf_scratch, sizeof(tf->tf_scratch));
	bzero(&tf->tf_scratch_fp, sizeof(tf->tf_scratch_fp));
	} else
	tf->tf_special.ndirty = 0;

	tf->tf_special.psr \|= IA64_PSR_IS;
	tf->tf_special.sp = stack;

	/* Point the RSE backstore to something harmless. */
	tf->tf_special.bspstore = (sv->sv_psstrings - sz_ia32_sigcode -
	SPARE_USRSPACE + 15) & ~15;

	codesel = LSEL(LUCODE_SEL, SEL_UPL);
	datasel = LSEL(LUDATA_SEL, SEL_UPL);
	ldtsel = GSEL(GLDT_SEL, SEL_UPL);

	/* Setup ia32 segment registers. */
	tf->tf_scratch.gr16 = (datasel << 48) \| (datasel << 32) \|
	(datasel << 16) \| datasel;
	tf->tf_scratch.gr17 = (ldtsel << 32) \| (datasel << 16) \| codesel;

	/*
	* Build the GDT and LDT.
	*/
	gdt = sv->sv_usrstack;
	vm_map_find(&vmspace->vm_map, 0, 0, &gdt, IA32_PAGE_SIZE << 1, 0,
	VM_PROT_ALL, VM_PROT_ALL, 0);
	ldt = gdt + IA32_PAGE_SIZE;

	desc.sd_lolimit = 8*NLDT-1;
	desc.sd_lobase = ldt & 0xffffff;
	desc.sd_type = SDT_SYSLDT;
	desc.sd_dpl = SEL_UPL;
	desc.sd_p = 1;
	desc.sd_hilimit = 0;
	desc.sd_def32 = 0;
	desc.sd_gran = 0;
	desc.sd_hibase = ldt >> 24;
	copyout(&desc, (caddr_t) gdt + 8*GLDT_SEL, sizeof(desc));

	desc.sd_lolimit = ((sv->sv_usrstack >> 12) - 1) & 0xffff;
	desc.sd_lobase = 0;
	desc.sd_type = SDT_MEMERA;
	desc.sd_dpl = SEL_UPL;
	desc.sd_p = 1;
	desc.sd_hilimit = ((sv->sv_usrstack >> 12) - 1) >> 16;
	desc.sd_def32 = 1;
	desc.sd_gran = 1;
	desc.sd_hibase = 0;
	copyout(&desc, (caddr_t) ldt + 8*LUCODE_SEL, sizeof(desc));
	desc.sd_type = SDT_MEMRWA;
	copyout(&desc, (caddr_t) ldt + 8*LUDATA_SEL, sizeof(desc));

	codeseg = 0 /* base */
	+ (((sv->sv_usrstack >> 12) - 1) << 32) /* limit */
	+ ((long)SDT_MEMERA << 52)
	+ ((long)SEL_UPL << 57)
	+ (1L << 59) /* present */
	+ (1L << 62) /* 32 bits */
	+ (1L << 63); /* page granularity */
	dataseg = 0 /* base */
	+ (((sv->sv_usrstack >> 12) - 1) << 32) /* limit */
	+ ((long)SDT_MEMRWA << 52)
	+ ((long)SEL_UPL << 57)
	+ (1L << 59) /* present */
	+ (1L << 62) /* 32 bits */
	+ (1L << 63); /* page granularity */

	tf->tf_scratch.csd = codeseg;
	tf->tf_scratch.ssd = dataseg;
	tf->tf_scratch.gr24 = dataseg; /* ESD */
	tf->tf_scratch.gr27 = dataseg; /* DSD */
	tf->tf_scratch.gr28 = dataseg; /* FSD */
	tf->tf_scratch.gr29 = dataseg; /* GSD */

	gdtseg = gdt /* base */
	+ ((8LNGDT - 1) << 32) / limit */
	+ ((long)SDT_SYSNULL << 52)
	+ ((long)SEL_UPL << 57)
	+ (1L << 59) /* present */
	+ (0L << 62) /* 16 bits */
	+ (0L << 63); /* byte granularity */
	ldtseg = ldt /* base */
	+ ((8LNLDT - 1) << 32) / limit */
	+ ((long)SDT_SYSLDT << 52)
	+ ((long)SEL_UPL << 57)
	+ (1L << 59) /* present */
	+ (0L << 62) /* 16 bits */
	+ (0L << 63); /* byte granularity */

	tf->tf_scratch.gr30 = ldtseg; /* LDTD */
	tf->tf_scratch.gr31 = gdtseg; /* GDTD */

	/* Set ia32 control registers on this processor. */
	ia64_set_cflg(CR0_PE \| CR0_PG \| ((long)(CR4_XMM \| CR4_FXSR) << 32));
	ia64_set_eflag(PSL_USER);

	/* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
	tf->tf_scratch.gr11 = td->td_proc->p_sysent->sv_psstrings;

	/*
	* XXX - Linux emulator
	* Make sure sure edx is 0x0 on entry. Linux binaries depend
	* on it.
	*/
	td->td_retval[1] = 0;
	}

	void
	ia32_restorectx(struct pcb *pcb)
	{

	ia64_set_cflg(pcb->pcb_ia32_cflg);
	ia64_set_eflag(pcb->pcb_ia32_eflag);
	ia64_set_fcr(pcb->pcb_ia32_fcr);
	ia64_set_fdr(pcb->pcb_ia32_fdr);
	ia64_set_fir(pcb->pcb_ia32_fir);
	ia64_set_fsr(pcb->pcb_ia32_fsr);
	}

	void
	ia32_savectx(struct pcb *pcb)
	{

	pcb->pcb_ia32_cflg = ia64_get_cflg();
	pcb->pcb_ia32_eflag = ia64_get_eflag();
	pcb->pcb_ia32_fcr = ia64_get_fcr();
	pcb->pcb_ia32_fdr = ia64_get_fdr();
	pcb->pcb_ia32_fir = ia64_get_fir();
	pcb->pcb_ia32_fsr = ia64_get_fsr();
	}

	int
	freebsd32_getcontext(struct thread td, struct freebsd32_getcontext_args uap)
	{

	return (nosys(td, NULL));
	}

	int
	freebsd32_setcontext(struct thread td, struct freebsd32_setcontext_args uap)
	{

	return (nosys(td, NULL));
	}

	int
	freebsd32_swapcontext(struct thread td, struct freebsd32_swapcontext_args uap)
	{

	return (nosys(td, NULL));
	}
	Index: head/sys/ia64/ia64/machdep.c
	===================================================================
	--- head/sys/ia64/ia64/machdep.c (revision 225616)
	+++ head/sys/ia64/ia64/machdep.c (revision 225617)
	@@ -1,1586 +1,1586 @@
	/*-
	* Copyright (c) 2003,2004 Marcel Moolenaar
	* Copyright (c) 2000,2001 Doug Rabson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_ddb.h"
	#include "opt_kstack_pages.h"
	#include "opt_sched.h"

	#include <sys/param.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/bus.h>
	#include <sys/cons.h>
	#include <sys/cpu.h>
	#include <sys/eventhandler.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/linker.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/msgbuf.h>
	#include <sys/pcpu.h>
	#include <sys/ptrace.h>
	#include <sys/random.h>
	#include <sys/reboot.h>
	#include <sys/sched.h>
	#include <sys/signalvar.h>
	#include <sys/syscall.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/ucontext.h>
	#include <sys/uio.h>
	#include <sys/uuid.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>

	#include <ddb/ddb.h>

	#include <net/netisr.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_page.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_pager.h>

	#include <machine/bootinfo.h>
	#include <machine/cpu.h>
	#include <machine/efi.h>
	#include <machine/elf.h>
	#include <machine/fpu.h>
	#include <machine/intr.h>
	#include <machine/mca.h>
	#include <machine/md_var.h>
	#include <machine/pal.h>
	#include <machine/pcb.h>
	#include <machine/reg.h>
	#include <machine/sal.h>
	#include <machine/sigframe.h>
	#ifdef SMP
	#include <machine/smp.h>
	#endif
	#include <machine/unwind.h>
	#include <machine/vmparam.h>

	SYSCTL_NODE(_hw, OID_AUTO, freq, CTLFLAG_RD, 0, "");
	SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RD, 0, "");

	static u_int bus_freq;
	SYSCTL_UINT(_hw_freq, OID_AUTO, bus, CTLFLAG_RD, &bus_freq, 0,
	"Bus clock frequency");

	static u_int cpu_freq;
	SYSCTL_UINT(_hw_freq, OID_AUTO, cpu, CTLFLAG_RD, &cpu_freq, 0,
	"CPU clock frequency");

	static u_int itc_freq;
	SYSCTL_UINT(_hw_freq, OID_AUTO, itc, CTLFLAG_RD, &itc_freq, 0,
	"ITC frequency");

	int cold = 1;

	struct bootinfo *bootinfo;

	struct pcpu pcpu0;

	extern u_int64_t kernel_text[], _end[];

	extern u_int64_t ia64_gateway_page[];
	extern u_int64_t break_sigtramp[];
	extern u_int64_t epc_sigtramp[];

	struct fpswa_iface *fpswa_iface;

	vm_size_t ia64_pal_size;
	vm_paddr_t ia64_pal_base;
	vm_offset_t ia64_port_base;

	u_int64_t ia64_lapic_addr = PAL_PIB_DEFAULT_ADDR;

	struct ia64_pib *ia64_pib;

	static int ia64_sync_icache_needed;

	char machine[] = MACHINE;
	SYSCTL_STRING(_hw, HW_MACHINE, machine, CTLFLAG_RD, machine, 0, "");

	static char cpu_model[64];
	SYSCTL_STRING(_hw, HW_MODEL, model, CTLFLAG_RD, cpu_model, 0,
	"The CPU model name");

	static char cpu_family[64];
	SYSCTL_STRING(_hw, OID_AUTO, family, CTLFLAG_RD, cpu_family, 0,
	"The CPU family name");

	#ifdef DDB
	extern vm_offset_t ksym_start, ksym_end;
	#endif


	struct msgbuf *msgbufp = NULL;

	/* Other subsystems (e.g., ACPI) can hook this later. */
	void (*cpu_idle_hook)(void) = NULL;

	long Maxmem = 0;
	long realmem = 0;

	#define PHYSMAP_SIZE (2 * VM_PHYSSEG_MAX)

	vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];

	/* must be 2 less so 0 0 can signal end of chunks */
	#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(vm_offset_t)) - 2)

	struct kva_md_info kmi;

	#define Mhz 1000000L
	#define Ghz (1000L*Mhz)

	static void
	identifycpu(void)
	{
	char vendor[17];
	char family_name, model_name;
	u_int64_t features, tmp;
	int number, revision, model, family, archrev;

	/*
	* Assumes little-endian.
	*/
	(u_int64_t ) &vendor[0] = ia64_get_cpuid(0);
	(u_int64_t ) &vendor[8] = ia64_get_cpuid(1);
	vendor[16] = '\0';

	tmp = ia64_get_cpuid(3);
	number = (tmp >> 0) & 0xff;
	revision = (tmp >> 8) & 0xff;
	model = (tmp >> 16) & 0xff;
	family = (tmp >> 24) & 0xff;
	archrev = (tmp >> 32) & 0xff;

	family_name = model_name = "unknown";
	switch (family) {
	case 0x07:
	family_name = "Itanium";
	model_name = "Merced";
	break;
	case 0x1f:
	family_name = "Itanium 2";
	switch (model) {
	case 0x00:
	model_name = "McKinley";
	break;
	case 0x01:
	/*
	* Deerfield is a low-voltage variant based on the
	* Madison core. We need circumstantial evidence
	* (i.e. the clock frequency) to identify those.
	* Allow for roughly 1% error margin.
	*/
	if (cpu_freq > 990 && cpu_freq < 1010)
	model_name = "Deerfield";
	else
	model_name = "Madison";
	break;
	case 0x02:
	model_name = "Madison II";
	break;
	}
	break;
	case 0x20:
	ia64_sync_icache_needed = 1;

	family_name = "Itanium 2";
	switch (model) {
	case 0x00:
	model_name = "Montecito";
	break;
	case 0x01:
	model_name = "Montvale";
	break;
	}
	break;
	}
	snprintf(cpu_family, sizeof(cpu_family), "%s", family_name);
	snprintf(cpu_model, sizeof(cpu_model), "%s", model_name);

	features = ia64_get_cpuid(4);

	printf("CPU: %s (", model_name);
	if (cpu_freq)
	printf("%u Mhz ", cpu_freq);
	printf("%s)\n", family_name);
	printf(" Origin = \"%s\" Revision = %d\n", vendor, revision);
	printf(" Features = 0x%b\n", (u_int32_t) features,
	"\020"
	"\001LB" /* long branch (brl) instruction. */
	"\002SD" /* Spontaneous deferral. */
	"\003AO" /* 16-byte atomic operations (ld, st, cmpxchg). */ );
	}

	static void
	cpu_startup(void *dummy)
	{
	char nodename[16];
	struct pcpu *pc;
	struct pcpu_stats *pcs;

	/*
	* Good {morning,afternoon,evening,night}.
	*/
	identifycpu();

	#ifdef PERFMON
	perfmon_init();
	#endif
	printf("real memory = %ld (%ld MB)\n", ia64_ptob(Maxmem),
	ia64_ptob(Maxmem) / 1048576);
	realmem = Maxmem;

	/*
	* Display any holes after the first chunk of extended memory.
	*/
	if (bootverbose) {
	int indx;

	printf("Physical memory chunk(s):\n");
	for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
	long size1 = phys_avail[indx + 1] - phys_avail[indx];

	printf("0x%08lx - 0x%08lx, %ld bytes (%ld pages)\n",
	phys_avail[indx], phys_avail[indx + 1] - 1, size1,
	size1 >> PAGE_SHIFT);
	}
	}

	vm_ksubmap_init(&kmi);

	printf("avail memory = %ld (%ld MB)\n", ptoa(cnt.v_free_count),
	ptoa(cnt.v_free_count) / 1048576);

	if (fpswa_iface == NULL)
	printf("Warning: no FPSWA package supplied\n");
	else
	printf("FPSWA Revision = 0x%lx, Entry = %p\n",
	(long)fpswa_iface->if_rev, (void *)fpswa_iface->if_fpswa);

	/*
	* Set up buffers, so they can be used to read disk labels.
	*/
	bufinit();
	vm_pager_bufferinit();

	/*
	* Traverse the MADT to discover IOSAPIC and Local SAPIC
	* information.
	*/
	ia64_probe_sapics();
	ia64_pib = pmap_mapdev(ia64_lapic_addr, sizeof(*ia64_pib));

	ia64_mca_init();

	/*
	* Create sysctl tree for per-CPU information.
	*/
	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
	snprintf(nodename, sizeof(nodename), "%u", pc->pc_cpuid);
	sysctl_ctx_init(&pc->pc_md.sysctl_ctx);
	pc->pc_md.sysctl_tree = SYSCTL_ADD_NODE(&pc->pc_md.sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_machdep_cpu), OID_AUTO, nodename,
	CTLFLAG_RD, NULL, "");
	if (pc->pc_md.sysctl_tree == NULL)
	continue;

	pcs = &pc->pc_md.stats;

	SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
	SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
	"nasts", CTLFLAG_RD, &pcs->pcs_nasts,
	"Number of IPI_AST interrupts");

	SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
	SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
	"nclks", CTLFLAG_RD, &pcs->pcs_nclks,
	"Number of clock interrupts");

	SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
	SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
	"nextints", CTLFLAG_RD, &pcs->pcs_nextints,
	"Number of ExtINT interrupts");

	SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
	SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
	"nhardclocks", CTLFLAG_RD, &pcs->pcs_nhardclocks,
	"Number of IPI_HARDCLOCK interrupts");

	SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
	SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
	"nhighfps", CTLFLAG_RD, &pcs->pcs_nhighfps,
	"Number of IPI_HIGH_FP interrupts");

	SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
	SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
	"nhwints", CTLFLAG_RD, &pcs->pcs_nhwints,
	"Number of hardware (device) interrupts");

	SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
	SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
	"npreempts", CTLFLAG_RD, &pcs->pcs_npreempts,
	"Number of IPI_PREEMPT interrupts");

	SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
	SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
	"nrdvs", CTLFLAG_RD, &pcs->pcs_nrdvs,
	"Number of IPI_RENDEZVOUS interrupts");

	SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
	SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
	"nstops", CTLFLAG_RD, &pcs->pcs_nstops,
	"Number of IPI_STOP interrupts");

	SYSCTL_ADD_ULONG(&pc->pc_md.sysctl_ctx,
	SYSCTL_CHILDREN(pc->pc_md.sysctl_tree), OID_AUTO,
	"nstrays", CTLFLAG_RD, &pcs->pcs_nstrays,
	"Number of stray interrupts");
	}
	}
	SYSINIT(cpu_startup, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);

	void
	cpu_flush_dcache(void *ptr, size_t len)
	{
	vm_offset_t lim, va;

	va = (uintptr_t)ptr & ~31;
	lim = (uintptr_t)ptr + len;
	while (va < lim) {
	ia64_fc(va);
	va += 32;
	}

	ia64_srlz_d();
	}

	/* Get current clock frequency for the given cpu id. */
	int
	cpu_est_clockrate(int cpu_id, uint64_t *rate)
	{

	if (pcpu_find(cpu_id) == NULL \|\| rate == NULL)
	return (EINVAL);
	rate = (u_long)cpu_freq 1000000ul;
	return (0);
	}

	void
	cpu_halt()
	{

	efi_reset_system();
	}

	void
	cpu_idle(int busy)
	{
	register_t ie;

	if (!busy) {
	critical_enter();
	cpu_idleclock();
	}

	ie = intr_disable();
	KASSERT(ie != 0, ("%s called with interrupts disabled\n", __func__));

	if (sched_runnable())
	ia64_enable_intr();
	else if (cpu_idle_hook != NULL) {
	(*cpu_idle_hook)();
	/* The hook must enable interrupts! */
	} else {
	ia64_call_pal_static(PAL_HALT_LIGHT, 0, 0, 0);
	ia64_enable_intr();
	}

	if (!busy) {
	cpu_activeclock();
	critical_exit();
	}
	}

	int
	cpu_idle_wakeup(int cpu)
	{

	return (0);
	}

	void
	cpu_reset()
	{

	efi_reset_system();
	}

	void
	cpu_switch(struct thread old, struct thread new, struct mtx *mtx)
	{
	struct pcb oldpcb, newpcb;

	oldpcb = old->td_pcb;
	#ifdef COMPAT_FREEBSD32
	ia32_savectx(oldpcb);
	#endif
	if (PCPU_GET(fpcurthread) == old)
	old->td_frame->tf_special.psr \|= IA64_PSR_DFH;
	if (!savectx(oldpcb)) {
	newpcb = new->td_pcb;
	oldpcb->pcb_current_pmap =
	pmap_switch(newpcb->pcb_current_pmap);

	atomic_store_rel_ptr(&old->td_lock, mtx);

	#if defined(SCHED_ULE) && defined(SMP)
	while (atomic_load_acq_ptr(&new->td_lock) == &blocked_lock)
	cpu_spinwait();
	#endif

	PCPU_SET(curthread, new);

	#ifdef COMPAT_FREEBSD32
	ia32_restorectx(newpcb);
	#endif

	if (PCPU_GET(fpcurthread) == new)
	new->td_frame->tf_special.psr &= ~IA64_PSR_DFH;
	restorectx(newpcb);
	/* We should not get here. */
	panic("cpu_switch: restorectx() returned");
	/* NOTREACHED */
	}
	}

	void
	cpu_throw(struct thread old __unused, struct thread new)
	{
	struct pcb *newpcb;

	newpcb = new->td_pcb;
	(void)pmap_switch(newpcb->pcb_current_pmap);

	#if defined(SCHED_ULE) && defined(SMP)
	while (atomic_load_acq_ptr(&new->td_lock) == &blocked_lock)
	cpu_spinwait();
	#endif

	PCPU_SET(curthread, new);

	#ifdef COMPAT_FREEBSD32
	ia32_restorectx(newpcb);
	#endif

	restorectx(newpcb);
	/* We should not get here. */
	panic("cpu_throw: restorectx() returned");
	/* NOTREACHED */
	}

	void
	cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
	{

	/*
	* Set pc_acpi_id to "uninitialized".
	* See sys/dev/acpica/acpi_cpu.c
	*/
	pcpu->pc_acpi_id = 0xffffffff;
	}

	void
	spinlock_enter(void)
	{
	struct thread *td;
	int intr;

	td = curthread;
	if (td->td_md.md_spinlock_count == 0) {
	intr = intr_disable();
	td->td_md.md_spinlock_count = 1;
	td->td_md.md_saved_intr = intr;
	} else
	td->td_md.md_spinlock_count++;
	critical_enter();
	}

	void
	spinlock_exit(void)
	{
	struct thread *td;
	int intr;

	td = curthread;
	critical_exit();
	intr = td->td_md.md_saved_intr;
	td->td_md.md_spinlock_count--;
	if (td->td_md.md_spinlock_count == 0)
	intr_restore(intr);
	}

	void
	map_vhpt(uintptr_t vhpt)
	{
	pt_entry_t pte;
	uint64_t psr;

	pte = PTE_PRESENT \| PTE_MA_WB \| PTE_ACCESSED \| PTE_DIRTY \|
	PTE_PL_KERN \| PTE_AR_RW;
	pte \|= vhpt & PTE_PPN_MASK;

	__asm __volatile("ptr.d %0,%1" :: "r"(vhpt),
	"r"(pmap_vhpt_log2size << 2));

	__asm __volatile("mov %0=psr" : "=r"(psr));
	__asm __volatile("rsm psr.ic\|psr.i");
	ia64_srlz_i();
	ia64_set_ifa(vhpt);
	ia64_set_itir(pmap_vhpt_log2size << 2);
	ia64_srlz_d();
	__asm __volatile("itr.d dtr[%0]=%1" :: "r"(3), "r"(pte));
	__asm __volatile("mov psr.l=%0" :: "r" (psr));
	ia64_srlz_i();
	}

	void
	map_pal_code(void)
	{
	pt_entry_t pte;
	vm_offset_t va;
	vm_size_t sz;
	uint64_t psr;
	u_int shft;

	if (ia64_pal_size == 0)
	return;

	va = IA64_PHYS_TO_RR7(ia64_pal_base);

	sz = ia64_pal_size;
	shft = 0;
	while (sz > 1) {
	shft++;
	sz >>= 1;
	}

	pte = PTE_PRESENT \| PTE_MA_WB \| PTE_ACCESSED \| PTE_DIRTY \|
	PTE_PL_KERN \| PTE_AR_RWX;
	pte \|= ia64_pal_base & PTE_PPN_MASK;

	__asm __volatile("ptr.d %0,%1; ptr.i %0,%1" :: "r"(va), "r"(shft<<2));

	__asm __volatile("mov %0=psr" : "=r"(psr));
	__asm __volatile("rsm psr.ic\|psr.i");
	ia64_srlz_i();
	ia64_set_ifa(va);
	ia64_set_itir(shft << 2);
	ia64_srlz_d();
	__asm __volatile("itr.d dtr[%0]=%1" :: "r"(4), "r"(pte));
	ia64_srlz_d();
	__asm __volatile("itr.i itr[%0]=%1" :: "r"(1), "r"(pte));
	__asm __volatile("mov psr.l=%0" :: "r" (psr));
	ia64_srlz_i();
	}

	void
	map_gateway_page(void)
	{
	pt_entry_t pte;
	uint64_t psr;

	pte = PTE_PRESENT \| PTE_MA_WB \| PTE_ACCESSED \| PTE_DIRTY \|
	PTE_PL_KERN \| PTE_AR_X_RX;
	pte \|= ia64_tpa((uint64_t)ia64_gateway_page) & PTE_PPN_MASK;

	__asm __volatile("ptr.d %0,%1; ptr.i %0,%1" ::
	"r"(VM_MAXUSER_ADDRESS), "r"(PAGE_SHIFT << 2));

	__asm __volatile("mov %0=psr" : "=r"(psr));
	__asm __volatile("rsm psr.ic\|psr.i");
	ia64_srlz_i();
	ia64_set_ifa(VM_MAXUSER_ADDRESS);
	ia64_set_itir(PAGE_SHIFT << 2);
	ia64_srlz_d();
	__asm __volatile("itr.d dtr[%0]=%1" :: "r"(5), "r"(pte));
	ia64_srlz_d();
	__asm __volatile("itr.i itr[%0]=%1" :: "r"(2), "r"(pte));
	__asm __volatile("mov psr.l=%0" :: "r" (psr));
	ia64_srlz_i();

	/* Expose the mapping to userland in ar.k5 */
	ia64_set_k5(VM_MAXUSER_ADDRESS);
	}

	static u_int
	freq_ratio(u_long base, u_long ratio)
	{
	u_long f;

	f = (base * (ratio >> 32)) / (ratio & 0xfffffffful);
	return ((f + 500000) / 1000000);
	}

	static void
	calculate_frequencies(void)
	{
	struct ia64_sal_result sal;
	struct ia64_pal_result pal;
	register_t ie;

	ie = intr_disable();
	sal = ia64_sal_entry(SAL_FREQ_BASE, 0, 0, 0, 0, 0, 0, 0);
	pal = ia64_call_pal_static(PAL_FREQ_RATIOS, 0, 0, 0);
	intr_restore(ie);

	if (sal.sal_status == 0 && pal.pal_status == 0) {
	if (bootverbose) {
	printf("Platform clock frequency %ld Hz\n",
	sal.sal_result[0]);
	printf("Processor ratio %ld/%ld, Bus ratio %ld/%ld, "
	"ITC ratio %ld/%ld\n",
	pal.pal_result[0] >> 32,
	pal.pal_result[0] & ((1L << 32) - 1),
	pal.pal_result[1] >> 32,
	pal.pal_result[1] & ((1L << 32) - 1),
	pal.pal_result[2] >> 32,
	pal.pal_result[2] & ((1L << 32) - 1));
	}
	cpu_freq = freq_ratio(sal.sal_result[0], pal.pal_result[0]);
	bus_freq = freq_ratio(sal.sal_result[0], pal.pal_result[1]);
	itc_freq = freq_ratio(sal.sal_result[0], pal.pal_result[2]);
	}
	}

	struct ia64_init_return
	ia64_init(void)
	{
	struct ia64_init_return ret;
	int phys_avail_cnt;
	vm_offset_t kernstart, kernend;
	vm_offset_t kernstartpfn, kernendpfn, pfn0, pfn1;
	char *p;
	struct efi_md *md;
	int metadata_missing;

	/* NO OUTPUT ALLOWED UNTIL FURTHER NOTICE */

	/*
	* TODO: Disable interrupts, floating point etc.
	* Maybe flush cache and tlb
	*/
	ia64_set_fpsr(IA64_FPSR_DEFAULT);

	/*
	* TODO: Get critical system information (if possible, from the
	* information provided by the boot program).
	*/

	/*
	* Look for the I/O ports first - we need them for console
	* probing.
	*/
	for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
	switch (md->md_type) {
	case EFI_MD_TYPE_IOPORT:
	ia64_port_base = (uintptr_t)pmap_mapdev(md->md_phys,
	md->md_pages * EFI_PAGE_SIZE);
	break;
	case EFI_MD_TYPE_PALCODE:
	ia64_pal_size = md->md_pages * EFI_PAGE_SIZE;
	ia64_pal_base = md->md_phys;
	break;
	}
	}

	metadata_missing = 0;
	if (bootinfo->bi_modulep)
	preload_metadata = (caddr_t)bootinfo->bi_modulep;
	else
	metadata_missing = 1;

	if (envmode == 0 && bootinfo->bi_envp)
	kern_envp = (caddr_t)bootinfo->bi_envp;
	else
	kern_envp = static_env;

	/*
	* Look at arguments passed to us and compute boothowto.
	*/
	boothowto = bootinfo->bi_boothowto;

	if (boothowto & RB_VERBOSE)
	bootverbose = 1;

	/*
	* Find the beginning and end of the kernel.
	*/
	kernstart = trunc_page(kernel_text);
	#ifdef DDB
	ksym_start = bootinfo->bi_symtab;
	ksym_end = bootinfo->bi_esymtab;
	kernend = (vm_offset_t)round_page(ksym_end);
	#else
	kernend = (vm_offset_t)round_page(_end);
	#endif
	/* But if the bootstrap tells us otherwise, believe it! */
	if (bootinfo->bi_kernend)
	kernend = round_page(bootinfo->bi_kernend);

	/*
	* Region 6 is direct mapped UC and region 7 is direct mapped
	* WC. The details of this is controlled by the Alt {I,D}TLB
	* handlers. Here we just make sure that they have the largest
	* possible page size to minimise TLB usage.
	*/
	ia64_set_rr(IA64_RR_BASE(6), (6 << 8) \| (PAGE_SHIFT << 2));
	ia64_set_rr(IA64_RR_BASE(7), (7 << 8) \| (PAGE_SHIFT << 2));
	ia64_srlz_d();

	/*
	* Wire things up so we can call the firmware.
	*/
	map_pal_code();
	efi_boot_minimal(bootinfo->bi_systab);
	ia64_xiv_init();
	ia64_sal_init();
	calculate_frequencies();

	set_cputicker(ia64_get_itc, (u_long)itc_freq * 1000000, 0);

	/*
	* Setup the PCPU data for the bootstrap processor. It is needed
	* by printf(). Also, since printf() has critical sections, we
	* need to initialize at least pc_curthread.
	*/
	pcpup = &pcpu0;
	ia64_set_k4((u_int64_t)pcpup);
	pcpu_init(pcpup, 0, sizeof(pcpu0));
	dpcpu_init((void *)kernend, 0);
	PCPU_SET(md.lid, ia64_get_lid());
	kernend += DPCPU_SIZE;
	PCPU_SET(curthread, &thread0);

	/*
	* Initialize the console before we print anything out.
	*/
	cninit();

	/* OUTPUT NOW ALLOWED */

	if (metadata_missing)
	printf("WARNING: loader(8) metadata is missing!\n");

	/* Get FPSWA interface */
	fpswa_iface = (bootinfo->bi_fpswa == 0) ? NULL :
	(struct fpswa_iface *)IA64_PHYS_TO_RR7(bootinfo->bi_fpswa);

	/* Init basic tunables, including hz */
	init_param1();

	p = getenv("kernelname");
	if (p != NULL) {
	strlcpy(kernelname, p, sizeof(kernelname));
	freeenv(p);
	}

	kernstartpfn = atop(IA64_RR_MASK(kernstart));
	kernendpfn = atop(IA64_RR_MASK(kernend));

	/*
	* Size the memory regions and load phys_avail[] with the results.
	*/

	/*
	* Find out how much memory is available, by looking at
	* the memory descriptors.
	*/

	#ifdef DEBUG_MD
	printf("Memory descriptor count: %d\n", mdcount);
	#endif

	phys_avail_cnt = 0;
	for (md = efi_md_first(); md != NULL; md = efi_md_next(md)) {
	#ifdef DEBUG_MD
	printf("MD %p: type %d pa 0x%lx cnt 0x%lx\n", md,
	md->md_type, md->md_phys, md->md_pages);
	#endif

	pfn0 = ia64_btop(round_page(md->md_phys));
	pfn1 = ia64_btop(trunc_page(md->md_phys + md->md_pages * 4096));
	if (pfn1 <= pfn0)
	continue;

	if (md->md_type != EFI_MD_TYPE_FREE)
	continue;

	/*
	* We have a memory descriptor that describes conventional
	* memory that is for general use. We must determine if the
	* loader has put the kernel in this region.
	*/
	physmem += (pfn1 - pfn0);
	if (pfn0 <= kernendpfn && kernstartpfn <= pfn1) {
	/*
	* Must compute the location of the kernel
	* within the segment.
	*/
	#ifdef DEBUG_MD
	printf("Descriptor %p contains kernel\n", mp);
	#endif
	if (pfn0 < kernstartpfn) {
	/*
	* There is a chunk before the kernel.
	*/
	#ifdef DEBUG_MD
	printf("Loading chunk before kernel: "
	"0x%lx / 0x%lx\n", pfn0, kernstartpfn);
	#endif
	phys_avail[phys_avail_cnt] = ia64_ptob(pfn0);
	phys_avail[phys_avail_cnt+1] = ia64_ptob(kernstartpfn);
	phys_avail_cnt += 2;
	}
	if (kernendpfn < pfn1) {
	/*
	* There is a chunk after the kernel.
	*/
	#ifdef DEBUG_MD
	printf("Loading chunk after kernel: "
	"0x%lx / 0x%lx\n", kernendpfn, pfn1);
	#endif
	phys_avail[phys_avail_cnt] = ia64_ptob(kernendpfn);
	phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1);
	phys_avail_cnt += 2;
	}
	} else {
	/*
	* Just load this cluster as one chunk.
	*/
	#ifdef DEBUG_MD
	printf("Loading descriptor %d: 0x%lx / 0x%lx\n", i,
	pfn0, pfn1);
	#endif
	phys_avail[phys_avail_cnt] = ia64_ptob(pfn0);
	phys_avail[phys_avail_cnt+1] = ia64_ptob(pfn1);
	phys_avail_cnt += 2;

	}
	}
	phys_avail[phys_avail_cnt] = 0;

	Maxmem = physmem;
	init_param2(physmem);

	/*
	* Initialize error message buffer (at end of core).
	*/
	msgbufp = (struct msgbuf *)pmap_steal_memory(msgbufsize);
	msgbufinit(msgbufp, msgbufsize);

	proc_linkup0(&proc0, &thread0);
	/*
	* Init mapping for kernel stack for proc 0
	*/
	thread0.td_kstack = pmap_steal_memory(KSTACK_PAGES * PAGE_SIZE);
	thread0.td_kstack_pages = KSTACK_PAGES;

	mutex_init();

	/*
	* Initialize the rest of proc 0's PCB.
	*
	* Set the kernel sp, reserving space for an (empty) trapframe,
	* and make proc0's trapframe pointer point to it for sanity.
	* Initialise proc0's backing store to start after u area.
	*/
	cpu_thread_alloc(&thread0);
	thread0.td_frame->tf_flags = FRAME_SYSCALL;
	thread0.td_pcb->pcb_special.sp =
	(u_int64_t)thread0.td_frame - 16;
	thread0.td_pcb->pcb_special.bspstore = thread0.td_kstack;

	/*
	* Initialize the virtual memory system.
	*/
	pmap_bootstrap();

	/*
	* Initialize debuggers, and break into them if appropriate.
	*/
	kdb_init();

	#ifdef KDB
	if (boothowto & RB_KDB)
	kdb_enter(KDB_WHY_BOOTFLAGS,
	"Boot flags requested debugger\n");
	#endif

	ia64_set_tpr(0);
	ia64_srlz_d();

	ret.bspstore = thread0.td_pcb->pcb_special.bspstore;
	ret.sp = thread0.td_pcb->pcb_special.sp;
	return (ret);
	}

	uint64_t
	ia64_get_hcdp(void)
	{

	return (bootinfo->bi_hcdp);
	}

	void
	bzero(void *buf, size_t len)
	{
	caddr_t p = buf;

	while (((vm_offset_t) p & (sizeof(u_long) - 1)) && len) {
	*p++ = 0;
	len--;
	}
	while (len >= sizeof(u_long) * 8) {
	(u_long) p = 0;
	((u_long) p + 1) = 0;
	((u_long) p + 2) = 0;
	((u_long) p + 3) = 0;
	len -= sizeof(u_long) * 8;
	((u_long) p + 4) = 0;
	((u_long) p + 5) = 0;
	((u_long) p + 6) = 0;
	((u_long) p + 7) = 0;
	p += sizeof(u_long) * 8;
	}
	while (len >= sizeof(u_long)) {
	(u_long) p = 0;
	len -= sizeof(u_long);
	p += sizeof(u_long);
	}
	while (len) {
	*p++ = 0;
	len--;
	}
	}

	u_int
	ia64_itc_freq(void)
	{

	return (itc_freq);
	}

	void
	DELAY(int n)
	{
	u_int64_t start, end, now;

	sched_pin();

	start = ia64_get_itc();
	end = start + itc_freq * n;
	/* printf("DELAY from 0x%lx to 0x%lx\n", start, end); */
	do {
	now = ia64_get_itc();
	} while (now < end \|\| (now > start && end < start));

	sched_unpin();
	}

	/*
	* Send an interrupt (signal) to a process.
	*/
	void
	sendsig(sig_t catcher, ksiginfo_t ksi, sigset_t mask)
	{
	struct proc *p;
	struct thread *td;
	struct trapframe *tf;
	struct sigacts *psp;
	struct sigframe sf, *sfp;
	u_int64_t sbs, sp;
	int oonstack;
	int sig;
	u_long code;

	td = curthread;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	sig = ksi->ksi_signo;
	code = ksi->ksi_code;
	psp = p->p_sigacts;
	mtx_assert(&psp->ps_mtx, MA_OWNED);
	tf = td->td_frame;
	sp = tf->tf_special.sp;
	oonstack = sigonstack(sp);
	sbs = 0;

	/* save user context */
	bzero(&sf, sizeof(struct sigframe));
	sf.sf_uc.uc_sigmask = *mask;
	sf.sf_uc.uc_stack = td->td_sigstk;
	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
	? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;

	/*
	* Allocate and validate space for the signal handler
	* context. Note that if the stack is in P0 space, the
	* call to grow() is a nop, and the useracc() check
	* will fail if the process has not already allocated
	* the space with a `brk'.
	*/
	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
	SIGISMEMBER(psp->ps_sigonstack, sig)) {
	sbs = (u_int64_t)td->td_sigstk.ss_sp;
	sbs = (sbs + 15) & ~15;
	sfp = (struct sigframe *)(sbs + td->td_sigstk.ss_size);
	#if defined(COMPAT_43)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	#endif
	} else
	sfp = (struct sigframe *)sp;
	sfp = (struct sigframe *)((u_int64_t)(sfp - 1) & ~15);

	/* Fill in the siginfo structure for POSIX handlers. */
	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
	sf.sf_si = ksi->ksi_info;
	sf.sf_si.si_signo = sig;
	/*
	* XXX this shouldn't be here after code in trap.c
	* is fixed
	*/
	sf.sf_si.si_addr = (void*)tf->tf_special.ifa;
	code = (u_int64_t)&sfp->sf_si;
	}

	mtx_unlock(&psp->ps_mtx);
	PROC_UNLOCK(p);

	get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);

	/* Copy the frame out to userland. */
	if (copyout(&sf, sfp, sizeof(sf)) != 0) {
	/*
	* Process has trashed its stack; give it an illegal
	* instruction to halt it in its tracks.
	*/
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	return;
	}

	if ((tf->tf_flags & FRAME_SYSCALL) == 0) {
	tf->tf_special.psr &= ~IA64_PSR_RI;
	tf->tf_special.iip = ia64_get_k5() +
	((uint64_t)break_sigtramp - (uint64_t)ia64_gateway_page);
	} else
	tf->tf_special.iip = ia64_get_k5() +
	((uint64_t)epc_sigtramp - (uint64_t)ia64_gateway_page);

	/*
	* Setup the trapframe to return to the signal trampoline. We pass
	* information to the trampoline in the following registers:
	*
	* gp new backing store or NULL
	* r8 signal number
	* r9 signal code or siginfo pointer
	* r10 signal handler (function descriptor)
	*/
	tf->tf_special.sp = (u_int64_t)sfp - 16;
	tf->tf_special.gp = sbs;
	tf->tf_special.bspstore = sf.sf_uc.uc_mcontext.mc_special.bspstore;
	tf->tf_special.ndirty = 0;
	tf->tf_special.rnat = sf.sf_uc.uc_mcontext.mc_special.rnat;
	tf->tf_scratch.gr8 = sig;
	tf->tf_scratch.gr9 = code;
	tf->tf_scratch.gr10 = (u_int64_t)catcher;

	PROC_LOCK(p);
	mtx_lock(&psp->ps_mtx);
	}

	/*
	* System call to cleanup state after a signal
	* has been taken. Reset signal mask and
	* stack state from context left by sendsig (above).
	* Return to previous pc and psl as specified by
	* context left by sendsig. Check carefully to
	* make sure that the user has not modified the
	* state to gain improper privileges.
	*
	* MPSAFE
	*/
	int
	-sigreturn(struct thread *td,
	+sys_sigreturn(struct thread *td,
	struct sigreturn_args /* {
	ucontext_t *sigcntxp;
	} / uap)
	{
	ucontext_t uc;
	struct trapframe *tf;
	struct pcb *pcb;

	tf = td->td_frame;
	pcb = td->td_pcb;

	/*
	* Fetch the entire context structure at once for speed.
	* We don't use a normal argument to simplify RSE handling.
	*/
	if (copyin(uap->sigcntxp, (caddr_t)&uc, sizeof(uc)))
	return (EFAULT);

	set_mcontext(td, &uc.uc_mcontext);

	#if defined(COMPAT_43)
	if (sigonstack(tf->tf_special.sp))
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	else
	td->td_sigstk.ss_flags &= ~SS_ONSTACK;
	#endif
	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);

	return (EJUSTRETURN);
	}

	#ifdef COMPAT_FREEBSD4
	int
	freebsd4_sigreturn(struct thread td, struct freebsd4_sigreturn_args uap)
	{

	- return sigreturn(td, (struct sigreturn_args *)uap);
	+ return sys_sigreturn(td, (struct sigreturn_args *)uap);
	}
	#endif

	/*
	* Construct a PCB from a trapframe. This is called from kdb_trap() where
	* we want to start a backtrace from the function that caused us to enter
	* the debugger. We have the context in the trapframe, but base the trace
	* on the PCB. The PCB doesn't have to be perfect, as long as it contains
	* enough for a backtrace.
	*/
	void
	makectx(struct trapframe tf, struct pcb pcb)
	{

	pcb->pcb_special = tf->tf_special;
	pcb->pcb_special.__spare = ~0UL; /* XXX see unwind.c */
	save_callee_saved(&pcb->pcb_preserved);
	save_callee_saved_fp(&pcb->pcb_preserved_fp);
	}

	int
	ia64_flush_dirty(struct thread td, struct _special r)
	{
	struct iovec iov;
	struct uio uio;
	uint64_t bspst, kstk, rnat;
	int error, locked;

	if (r->ndirty == 0)
	return (0);

	kstk = td->td_kstack + (r->bspstore & 0x1ffUL);
	if (td == curthread) {
	__asm __volatile("mov ar.rsc=0;;");
	__asm __volatile("mov %0=ar.bspstore" : "=r"(bspst));
	/* Make sure we have all the user registers written out. */
	if (bspst - kstk < r->ndirty) {
	__asm __volatile("flushrs;;");
	__asm __volatile("mov %0=ar.bspstore" : "=r"(bspst));
	}
	__asm __volatile("mov %0=ar.rnat;;" : "=r"(rnat));
	__asm __volatile("mov ar.rsc=3");
	error = copyout((void)kstk, (void)r->bspstore, r->ndirty);
	kstk += r->ndirty;
	r->rnat = (bspst > kstk && (bspst & 0x1ffL) < (kstk & 0x1ffL))
	? (uint64_t)(kstk \| 0x1f8L) : rnat;
	} else {
	locked = PROC_LOCKED(td->td_proc);
	if (!locked)
	PHOLD(td->td_proc);
	iov.iov_base = (void*)(uintptr_t)kstk;
	iov.iov_len = r->ndirty;
	uio.uio_iov = &iov;
	uio.uio_iovcnt = 1;
	uio.uio_offset = r->bspstore;
	uio.uio_resid = r->ndirty;
	uio.uio_segflg = UIO_SYSSPACE;
	uio.uio_rw = UIO_WRITE;
	uio.uio_td = td;
	error = proc_rwmem(td->td_proc, &uio);
	/*
	* XXX proc_rwmem() doesn't currently return ENOSPC,
	* so I think it can bogusly return 0. Neither do
	* we allow short writes.
	*/
	if (uio.uio_resid != 0 && error == 0)
	error = ENOSPC;
	if (!locked)
	PRELE(td->td_proc);
	}

	r->bspstore += r->ndirty;
	r->ndirty = 0;
	return (error);
	}

	int
	get_mcontext(struct thread td, mcontext_t mc, int flags)
	{
	struct trapframe *tf;
	int error;

	tf = td->td_frame;
	bzero(mc, sizeof(*mc));
	mc->mc_special = tf->tf_special;
	error = ia64_flush_dirty(td, &mc->mc_special);
	if (tf->tf_flags & FRAME_SYSCALL) {
	mc->mc_flags \|= _MC_FLAGS_SYSCALL_CONTEXT;
	mc->mc_scratch = tf->tf_scratch;
	if (flags & GET_MC_CLEAR_RET) {
	mc->mc_scratch.gr8 = 0;
	mc->mc_scratch.gr9 = 0;
	mc->mc_scratch.gr10 = 0;
	mc->mc_scratch.gr11 = 0;
	}
	} else {
	mc->mc_flags \|= _MC_FLAGS_ASYNC_CONTEXT;
	mc->mc_scratch = tf->tf_scratch;
	mc->mc_scratch_fp = tf->tf_scratch_fp;
	/*
	* XXX If the thread never used the high FP registers, we
	* probably shouldn't waste time saving them.
	*/
	ia64_highfp_save(td);
	mc->mc_flags \|= _MC_FLAGS_HIGHFP_VALID;
	mc->mc_high_fp = td->td_pcb->pcb_high_fp;
	}
	save_callee_saved(&mc->mc_preserved);
	save_callee_saved_fp(&mc->mc_preserved_fp);
	return (error);
	}

	int
	set_mcontext(struct thread td, const mcontext_t mc)
	{
	struct _special s;
	struct trapframe *tf;
	uint64_t psrmask;

	tf = td->td_frame;

	KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0,
	("Whoa there! We have more than 8KB of dirty registers!"));

	s = mc->mc_special;
	/*
	* Only copy the user mask and the restart instruction bit from
	* the new context.
	*/
	psrmask = IA64_PSR_BE \| IA64_PSR_UP \| IA64_PSR_AC \| IA64_PSR_MFL \|
	IA64_PSR_MFH \| IA64_PSR_RI;
	s.psr = (tf->tf_special.psr & ~psrmask) \| (s.psr & psrmask);
	/* We don't have any dirty registers of the new context. */
	s.ndirty = 0;
	if (mc->mc_flags & _MC_FLAGS_ASYNC_CONTEXT) {
	/*
	* We can get an async context passed to us while we
	* entered the kernel through a syscall: sigreturn(2)
	* takes contexts that could previously be the result of
	* a trap or interrupt.
	* Hence, we cannot assert that the trapframe is not
	* a syscall frame, but we can assert that it's at
	* least an expected syscall.
	*/
	if (tf->tf_flags & FRAME_SYSCALL) {
	KASSERT(tf->tf_scratch.gr15 == SYS_sigreturn, ("foo"));
	tf->tf_flags &= ~FRAME_SYSCALL;
	}
	tf->tf_scratch = mc->mc_scratch;
	tf->tf_scratch_fp = mc->mc_scratch_fp;
	if (mc->mc_flags & _MC_FLAGS_HIGHFP_VALID)
	td->td_pcb->pcb_high_fp = mc->mc_high_fp;
	} else {
	KASSERT((tf->tf_flags & FRAME_SYSCALL) != 0, ("foo"));
	if ((mc->mc_flags & _MC_FLAGS_SYSCALL_CONTEXT) == 0) {
	s.cfm = tf->tf_special.cfm;
	s.iip = tf->tf_special.iip;
	tf->tf_scratch.gr15 = 0; /* Clear syscall nr. */
	} else
	tf->tf_scratch = mc->mc_scratch;
	}
	tf->tf_special = s;
	restore_callee_saved(&mc->mc_preserved);
	restore_callee_saved_fp(&mc->mc_preserved_fp);

	return (0);
	}

	/*
	* Clear registers on exec.
	*/
	void
	exec_setregs(struct thread td, struct image_params imgp, u_long stack)
	{
	struct trapframe *tf;
	uint64_t ksttop, kst;

	tf = td->td_frame;
	ksttop = (uint64_t*)(td->td_kstack + tf->tf_special.ndirty +
	(tf->tf_special.bspstore & 0x1ffUL));

	/*
	* We can ignore up to 8KB of dirty registers by masking off the
	* lower 13 bits in exception_restore() or epc_syscall(). This
	* should be enough for a couple of years, but if there are more
	* than 8KB of dirty registers, we lose track of the bottom of
	* the kernel stack. The solution is to copy the active part of
	* the kernel stack down 1 page (or 2, but not more than that)
	* so that we always have less than 8KB of dirty registers.
	*/
	KASSERT((tf->tf_special.ndirty & ~PAGE_MASK) == 0,
	("Whoa there! We have more than 8KB of dirty registers!"));

	bzero(&tf->tf_special, sizeof(tf->tf_special));
	if ((tf->tf_flags & FRAME_SYSCALL) == 0) { /* break syscalls. */
	bzero(&tf->tf_scratch, sizeof(tf->tf_scratch));
	bzero(&tf->tf_scratch_fp, sizeof(tf->tf_scratch_fp));
	tf->tf_special.cfm = (1UL<<63) \| (3UL<<7) \| 3UL;
	tf->tf_special.bspstore = IA64_BACKINGSTORE;
	/*
	* Copy the arguments onto the kernel register stack so that
	* they get loaded by the loadrs instruction. Skip over the
	* NaT collection points.
	*/
	kst = ksttop - 1;
	if (((uintptr_t)kst & 0x1ff) == 0x1f8)
	*kst-- = 0;
	*kst-- = 0;
	if (((uintptr_t)kst & 0x1ff) == 0x1f8)
	*kst-- = 0;
	*kst-- = imgp->ps_strings;
	if (((uintptr_t)kst & 0x1ff) == 0x1f8)
	*kst-- = 0;
	*kst = stack;
	tf->tf_special.ndirty = (ksttop - kst) << 3;
	} else { /* epc syscalls (default). */
	tf->tf_special.cfm = (3UL<<62) \| (3UL<<7) \| 3UL;
	tf->tf_special.bspstore = IA64_BACKINGSTORE + 24;
	/*
	* Write values for out0, out1 and out2 to the user's backing
	* store and arrange for them to be restored into the user's
	* initial register frame.
	* Assumes that (bspstore & 0x1f8) < 0x1e0.
	*/
	suword((caddr_t)tf->tf_special.bspstore - 24, stack);
	suword((caddr_t)tf->tf_special.bspstore - 16, imgp->ps_strings);
	suword((caddr_t)tf->tf_special.bspstore - 8, 0);
	}

	tf->tf_special.iip = imgp->entry_addr;
	tf->tf_special.sp = (stack & ~15) - 16;
	tf->tf_special.rsc = 0xf;
	tf->tf_special.fpsr = IA64_FPSR_DEFAULT;
	tf->tf_special.psr = IA64_PSR_IC \| IA64_PSR_I \| IA64_PSR_IT \|
	IA64_PSR_DT \| IA64_PSR_RT \| IA64_PSR_DFH \| IA64_PSR_BN \|
	IA64_PSR_CPL_USER;
	}

	int
	ptrace_set_pc(struct thread *td, unsigned long addr)
	{
	uint64_t slot;

	switch (addr & 0xFUL) {
	case 0:
	slot = IA64_PSR_RI_0;
	break;
	case 1:
	/* XXX we need to deal with MLX bundles here */
	slot = IA64_PSR_RI_1;
	break;
	case 2:
	slot = IA64_PSR_RI_2;
	break;
	default:
	return (EINVAL);
	}

	td->td_frame->tf_special.iip = addr & ~0x0FULL;
	td->td_frame->tf_special.psr =
	(td->td_frame->tf_special.psr & ~IA64_PSR_RI) \| slot;
	return (0);
	}

	int
	ptrace_single_step(struct thread *td)
	{
	struct trapframe *tf;

	/*
	* There's no way to set single stepping when we're leaving the
	* kernel through the EPC syscall path. The way we solve this is
	* by enabling the lower-privilege trap so that we re-enter the
	* kernel as soon as the privilege level changes. See trap.c for
	* how we proceed from there.
	*/
	tf = td->td_frame;
	if (tf->tf_flags & FRAME_SYSCALL)
	tf->tf_special.psr \|= IA64_PSR_LP;
	else
	tf->tf_special.psr \|= IA64_PSR_SS;
	return (0);
	}

	int
	ptrace_clear_single_step(struct thread *td)
	{
	struct trapframe *tf;

	/*
	* Clear any and all status bits we may use to implement single
	* stepping.
	*/
	tf = td->td_frame;
	tf->tf_special.psr &= ~IA64_PSR_SS;
	tf->tf_special.psr &= ~IA64_PSR_LP;
	tf->tf_special.psr &= ~IA64_PSR_TB;
	return (0);
	}

	int
	fill_regs(struct thread td, struct reg regs)
	{
	struct trapframe *tf;

	tf = td->td_frame;
	regs->r_special = tf->tf_special;
	regs->r_scratch = tf->tf_scratch;
	save_callee_saved(&regs->r_preserved);
	return (0);
	}

	int
	set_regs(struct thread td, struct reg regs)
	{
	struct trapframe *tf;
	int error;

	tf = td->td_frame;
	error = ia64_flush_dirty(td, &tf->tf_special);
	if (!error) {
	tf->tf_special = regs->r_special;
	tf->tf_special.bspstore += tf->tf_special.ndirty;
	tf->tf_special.ndirty = 0;
	tf->tf_scratch = regs->r_scratch;
	restore_callee_saved(&regs->r_preserved);
	}
	return (error);
	}

	int
	fill_dbregs(struct thread td, struct dbreg dbregs)
	{

	return (ENOSYS);
	}

	int
	set_dbregs(struct thread td, struct dbreg dbregs)
	{

	return (ENOSYS);
	}

	int
	fill_fpregs(struct thread td, struct fpreg fpregs)
	{
	struct trapframe *frame = td->td_frame;
	struct pcb *pcb = td->td_pcb;

	/* Save the high FP registers. */
	ia64_highfp_save(td);

	fpregs->fpr_scratch = frame->tf_scratch_fp;
	save_callee_saved_fp(&fpregs->fpr_preserved);
	fpregs->fpr_high = pcb->pcb_high_fp;
	return (0);
	}

	int
	set_fpregs(struct thread td, struct fpreg fpregs)
	{
	struct trapframe *frame = td->td_frame;
	struct pcb *pcb = td->td_pcb;

	/* Throw away the high FP registers (should be redundant). */
	ia64_highfp_drop(td);

	frame->tf_scratch_fp = fpregs->fpr_scratch;
	restore_callee_saved_fp(&fpregs->fpr_preserved);
	pcb->pcb_high_fp = fpregs->fpr_high;
	return (0);
	}

	void
	ia64_sync_icache(vm_offset_t va, vm_offset_t sz)
	{
	vm_offset_t lim;

	if (!ia64_sync_icache_needed)
	return;

	lim = va + sz;
	while (va < lim) {
	ia64_fc_i(va);
	va += 32; /* XXX */
	}

	ia64_sync_i();
	ia64_srlz_i();
	}
	Index: head/sys/kern/init_main.c
	===================================================================
	--- head/sys/kern/init_main.c (revision 225616)
	+++ head/sys/kern/init_main.c (revision 225617)
	@@ -1,832 +1,832 @@
	/*-
	* Copyright (c) 1995 Terrence R. Lambert
	* All rights reserved.
	*
	* Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)init_main.c 8.9 (Berkeley) 1/21/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ddb.h"
	#include "opt_init_path.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/exec.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/jail.h>
	#include <sys/ktr.h>
	#include <sys/lock.h>
	#include <sys/loginclass.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/proc.h>
	#include <sys/racct.h>
	#include <sys/resourcevar.h>
	#include <sys/systm.h>
	#include <sys/signalvar.h>
	#include <sys/vnode.h>
	#include <sys/sysent.h>
	#include <sys/reboot.h>
	#include <sys/sched.h>
	#include <sys/sx.h>
	#include <sys/sysproto.h>
	#include <sys/vmmeter.h>
	#include <sys/unistd.h>
	#include <sys/malloc.h>
	#include <sys/conf.h>
	#include <sys/cpuset.h>

	#include <machine/cpu.h>

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <sys/copyright.h>

	#include <ddb/ddb.h>
	#include <ddb/db_sym.h>

	void mi_startup(void); /* Should be elsewhere */

	/* Components of the first process -- never freed. */
	static struct session session0;
	static struct pgrp pgrp0;
	struct proc proc0;
	struct thread thread0 __aligned(16);
	struct vmspace vmspace0;
	struct proc *initproc;

	int boothowto = 0; /* initialized so that it can be patched */
	SYSCTL_INT(_debug, OID_AUTO, boothowto, CTLFLAG_RD, &boothowto, 0,
	"Boot control flags, passed from loader");
	int bootverbose;
	SYSCTL_INT(_debug, OID_AUTO, bootverbose, CTLFLAG_RW, &bootverbose, 0,
	"Control the output of verbose kernel messages");

	/*
	* This ensures that there is at least one entry so that the sysinit_set
	* symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never
	* executed.
	*/
	SYSINIT(placeholder, SI_SUB_DUMMY, SI_ORDER_ANY, NULL, NULL);

	/*
	* The sysinit table itself. Items are checked off as the are run.
	* If we want to register new sysinit types, add them to newsysinit.
	*/
	SET_DECLARE(sysinit_set, struct sysinit);
	struct sysinit sysinit, sysinit_end;
	struct sysinit newsysinit, newsysinit_end;

	/*
	* Merge a new sysinit set into the current set, reallocating it if
	* necessary. This can only be called after malloc is running.
	*/
	void
	sysinit_add(struct sysinit set, struct sysinit set_end)
	{
	struct sysinit **newset;
	struct sysinit **sipp;
	struct sysinit **xipp;
	int count;

	count = set_end - set;
	if (newsysinit)
	count += newsysinit_end - newsysinit;
	else
	count += sysinit_end - sysinit;
	newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT);
	if (newset == NULL)
	panic("cannot malloc for sysinit");
	xipp = newset;
	if (newsysinit)
	for (sipp = newsysinit; sipp < newsysinit_end; sipp++)
	xipp++ = sipp;
	else
	for (sipp = sysinit; sipp < sysinit_end; sipp++)
	xipp++ = sipp;
	for (sipp = set; sipp < set_end; sipp++)
	xipp++ = sipp;
	if (newsysinit)
	free(newsysinit, M_TEMP);
	newsysinit = newset;
	newsysinit_end = newset + count;
	}

	/*
	* System startup; initialize the world, create process 0, mount root
	* filesystem, and fork to create init and pagedaemon. Most of the
	* hard work is done in the lower-level initialization routines including
	* startup(), which does memory initialization and autoconfiguration.
	*
	* This allows simple addition of new kernel subsystems that require
	* boot time initialization. It also allows substitution of subsystem
	* (for instance, a scheduler, kernel profiler, or VM system) by object
	* module. Finally, it allows for optional "kernel threads".
	*/
	void
	mi_startup(void)
	{

	register struct sysinit *sipp; / system initialization*/
	register struct sysinit *xipp; / interior loop of sort*/
	register struct sysinit save; / bubble*/

	#if defined(VERBOSE_SYSINIT)
	int last;
	int verbose;
	#endif

	if (boothowto & RB_VERBOSE)
	bootverbose++;

	if (sysinit == NULL) {
	sysinit = SET_BEGIN(sysinit_set);
	sysinit_end = SET_LIMIT(sysinit_set);
	}

	restart:
	/*
	* Perform a bubble sort of the system initialization objects by
	* their subsystem (primary key) and order (secondary key).
	*/
	for (sipp = sysinit; sipp < sysinit_end; sipp++) {
	for (xipp = sipp + 1; xipp < sysinit_end; xipp++) {
	if ((sipp)->subsystem < (xipp)->subsystem \|\|
	((sipp)->subsystem == (xipp)->subsystem &&
	(sipp)->order <= (xipp)->order))
	continue; /* skip*/
	save = *sipp;
	sipp = xipp;
	*xipp = save;
	}
	}

	#if defined(VERBOSE_SYSINIT)
	last = SI_SUB_COPYRIGHT;
	verbose = 0;
	#if !defined(DDB)
	printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n");
	#endif
	#endif

	/*
	* Traverse the (now) ordered list of system initialization tasks.
	* Perform each task, and continue on to the next task.
	*
	* The last item on the list is expected to be the scheduler,
	* which will not return.
	*/
	for (sipp = sysinit; sipp < sysinit_end; sipp++) {

	if ((*sipp)->subsystem == SI_SUB_DUMMY)
	continue; /* skip dummy task(s)*/

	if ((*sipp)->subsystem == SI_SUB_DONE)
	continue;

	#if defined(VERBOSE_SYSINIT)
	if ((*sipp)->subsystem > last) {
	verbose = 1;
	last = (*sipp)->subsystem;
	printf("subsystem %x\n", last);
	}
	if (verbose) {
	#if defined(DDB)
	const char *name;
	c_db_sym_t sym;
	db_expr_t offset;

	sym = db_search_symbol((vm_offset_t)(*sipp)->func,
	DB_STGY_PROC, &offset);
	db_symbol_values(sym, &name, NULL);
	if (name != NULL)
	printf(" %s(%p)... ", name, (*sipp)->udata);
	else
	#endif
	printf(" %p(%p)... ", (*sipp)->func,
	(*sipp)->udata);
	}
	#endif

	/* Call function */
	(((sipp)->func))((*sipp)->udata);

	#if defined(VERBOSE_SYSINIT)
	if (verbose)
	printf("done.\n");
	#endif

	/* Check off the one we're just done */
	(*sipp)->subsystem = SI_SUB_DONE;

	/* Check if we've installed more sysinit items via KLD */
	if (newsysinit != NULL) {
	if (sysinit != SET_BEGIN(sysinit_set))
	free(sysinit, M_TEMP);
	sysinit = newsysinit;
	sysinit_end = newsysinit_end;
	newsysinit = NULL;
	newsysinit_end = NULL;
	goto restart;
	}
	}

	panic("Shouldn't get here!");
	/* NOTREACHED*/
	}


	/*
	***************************************************************************
	****
	**** The following SYSINIT's belong elsewhere, but have not yet
	**** been moved.
	****
	***************************************************************************
	*/
	static void
	print_caddr_t(void *data)
	{
	printf("%s", (char *)data);
	}

	static void
	print_version(void *data __unused)
	{
	int len;

	/* Strip a trailing newline from version. */
	len = strlen(version);
	while (len > 0 && version[len - 1] == '\n')
	len--;
	printf("%.*s %s\n", len, version, machine);
	}

	SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t,
	copyright);
	SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t,
	trademark);
	SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_version, NULL);

	#ifdef WITNESS
	static char wit_warn[] =
	"WARNING: WITNESS option enabled, expect reduced performance.\n";
	SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1,
	print_caddr_t, wit_warn);
	SYSINIT(witwarn2, SI_SUB_RUN_SCHEDULER, SI_ORDER_THIRD + 1,
	print_caddr_t, wit_warn);
	#endif

	#ifdef DIAGNOSTIC
	static char diag_warn[] =
	"WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n";
	SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2,
	print_caddr_t, diag_warn);
	SYSINIT(diagwarn2, SI_SUB_RUN_SCHEDULER, SI_ORDER_THIRD + 2,
	print_caddr_t, diag_warn);
	#endif

	static int
	null_fetch_syscall_args(struct thread *td __unused,
	struct syscall_args *sa __unused)
	{

	panic("null_fetch_syscall_args");
	}

	static void
	null_set_syscall_retval(struct thread *td __unused, int error __unused)
	{

	panic("null_set_syscall_retval");
	}

	struct sysentvec null_sysvec = {
	.sv_size = 0,
	.sv_table = NULL,
	.sv_mask = 0,
	.sv_sigsize = 0,
	.sv_sigtbl = NULL,
	.sv_errsize = 0,
	.sv_errtbl = NULL,
	.sv_transtrap = NULL,
	.sv_fixup = NULL,
	.sv_sendsig = NULL,
	.sv_sigcode = NULL,
	.sv_szsigcode = NULL,
	.sv_prepsyscall = NULL,
	.sv_name = "null",
	.sv_coredump = NULL,
	.sv_imgact_try = NULL,
	.sv_minsigstksz = 0,
	.sv_pagesize = PAGE_SIZE,
	.sv_minuser = VM_MIN_ADDRESS,
	.sv_maxuser = VM_MAXUSER_ADDRESS,
	.sv_usrstack = USRSTACK,
	.sv_psstrings = PS_STRINGS,
	.sv_stackprot = VM_PROT_ALL,
	.sv_copyout_strings = NULL,
	.sv_setregs = NULL,
	.sv_fixlimit = NULL,
	.sv_maxssiz = NULL,
	.sv_flags = 0,
	.sv_set_syscall_retval = null_set_syscall_retval,
	.sv_fetch_syscall_args = null_fetch_syscall_args,
	.sv_syscallnames = NULL,
	.sv_schedtail = NULL,
	};

	/*
	***************************************************************************
	****
	**** The two following SYSINIT's are proc0 specific glue code. I am not
	**** convinced that they can not be safely combined, but their order of
	**** operation has been maintained as the same as the original init_main.c
	**** for right now.
	****
	**** These probably belong in init_proc.c or kern_proc.c, since they
	**** deal with proc0 (the fork template process).
	****
	***************************************************************************
	*/
	/* ARGSUSED*/
	static void
	proc0_init(void *dummy __unused)
	{
	struct proc *p;
	struct thread *td;
	vm_paddr_t pageablemem;
	int i;

	GIANT_REQUIRED;
	p = &proc0;
	td = &thread0;

	/*
	* Initialize magic number and osrel.
	*/
	p->p_magic = P_MAGIC;
	p->p_osrel = osreldate;

	/*
	* Initialize thread and process structures.
	*/
	procinit(); /* set up proc zone */
	threadinit(); /* set up UMA zones */

	/*
	* Initialise scheduler resources.
	* Add scheduler specific parts to proc, thread as needed.
	*/
	schedinit(); /* scheduler gets its house in order */
	/*
	* Initialize sleep queue hash table
	*/
	sleepinit();

	/*
	* additional VM structures
	*/
	vm_init2();

	/*
	* Create process 0 (the swapper).
	*/
	LIST_INSERT_HEAD(&allproc, p, p_list);
	LIST_INSERT_HEAD(PIDHASH(0), p, p_hash);
	mtx_init(&pgrp0.pg_mtx, "process group", NULL, MTX_DEF \| MTX_DUPOK);
	p->p_pgrp = &pgrp0;
	LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash);
	LIST_INIT(&pgrp0.pg_members);
	LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist);

	pgrp0.pg_session = &session0;
	mtx_init(&session0.s_mtx, "session", NULL, MTX_DEF);
	refcount_init(&session0.s_count, 1);
	session0.s_leader = p;

	p->p_sysent = &null_sysvec;
	p->p_flag = P_SYSTEM \| P_INMEM;
	p->p_state = PRS_NORMAL;
	knlist_init_mtx(&p->p_klist, &p->p_mtx);
	STAILQ_INIT(&p->p_ktr);
	p->p_nice = NZERO;
	td->td_tid = PID_MAX + 1;
	LIST_INSERT_HEAD(TIDHASH(td->td_tid), td, td_hash);
	td->td_state = TDS_RUNNING;
	td->td_pri_class = PRI_TIMESHARE;
	td->td_user_pri = PUSER;
	td->td_base_user_pri = PUSER;
	td->td_lend_user_pri = PRI_MAX;
	td->td_priority = PVM;
	td->td_base_pri = PVM;
	td->td_oncpu = 0;
	td->td_flags = TDF_INMEM\|TDP_KTHREAD;
	td->td_cpuset = cpuset_thread0();
	prison0.pr_cpuset = cpuset_ref(td->td_cpuset);
	p->p_peers = 0;
	p->p_leader = p;


	strncpy(p->p_comm, "kernel", sizeof (p->p_comm));
	strncpy(td->td_name, "swapper", sizeof (td->td_name));

	callout_init(&p->p_itcallout, CALLOUT_MPSAFE);
	callout_init_mtx(&p->p_limco, &p->p_mtx, 0);
	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);

	/* Create credentials. */
	p->p_ucred = crget();
	p->p_ucred->cr_ngroups = 1; /* group 0 */
	p->p_ucred->cr_uidinfo = uifind(0);
	p->p_ucred->cr_ruidinfo = uifind(0);
	p->p_ucred->cr_prison = &prison0;
	p->p_ucred->cr_loginclass = loginclass_find("default");
	#ifdef AUDIT
	audit_cred_kproc0(p->p_ucred);
	#endif
	#ifdef MAC
	mac_cred_create_swapper(p->p_ucred);
	#endif
	td->td_ucred = crhold(p->p_ucred);

	/* Create sigacts. */
	p->p_sigacts = sigacts_alloc();

	/* Initialize signal state for process 0. */
	siginit(&proc0);

	/* Create the file descriptor table. */
	p->p_fd = fdinit(NULL);
	p->p_fdtol = NULL;

	/* Create the limits structures. */
	p->p_limit = lim_alloc();
	for (i = 0; i < RLIM_NLIMITS; i++)
	p->p_limit->pl_rlimit[i].rlim_cur =
	p->p_limit->pl_rlimit[i].rlim_max = RLIM_INFINITY;
	p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_cur =
	p->p_limit->pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
	p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_cur =
	p->p_limit->pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
	p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_cur = dfldsiz;
	p->p_limit->pl_rlimit[RLIMIT_DATA].rlim_max = maxdsiz;
	p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_cur = dflssiz;
	p->p_limit->pl_rlimit[RLIMIT_STACK].rlim_max = maxssiz;
	/* Cast to avoid overflow on i386/PAE. */
	pageablemem = ptoa((vm_paddr_t)cnt.v_free_count);
	p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_cur =
	p->p_limit->pl_rlimit[RLIMIT_RSS].rlim_max = pageablemem;
	p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = pageablemem / 3;
	p->p_limit->pl_rlimit[RLIMIT_MEMLOCK].rlim_max = pageablemem;
	p->p_cpulimit = RLIM_INFINITY;

	/* Initialize resource accounting structures. */
	racct_create(&p->p_racct);

	p->p_stats = pstats_alloc();

	/* Allocate a prototype map so we have something to fork. */
	pmap_pinit0(vmspace_pmap(&vmspace0));
	p->p_vmspace = &vmspace0;
	vmspace0.vm_refcnt = 1;

	/*
	* proc0 is not expected to enter usermode, so there is no special
	* handling for sv_minuser here, like is done for exec_new_vmspace().
	*/
	vm_map_init(&vmspace0.vm_map, vmspace_pmap(&vmspace0),
	p->p_sysent->sv_minuser, p->p_sysent->sv_maxuser);

	/*
	* Call the init and ctor for the new thread and proc. We wait
	* to do this until all other structures are fairly sane.
	*/
	EVENTHANDLER_INVOKE(process_init, p);
	EVENTHANDLER_INVOKE(thread_init, td);
	EVENTHANDLER_INVOKE(process_ctor, p);
	EVENTHANDLER_INVOKE(thread_ctor, td);

	/*
	* Charge root for one process.
	*/
	(void)chgproccnt(p->p_ucred->cr_ruidinfo, 1, 0);
	PROC_LOCK(p);
	racct_add_force(p, RACCT_NPROC, 1);
	PROC_UNLOCK(p);
	}
	SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL);

	/* ARGSUSED*/
	static void
	proc0_post(void *dummy __unused)
	{
	struct timespec ts;
	struct proc *p;
	struct rusage ru;
	struct thread *td;

	/*
	* Now we can look at the time, having had a chance to verify the
	* time from the filesystem. Pretend that proc0 started now.
	*/
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	microuptime(&p->p_stats->p_start);
	PROC_SLOCK(p);
	rufetch(p, &ru); /* Clears thread stats */
	PROC_SUNLOCK(p);
	p->p_rux.rux_runtime = 0;
	p->p_rux.rux_uticks = 0;
	p->p_rux.rux_sticks = 0;
	p->p_rux.rux_iticks = 0;
	FOREACH_THREAD_IN_PROC(p, td) {
	td->td_runtime = 0;
	}
	}
	sx_sunlock(&allproc_lock);
	PCPU_SET(switchtime, cpu_ticks());
	PCPU_SET(switchticks, ticks);

	/*
	* Give the ``random'' number generator a thump.
	*/
	nanotime(&ts);
	srandom(ts.tv_sec ^ ts.tv_nsec);
	}
	SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL);

	static void
	random_init(void *dummy __unused)
	{

	/*
	* After CPU has been started we have some randomness on most
	* platforms via get_cyclecount(). For platforms that don't
	* we will reseed random(9) in proc0_post() as well.
	*/
	srandom(get_cyclecount());
	}
	SYSINIT(random, SI_SUB_RANDOM, SI_ORDER_FIRST, random_init, NULL);

	/*
	***************************************************************************
	****
	**** The following SYSINIT's and glue code should be moved to the
	**** respective files on a per subsystem basis.
	****
	***************************************************************************
	*/


	/*
	***************************************************************************
	****
	**** The following code probably belongs in another file, like
	**** kern/init_init.c.
	****
	***************************************************************************
	*/

	/*
	* List of paths to try when searching for "init".
	*/
	static char init_path[MAXPATHLEN] =
	#ifdef INIT_PATH
	__XSTRING(INIT_PATH);
	#else
	"/sbin/init:/sbin/oinit:/sbin/init.bak:/rescue/init:/stand/sysinstall";
	#endif
	SYSCTL_STRING(_kern, OID_AUTO, init_path, CTLFLAG_RD, init_path, 0,
	"Path used to search the init process");

	/*
	* Shutdown timeout of init(8).
	* Unused within kernel, but used to control init(8), hence do not remove.
	*/
	#ifndef INIT_SHUTDOWN_TIMEOUT
	#define INIT_SHUTDOWN_TIMEOUT 120
	#endif
	static int init_shutdown_timeout = INIT_SHUTDOWN_TIMEOUT;
	SYSCTL_INT(_kern, OID_AUTO, init_shutdown_timeout,
	CTLFLAG_RW, &init_shutdown_timeout, 0, "Shutdown timeout of init(8). "
	"Unused within kernel, but used to control init(8)");

	/*
	* Start the initial user process; try exec'ing each pathname in init_path.
	* The program is invoked with one argument containing the boot flags.
	*/
	static void
	start_init(void *dummy)
	{
	vm_offset_t addr;
	struct execve_args args;
	int options, error;
	char var, path, next, s;
	char ucp, uap, arg0, *arg1;
	struct thread *td;
	struct proc *p;

	mtx_lock(&Giant);

	GIANT_REQUIRED;

	td = curthread;
	p = td->td_proc;

	vfs_mountroot();

	/*
	* Need just enough stack to hold the faked-up "execve()" arguments.
	*/
	addr = p->p_sysent->sv_usrstack - PAGE_SIZE;
	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE,
	FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
	panic("init: couldn't allocate argument space");
	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
	p->p_vmspace->vm_ssize = 1;

	if ((var = getenv("init_path")) != NULL) {
	strlcpy(init_path, var, sizeof(init_path));
	freeenv(var);
	}

	for (path = init_path; *path != '\0'; path = next) {
	while (*path == ':')
	path++;
	if (*path == '\0')
	break;
	for (next = path; next != '\0' && next != ':'; next++)
	/* nothing */ ;
	if (bootverbose)
	printf("start_init: trying %.*s\n", (int)(next - path),
	path);

	/*
	* Move out the boot flag argument.
	*/
	options = 0;
	ucp = (char *)p->p_sysent->sv_usrstack;
	(void)subyte(--ucp, 0); /* trailing zero */
	if (boothowto & RB_SINGLE) {
	(void)subyte(--ucp, 's');
	options = 1;
	}
	#ifdef notyet
	if (boothowto & RB_FASTBOOT) {
	(void)subyte(--ucp, 'f');
	options = 1;
	}
	#endif

	#ifdef BOOTCDROM
	(void)subyte(--ucp, 'C');
	options = 1;
	#endif

	if (options == 0)
	(void)subyte(--ucp, '-');
	(void)subyte(--ucp, '-'); /* leading hyphen */
	arg1 = ucp;

	/*
	* Move out the file name (also arg 0).
	*/
	(void)subyte(--ucp, 0);
	for (s = next - 1; s >= path; s--)
	(void)subyte(--ucp, *s);
	arg0 = ucp;

	/*
	* Move out the arg pointers.
	*/
	uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1));
	(void)suword((caddr_t)--uap, (long)0); /* terminator */
	(void)suword((caddr_t)--uap, (long)(intptr_t)arg1);
	(void)suword((caddr_t)--uap, (long)(intptr_t)arg0);

	/*
	* Point at the arguments.
	*/
	args.fname = arg0;
	args.argv = uap;
	args.envv = NULL;

	/*
	* Now try to exec the program. If can't for any reason
	* other than it doesn't exist, complain.
	*
	* Otherwise, return via fork_trampoline() all the way
	* to user mode as init!
	*/
	- if ((error = execve(td, &args)) == 0) {
	+ if ((error = sys_execve(td, &args)) == 0) {
	mtx_unlock(&Giant);
	return;
	}
	if (error != ENOENT)
	printf("exec %.*s: error %d\n", (int)(next - path),
	path, error);
	}
	printf("init: not found in path %s\n", init_path);
	panic("no init");
	}

	/*
	* Like kproc_create(), but runs in it's own address space.
	* We do this early to reserve pid 1.
	*
	* Note special case - do not make it runnable yet. Other work
	* in progress will change this more.
	*/
	static void
	create_init(const void *udata __unused)
	{
	struct ucred newcred, oldcred;
	int error;

	error = fork1(&thread0, RFFDG \| RFPROC \| RFSTOPPED, 0, &initproc,
	NULL, 0);
	if (error)
	panic("cannot fork init: %d\n", error);
	KASSERT(initproc->p_pid == 1, ("create_init: initproc->p_pid != 1"));
	/* divorce init's credentials from the kernel's */
	newcred = crget();
	PROC_LOCK(initproc);
	initproc->p_flag \|= P_SYSTEM \| P_INMEM;
	oldcred = initproc->p_ucred;
	crcopy(newcred, oldcred);
	#ifdef MAC
	mac_cred_create_init(newcred);
	#endif
	#ifdef AUDIT
	audit_cred_proc1(newcred);
	#endif
	initproc->p_ucred = newcred;
	PROC_UNLOCK(initproc);
	crfree(oldcred);
	cred_update_thread(FIRST_THREAD_IN_PROC(initproc));
	cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL);
	}
	SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL);

	/*
	* Make it runnable now.
	*/
	static void
	kick_init(const void *udata __unused)
	{
	struct thread *td;

	td = FIRST_THREAD_IN_PROC(initproc);
	thread_lock(td);
	TD_SET_CAN_RUN(td);
	sched_add(td, SRQ_BORING);
	thread_unlock(td);
	}
	SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL);
	Index: head/sys/kern/kern_acct.c
	===================================================================
	--- head/sys/kern/kern_acct.c (revision 225616)
	+++ head/sys/kern/kern_acct.c (revision 225617)
	@@ -1,654 +1,654 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* Copyright (c) 2005 Robert N. M. Watson
	* All rights reserved.
	*
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* Copyright (c) 1994 Christopher G. Demetriou
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_acct.c 8.1 (Berkeley) 6/14/93
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/acct.h>
	#include <sys/fcntl.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/sched.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/syslog.h>
	#include <sys/sysproto.h>
	#include <sys/tty.h>
	#include <sys/vnode.h>

	#include <security/mac/mac_framework.h>

	/*
	* The routines implemented in this file are described in:
	* Leffler, et al.: The Design and Implementation of the 4.3BSD
	* UNIX Operating System (Addison Welley, 1989)
	* on pages 62-63.
	* On May 2007 the historic 3 bits base 8 exponent, 13 bit fraction
	* compt_t representation described in the above reference was replaced
	* with that of IEEE-754 floats.
	*
	* Arguably, to simplify accounting operations, this mechanism should
	* be replaced by one in which an accounting log file (similar to /dev/klog)
	* is read by a user process, etc. However, that has its own problems.
	*/

	/* Floating point definitions from <float.h>. */
	#define FLT_MANT_DIG 24 /* p */
	#define FLT_MAX_EXP 128 /* emax */

	/*
	* Internal accounting functions.
	* The former's operation is described in Leffler, et al., and the latter
	* was provided by UCB with the 4.4BSD-Lite release
	*/
	static uint32_t encode_timeval(struct timeval);
	static uint32_t encode_long(long);
	static void acctwatch(void);
	static void acct_thread(void *);
	static int acct_disable(struct thread *);

	/*
	* Accounting vnode pointer, saved vnode pointer, and flags for each.
	* acct_sx protects against changes to the active vnode and credentials
	* while accounting records are being committed to disk.
	*/
	static int acct_configured;
	static int acct_suspended;
	static struct vnode *acct_vp;
	static struct ucred *acct_cred;
	static int acct_flags;
	static struct sx acct_sx;

	SX_SYSINIT(acct, &acct_sx, "acct_sx");

	/*
	* State of the accounting kthread.
	*/
	static int acct_state;

	#define ACCT_RUNNING 1 /* Accounting kthread is running. */
	#define ACCT_EXITREQ 2 /* Accounting kthread should exit. */

	/*
	* Values associated with enabling and disabling accounting
	*/
	static int acctsuspend = 2; /* stop accounting when < 2% free space left */
	SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW,
	&acctsuspend, 0, "percentage of free disk space below which accounting stops");

	static int acctresume = 4; /* resume when free space risen to > 4% */
	SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW,
	&acctresume, 0, "percentage of free disk space above which accounting resumes");

	static int acctchkfreq = 15; /* frequency (in seconds) to check space */

	static int
	sysctl_acct_chkfreq(SYSCTL_HANDLER_ARGS)
	{
	int error, value;

	/* Write out the old value. */
	error = SYSCTL_OUT(req, &acctchkfreq, sizeof(int));
	if (error \|\| req->newptr == NULL)
	return (error);

	/* Read in and verify the new value. */
	error = SYSCTL_IN(req, &value, sizeof(int));
	if (error)
	return (error);
	if (value <= 0)
	return (EINVAL);
	acctchkfreq = value;
	return (0);
	}
	SYSCTL_PROC(_kern, OID_AUTO, acct_chkfreq, CTLTYPE_INT\|CTLFLAG_RW,
	&acctchkfreq, 0, sysctl_acct_chkfreq, "I",
	"frequency for checking the free space");

	SYSCTL_INT(_kern, OID_AUTO, acct_configured, CTLFLAG_RD, &acct_configured, 0,
	"Accounting configured or not");

	SYSCTL_INT(_kern, OID_AUTO, acct_suspended, CTLFLAG_RD, &acct_suspended, 0,
	"Accounting suspended or not");

	/*
	* Accounting system call. Written based on the specification and previous
	* implementation done by Mark Tinguely.
	*/
	int
	-acct(struct thread td, struct acct_args uap)
	+sys_acct(struct thread td, struct acct_args uap)
	{
	struct nameidata nd;
	int error, flags, vfslocked;

	error = priv_check(td, PRIV_ACCT);
	if (error)
	return (error);

	/*
	* If accounting is to be started to a file, open that file for
	* appending and make sure it's a 'normal'.
	*/
	if (uap->path != NULL) {
	NDINIT(&nd, LOOKUP, NOFOLLOW \| MPSAFE \| AUDITVNODE1,
	UIO_USERSPACE, uap->path, td);
	flags = FWRITE \| O_APPEND;
	error = vn_open(&nd, &flags, 0, NULL);
	if (error)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	#ifdef MAC
	error = mac_system_check_acct(td->td_ucred, nd.ni_vp);
	if (error) {
	VOP_UNLOCK(nd.ni_vp, 0);
	vn_close(nd.ni_vp, flags, td->td_ucred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	#endif
	VOP_UNLOCK(nd.ni_vp, 0);
	if (nd.ni_vp->v_type != VREG) {
	vn_close(nd.ni_vp, flags, td->td_ucred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	return (EACCES);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	#ifdef MAC
	} else {
	error = mac_system_check_acct(td->td_ucred, NULL);
	if (error)
	return (error);
	#endif
	}

	/*
	* Disallow concurrent access to the accounting vnode while we swap
	* it out, in order to prevent access after close.
	*/
	sx_xlock(&acct_sx);

	/*
	* If accounting was previously enabled, kill the old space-watcher,
	* close the file, and (if no new file was specified, leave). Reset
	* the suspended state regardless of whether accounting remains
	* enabled.
	*/
	acct_suspended = 0;
	if (acct_vp != NULL) {
	vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
	error = acct_disable(td);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	if (uap->path == NULL) {
	if (acct_state & ACCT_RUNNING) {
	acct_state \|= ACCT_EXITREQ;
	wakeup(&acct_state);
	}
	sx_xunlock(&acct_sx);
	return (error);
	}

	/*
	* Save the new accounting file vnode, and schedule the new
	* free space watcher.
	*/
	acct_vp = nd.ni_vp;
	acct_cred = crhold(td->td_ucred);
	acct_flags = flags;
	if (acct_state & ACCT_RUNNING)
	acct_state &= ~ACCT_EXITREQ;
	else {
	/*
	* Try to start up an accounting kthread. We may start more
	* than one, but if so the extras will commit suicide as
	* soon as they start up.
	*/
	error = kproc_create(acct_thread, NULL, NULL, 0, 0,
	"accounting");
	if (error) {
	vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
	(void) vn_close(acct_vp, acct_flags, acct_cred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	crfree(acct_cred);
	acct_configured = 0;
	acct_vp = NULL;
	acct_cred = NULL;
	acct_flags = 0;
	sx_xunlock(&acct_sx);
	log(LOG_NOTICE, "Unable to start accounting thread\n");
	return (error);
	}
	}
	acct_configured = 1;
	sx_xunlock(&acct_sx);
	log(LOG_NOTICE, "Accounting enabled\n");
	return (error);
	}

	/*
	* Disable currently in-progress accounting by closing the vnode, dropping
	* our reference to the credential, and clearing the vnode's flags.
	*/
	static int
	acct_disable(struct thread *td)
	{
	int error;

	sx_assert(&acct_sx, SX_XLOCKED);
	error = vn_close(acct_vp, acct_flags, acct_cred, td);
	crfree(acct_cred);
	acct_configured = 0;
	acct_vp = NULL;
	acct_cred = NULL;
	acct_flags = 0;
	log(LOG_NOTICE, "Accounting disabled\n");
	return (error);
	}

	/*
	* Write out process accounting information, on process exit.
	* Data to be written out is specified in Leffler, et al.
	* and are enumerated below. (They're also noted in the system
	* "acct.h" header file.)
	*/
	int
	acct_process(struct thread *td)
	{
	struct acctv2 acct;
	struct timeval ut, st, tmp;
	struct plimit newlim, oldlim;
	struct proc *p;
	struct rusage ru;
	int t, ret, vfslocked;

	/*
	* Lockless check of accounting condition before doing the hard
	* work.
	*/
	if (acct_vp == NULL \|\| acct_suspended)
	return (0);

	sx_slock(&acct_sx);

	/*
	* If accounting isn't enabled, don't bother. Have to check again
	* once we own the lock in case we raced with disabling of accounting
	* by another thread.
	*/
	if (acct_vp == NULL \|\| acct_suspended) {
	sx_sunlock(&acct_sx);
	return (0);
	}

	p = td->td_proc;

	/*
	* Get process accounting information.
	*/

	sx_slock(&proctree_lock);
	PROC_LOCK(p);

	/* (1) The terminal from which the process was started */
	if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp)
	acct.ac_tty = tty_udev(p->p_pgrp->pg_session->s_ttyp);
	else
	acct.ac_tty = NODEV;
	sx_sunlock(&proctree_lock);

	/* (2) The name of the command that ran */
	bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);

	/* (3) The amount of user and system time that was used */
	rufetchcalc(p, &ru, &ut, &st);
	acct.ac_utime = encode_timeval(ut);
	acct.ac_stime = encode_timeval(st);

	/* (4) The elapsed time the command ran (and its starting time) */
	tmp = boottime;
	timevaladd(&tmp, &p->p_stats->p_start);
	acct.ac_btime = tmp.tv_sec;
	microuptime(&tmp);
	timevalsub(&tmp, &p->p_stats->p_start);
	acct.ac_etime = encode_timeval(tmp);

	/* (5) The average amount of memory used */
	tmp = ut;
	timevaladd(&tmp, &st);
	/* Convert tmp (i.e. u + s) into hz units to match ru_i. /
	t = tmp.tv_sec * hz + tmp.tv_usec / tick;
	if (t)
	acct.ac_mem = encode_long((ru.ru_ixrss + ru.ru_idrss +
	+ ru.ru_isrss) / t);
	else
	acct.ac_mem = 0;

	/* (6) The number of disk I/O operations done */
	acct.ac_io = encode_long(ru.ru_inblock + ru.ru_oublock);

	/* (7) The UID and GID of the process */
	acct.ac_uid = p->p_ucred->cr_ruid;
	acct.ac_gid = p->p_ucred->cr_rgid;

	/* (8) The boolean flags that tell how the process terminated, etc. */
	acct.ac_flagx = p->p_acflag;
	PROC_UNLOCK(p);

	/* Setup ancillary structure fields. */
	acct.ac_flagx \|= ANVER;
	acct.ac_zero = 0;
	acct.ac_version = 2;
	acct.ac_len = acct.ac_len2 = sizeof(acct);

	/*
	* Eliminate any file size rlimit.
	*/
	newlim = lim_alloc();
	PROC_LOCK(p);
	oldlim = p->p_limit;
	lim_copy(newlim, oldlim);
	newlim->pl_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
	p->p_limit = newlim;
	PROC_UNLOCK(p);
	lim_free(oldlim);

	/*
	* Write the accounting information to the file.
	*/
	vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
	ret = vn_rdwr(UIO_WRITE, acct_vp, (caddr_t)&acct, sizeof (acct),
	(off_t)0, UIO_SYSSPACE, IO_APPEND\|IO_UNIT, acct_cred, NOCRED,
	NULL, td);
	VFS_UNLOCK_GIANT(vfslocked);
	sx_sunlock(&acct_sx);
	return (ret);
	}

	/* FLOAT_CONVERSION_START (Regression testing; don't remove this line.) */

	/* Convert timevals and longs into IEEE-754 bit patterns. */

	/* Mantissa mask (MSB is implied, so subtract 1). */
	#define MANT_MASK ((1 << (FLT_MANT_DIG - 1)) - 1)

	/*
	* We calculate integer values to a precision of approximately
	* 28 bits.
	* This is high-enough precision to fill the 24 float bits
	* and low-enough to avoid overflowing the 32 int bits.
	*/
	#define CALC_BITS 28

	/* log_2(1000000). */
	#define LOG2_1M 20

	/*
	* Convert the elements of a timeval into a 32-bit word holding
	* the bits of a IEEE-754 float.
	* The float value represents the timeval's value in microsecond units.
	*/
	static uint32_t
	encode_timeval(struct timeval tv)
	{
	int log2_s;
	int val, exp; /* Unnormalized value and exponent */
	int norm_exp; /* Normalized exponent */
	int shift;

	/*
	* First calculate value and exponent to about CALC_BITS precision.
	* Note that the following conditionals have been ordered so that
	* the most common cases appear first.
	*/
	if (tv.tv_sec == 0) {
	if (tv.tv_usec == 0)
	return (0);
	exp = 0;
	val = tv.tv_usec;
	} else {
	/*
	* Calculate the value to a precision of approximately
	* CALC_BITS.
	*/
	log2_s = fls(tv.tv_sec) - 1;
	if (log2_s + LOG2_1M < CALC_BITS) {
	exp = 0;
	val = 1000000 * tv.tv_sec + tv.tv_usec;
	} else {
	exp = log2_s + LOG2_1M - CALC_BITS;
	val = (unsigned int)(((uint64_t)1000000 * tv.tv_sec +
	tv.tv_usec) >> exp);
	}
	}
	/* Now normalize and pack the value into an IEEE-754 float. */
	norm_exp = fls(val) - 1;
	shift = FLT_MANT_DIG - norm_exp - 1;
	#ifdef ACCT_DEBUG
	printf("val=%d exp=%d shift=%d log2(val)=%d\n",
	val, exp, shift, norm_exp);
	printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
	((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
	#endif
	return (((FLT_MAX_EXP - 1 + exp + norm_exp) << (FLT_MANT_DIG - 1)) \|
	((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
	}

	/*
	* Convert a non-negative long value into the bit pattern of
	* an IEEE-754 float value.
	*/
	static uint32_t
	encode_long(long val)
	{
	int norm_exp; /* Normalized exponent */
	int shift;

	if (val == 0)
	return (0);
	if (val < 0) {
	log(LOG_NOTICE,
	"encode_long: negative value %ld in accounting record\n",
	val);
	val = LONG_MAX;
	}
	norm_exp = fls(val) - 1;
	shift = FLT_MANT_DIG - norm_exp - 1;
	#ifdef ACCT_DEBUG
	printf("val=%d shift=%d log2(val)=%d\n",
	val, shift, norm_exp);
	printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
	((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
	#endif
	return (((FLT_MAX_EXP - 1 + norm_exp) << (FLT_MANT_DIG - 1)) \|
	((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
	}

	/* FLOAT_CONVERSION_END (Regression testing; don't remove this line.) */

	/*
	* Periodically check the filesystem to see if accounting
	* should be turned on or off. Beware the case where the vnode
	* has been vgone()'d out from underneath us, e.g. when the file
	* system containing the accounting file has been forcibly unmounted.
	*/
	/* ARGSUSED */
	static void
	acctwatch(void)
	{
	struct statfs sb;
	int vfslocked;

	sx_assert(&acct_sx, SX_XLOCKED);

	/*
	* If accounting was disabled before our kthread was scheduled,
	* then acct_vp might be NULL. If so, just ask our kthread to
	* exit and return.
	*/
	if (acct_vp == NULL) {
	acct_state \|= ACCT_EXITREQ;
	return;
	}

	/*
	* If our vnode is no longer valid, tear it down and signal the
	* accounting thread to die.
	*/
	vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
	if (acct_vp->v_type == VBAD) {
	(void) acct_disable(NULL);
	VFS_UNLOCK_GIANT(vfslocked);
	acct_state \|= ACCT_EXITREQ;
	return;
	}

	/*
	* Stopping here is better than continuing, maybe it will be VBAD
	* next time around.
	*/
	if (VFS_STATFS(acct_vp->v_mount, &sb) < 0) {
	VFS_UNLOCK_GIANT(vfslocked);
	return;
	}
	VFS_UNLOCK_GIANT(vfslocked);
	if (acct_suspended) {
	if (sb.f_bavail > (int64_t)(acctresume * sb.f_blocks /
	100)) {
	acct_suspended = 0;
	log(LOG_NOTICE, "Accounting resumed\n");
	}
	} else {
	if (sb.f_bavail <= (int64_t)(acctsuspend * sb.f_blocks /
	100)) {
	acct_suspended = 1;
	log(LOG_NOTICE, "Accounting suspended\n");
	}
	}
	}

	/*
	* The main loop for the dedicated kernel thread that periodically calls
	* acctwatch().
	*/
	static void
	acct_thread(void *dummy)
	{
	u_char pri;

	/* This is a low-priority kernel thread. */
	pri = PRI_MAX_KERN;
	thread_lock(curthread);
	sched_prio(curthread, pri);
	thread_unlock(curthread);

	/* If another accounting kthread is already running, just die. */
	sx_xlock(&acct_sx);
	if (acct_state & ACCT_RUNNING) {
	sx_xunlock(&acct_sx);
	kproc_exit(0);
	}
	acct_state \|= ACCT_RUNNING;

	/* Loop until we are asked to exit. */
	while (!(acct_state & ACCT_EXITREQ)) {

	/* Perform our periodic checks. */
	acctwatch();

	/*
	* We check this flag again before sleeping since the
	* acctwatch() might have shut down accounting and asked us
	* to exit.
	*/
	if (!(acct_state & ACCT_EXITREQ)) {
	sx_sleep(&acct_state, &acct_sx, 0, "-",
	acctchkfreq * hz);
	}
	}

	/*
	* Acknowledge the exit request and shutdown. We clear both the
	* exit request and running flags.
	*/
	acct_state = 0;
	sx_xunlock(&acct_sx);
	kproc_exit(0);
	}
	Index: head/sys/kern/kern_context.c
	===================================================================
	--- head/sys/kern/kern_context.c (revision 225616)
	+++ head/sys/kern/kern_context.c (revision 225617)
	@@ -1,129 +1,129 @@
	/*-
	* Copyright (c) 2002 Daniel M. Eischen <deischen@freebsd.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/signalvar.h>
	#include <sys/ucontext.h>

	/*
	* The first two fields of a ucontext_t are the signal mask and the machine
	* context. The next field is uc_link; we want to avoid destroying the link
	* when copying out contexts.
	*/
	#define UC_COPY_SIZE offsetof(ucontext_t, uc_link)

	#ifndef _SYS_SYSPROTO_H_
	struct getcontext_args {
	struct __ucontext *ucp;
	}
	struct setcontext_args {
	const struct __ucontext_t *ucp;
	}
	struct swapcontext_args {
	struct __ucontext *oucp;
	const struct __ucontext_t *ucp;
	}
	#endif

	int
	-getcontext(struct thread td, struct getcontext_args uap)
	+sys_getcontext(struct thread td, struct getcontext_args uap)
	{
	ucontext_t uc;
	int ret;

	if (uap->ucp == NULL)
	ret = EINVAL;
	else {
	get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
	PROC_LOCK(td->td_proc);
	uc.uc_sigmask = td->td_sigmask;
	PROC_UNLOCK(td->td_proc);
	bzero(uc.__spare__, sizeof(uc.__spare__));
	ret = copyout(&uc, uap->ucp, UC_COPY_SIZE);
	}
	return (ret);
	}

	int
	-setcontext(struct thread td, struct setcontext_args uap)
	+sys_setcontext(struct thread td, struct setcontext_args uap)
	{
	ucontext_t uc;
	int ret;

	if (uap->ucp == NULL)
	ret = EINVAL;
	else {
	ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
	if (ret == 0) {
	ret = set_mcontext(td, &uc.uc_mcontext);
	if (ret == 0) {
	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask,
	NULL, 0);
	}
	}
	}
	return (ret == 0 ? EJUSTRETURN : ret);
	}

	int
	-swapcontext(struct thread td, struct swapcontext_args uap)
	+sys_swapcontext(struct thread td, struct swapcontext_args uap)
	{
	ucontext_t uc;
	int ret;

	if (uap->oucp == NULL \|\| uap->ucp == NULL)
	ret = EINVAL;
	else {
	get_mcontext(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
	bzero(uc.__spare__, sizeof(uc.__spare__));
	PROC_LOCK(td->td_proc);
	uc.uc_sigmask = td->td_sigmask;
	PROC_UNLOCK(td->td_proc);
	ret = copyout(&uc, uap->oucp, UC_COPY_SIZE);
	if (ret == 0) {
	ret = copyin(uap->ucp, &uc, UC_COPY_SIZE);
	if (ret == 0) {
	ret = set_mcontext(td, &uc.uc_mcontext);
	if (ret == 0) {
	kern_sigprocmask(td, SIG_SETMASK,
	&uc.uc_sigmask, NULL, 0);
	}
	}
	}
	}
	return (ret == 0 ? EJUSTRETURN : ret);
	}
	Index: head/sys/kern/kern_cpuset.c
	===================================================================
	--- head/sys/kern/kern_cpuset.c (revision 225616)
	+++ head/sys/kern/kern_cpuset.c (revision 225617)
	@@ -1,1173 +1,1173 @@
	/*-
	* Copyright (c) 2008, Jeffrey Roberson <jeff@freebsd.org>
	* All rights reserved.
	*
	* Copyright (c) 2008 Nokia Corporation
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice unmodified, this list of conditions, and the following
	* disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ddb.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/refcount.h>
	#include <sys/sched.h>
	#include <sys/smp.h>
	#include <sys/syscallsubr.h>
	#include <sys/cpuset.h>
	#include <sys/sx.h>
	#include <sys/queue.h>
	#include <sys/libkern.h>
	#include <sys/limits.h>
	#include <sys/bus.h>
	#include <sys/interrupt.h>

	#include <vm/uma.h>

	#ifdef DDB
	#include <ddb/ddb.h>
	#endif /* DDB */

	/*
	* cpusets provide a mechanism for creating and manipulating sets of
	* processors for the purpose of constraining the scheduling of threads to
	* specific processors.
	*
	* Each process belongs to an identified set, by default this is set 1. Each
	* thread may further restrict the cpus it may run on to a subset of this
	* named set. This creates an anonymous set which other threads and processes
	* may not join by number.
	*
	* The named set is referred to herein as the 'base' set to avoid ambiguity.
	* This set is usually a child of a 'root' set while the anonymous set may
	* simply be referred to as a mask. In the syscall api these are referred to
	* as the ROOT, CPUSET, and MASK levels where CPUSET is called 'base' here.
	*
	* Threads inherit their set from their creator whether it be anonymous or
	* not. This means that anonymous sets are immutable because they may be
	* shared. To modify an anonymous set a new set is created with the desired
	* mask and the same parent as the existing anonymous set. This gives the
	* illusion of each thread having a private mask.
	*
	* Via the syscall apis a user may ask to retrieve or modify the root, base,
	* or mask that is discovered via a pid, tid, or setid. Modifying a set
	* modifies all numbered and anonymous child sets to comply with the new mask.
	* Modifying a pid or tid's mask applies only to that tid but must still
	* exist within the assigned parent set.
	*
	* A thread may not be assigned to a group separate from other threads in
	* the process. This is to remove ambiguity when the setid is queried with
	* a pid argument. There is no other technical limitation.
	*
	* This somewhat complex arrangement is intended to make it easy for
	* applications to query available processors and bind their threads to
	* specific processors while also allowing administrators to dynamically
	* reprovision by changing sets which apply to groups of processes.
	*
	* A simple application should not concern itself with sets at all and
	* rather apply masks to its own threads via CPU_WHICH_TID and a -1 id
	* meaning 'curthread'. It may query available cpus for that tid with a
	* getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
	*/
	static uma_zone_t cpuset_zone;
	static struct mtx cpuset_lock;
	static struct setlist cpuset_ids;
	static struct unrhdr *cpuset_unr;
	static struct cpuset *cpuset_zero;

	/* Return the size of cpuset_t at the kernel level */
	SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD,
	0, sizeof(cpuset_t), "sizeof(cpuset_t)");

	cpuset_t *cpuset_root;

	/*
	* Acquire a reference to a cpuset, all pointers must be tracked with refs.
	*/
	struct cpuset *
	cpuset_ref(struct cpuset *set)
	{

	refcount_acquire(&set->cs_ref);
	return (set);
	}

	/*
	* Walks up the tree from 'set' to find the root. Returns the root
	* referenced.
	*/
	static struct cpuset *
	cpuset_refroot(struct cpuset *set)
	{

	for (; set->cs_parent != NULL; set = set->cs_parent)
	if (set->cs_flags & CPU_SET_ROOT)
	break;
	cpuset_ref(set);

	return (set);
	}

	/*
	* Find the first non-anonymous set starting from 'set'. Returns this set
	* referenced. May return the passed in set with an extra ref if it is
	* not anonymous.
	*/
	static struct cpuset *
	cpuset_refbase(struct cpuset *set)
	{

	if (set->cs_id == CPUSET_INVALID)
	set = set->cs_parent;
	cpuset_ref(set);

	return (set);
	}

	/*
	* Release a reference in a context where it is safe to allocate.
	*/
	void
	cpuset_rel(struct cpuset *set)
	{
	cpusetid_t id;

	if (refcount_release(&set->cs_ref) == 0)
	return;
	mtx_lock_spin(&cpuset_lock);
	LIST_REMOVE(set, cs_siblings);
	id = set->cs_id;
	if (id != CPUSET_INVALID)
	LIST_REMOVE(set, cs_link);
	mtx_unlock_spin(&cpuset_lock);
	cpuset_rel(set->cs_parent);
	uma_zfree(cpuset_zone, set);
	if (id != CPUSET_INVALID)
	free_unr(cpuset_unr, id);
	}

	/*
	* Deferred release must be used when in a context that is not safe to
	* allocate/free. This places any unreferenced sets on the list 'head'.
	*/
	static void
	cpuset_rel_defer(struct setlist head, struct cpuset set)
	{

	if (refcount_release(&set->cs_ref) == 0)
	return;
	mtx_lock_spin(&cpuset_lock);
	LIST_REMOVE(set, cs_siblings);
	if (set->cs_id != CPUSET_INVALID)
	LIST_REMOVE(set, cs_link);
	LIST_INSERT_HEAD(head, set, cs_link);
	mtx_unlock_spin(&cpuset_lock);
	}

	/*
	* Complete a deferred release. Removes the set from the list provided to
	* cpuset_rel_defer.
	*/
	static void
	cpuset_rel_complete(struct cpuset *set)
	{
	LIST_REMOVE(set, cs_link);
	cpuset_rel(set->cs_parent);
	uma_zfree(cpuset_zone, set);
	}

	/*
	* Find a set based on an id. Returns it with a ref.
	*/
	static struct cpuset *
	cpuset_lookup(cpusetid_t setid, struct thread *td)
	{
	struct cpuset *set;

	if (setid == CPUSET_INVALID)
	return (NULL);
	mtx_lock_spin(&cpuset_lock);
	LIST_FOREACH(set, &cpuset_ids, cs_link)
	if (set->cs_id == setid)
	break;
	if (set)
	cpuset_ref(set);
	mtx_unlock_spin(&cpuset_lock);

	KASSERT(td != NULL, ("[%s:%d] td is NULL", __func__, __LINE__));
	if (set != NULL && jailed(td->td_ucred)) {
	struct cpuset jset, tset;

	jset = td->td_ucred->cr_prison->pr_cpuset;
	for (tset = set; tset != NULL; tset = tset->cs_parent)
	if (tset == jset)
	break;
	if (tset == NULL) {
	cpuset_rel(set);
	set = NULL;
	}
	}

	return (set);
	}

	/*
	* Create a set in the space provided in 'set' with the provided parameters.
	* The set is returned with a single ref. May return EDEADLK if the set
	* will have no valid cpu based on restrictions from the parent.
	*/
	static int
	_cpuset_create(struct cpuset set, struct cpuset parent, const cpuset_t *mask,
	cpusetid_t id)
	{

	if (!CPU_OVERLAP(&parent->cs_mask, mask))
	return (EDEADLK);
	CPU_COPY(mask, &set->cs_mask);
	LIST_INIT(&set->cs_children);
	refcount_init(&set->cs_ref, 1);
	set->cs_flags = 0;
	mtx_lock_spin(&cpuset_lock);
	CPU_AND(&set->cs_mask, &parent->cs_mask);
	set->cs_id = id;
	set->cs_parent = cpuset_ref(parent);
	LIST_INSERT_HEAD(&parent->cs_children, set, cs_siblings);
	if (set->cs_id != CPUSET_INVALID)
	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
	mtx_unlock_spin(&cpuset_lock);

	return (0);
	}

	/*
	* Create a new non-anonymous set with the requested parent and mask. May
	* return failures if the mask is invalid or a new number can not be
	* allocated.
	*/
	static int
	cpuset_create(struct cpuset *setp, struct cpuset parent, const cpuset_t *mask)
	{
	struct cpuset *set;
	cpusetid_t id;
	int error;

	id = alloc_unr(cpuset_unr);
	if (id == -1)
	return (ENFILE);
	*setp = set = uma_zalloc(cpuset_zone, M_WAITOK);
	error = _cpuset_create(set, parent, mask, id);
	if (error == 0)
	return (0);
	free_unr(cpuset_unr, id);
	uma_zfree(cpuset_zone, set);

	return (error);
	}

	/*
	* Recursively check for errors that would occur from applying mask to
	* the tree of sets starting at 'set'. Checks for sets that would become
	* empty as well as RDONLY flags.
	*/
	static int
	cpuset_testupdate(struct cpuset set, cpuset_t mask)
	{
	struct cpuset *nset;
	cpuset_t newmask;
	int error;

	mtx_assert(&cpuset_lock, MA_OWNED);
	if (set->cs_flags & CPU_SET_RDONLY)
	return (EPERM);
	if (!CPU_OVERLAP(&set->cs_mask, mask))
	return (EDEADLK);
	CPU_COPY(&set->cs_mask, &newmask);
	CPU_AND(&newmask, mask);
	error = 0;
	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
	if ((error = cpuset_testupdate(nset, &newmask)) != 0)
	break;
	return (error);
	}

	/*
	* Applies the mask 'mask' without checking for empty sets or permissions.
	*/
	static void
	cpuset_update(struct cpuset set, cpuset_t mask)
	{
	struct cpuset *nset;

	mtx_assert(&cpuset_lock, MA_OWNED);
	CPU_AND(&set->cs_mask, mask);
	LIST_FOREACH(nset, &set->cs_children, cs_siblings)
	cpuset_update(nset, &set->cs_mask);

	return;
	}

	/*
	* Modify the set 'set' to use a copy of the mask provided. Apply this new
	* mask to restrict all children in the tree. Checks for validity before
	* applying the changes.
	*/
	static int
	cpuset_modify(struct cpuset set, cpuset_t mask)
	{
	struct cpuset *root;
	int error;

	error = priv_check(curthread, PRIV_SCHED_CPUSET);
	if (error)
	return (error);
	/*
	* In case we are called from within the jail
	* we do not allow modifying the dedicated root
	* cpuset of the jail but may still allow to
	* change child sets.
	*/
	if (jailed(curthread->td_ucred) &&
	set->cs_flags & CPU_SET_ROOT)
	return (EPERM);
	/*
	* Verify that we have access to this set of
	* cpus.
	*/
	root = set->cs_parent;
	if (root && !CPU_SUBSET(&root->cs_mask, mask))
	return (EINVAL);
	mtx_lock_spin(&cpuset_lock);
	error = cpuset_testupdate(set, mask);
	if (error)
	goto out;
	cpuset_update(set, mask);
	CPU_COPY(mask, &set->cs_mask);
	out:
	mtx_unlock_spin(&cpuset_lock);

	return (error);
	}

	/*
	* Resolve the 'which' parameter of several cpuset apis.
	*
	* For WHICH_PID and WHICH_TID return a locked proc and valid proc/tid. Also
	* checks for permission via p_cansched().
	*
	* For WHICH_SET returns a valid set with a new reference.
	*
	* -1 may be supplied for any argument to mean the current proc/thread or
	* the base set of the current thread. May fail with ESRCH/EPERM.
	*/
	static int
	cpuset_which(cpuwhich_t which, id_t id, struct proc pp, struct thread tdp,
	struct cpuset **setp)
	{
	struct cpuset *set;
	struct thread *td;
	struct proc *p;
	int error;

	*pp = p = NULL;
	*tdp = td = NULL;
	*setp = set = NULL;
	switch (which) {
	case CPU_WHICH_PID:
	if (id == -1) {
	PROC_LOCK(curproc);
	p = curproc;
	break;
	}
	if ((p = pfind(id)) == NULL)
	return (ESRCH);
	break;
	case CPU_WHICH_TID:
	if (id == -1) {
	PROC_LOCK(curproc);
	p = curproc;
	td = curthread;
	break;
	}
	td = tdfind(id, -1);
	if (td == NULL)
	return (ESRCH);
	p = td->td_proc;
	break;
	case CPU_WHICH_CPUSET:
	if (id == -1) {
	thread_lock(curthread);
	set = cpuset_refbase(curthread->td_cpuset);
	thread_unlock(curthread);
	} else
	set = cpuset_lookup(id, curthread);
	if (set) {
	*setp = set;
	return (0);
	}
	return (ESRCH);
	case CPU_WHICH_JAIL:
	{
	/* Find `set' for prison with given id. */
	struct prison *pr;

	sx_slock(&allprison_lock);
	pr = prison_find_child(curthread->td_ucred->cr_prison, id);
	sx_sunlock(&allprison_lock);
	if (pr == NULL)
	return (ESRCH);
	cpuset_ref(pr->pr_cpuset);
	*setp = pr->pr_cpuset;
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}
	case CPU_WHICH_IRQ:
	return (0);
	default:
	return (EINVAL);
	}
	error = p_cansched(curthread, p);
	if (error) {
	PROC_UNLOCK(p);
	return (error);
	}
	if (td == NULL)
	td = FIRST_THREAD_IN_PROC(p);
	*pp = p;
	*tdp = td;
	return (0);
	}

	/*
	* Create an anonymous set with the provided mask in the space provided by
	* 'fset'. If the passed in set is anonymous we use its parent otherwise
	* the new set is a child of 'set'.
	*/
	static int
	cpuset_shadow(struct cpuset set, struct cpuset fset, const cpuset_t *mask)
	{
	struct cpuset *parent;

	if (set->cs_id == CPUSET_INVALID)
	parent = set->cs_parent;
	else
	parent = set;
	if (!CPU_SUBSET(&parent->cs_mask, mask))
	return (EDEADLK);
	return (_cpuset_create(fset, parent, mask, CPUSET_INVALID));
	}

	/*
	* Handle two cases for replacing the base set or mask of an entire process.
	*
	* 1) Set is non-null and mask is null. This reparents all anonymous sets
	* to the provided set and replaces all non-anonymous td_cpusets with the
	* provided set.
	* 2) Mask is non-null and set is null. This replaces or creates anonymous
	* sets for every thread with the existing base as a parent.
	*
	* This is overly complicated because we can't allocate while holding a
	* spinlock and spinlocks must be held while changing and examining thread
	* state.
	*/
	static int
	cpuset_setproc(pid_t pid, struct cpuset set, cpuset_t mask)
	{
	struct setlist freelist;
	struct setlist droplist;
	struct cpuset *tdset;
	struct cpuset *nset;
	struct thread *td;
	struct proc *p;
	int threads;
	int nfree;
	int error;
	/*
	* The algorithm requires two passes due to locking considerations.
	*
	* 1) Lookup the process and acquire the locks in the required order.
	* 2) If enough cpusets have not been allocated release the locks and
	* allocate them. Loop.
	*/
	LIST_INIT(&freelist);
	LIST_INIT(&droplist);
	nfree = 0;
	for (;;) {
	error = cpuset_which(CPU_WHICH_PID, pid, &p, &td, &nset);
	if (error)
	goto out;
	if (nfree >= p->p_numthreads)
	break;
	threads = p->p_numthreads;
	PROC_UNLOCK(p);
	for (; nfree < threads; nfree++) {
	nset = uma_zalloc(cpuset_zone, M_WAITOK);
	LIST_INSERT_HEAD(&freelist, nset, cs_link);
	}
	}
	PROC_LOCK_ASSERT(p, MA_OWNED);
	/*
	* Now that the appropriate locks are held and we have enough cpusets,
	* make sure the operation will succeed before applying changes. The
	* proc lock prevents td_cpuset from changing between calls.
	*/
	error = 0;
	FOREACH_THREAD_IN_PROC(p, td) {
	thread_lock(td);
	tdset = td->td_cpuset;
	/*
	* Verify that a new mask doesn't specify cpus outside of
	* the set the thread is a member of.
	*/
	if (mask) {
	if (tdset->cs_id == CPUSET_INVALID)
	tdset = tdset->cs_parent;
	if (!CPU_SUBSET(&tdset->cs_mask, mask))
	error = EDEADLK;
	/*
	* Verify that a new set won't leave an existing thread
	* mask without a cpu to run on. It can, however, restrict
	* the set.
	*/
	} else if (tdset->cs_id == CPUSET_INVALID) {
	if (!CPU_OVERLAP(&set->cs_mask, &tdset->cs_mask))
	error = EDEADLK;
	}
	thread_unlock(td);
	if (error)
	goto unlock_out;
	}
	/*
	* Replace each thread's cpuset while using deferred release. We
	* must do this because the thread lock must be held while operating
	* on the thread and this limits the type of operations allowed.
	*/
	FOREACH_THREAD_IN_PROC(p, td) {
	thread_lock(td);
	/*
	* If we presently have an anonymous set or are applying a
	* mask we must create an anonymous shadow set. That is
	* either parented to our existing base or the supplied set.
	*
	* If we have a base set with no anonymous shadow we simply
	* replace it outright.
	*/
	tdset = td->td_cpuset;
	if (tdset->cs_id == CPUSET_INVALID \|\| mask) {
	nset = LIST_FIRST(&freelist);
	LIST_REMOVE(nset, cs_link);
	if (mask)
	error = cpuset_shadow(tdset, nset, mask);
	else
	error = _cpuset_create(nset, set,
	&tdset->cs_mask, CPUSET_INVALID);
	if (error) {
	LIST_INSERT_HEAD(&freelist, nset, cs_link);
	thread_unlock(td);
	break;
	}
	} else
	nset = cpuset_ref(set);
	cpuset_rel_defer(&droplist, tdset);
	td->td_cpuset = nset;
	sched_affinity(td);
	thread_unlock(td);
	}
	unlock_out:
	PROC_UNLOCK(p);
	out:
	while ((nset = LIST_FIRST(&droplist)) != NULL)
	cpuset_rel_complete(nset);
	while ((nset = LIST_FIRST(&freelist)) != NULL) {
	LIST_REMOVE(nset, cs_link);
	uma_zfree(cpuset_zone, nset);
	}
	return (error);
	}

	/*
	* Calculate the ffs() of the cpuset.
	*/
	int
	cpusetobj_ffs(const cpuset_t *set)
	{
	size_t i;
	int cbit;

	cbit = 0;
	for (i = 0; i < _NCPUWORDS; i++) {
	if (set->__bits[i] != 0) {
	cbit = ffsl(set->__bits[i]);
	cbit += i * _NCPUBITS;
	break;
	}
	}
	return (cbit);
	}

	/*
	* Return a string representing a valid layout for a cpuset_t object.
	* It expects an incoming buffer at least sized as CPUSETBUFSIZ.
	*/
	char *
	cpusetobj_strprint(char buf, const cpuset_t set)
	{
	char *tbuf;
	size_t i, bytesp, bufsiz;

	tbuf = buf;
	bytesp = 0;
	bufsiz = CPUSETBUFSIZ;

	for (i = _NCPUWORDS - 1; i > 0; i--) {
	bytesp = snprintf(tbuf, bufsiz, "%lx, ", set->__bits[i]);
	bufsiz -= bytesp;
	tbuf += bytesp;
	}
	snprintf(tbuf, bufsiz, "%lx", set->__bits[0]);
	return (buf);
	}

	/*
	* Build a valid cpuset_t object from a string representation.
	* It expects an incoming buffer at least sized as CPUSETBUFSIZ.
	*/
	int
	cpusetobj_strscan(cpuset_t set, const char buf)
	{
	u_int nwords;
	int i, ret;

	if (strlen(buf) > CPUSETBUFSIZ - 1)
	return (-1);

	/* Allow to pass a shorter version of the mask when necessary. */
	nwords = 1;
	for (i = 0; buf[i] != '\0'; i++)
	if (buf[i] == ',')
	nwords++;
	if (nwords > _NCPUWORDS)
	return (-1);

	CPU_ZERO(set);
	for (i = nwords - 1; i > 0; i--) {
	ret = sscanf(buf, "%lx, ", &set->__bits[i]);
	if (ret == 0 \|\| ret == -1)
	return (-1);
	buf = strstr(buf, " ");
	if (buf == NULL)
	return (-1);
	buf++;
	}
	ret = sscanf(buf, "%lx", &set->__bits[0]);
	if (ret == 0 \|\| ret == -1)
	return (-1);
	return (0);
	}

	/*
	* Apply an anonymous mask to a single thread.
	*/
	int
	cpuset_setthread(lwpid_t id, cpuset_t *mask)
	{
	struct cpuset *nset;
	struct cpuset *set;
	struct thread *td;
	struct proc *p;
	int error;

	nset = uma_zalloc(cpuset_zone, M_WAITOK);
	error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &set);
	if (error)
	goto out;
	set = NULL;
	thread_lock(td);
	error = cpuset_shadow(td->td_cpuset, nset, mask);
	if (error == 0) {
	set = td->td_cpuset;
	td->td_cpuset = nset;
	sched_affinity(td);
	nset = NULL;
	}
	thread_unlock(td);
	PROC_UNLOCK(p);
	if (set)
	cpuset_rel(set);
	out:
	if (nset)
	uma_zfree(cpuset_zone, nset);
	return (error);
	}

	/*
	* Creates the cpuset for thread0. We make two sets:
	*
	* 0 - The root set which should represent all valid processors in the
	* system. It is initially created with a mask of all processors
	* because we don't know what processors are valid until cpuset_init()
	* runs. This set is immutable.
	* 1 - The default set which all processes are a member of until changed.
	* This allows an administrator to move all threads off of given cpus to
	* dedicate them to high priority tasks or save power etc.
	*/
	struct cpuset *
	cpuset_thread0(void)
	{
	struct cpuset *set;
	int error;

	cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, 0);
	mtx_init(&cpuset_lock, "cpuset", NULL, MTX_SPIN \| MTX_RECURSE);
	/*
	* Create the root system set for the whole machine. Doesn't use
	* cpuset_create() due to NULL parent.
	*/
	set = uma_zalloc(cpuset_zone, M_WAITOK \| M_ZERO);
	CPU_FILL(&set->cs_mask);
	LIST_INIT(&set->cs_children);
	LIST_INSERT_HEAD(&cpuset_ids, set, cs_link);
	set->cs_ref = 1;
	set->cs_flags = CPU_SET_ROOT;
	cpuset_zero = set;
	cpuset_root = &set->cs_mask;
	/*
	* Now derive a default, modifiable set from that to give out.
	*/
	set = uma_zalloc(cpuset_zone, M_WAITOK);
	error = _cpuset_create(set, cpuset_zero, &cpuset_zero->cs_mask, 1);
	KASSERT(error == 0, ("Error creating default set: %d\n", error));
	/*
	* Initialize the unit allocator. 0 and 1 are allocated above.
	*/
	cpuset_unr = new_unrhdr(2, INT_MAX, NULL);

	return (set);
	}

	/*
	* Create a cpuset, which would be cpuset_create() but
	* mark the new 'set' as root.
	*
	* We are not going to reparent the td to it. Use cpuset_setproc_update_set()
	* for that.
	*
	* In case of no error, returns the set in *setp locked with a reference.
	*/
	int
	cpuset_create_root(struct prison pr, struct cpuset *setp)
	{
	struct cpuset *set;
	int error;

	KASSERT(pr != NULL, ("[%s:%d] invalid pr", __func__, __LINE__));
	KASSERT(setp != NULL, ("[%s:%d] invalid setp", __func__, __LINE__));

	error = cpuset_create(setp, pr->pr_cpuset, &pr->pr_cpuset->cs_mask);
	if (error)
	return (error);

	KASSERT(*setp != NULL, ("[%s:%d] cpuset_create returned invalid data",
	__func__, __LINE__));

	/* Mark the set as root. */
	set = *setp;
	set->cs_flags \|= CPU_SET_ROOT;

	return (0);
	}

	int
	cpuset_setproc_update_set(struct proc p, struct cpuset set)
	{
	int error;

	KASSERT(p != NULL, ("[%s:%d] invalid proc", __func__, __LINE__));
	KASSERT(set != NULL, ("[%s:%d] invalid set", __func__, __LINE__));

	cpuset_ref(set);
	error = cpuset_setproc(p->p_pid, set, NULL);
	if (error)
	return (error);
	cpuset_rel(set);
	return (0);
	}

	/*
	* This is called once the final set of system cpus is known. Modifies
	* the root set and all children and mark the root read-only.
	*/
	static void
	cpuset_init(void *arg)
	{
	cpuset_t mask;

	mask = all_cpus;
	if (cpuset_modify(cpuset_zero, &mask))
	panic("Can't set initial cpuset mask.\n");
	cpuset_zero->cs_flags \|= CPU_SET_RDONLY;
	}
	SYSINIT(cpuset, SI_SUB_SMP, SI_ORDER_ANY, cpuset_init, NULL);

	#ifndef _SYS_SYSPROTO_H_
	struct cpuset_args {
	cpusetid_t *setid;
	};
	#endif
	int
	-cpuset(struct thread td, struct cpuset_args uap)
	+sys_cpuset(struct thread td, struct cpuset_args uap)
	{
	struct cpuset *root;
	struct cpuset *set;
	int error;

	thread_lock(td);
	root = cpuset_refroot(td->td_cpuset);
	thread_unlock(td);
	error = cpuset_create(&set, root, &root->cs_mask);
	cpuset_rel(root);
	if (error)
	return (error);
	error = copyout(&set->cs_id, uap->setid, sizeof(set->cs_id));
	if (error == 0)
	error = cpuset_setproc(-1, set, NULL);
	cpuset_rel(set);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct cpuset_setid_args {
	cpuwhich_t which;
	id_t id;
	cpusetid_t setid;
	};
	#endif
	int
	-cpuset_setid(struct thread td, struct cpuset_setid_args uap)
	+sys_cpuset_setid(struct thread td, struct cpuset_setid_args uap)
	{
	struct cpuset *set;
	int error;

	/*
	* Presently we only support per-process sets.
	*/
	if (uap->which != CPU_WHICH_PID)
	return (EINVAL);
	set = cpuset_lookup(uap->setid, td);
	if (set == NULL)
	return (ESRCH);
	error = cpuset_setproc(uap->id, set, NULL);
	cpuset_rel(set);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct cpuset_getid_args {
	cpulevel_t level;
	cpuwhich_t which;
	id_t id;
	cpusetid_t *setid;
	#endif
	int
	-cpuset_getid(struct thread td, struct cpuset_getid_args uap)
	+sys_cpuset_getid(struct thread td, struct cpuset_getid_args uap)
	{
	struct cpuset *nset;
	struct cpuset *set;
	struct thread *ttd;
	struct proc *p;
	cpusetid_t id;
	int error;

	if (uap->level == CPU_LEVEL_WHICH && uap->which != CPU_WHICH_CPUSET)
	return (EINVAL);
	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
	if (error)
	return (error);
	switch (uap->which) {
	case CPU_WHICH_TID:
	case CPU_WHICH_PID:
	thread_lock(ttd);
	set = cpuset_refbase(ttd->td_cpuset);
	thread_unlock(ttd);
	PROC_UNLOCK(p);
	break;
	case CPU_WHICH_CPUSET:
	case CPU_WHICH_JAIL:
	break;
	case CPU_WHICH_IRQ:
	return (EINVAL);
	}
	switch (uap->level) {
	case CPU_LEVEL_ROOT:
	nset = cpuset_refroot(set);
	cpuset_rel(set);
	set = nset;
	break;
	case CPU_LEVEL_CPUSET:
	break;
	case CPU_LEVEL_WHICH:
	break;
	}
	id = set->cs_id;
	cpuset_rel(set);
	if (error == 0)
	error = copyout(&id, uap->setid, sizeof(id));

	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct cpuset_getaffinity_args {
	cpulevel_t level;
	cpuwhich_t which;
	id_t id;
	size_t cpusetsize;
	cpuset_t *mask;
	};
	#endif
	int
	-cpuset_getaffinity(struct thread td, struct cpuset_getaffinity_args uap)
	+sys_cpuset_getaffinity(struct thread td, struct cpuset_getaffinity_args uap)
	{
	struct thread *ttd;
	struct cpuset *nset;
	struct cpuset *set;
	struct proc *p;
	cpuset_t *mask;
	int error;
	size_t size;

	if (uap->cpusetsize < sizeof(cpuset_t) \|\|
	uap->cpusetsize > CPU_MAXSIZE / NBBY)
	return (ERANGE);
	size = uap->cpusetsize;
	mask = malloc(size, M_TEMP, M_WAITOK \| M_ZERO);
	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
	if (error)
	goto out;
	switch (uap->level) {
	case CPU_LEVEL_ROOT:
	case CPU_LEVEL_CPUSET:
	switch (uap->which) {
	case CPU_WHICH_TID:
	case CPU_WHICH_PID:
	thread_lock(ttd);
	set = cpuset_ref(ttd->td_cpuset);
	thread_unlock(ttd);
	break;
	case CPU_WHICH_CPUSET:
	case CPU_WHICH_JAIL:
	break;
	case CPU_WHICH_IRQ:
	error = EINVAL;
	goto out;
	}
	if (uap->level == CPU_LEVEL_ROOT)
	nset = cpuset_refroot(set);
	else
	nset = cpuset_refbase(set);
	CPU_COPY(&nset->cs_mask, mask);
	cpuset_rel(nset);
	break;
	case CPU_LEVEL_WHICH:
	switch (uap->which) {
	case CPU_WHICH_TID:
	thread_lock(ttd);
	CPU_COPY(&ttd->td_cpuset->cs_mask, mask);
	thread_unlock(ttd);
	break;
	case CPU_WHICH_PID:
	FOREACH_THREAD_IN_PROC(p, ttd) {
	thread_lock(ttd);
	CPU_OR(mask, &ttd->td_cpuset->cs_mask);
	thread_unlock(ttd);
	}
	break;
	case CPU_WHICH_CPUSET:
	case CPU_WHICH_JAIL:
	CPU_COPY(&set->cs_mask, mask);
	break;
	case CPU_WHICH_IRQ:
	error = intr_getaffinity(uap->id, mask);
	break;
	}
	break;
	default:
	error = EINVAL;
	break;
	}
	if (set)
	cpuset_rel(set);
	if (p)
	PROC_UNLOCK(p);
	if (error == 0)
	error = copyout(mask, uap->mask, size);
	out:
	free(mask, M_TEMP);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct cpuset_setaffinity_args {
	cpulevel_t level;
	cpuwhich_t which;
	id_t id;
	size_t cpusetsize;
	const cpuset_t *mask;
	};
	#endif
	int
	-cpuset_setaffinity(struct thread td, struct cpuset_setaffinity_args uap)
	+sys_cpuset_setaffinity(struct thread td, struct cpuset_setaffinity_args uap)
	{
	struct cpuset *nset;
	struct cpuset *set;
	struct thread *ttd;
	struct proc *p;
	cpuset_t *mask;
	int error;

	if (uap->cpusetsize < sizeof(cpuset_t) \|\|
	uap->cpusetsize > CPU_MAXSIZE / NBBY)
	return (ERANGE);
	mask = malloc(uap->cpusetsize, M_TEMP, M_WAITOK \| M_ZERO);
	error = copyin(uap->mask, mask, uap->cpusetsize);
	if (error)
	goto out;
	/*
	* Verify that no high bits are set.
	*/
	if (uap->cpusetsize > sizeof(cpuset_t)) {
	char *end;
	char *cp;

	end = cp = (char *)&mask->__bits;
	end += uap->cpusetsize;
	cp += sizeof(cpuset_t);
	while (cp != end)
	if (*cp++ != 0) {
	error = EINVAL;
	goto out;
	}

	}
	switch (uap->level) {
	case CPU_LEVEL_ROOT:
	case CPU_LEVEL_CPUSET:
	error = cpuset_which(uap->which, uap->id, &p, &ttd, &set);
	if (error)
	break;
	switch (uap->which) {
	case CPU_WHICH_TID:
	case CPU_WHICH_PID:
	thread_lock(ttd);
	set = cpuset_ref(ttd->td_cpuset);
	thread_unlock(ttd);
	PROC_UNLOCK(p);
	break;
	case CPU_WHICH_CPUSET:
	case CPU_WHICH_JAIL:
	break;
	case CPU_WHICH_IRQ:
	error = EINVAL;
	goto out;
	}
	if (uap->level == CPU_LEVEL_ROOT)
	nset = cpuset_refroot(set);
	else
	nset = cpuset_refbase(set);
	error = cpuset_modify(nset, mask);
	cpuset_rel(nset);
	cpuset_rel(set);
	break;
	case CPU_LEVEL_WHICH:
	switch (uap->which) {
	case CPU_WHICH_TID:
	error = cpuset_setthread(uap->id, mask);
	break;
	case CPU_WHICH_PID:
	error = cpuset_setproc(uap->id, NULL, mask);
	break;
	case CPU_WHICH_CPUSET:
	case CPU_WHICH_JAIL:
	error = cpuset_which(uap->which, uap->id, &p,
	&ttd, &set);
	if (error == 0) {
	error = cpuset_modify(set, mask);
	cpuset_rel(set);
	}
	break;
	case CPU_WHICH_IRQ:
	error = intr_setaffinity(uap->id, mask);
	break;
	default:
	error = EINVAL;
	break;
	}
	break;
	default:
	error = EINVAL;
	break;
	}
	out:
	free(mask, M_TEMP);
	return (error);
	}

	#ifdef DDB
	DB_SHOW_COMMAND(cpusets, db_show_cpusets)
	{
	struct cpuset *set;
	int cpu, once;

	LIST_FOREACH(set, &cpuset_ids, cs_link) {
	db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
	set, set->cs_id, set->cs_ref, set->cs_flags,
	(set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
	db_printf(" mask=");
	for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
	if (CPU_ISSET(cpu, &set->cs_mask)) {
	if (once == 0) {
	db_printf("%d", cpu);
	once = 1;
	} else
	db_printf(",%d", cpu);
	}
	}
	db_printf("\n");
	if (db_pager_quit)
	break;
	}
	}
	#endif /* DDB */
	Index: head/sys/kern/kern_descrip.c
	===================================================================
	--- head/sys/kern/kern_descrip.c (revision 225616)
	+++ head/sys/kern/kern_descrip.c (revision 225617)
	@@ -1,3904 +1,3904 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_capsicum.h"
	#include "opt_compat.h"
	#include "opt_ddb.h"
	#include "opt_ktrace.h"
	#include "opt_procdesc.h"

	#include <sys/param.h>
	#include <sys/systm.h>

	#include <sys/capability.h>
	#include <sys/conf.h>
	#include <sys/domain.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/filio.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/mqueue.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/selinfo.h>
	#include <sys/pipe.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/procdesc.h>
	#include <sys/protosw.h>
	#include <sys/racct.h>
	#include <sys/resourcevar.h>
	#include <sys/signalvar.h>
	#include <sys/socketvar.h>
	#include <sys/stat.h>
	#include <sys/sx.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/tty.h>
	#include <sys/unistd.h>
	#include <sys/un.h>
	#include <sys/unpcb.h>
	#include <sys/user.h>
	#include <sys/vnode.h>
	#ifdef KTRACE
	#include <sys/ktrace.h>
	#endif

	#include <net/vnet.h>

	#include <netinet/in.h>
	#include <netinet/in_pcb.h>

	#include <security/audit/audit.h>

	#include <vm/uma.h>
	#include <vm/vm.h>

	#include <ddb/ddb.h>

	static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
	static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
	"file desc to leader structures");
	static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");

	static uma_zone_t file_zone;


	/* Flags for do_dup() */
	#define DUP_FIXED 0x1 /* Force fixed allocation */
	#define DUP_FCNTL 0x2 /* fcntl()-style errors */

	static int do_dup(struct thread *td, int flags, int old, int new,
	register_t *retval);
	static int fd_first_free(struct filedesc *, int, int);
	static int fd_last_used(struct filedesc *, int, int);
	static void fdgrowtable(struct filedesc *, int);
	static void fdunused(struct filedesc *fdp, int fd);
	static void fdused(struct filedesc *fdp, int fd);
	static int fill_vnode_info(struct vnode vp, struct kinfo_file kif);
	static int fill_socket_info(struct socket so, struct kinfo_file kif);
	static int fill_pts_info(struct tty tp, struct kinfo_file kif);
	static int fill_pipe_info(struct pipe pi, struct kinfo_file kif);
	static int fill_procdesc_info(struct procdesc *pdp,
	struct kinfo_file *kif);

	/*
	* A process is initially started out with NDFILE descriptors stored within
	* this structure, selected to be enough for typical applications based on
	* the historical limit of 20 open files (and the usage of descriptors by
	* shells). If these descriptors are exhausted, a larger descriptor table
	* may be allocated, up to a process' resource limit; the internal arrays
	* are then unused.
	*/
	#define NDFILE 20
	#define NDSLOTSIZE sizeof(NDSLOTTYPE)
	#define NDENTRIES (NDSLOTSIZE * __CHAR_BIT)
	#define NDSLOT(x) ((x) / NDENTRIES)
	#define NDBIT(x) ((NDSLOTTYPE)1 << ((x) % NDENTRIES))
	#define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES)

	/*
	* Storage required per open file descriptor.
	*/
	#define OFILESIZE (sizeof(struct file *) + sizeof(char))

	/*
	* Storage to hold unused ofiles that need to be reclaimed.
	*/
	struct freetable {
	struct file **ft_table;
	SLIST_ENTRY(freetable) ft_next;
	};

	/*
	* Basic allocation of descriptors:
	* one of the above, plus arrays for NDFILE descriptors.
	*/
	struct filedesc0 {
	struct filedesc fd_fd;
	/*
	* ofiles which need to be reclaimed on free.
	*/
	SLIST_HEAD(,freetable) fd_free;
	/*
	* These arrays are used when the number of open files is
	* <= NDFILE, and are then pointed to by the pointers above.
	*/
	struct file *fd_dfiles[NDFILE];
	char fd_dfileflags[NDFILE];
	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
	};

	/*
	* Descriptor management.
	*/
	volatile int openfiles; /* actual number of open files */
	struct mtx sigio_lock; /* mtx to protect pointers to sigio */
	void (mq_fdclose)(struct thread td, int fd, struct file *fp);

	/* A mutex to protect the association between a proc and filedesc. */
	static struct mtx fdesc_mtx;

	/*
	* Find the first zero bit in the given bitmap, starting at low and not
	* exceeding size - 1.
	*/
	static int
	fd_first_free(struct filedesc *fdp, int low, int size)
	{
	NDSLOTTYPE *map = fdp->fd_map;
	NDSLOTTYPE mask;
	int off, maxoff;

	if (low >= size)
	return (low);

	off = NDSLOT(low);
	if (low % NDENTRIES) {
	mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
	if ((mask &= ~map[off]) != 0UL)
	return (off * NDENTRIES + ffsl(mask) - 1);
	++off;
	}
	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
	if (map[off] != ~0UL)
	return (off * NDENTRIES + ffsl(~map[off]) - 1);
	return (size);
	}

	/*
	* Find the highest non-zero bit in the given bitmap, starting at low and
	* not exceeding size - 1.
	*/
	static int
	fd_last_used(struct filedesc *fdp, int low, int size)
	{
	NDSLOTTYPE *map = fdp->fd_map;
	NDSLOTTYPE mask;
	int off, minoff;

	if (low >= size)
	return (-1);

	off = NDSLOT(size);
	if (size % NDENTRIES) {
	mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
	if ((mask &= map[off]) != 0)
	return (off * NDENTRIES + flsl(mask) - 1);
	--off;
	}
	for (minoff = NDSLOT(low); off >= minoff; --off)
	if (map[off] != 0)
	return (off * NDENTRIES + flsl(map[off]) - 1);
	return (low - 1);
	}

	static int
	fdisused(struct filedesc *fdp, int fd)
	{
	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
	("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
	}

	/*
	* Mark a file descriptor as used.
	*/
	static void
	fdused(struct filedesc *fdp, int fd)
	{

	FILEDESC_XLOCK_ASSERT(fdp);
	KASSERT(!fdisused(fdp, fd),
	("fd already used"));

	fdp->fd_map[NDSLOT(fd)] \|= NDBIT(fd);
	if (fd > fdp->fd_lastfile)
	fdp->fd_lastfile = fd;
	if (fd == fdp->fd_freefile)
	fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
	}

	/*
	* Mark a file descriptor as unused.
	*/
	static void
	fdunused(struct filedesc *fdp, int fd)
	{

	FILEDESC_XLOCK_ASSERT(fdp);
	KASSERT(fdisused(fdp, fd),
	("fd is already unused"));
	KASSERT(fdp->fd_ofiles[fd] == NULL,
	("fd is still in use"));

	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
	if (fd < fdp->fd_freefile)
	fdp->fd_freefile = fd;
	if (fd == fdp->fd_lastfile)
	fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
	}

	/*
	* System calls on descriptors.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct getdtablesize_args {
	int dummy;
	};
	#endif
	/* ARGSUSED */
	int
	-getdtablesize(struct thread td, struct getdtablesize_args uap)
	+sys_getdtablesize(struct thread td, struct getdtablesize_args uap)
	{
	struct proc *p = td->td_proc;
	uint64_t lim;

	PROC_LOCK(p);
	td->td_retval[0] =
	min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
	PROC_UNLOCK(p);
	if (lim < td->td_retval[0])
	td->td_retval[0] = lim;
	return (0);
	}

	/*
	* Duplicate a file descriptor to a particular value.
	*
	* Note: keep in mind that a potential race condition exists when closing
	* descriptors from a shared descriptor table (via rfork).
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct dup2_args {
	u_int from;
	u_int to;
	};
	#endif
	/* ARGSUSED */
	int
	-dup2(struct thread td, struct dup2_args uap)
	+sys_dup2(struct thread td, struct dup2_args uap)
	{

	return (do_dup(td, DUP_FIXED, (int)uap->from, (int)uap->to,
	td->td_retval));
	}

	/*
	* Duplicate a file descriptor.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct dup_args {
	u_int fd;
	};
	#endif
	/* ARGSUSED */
	int
	-dup(struct thread td, struct dup_args uap)
	+sys_dup(struct thread td, struct dup_args uap)
	{

	return (do_dup(td, 0, (int)uap->fd, 0, td->td_retval));
	}

	/*
	* The file control system call.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct fcntl_args {
	int fd;
	int cmd;
	long arg;
	};
	#endif
	/* ARGSUSED */
	int
	-fcntl(struct thread td, struct fcntl_args uap)
	+sys_fcntl(struct thread td, struct fcntl_args uap)
	{
	struct flock fl;
	struct oflock ofl;
	intptr_t arg;
	int error;
	int cmd;

	error = 0;
	cmd = uap->cmd;
	switch (uap->cmd) {
	case F_OGETLK:
	case F_OSETLK:
	case F_OSETLKW:
	/*
	* Convert old flock structure to new.
	*/
	error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
	fl.l_start = ofl.l_start;
	fl.l_len = ofl.l_len;
	fl.l_pid = ofl.l_pid;
	fl.l_type = ofl.l_type;
	fl.l_whence = ofl.l_whence;
	fl.l_sysid = 0;

	switch (uap->cmd) {
	case F_OGETLK:
	cmd = F_GETLK;
	break;
	case F_OSETLK:
	cmd = F_SETLK;
	break;
	case F_OSETLKW:
	cmd = F_SETLKW;
	break;
	}
	arg = (intptr_t)&fl;
	break;
	case F_GETLK:
	case F_SETLK:
	case F_SETLKW:
	case F_SETLK_REMOTE:
	error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
	arg = (intptr_t)&fl;
	break;
	default:
	arg = uap->arg;
	break;
	}
	if (error)
	return (error);
	error = kern_fcntl(td, uap->fd, cmd, arg);
	if (error)
	return (error);
	if (uap->cmd == F_OGETLK) {
	ofl.l_start = fl.l_start;
	ofl.l_len = fl.l_len;
	ofl.l_pid = fl.l_pid;
	ofl.l_type = fl.l_type;
	ofl.l_whence = fl.l_whence;
	error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
	} else if (uap->cmd == F_GETLK) {
	error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
	}
	return (error);
	}

	static inline struct file *
	fdtofp(int fd, struct filedesc *fdp)
	{
	struct file *fp;

	FILEDESC_LOCK_ASSERT(fdp);
	if ((unsigned)fd >= fdp->fd_nfiles \|\|
	(fp = fdp->fd_ofiles[fd]) == NULL)
	return (NULL);
	return (fp);
	}

	static inline int
	fdunwrap(int fd, cap_rights_t rights, struct filedesc fdp, struct file *fpp)
	{

	*fpp = fdtofp(fd, fdp);
	if (*fpp == NULL)
	return (EBADF);

	#ifdef CAPABILITIES
	if ((*fpp)->f_type == DTYPE_CAPABILITY) {
	int err = cap_funwrap(*fpp, rights, fpp);
	if (err != 0) {
	*fpp = NULL;
	return (err);
	}
	}
	#endif /* CAPABILITIES */
	return (0);
	}

	int
	kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
	{
	struct filedesc *fdp;
	struct flock *flp;
	struct file *fp;
	struct proc *p;
	char *pop;
	struct vnode *vp;
	int error, flg, tmp;
	int vfslocked;
	u_int old, new;
	uint64_t bsize;

	vfslocked = 0;
	error = 0;
	flg = F_POSIX;
	p = td->td_proc;
	fdp = p->p_fd;

	switch (cmd) {
	case F_DUPFD:
	tmp = arg;
	error = do_dup(td, DUP_FCNTL, fd, tmp, td->td_retval);
	break;

	case F_DUP2FD:
	tmp = arg;
	error = do_dup(td, DUP_FIXED, fd, tmp, td->td_retval);
	break;

	case F_GETFD:
	FILEDESC_SLOCK(fdp);
	if ((fp = fdtofp(fd, fdp)) == NULL) {
	FILEDESC_SUNLOCK(fdp);
	error = EBADF;
	break;
	}
	pop = &fdp->fd_ofileflags[fd];
	td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
	FILEDESC_SUNLOCK(fdp);
	break;

	case F_SETFD:
	FILEDESC_XLOCK(fdp);
	if ((fp = fdtofp(fd, fdp)) == NULL) {
	FILEDESC_XUNLOCK(fdp);
	error = EBADF;
	break;
	}
	pop = &fdp->fd_ofileflags[fd];
	pop = (pop &~ UF_EXCLOSE) \|
	(arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
	FILEDESC_XUNLOCK(fdp);
	break;

	case F_GETFL:
	FILEDESC_SLOCK(fdp);
	error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
	if (error != 0) {
	FILEDESC_SUNLOCK(fdp);
	break;
	}
	td->td_retval[0] = OFLAGS(fp->f_flag);
	FILEDESC_SUNLOCK(fdp);
	break;

	case F_SETFL:
	FILEDESC_SLOCK(fdp);
	error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
	if (error != 0) {
	FILEDESC_SUNLOCK(fdp);
	break;
	}
	fhold(fp);
	FILEDESC_SUNLOCK(fdp);
	do {
	tmp = flg = fp->f_flag;
	tmp &= ~FCNTLFLAGS;
	tmp \|= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
	} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
	tmp = fp->f_flag & FNONBLOCK;
	error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
	if (error) {
	fdrop(fp, td);
	break;
	}
	tmp = fp->f_flag & FASYNC;
	error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
	if (error == 0) {
	fdrop(fp, td);
	break;
	}
	atomic_clear_int(&fp->f_flag, FNONBLOCK);
	tmp = 0;
	(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
	fdrop(fp, td);
	break;

	case F_GETOWN:
	FILEDESC_SLOCK(fdp);
	error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
	if (error != 0) {
	FILEDESC_SUNLOCK(fdp);
	break;
	}
	fhold(fp);
	FILEDESC_SUNLOCK(fdp);
	error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
	if (error == 0)
	td->td_retval[0] = tmp;
	fdrop(fp, td);
	break;

	case F_SETOWN:
	FILEDESC_SLOCK(fdp);
	error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
	if (error != 0) {
	FILEDESC_SUNLOCK(fdp);
	break;
	}
	fhold(fp);
	FILEDESC_SUNLOCK(fdp);
	tmp = arg;
	error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
	fdrop(fp, td);
	break;

	case F_SETLK_REMOTE:
	error = priv_check(td, PRIV_NFS_LOCKD);
	if (error)
	return (error);
	flg = F_REMOTE;
	goto do_setlk;

	case F_SETLKW:
	flg \|= F_WAIT;
	/* FALLTHROUGH F_SETLK */

	case F_SETLK:
	do_setlk:
	FILEDESC_SLOCK(fdp);
	error = fdunwrap(fd, CAP_FLOCK, fdp, &fp);
	if (error != 0) {
	FILEDESC_SUNLOCK(fdp);
	break;
	}
	if (fp->f_type != DTYPE_VNODE) {
	FILEDESC_SUNLOCK(fdp);
	error = EBADF;
	break;
	}
	flp = (struct flock *)arg;
	if (flp->l_whence == SEEK_CUR) {
	if (fp->f_offset < 0 \|\|
	(flp->l_start > 0 &&
	fp->f_offset > OFF_MAX - flp->l_start)) {
	FILEDESC_SUNLOCK(fdp);
	error = EOVERFLOW;
	break;
	}
	flp->l_start += fp->f_offset;
	}

	/*
	* VOP_ADVLOCK() may block.
	*/
	fhold(fp);
	FILEDESC_SUNLOCK(fdp);
	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	switch (flp->l_type) {
	case F_RDLCK:
	if ((fp->f_flag & FREAD) == 0) {
	error = EBADF;
	break;
	}
	PROC_LOCK(p->p_leader);
	p->p_leader->p_flag \|= P_ADVLOCK;
	PROC_UNLOCK(p->p_leader);
	error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
	flp, flg);
	break;
	case F_WRLCK:
	if ((fp->f_flag & FWRITE) == 0) {
	error = EBADF;
	break;
	}
	PROC_LOCK(p->p_leader);
	p->p_leader->p_flag \|= P_ADVLOCK;
	PROC_UNLOCK(p->p_leader);
	error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
	flp, flg);
	break;
	case F_UNLCK:
	error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
	flp, flg);
	break;
	case F_UNLCKSYS:
	/*
	* Temporary api for testing remote lock
	* infrastructure.
	*/
	if (flg != F_REMOTE) {
	error = EINVAL;
	break;
	}
	error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
	F_UNLCKSYS, flp, flg);
	break;
	default:
	error = EINVAL;
	break;
	}
	VFS_UNLOCK_GIANT(vfslocked);
	vfslocked = 0;
	/* Check for race with close */
	FILEDESC_SLOCK(fdp);
	if ((unsigned) fd >= fdp->fd_nfiles \|\|
	fp != fdp->fd_ofiles[fd]) {
	FILEDESC_SUNLOCK(fdp);
	flp->l_whence = SEEK_SET;
	flp->l_start = 0;
	flp->l_len = 0;
	flp->l_type = F_UNLCK;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
	F_UNLCK, flp, F_POSIX);
	VFS_UNLOCK_GIANT(vfslocked);
	vfslocked = 0;
	} else
	FILEDESC_SUNLOCK(fdp);
	fdrop(fp, td);
	break;

	case F_GETLK:
	FILEDESC_SLOCK(fdp);
	error = fdunwrap(fd, CAP_FLOCK, fdp, &fp);
	if (error != 0) {
	FILEDESC_SUNLOCK(fdp);
	break;
	}
	if (fp->f_type != DTYPE_VNODE) {
	FILEDESC_SUNLOCK(fdp);
	error = EBADF;
	break;
	}
	flp = (struct flock *)arg;
	if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
	flp->l_type != F_UNLCK) {
	FILEDESC_SUNLOCK(fdp);
	error = EINVAL;
	break;
	}
	if (flp->l_whence == SEEK_CUR) {
	if ((flp->l_start > 0 &&
	fp->f_offset > OFF_MAX - flp->l_start) \|\|
	(flp->l_start < 0 &&
	fp->f_offset < OFF_MIN - flp->l_start)) {
	FILEDESC_SUNLOCK(fdp);
	error = EOVERFLOW;
	break;
	}
	flp->l_start += fp->f_offset;
	}
	/*
	* VOP_ADVLOCK() may block.
	*/
	fhold(fp);
	FILEDESC_SUNLOCK(fdp);
	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
	F_POSIX);
	VFS_UNLOCK_GIANT(vfslocked);
	vfslocked = 0;
	fdrop(fp, td);
	break;

	case F_RDAHEAD:
	arg = arg ? 128 * 1024: 0;
	/* FALLTHROUGH */
	case F_READAHEAD:
	FILEDESC_SLOCK(fdp);
	if ((fp = fdtofp(fd, fdp)) == NULL) {
	FILEDESC_SUNLOCK(fdp);
	error = EBADF;
	break;
	}
	if (fp->f_type != DTYPE_VNODE) {
	FILEDESC_SUNLOCK(fdp);
	error = EBADF;
	break;
	}
	fhold(fp);
	FILEDESC_SUNLOCK(fdp);
	if (arg != 0) {
	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	error = vn_lock(vp, LK_SHARED);
	if (error != 0)
	goto readahead_vnlock_fail;
	bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
	VOP_UNLOCK(vp, 0);
	fp->f_seqcount = (arg + bsize - 1) / bsize;
	do {
	new = old = fp->f_flag;
	new \|= FRDAHEAD;
	} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
	readahead_vnlock_fail:
	VFS_UNLOCK_GIANT(vfslocked);
	vfslocked = 0;
	} else {
	do {
	new = old = fp->f_flag;
	new &= ~FRDAHEAD;
	} while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
	}
	fdrop(fp, td);
	break;

	default:
	error = EINVAL;
	break;
	}
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
	*/
	static int
	do_dup(struct thread *td, int flags, int old, int new,
	register_t *retval)
	{
	struct filedesc *fdp;
	struct proc *p;
	struct file *fp;
	struct file *delfp;
	int error, holdleaders, maxfd;

	p = td->td_proc;
	fdp = p->p_fd;

	/*
	* Verify we have a valid descriptor to dup from and possibly to
	* dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
	* return EINVAL when the new descriptor is out of bounds.
	*/
	if (old < 0)
	return (EBADF);
	if (new < 0)
	return (flags & DUP_FCNTL ? EINVAL : EBADF);
	PROC_LOCK(p);
	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
	PROC_UNLOCK(p);
	if (new >= maxfd)
	return (flags & DUP_FCNTL ? EINVAL : EMFILE);

	FILEDESC_XLOCK(fdp);
	if (old >= fdp->fd_nfiles \|\| fdp->fd_ofiles[old] == NULL) {
	FILEDESC_XUNLOCK(fdp);
	return (EBADF);
	}
	if (flags & DUP_FIXED && old == new) {
	*retval = new;
	FILEDESC_XUNLOCK(fdp);
	return (0);
	}
	fp = fdp->fd_ofiles[old];
	fhold(fp);

	/*
	* If the caller specified a file descriptor, make sure the file
	* table is large enough to hold it, and grab it. Otherwise, just
	* allocate a new descriptor the usual way. Since the filedesc
	* lock may be temporarily dropped in the process, we have to look
	* out for a race.
	*/
	if (flags & DUP_FIXED) {
	if (new >= fdp->fd_nfiles) {
	/*
	* The resource limits are here instead of e.g. fdalloc(),
	* because the file descriptor table may be shared between
	* processes, so we can't really use racct_add()/racct_sub().
	* Instead of counting the number of actually allocated
	* descriptors, just put the limit on the size of the file
	* descriptor table.
	*/
	#ifdef RACCT
	PROC_LOCK(p);
	error = racct_set(p, RACCT_NOFILE, new + 1);
	PROC_UNLOCK(p);
	if (error != 0) {
	FILEDESC_XUNLOCK(fdp);
	fdrop(fp, td);
	return (EMFILE);
	}
	#endif
	fdgrowtable(fdp, new + 1);
	}
	if (fdp->fd_ofiles[new] == NULL)
	fdused(fdp, new);
	} else {
	if ((error = fdalloc(td, new, &new)) != 0) {
	FILEDESC_XUNLOCK(fdp);
	fdrop(fp, td);
	return (error);
	}
	}

	/*
	* If the old file changed out from under us then treat it as a
	* bad file descriptor. Userland should do its own locking to
	* avoid this case.
	*/
	if (fdp->fd_ofiles[old] != fp) {
	/* we've allocated a descriptor which we won't use */
	if (fdp->fd_ofiles[new] == NULL)
	fdunused(fdp, new);
	FILEDESC_XUNLOCK(fdp);
	fdrop(fp, td);
	return (EBADF);
	}
	KASSERT(old != new,
	("new fd is same as old"));

	/*
	* Save info on the descriptor being overwritten. We cannot close
	* it without introducing an ownership race for the slot, since we
	* need to drop the filedesc lock to call closef().
	*
	* XXX this duplicates parts of close().
	*/
	delfp = fdp->fd_ofiles[new];
	holdleaders = 0;
	if (delfp != NULL) {
	if (td->td_proc->p_fdtol != NULL) {
	/*
	* Ask fdfree() to sleep to ensure that all relevant
	* process leaders can be traversed in closef().
	*/
	fdp->fd_holdleaderscount++;
	holdleaders = 1;
	}
	}

	/*
	* Duplicate the source descriptor
	*/
	fdp->fd_ofiles[new] = fp;
	fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE;
	if (new > fdp->fd_lastfile)
	fdp->fd_lastfile = new;
	*retval = new;

	/*
	* If we dup'd over a valid file, we now own the reference to it
	* and must dispose of it using closef() semantics (as if a
	* close() were performed on it).
	*
	* XXX this duplicates parts of close().
	*/
	if (delfp != NULL) {
	knote_fdclose(td, new);
	if (delfp->f_type == DTYPE_MQUEUE)
	mq_fdclose(td, new, delfp);
	FILEDESC_XUNLOCK(fdp);
	(void) closef(delfp, td);
	if (holdleaders) {
	FILEDESC_XLOCK(fdp);
	fdp->fd_holdleaderscount--;
	if (fdp->fd_holdleaderscount == 0 &&
	fdp->fd_holdleaderswakeup != 0) {
	fdp->fd_holdleaderswakeup = 0;
	wakeup(&fdp->fd_holdleaderscount);
	}
	FILEDESC_XUNLOCK(fdp);
	}
	} else {
	FILEDESC_XUNLOCK(fdp);
	}
	return (0);
	}

	/*
	* If sigio is on the list associated with a process or process group,
	* disable signalling from the device, remove sigio from the list and
	* free sigio.
	*/
	void
	funsetown(struct sigio **sigiop)
	{
	struct sigio *sigio;

	SIGIO_LOCK();
	sigio = *sigiop;
	if (sigio == NULL) {
	SIGIO_UNLOCK();
	return;
	}
	*(sigio->sio_myref) = NULL;
	if ((sigio)->sio_pgid < 0) {
	struct pgrp *pg = (sigio)->sio_pgrp;
	PGRP_LOCK(pg);
	SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
	sigio, sio_pgsigio);
	PGRP_UNLOCK(pg);
	} else {
	struct proc *p = (sigio)->sio_proc;
	PROC_LOCK(p);
	SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
	sigio, sio_pgsigio);
	PROC_UNLOCK(p);
	}
	SIGIO_UNLOCK();
	crfree(sigio->sio_ucred);
	free(sigio, M_SIGIO);
	}

	/*
	* Free a list of sigio structures.
	* We only need to lock the SIGIO_LOCK because we have made ourselves
	* inaccessible to callers of fsetown and therefore do not need to lock
	* the proc or pgrp struct for the list manipulation.
	*/
	void
	funsetownlst(struct sigiolst *sigiolst)
	{
	struct proc *p;
	struct pgrp *pg;
	struct sigio *sigio;

	sigio = SLIST_FIRST(sigiolst);
	if (sigio == NULL)
	return;
	p = NULL;
	pg = NULL;

	/*
	* Every entry of the list should belong
	* to a single proc or pgrp.
	*/
	if (sigio->sio_pgid < 0) {
	pg = sigio->sio_pgrp;
	PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
	} else /* if (sigio->sio_pgid > 0) */ {
	p = sigio->sio_proc;
	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
	}

	SIGIO_LOCK();
	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
	*(sigio->sio_myref) = NULL;
	if (pg != NULL) {
	KASSERT(sigio->sio_pgid < 0,
	("Proc sigio in pgrp sigio list"));
	KASSERT(sigio->sio_pgrp == pg,
	("Bogus pgrp in sigio list"));
	PGRP_LOCK(pg);
	SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
	sio_pgsigio);
	PGRP_UNLOCK(pg);
	} else /* if (p != NULL) */ {
	KASSERT(sigio->sio_pgid > 0,
	("Pgrp sigio in proc sigio list"));
	KASSERT(sigio->sio_proc == p,
	("Bogus proc in sigio list"));
	PROC_LOCK(p);
	SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
	sio_pgsigio);
	PROC_UNLOCK(p);
	}
	SIGIO_UNLOCK();
	crfree(sigio->sio_ucred);
	free(sigio, M_SIGIO);
	SIGIO_LOCK();
	}
	SIGIO_UNLOCK();
	}

	/*
	* This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
	*
	* After permission checking, add a sigio structure to the sigio list for
	* the process or process group.
	*/
	int
	fsetown(pid_t pgid, struct sigio **sigiop)
	{
	struct proc *proc;
	struct pgrp *pgrp;
	struct sigio *sigio;
	int ret;

	if (pgid == 0) {
	funsetown(sigiop);
	return (0);
	}

	ret = 0;

	/* Allocate and fill in the new sigio out of locks. */
	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
	sigio->sio_pgid = pgid;
	sigio->sio_ucred = crhold(curthread->td_ucred);
	sigio->sio_myref = sigiop;

	sx_slock(&proctree_lock);
	if (pgid > 0) {
	proc = pfind(pgid);
	if (proc == NULL) {
	ret = ESRCH;
	goto fail;
	}

	/*
	* Policy - Don't allow a process to FSETOWN a process
	* in another session.
	*
	* Remove this test to allow maximum flexibility or
	* restrict FSETOWN to the current process or process
	* group for maximum safety.
	*/
	PROC_UNLOCK(proc);
	if (proc->p_session != curthread->td_proc->p_session) {
	ret = EPERM;
	goto fail;
	}

	pgrp = NULL;
	} else /* if (pgid < 0) */ {
	pgrp = pgfind(-pgid);
	if (pgrp == NULL) {
	ret = ESRCH;
	goto fail;
	}
	PGRP_UNLOCK(pgrp);

	/*
	* Policy - Don't allow a process to FSETOWN a process
	* in another session.
	*
	* Remove this test to allow maximum flexibility or
	* restrict FSETOWN to the current process or process
	* group for maximum safety.
	*/
	if (pgrp->pg_session != curthread->td_proc->p_session) {
	ret = EPERM;
	goto fail;
	}

	proc = NULL;
	}
	funsetown(sigiop);
	if (pgid > 0) {
	PROC_LOCK(proc);
	/*
	* Since funsetownlst() is called without the proctree
	* locked, we need to check for P_WEXIT.
	* XXX: is ESRCH correct?
	*/
	if ((proc->p_flag & P_WEXIT) != 0) {
	PROC_UNLOCK(proc);
	ret = ESRCH;
	goto fail;
	}
	SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
	sigio->sio_proc = proc;
	PROC_UNLOCK(proc);
	} else {
	PGRP_LOCK(pgrp);
	SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
	sigio->sio_pgrp = pgrp;
	PGRP_UNLOCK(pgrp);
	}
	sx_sunlock(&proctree_lock);
	SIGIO_LOCK();
	*sigiop = sigio;
	SIGIO_UNLOCK();
	return (0);

	fail:
	sx_sunlock(&proctree_lock);
	crfree(sigio->sio_ucred);
	free(sigio, M_SIGIO);
	return (ret);
	}

	/*
	* This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
	*/
	pid_t
	fgetown(sigiop)
	struct sigio **sigiop;
	{
	pid_t pgid;

	SIGIO_LOCK();
	pgid = (sigiop != NULL) ? (sigiop)->sio_pgid : 0;
	SIGIO_UNLOCK();
	return (pgid);
	}

	/*
	* Close a file descriptor.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct close_args {
	int fd;
	};
	#endif
	/* ARGSUSED */
	int
	-close(td, uap)
	+sys_close(td, uap)
	struct thread *td;
	struct close_args *uap;
	{

	return (kern_close(td, uap->fd));
	}

	int
	kern_close(td, fd)
	struct thread *td;
	int fd;
	{
	struct filedesc *fdp;
	struct file fp, fp_object;
	int error;
	int holdleaders;

	error = 0;
	holdleaders = 0;
	fdp = td->td_proc->p_fd;

	AUDIT_SYSCLOSE(td, fd);

	FILEDESC_XLOCK(fdp);
	if ((unsigned)fd >= fdp->fd_nfiles \|\|
	(fp = fdp->fd_ofiles[fd]) == NULL) {
	FILEDESC_XUNLOCK(fdp);
	return (EBADF);
	}
	fdp->fd_ofiles[fd] = NULL;
	fdp->fd_ofileflags[fd] = 0;
	fdunused(fdp, fd);
	if (td->td_proc->p_fdtol != NULL) {
	/*
	* Ask fdfree() to sleep to ensure that all relevant
	* process leaders can be traversed in closef().
	*/
	fdp->fd_holdleaderscount++;
	holdleaders = 1;
	}

	/*
	* We now hold the fp reference that used to be owned by the
	* descriptor array. We have to unlock the FILEDESC AFTER
	* knote_fdclose to prevent a race of the fd getting opened, a knote
	* added, and deleteing a knote for the new fd.
	*/
	knote_fdclose(td, fd);

	/*
	* When we're closing an fd with a capability, we need to notify
	* mqueue if the underlying object is of type mqueue.
	*/
	(void)cap_funwrap(fp, 0, &fp_object);
	if (fp_object->f_type == DTYPE_MQUEUE)
	mq_fdclose(td, fd, fp_object);
	FILEDESC_XUNLOCK(fdp);

	error = closef(fp, td);
	if (holdleaders) {
	FILEDESC_XLOCK(fdp);
	fdp->fd_holdleaderscount--;
	if (fdp->fd_holdleaderscount == 0 &&
	fdp->fd_holdleaderswakeup != 0) {
	fdp->fd_holdleaderswakeup = 0;
	wakeup(&fdp->fd_holdleaderscount);
	}
	FILEDESC_XUNLOCK(fdp);
	}
	return (error);
	}

	/*
	* Close open file descriptors.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct closefrom_args {
	int lowfd;
	};
	#endif
	/* ARGSUSED */
	int
	-closefrom(struct thread td, struct closefrom_args uap)
	+sys_closefrom(struct thread td, struct closefrom_args uap)
	{
	struct filedesc *fdp;
	int fd;

	fdp = td->td_proc->p_fd;
	AUDIT_ARG_FD(uap->lowfd);

	/*
	* Treat negative starting file descriptor values identical to
	* closefrom(0) which closes all files.
	*/
	if (uap->lowfd < 0)
	uap->lowfd = 0;
	FILEDESC_SLOCK(fdp);
	for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) {
	if (fdp->fd_ofiles[fd] != NULL) {
	FILEDESC_SUNLOCK(fdp);
	(void)kern_close(td, fd);
	FILEDESC_SLOCK(fdp);
	}
	}
	FILEDESC_SUNLOCK(fdp);
	return (0);
	}

	#if defined(COMPAT_43)
	/*
	* Return status information about a file descriptor.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct ofstat_args {
	int fd;
	struct ostat *sb;
	};
	#endif
	/* ARGSUSED */
	int
	ofstat(struct thread td, struct ofstat_args uap)
	{
	struct ostat oub;
	struct stat ub;
	int error;

	error = kern_fstat(td, uap->fd, &ub);
	if (error == 0) {
	cvtstat(&ub, &oub);
	error = copyout(&oub, uap->sb, sizeof(oub));
	}
	return (error);
	}
	#endif /* COMPAT_43 */

	/*
	* Return status information about a file descriptor.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct fstat_args {
	int fd;
	struct stat *sb;
	};
	#endif
	/* ARGSUSED */
	int
	-fstat(struct thread td, struct fstat_args uap)
	+sys_fstat(struct thread td, struct fstat_args uap)
	{
	struct stat ub;
	int error;

	error = kern_fstat(td, uap->fd, &ub);
	if (error == 0)
	error = copyout(&ub, uap->sb, sizeof(ub));
	return (error);
	}

	int
	kern_fstat(struct thread td, int fd, struct stat sbp)
	{
	struct file *fp;
	int error;

	AUDIT_ARG_FD(fd);

	if ((error = fget(td, fd, CAP_FSTAT, &fp)) != 0)
	return (error);

	AUDIT_ARG_FILE(td->td_proc, fp);

	error = fo_stat(fp, sbp, td->td_ucred, td);
	fdrop(fp, td);
	#ifdef KTRACE
	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
	ktrstat(sbp);
	#endif
	return (error);
	}

	/*
	* Return status information about a file descriptor.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct nfstat_args {
	int fd;
	struct nstat *sb;
	};
	#endif
	/* ARGSUSED */
	int
	-nfstat(struct thread td, struct nfstat_args uap)
	+sys_nfstat(struct thread td, struct nfstat_args uap)
	{
	struct nstat nub;
	struct stat ub;
	int error;

	error = kern_fstat(td, uap->fd, &ub);
	if (error == 0) {
	cvtnstat(&ub, &nub);
	error = copyout(&nub, uap->sb, sizeof(nub));
	}
	return (error);
	}

	/*
	* Return pathconf information about a file descriptor.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct fpathconf_args {
	int fd;
	int name;
	};
	#endif
	/* ARGSUSED */
	int
	-fpathconf(struct thread td, struct fpathconf_args uap)
	+sys_fpathconf(struct thread td, struct fpathconf_args uap)
	{
	struct file *fp;
	struct vnode *vp;
	int error;

	if ((error = fget(td, uap->fd, CAP_FPATHCONF, &fp)) != 0)
	return (error);

	/* If asynchronous I/O is available, it works for all descriptors. */
	if (uap->name == _PC_ASYNC_IO) {
	td->td_retval[0] = async_io_version;
	goto out;
	}
	vp = fp->f_vnode;
	if (vp != NULL) {
	int vfslocked;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	error = VOP_PATHCONF(vp, uap->name, td->td_retval);
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	} else if (fp->f_type == DTYPE_PIPE \|\| fp->f_type == DTYPE_SOCKET) {
	if (uap->name != _PC_PIPE_BUF) {
	error = EINVAL;
	} else {
	td->td_retval[0] = PIPE_BUF;
	error = 0;
	}
	} else {
	error = EOPNOTSUPP;
	}
	out:
	fdrop(fp, td);
	return (error);
	}

	/*
	* Grow the file table to accomodate (at least) nfd descriptors. This may
	* block and drop the filedesc lock, but it will reacquire it before
	* returning.
	*/
	static void
	fdgrowtable(struct filedesc *fdp, int nfd)
	{
	struct filedesc0 *fdp0;
	struct freetable *fo;
	struct file **ntable;
	struct file **otable;
	char *nfileflags;
	int nnfiles, onfiles;
	NDSLOTTYPE *nmap;

	FILEDESC_XLOCK_ASSERT(fdp);

	KASSERT(fdp->fd_nfiles > 0,
	("zero-length file table"));

	/* compute the size of the new table */
	onfiles = fdp->fd_nfiles;
	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
	if (nnfiles <= onfiles)
	/* the table is already large enough */
	return;

	/* allocate a new table and (if required) new bitmaps */
	FILEDESC_XUNLOCK(fdp);
	ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable),
	M_FILEDESC, M_ZERO \| M_WAITOK);
	nfileflags = (char *)&ntable[nnfiles];
	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
	nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE,
	M_FILEDESC, M_ZERO \| M_WAITOK);
	else
	nmap = NULL;
	FILEDESC_XLOCK(fdp);

	/*
	* We now have new tables ready to go. Since we dropped the
	* filedesc lock to call malloc(), watch out for a race.
	*/
	onfiles = fdp->fd_nfiles;
	if (onfiles >= nnfiles) {
	/* we lost the race, but that's OK */
	free(ntable, M_FILEDESC);
	if (nmap != NULL)
	free(nmap, M_FILEDESC);
	return;
	}
	bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
	bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
	otable = fdp->fd_ofiles;
	fdp->fd_ofileflags = nfileflags;
	fdp->fd_ofiles = ntable;
	/*
	* We must preserve ofiles until the process exits because we can't
	* be certain that no threads have references to the old table via
	* _fget().
	*/
	if (onfiles > NDFILE) {
	fo = (struct freetable *)&otable[onfiles];
	fdp0 = (struct filedesc0 *)fdp;
	fo->ft_table = otable;
	SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next);
	}
	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
	bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
	free(fdp->fd_map, M_FILEDESC);
	fdp->fd_map = nmap;
	}
	fdp->fd_nfiles = nnfiles;
	}

	/*
	* Allocate a file descriptor for the process.
	*/
	int
	fdalloc(struct thread td, int minfd, int result)
	{
	struct proc *p = td->td_proc;
	struct filedesc *fdp = p->p_fd;
	int fd = -1, maxfd;
	#ifdef RACCT
	int error;
	#endif

	FILEDESC_XLOCK_ASSERT(fdp);

	if (fdp->fd_freefile > minfd)
	minfd = fdp->fd_freefile;

	PROC_LOCK(p);
	maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
	PROC_UNLOCK(p);

	/*
	* Search the bitmap for a free descriptor. If none is found, try
	* to grow the file table. Keep at it until we either get a file
	* descriptor or run into process or system limits; fdgrowtable()
	* may drop the filedesc lock, so we're in a race.
	*/
	for (;;) {
	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
	if (fd >= maxfd)
	return (EMFILE);
	if (fd < fdp->fd_nfiles)
	break;
	#ifdef RACCT
	PROC_LOCK(p);
	error = racct_set(p, RACCT_NOFILE, min(fdp->fd_nfiles * 2, maxfd));
	PROC_UNLOCK(p);
	if (error != 0)
	return (EMFILE);
	#endif
	fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
	}

	/*
	* Perform some sanity checks, then mark the file descriptor as
	* used and return it to the caller.
	*/
	KASSERT(!fdisused(fdp, fd),
	("fd_first_free() returned non-free descriptor"));
	KASSERT(fdp->fd_ofiles[fd] == NULL,
	("free descriptor isn't"));
	fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
	fdused(fdp, fd);
	*result = fd;
	return (0);
	}

	/*
	* Check to see whether n user file descriptors are available to the process
	* p.
	*/
	int
	fdavail(struct thread *td, int n)
	{
	struct proc *p = td->td_proc;
	struct filedesc *fdp = td->td_proc->p_fd;
	struct file **fpp;
	int i, lim, last;

	FILEDESC_LOCK_ASSERT(fdp);

	/*
	* XXX: This is only called from uipc_usrreq.c:unp_externalize();
	* call racct_add() from there instead of dealing with containers
	* here.
	*/
	PROC_LOCK(p);
	lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
	PROC_UNLOCK(p);
	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
	return (1);
	last = min(fdp->fd_nfiles, lim);
	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
	for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
	if (*fpp == NULL && --n <= 0)
	return (1);
	}
	return (0);
	}

	/*
	* Create a new open file structure and allocate a file decriptor for the
	* process that refers to it. We add one reference to the file for the
	* descriptor table and one reference for resultfp. This is to prevent us
	* being preempted and the entry in the descriptor table closed after we
	* release the FILEDESC lock.
	*/
	int
	falloc(struct thread td, struct file resultfp, int resultfd, int flags)
	{
	struct file *fp;
	int error, fd;

	error = falloc_noinstall(td, &fp);
	if (error)
	return (error); /* no reference held on error */

	error = finstall(td, fp, &fd, flags);
	if (error) {
	fdrop(fp, td); /* one reference (fp only) */
	return (error);
	}

	if (resultfp != NULL)
	resultfp = fp; / copy out result */
	else
	fdrop(fp, td); /* release local reference */

	if (resultfd != NULL)
	*resultfd = fd;

	return (0);
	}

	/*
	* Create a new open file structure without allocating a file descriptor.
	*/
	int
	falloc_noinstall(struct thread td, struct file *resultfp)
	{
	struct file *fp;
	int maxuserfiles = maxfiles - (maxfiles / 20);
	static struct timeval lastfail;
	static int curfail;

	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));

	if ((openfiles >= maxuserfiles &&
	priv_check(td, PRIV_MAXFILES) != 0) \|\|
	openfiles >= maxfiles) {
	if (ppsratecheck(&lastfail, &curfail, 1)) {
	printf("kern.maxfiles limit exceeded by uid %i, "
	"please see tuning(7).\n", td->td_ucred->cr_ruid);
	}
	return (ENFILE);
	}
	atomic_add_int(&openfiles, 1);
	fp = uma_zalloc(file_zone, M_WAITOK \| M_ZERO);
	refcount_init(&fp->f_count, 1);
	fp->f_cred = crhold(td->td_ucred);
	fp->f_ops = &badfileops;
	fp->f_data = NULL;
	fp->f_vnode = NULL;
	*resultfp = fp;
	return (0);
	}

	/*
	* Install a file in a file descriptor table.
	*/
	int
	finstall(struct thread td, struct file fp, int *fd, int flags)
	{
	struct filedesc *fdp = td->td_proc->p_fd;
	int error;

	KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
	KASSERT(fp != NULL, ("%s: fp == NULL", __func__));

	FILEDESC_XLOCK(fdp);
	if ((error = fdalloc(td, 0, fd))) {
	FILEDESC_XUNLOCK(fdp);
	return (error);
	}
	fhold(fp);
	fdp->fd_ofiles[*fd] = fp;
	if ((flags & O_CLOEXEC) != 0)
	fdp->fd_ofileflags[*fd] \|= UF_EXCLOSE;
	FILEDESC_XUNLOCK(fdp);
	return (0);
	}

	/*
	* Build a new filedesc structure from another.
	* Copy the current, root, and jail root vnode references.
	*/
	struct filedesc *
	fdinit(struct filedesc *fdp)
	{
	struct filedesc0 *newfdp;

	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK \| M_ZERO);
	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
	if (fdp != NULL) {
	FILEDESC_XLOCK(fdp);
	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
	if (newfdp->fd_fd.fd_cdir)
	VREF(newfdp->fd_fd.fd_cdir);
	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
	if (newfdp->fd_fd.fd_rdir)
	VREF(newfdp->fd_fd.fd_rdir);
	newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
	if (newfdp->fd_fd.fd_jdir)
	VREF(newfdp->fd_fd.fd_jdir);
	FILEDESC_XUNLOCK(fdp);
	}

	/* Create the file descriptor table. */
	newfdp->fd_fd.fd_refcnt = 1;
	newfdp->fd_fd.fd_holdcnt = 1;
	newfdp->fd_fd.fd_cmask = CMASK;
	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
	newfdp->fd_fd.fd_nfiles = NDFILE;
	newfdp->fd_fd.fd_map = newfdp->fd_dmap;
	newfdp->fd_fd.fd_lastfile = -1;
	return (&newfdp->fd_fd);
	}

	static struct filedesc *
	fdhold(struct proc *p)
	{
	struct filedesc *fdp;

	mtx_lock(&fdesc_mtx);
	fdp = p->p_fd;
	if (fdp != NULL)
	fdp->fd_holdcnt++;
	mtx_unlock(&fdesc_mtx);
	return (fdp);
	}

	static void
	fddrop(struct filedesc *fdp)
	{
	struct filedesc0 *fdp0;
	struct freetable *ft;
	int i;

	mtx_lock(&fdesc_mtx);
	i = --fdp->fd_holdcnt;
	mtx_unlock(&fdesc_mtx);
	if (i > 0)
	return;

	FILEDESC_LOCK_DESTROY(fdp);
	fdp0 = (struct filedesc0 *)fdp;
	while ((ft = SLIST_FIRST(&fdp0->fd_free)) != NULL) {
	SLIST_REMOVE_HEAD(&fdp0->fd_free, ft_next);
	free(ft->ft_table, M_FILEDESC);
	}
	free(fdp, M_FILEDESC);
	}

	/*
	* Share a filedesc structure.
	*/
	struct filedesc *
	fdshare(struct filedesc *fdp)
	{

	FILEDESC_XLOCK(fdp);
	fdp->fd_refcnt++;
	FILEDESC_XUNLOCK(fdp);
	return (fdp);
	}

	/*
	* Unshare a filedesc structure, if necessary by making a copy
	*/
	void
	fdunshare(struct proc p, struct thread td)
	{

	FILEDESC_XLOCK(p->p_fd);
	if (p->p_fd->fd_refcnt > 1) {
	struct filedesc *tmp;

	FILEDESC_XUNLOCK(p->p_fd);
	tmp = fdcopy(p->p_fd);
	fdfree(td);
	p->p_fd = tmp;
	} else
	FILEDESC_XUNLOCK(p->p_fd);
	}

	/*
	* Copy a filedesc structure. A NULL pointer in returns a NULL reference,
	* this is to ease callers, not catch errors.
	*/
	struct filedesc *
	fdcopy(struct filedesc *fdp)
	{
	struct filedesc *newfdp;
	int i;

	/* Certain daemons might not have file descriptors. */
	if (fdp == NULL)
	return (NULL);

	newfdp = fdinit(fdp);
	FILEDESC_SLOCK(fdp);
	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
	FILEDESC_SUNLOCK(fdp);
	FILEDESC_XLOCK(newfdp);
	fdgrowtable(newfdp, fdp->fd_lastfile + 1);
	FILEDESC_XUNLOCK(newfdp);
	FILEDESC_SLOCK(fdp);
	}
	/* copy all passable descriptors (i.e. not kqueue) */
	newfdp->fd_freefile = -1;
	for (i = 0; i <= fdp->fd_lastfile; ++i) {
	if (fdisused(fdp, i) &&
	(fdp->fd_ofiles[i]->f_ops->fo_flags & DFLAG_PASSABLE) &&
	fdp->fd_ofiles[i]->f_ops != &badfileops) {
	newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
	newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
	fhold(newfdp->fd_ofiles[i]);
	newfdp->fd_lastfile = i;
	} else {
	if (newfdp->fd_freefile == -1)
	newfdp->fd_freefile = i;
	}
	}
	newfdp->fd_cmask = fdp->fd_cmask;
	FILEDESC_SUNLOCK(fdp);
	FILEDESC_XLOCK(newfdp);
	for (i = 0; i <= newfdp->fd_lastfile; ++i)
	if (newfdp->fd_ofiles[i] != NULL)
	fdused(newfdp, i);
	if (newfdp->fd_freefile == -1)
	newfdp->fd_freefile = i;
	FILEDESC_XUNLOCK(newfdp);
	return (newfdp);
	}

	/*
	* Release a filedesc structure.
	*/
	void
	fdfree(struct thread *td)
	{
	struct filedesc *fdp;
	struct file **fpp;
	int i, locked;
	struct filedesc_to_leader *fdtol;
	struct file *fp;
	struct vnode cdir, jdir, rdir, vp;
	struct flock lf;

	/* Certain daemons might not have file descriptors. */
	fdp = td->td_proc->p_fd;
	if (fdp == NULL)
	return;

	#ifdef RACCT
	PROC_LOCK(td->td_proc);
	racct_set(td->td_proc, RACCT_NOFILE, 0);
	PROC_UNLOCK(td->td_proc);
	#endif

	/* Check for special need to clear POSIX style locks */
	fdtol = td->td_proc->p_fdtol;
	if (fdtol != NULL) {
	FILEDESC_XLOCK(fdp);
	KASSERT(fdtol->fdl_refcount > 0,
	("filedesc_to_refcount botch: fdl_refcount=%d",
	fdtol->fdl_refcount));
	if (fdtol->fdl_refcount == 1 &&
	(td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
	for (i = 0, fpp = fdp->fd_ofiles;
	i <= fdp->fd_lastfile;
	i++, fpp++) {
	if (*fpp == NULL \|\|
	(*fpp)->f_type != DTYPE_VNODE)
	continue;
	fp = *fpp;
	fhold(fp);
	FILEDESC_XUNLOCK(fdp);
	lf.l_whence = SEEK_SET;
	lf.l_start = 0;
	lf.l_len = 0;
	lf.l_type = F_UNLCK;
	vp = fp->f_vnode;
	locked = VFS_LOCK_GIANT(vp->v_mount);
	(void) VOP_ADVLOCK(vp,
	(caddr_t)td->td_proc->
	p_leader,
	F_UNLCK,
	&lf,
	F_POSIX);
	VFS_UNLOCK_GIANT(locked);
	FILEDESC_XLOCK(fdp);
	fdrop(fp, td);
	fpp = fdp->fd_ofiles + i;
	}
	}
	retry:
	if (fdtol->fdl_refcount == 1) {
	if (fdp->fd_holdleaderscount > 0 &&
	(td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
	/*
	* close() or do_dup() has cleared a reference
	* in a shared file descriptor table.
	*/
	fdp->fd_holdleaderswakeup = 1;
	sx_sleep(&fdp->fd_holdleaderscount,
	FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
	goto retry;
	}
	if (fdtol->fdl_holdcount > 0) {
	/*
	* Ensure that fdtol->fdl_leader remains
	* valid in closef().
	*/
	fdtol->fdl_wakeup = 1;
	sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
	"fdlhold", 0);
	goto retry;
	}
	}
	fdtol->fdl_refcount--;
	if (fdtol->fdl_refcount == 0 &&
	fdtol->fdl_holdcount == 0) {
	fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
	fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
	} else
	fdtol = NULL;
	td->td_proc->p_fdtol = NULL;
	FILEDESC_XUNLOCK(fdp);
	if (fdtol != NULL)
	free(fdtol, M_FILEDESC_TO_LEADER);
	}
	FILEDESC_XLOCK(fdp);
	i = --fdp->fd_refcnt;
	FILEDESC_XUNLOCK(fdp);
	if (i > 0)
	return;

	fpp = fdp->fd_ofiles;
	for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
	if (*fpp) {
	FILEDESC_XLOCK(fdp);
	fp = *fpp;
	*fpp = NULL;
	FILEDESC_XUNLOCK(fdp);
	(void) closef(fp, td);
	}
	}
	FILEDESC_XLOCK(fdp);

	/* XXX This should happen earlier. */
	mtx_lock(&fdesc_mtx);
	td->td_proc->p_fd = NULL;
	mtx_unlock(&fdesc_mtx);

	if (fdp->fd_nfiles > NDFILE)
	free(fdp->fd_ofiles, M_FILEDESC);
	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
	free(fdp->fd_map, M_FILEDESC);

	fdp->fd_nfiles = 0;

	cdir = fdp->fd_cdir;
	fdp->fd_cdir = NULL;
	rdir = fdp->fd_rdir;
	fdp->fd_rdir = NULL;
	jdir = fdp->fd_jdir;
	fdp->fd_jdir = NULL;
	FILEDESC_XUNLOCK(fdp);

	if (cdir) {
	locked = VFS_LOCK_GIANT(cdir->v_mount);
	vrele(cdir);
	VFS_UNLOCK_GIANT(locked);
	}
	if (rdir) {
	locked = VFS_LOCK_GIANT(rdir->v_mount);
	vrele(rdir);
	VFS_UNLOCK_GIANT(locked);
	}
	if (jdir) {
	locked = VFS_LOCK_GIANT(jdir->v_mount);
	vrele(jdir);
	VFS_UNLOCK_GIANT(locked);
	}

	fddrop(fdp);
	}

	/*
	* For setugid programs, we don't want to people to use that setugidness
	* to generate error messages which write to a file which otherwise would
	* otherwise be off-limits to the process. We check for filesystems where
	* the vnode can change out from under us after execve (like [lin]procfs).
	*
	* Since setugidsafety calls this only for fd 0, 1 and 2, this check is
	* sufficient. We also don't check for setugidness since we know we are.
	*/
	static int
	is_unsafe(struct file *fp)
	{
	if (fp->f_type == DTYPE_VNODE) {
	struct vnode *vp = fp->f_vnode;

	if ((vp->v_vflag & VV_PROCDEP) != 0)
	return (1);
	}
	return (0);
	}

	/*
	* Make this setguid thing safe, if at all possible.
	*/
	void
	setugidsafety(struct thread *td)
	{
	struct filedesc *fdp;
	int i;

	/* Certain daemons might not have file descriptors. */
	fdp = td->td_proc->p_fd;
	if (fdp == NULL)
	return;

	/*
	* Note: fdp->fd_ofiles may be reallocated out from under us while
	* we are blocked in a close. Be careful!
	*/
	FILEDESC_XLOCK(fdp);
	for (i = 0; i <= fdp->fd_lastfile; i++) {
	if (i > 2)
	break;
	if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
	struct file *fp;

	knote_fdclose(td, i);
	/*
	* NULL-out descriptor prior to close to avoid
	* a race while close blocks.
	*/
	fp = fdp->fd_ofiles[i];
	fdp->fd_ofiles[i] = NULL;
	fdp->fd_ofileflags[i] = 0;
	fdunused(fdp, i);
	FILEDESC_XUNLOCK(fdp);
	(void) closef(fp, td);
	FILEDESC_XLOCK(fdp);
	}
	}
	FILEDESC_XUNLOCK(fdp);
	}

	/*
	* If a specific file object occupies a specific file descriptor, close the
	* file descriptor entry and drop a reference on the file object. This is a
	* convenience function to handle a subsequent error in a function that calls
	* falloc() that handles the race that another thread might have closed the
	* file descriptor out from under the thread creating the file object.
	*/
	void
	fdclose(struct filedesc fdp, struct file fp, int idx, struct thread *td)
	{

	FILEDESC_XLOCK(fdp);
	if (fdp->fd_ofiles[idx] == fp) {
	fdp->fd_ofiles[idx] = NULL;
	fdunused(fdp, idx);
	FILEDESC_XUNLOCK(fdp);
	fdrop(fp, td);
	} else
	FILEDESC_XUNLOCK(fdp);
	}

	/*
	* Close any files on exec?
	*/
	void
	fdcloseexec(struct thread *td)
	{
	struct filedesc *fdp;
	int i;

	/* Certain daemons might not have file descriptors. */
	fdp = td->td_proc->p_fd;
	if (fdp == NULL)
	return;

	FILEDESC_XLOCK(fdp);

	/*
	* We cannot cache fd_ofiles or fd_ofileflags since operations
	* may block and rip them out from under us.
	*/
	for (i = 0; i <= fdp->fd_lastfile; i++) {
	if (fdp->fd_ofiles[i] != NULL &&
	(fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE \|\|
	(fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
	struct file *fp;

	knote_fdclose(td, i);
	/*
	* NULL-out descriptor prior to close to avoid
	* a race while close blocks.
	*/
	fp = fdp->fd_ofiles[i];
	fdp->fd_ofiles[i] = NULL;
	fdp->fd_ofileflags[i] = 0;
	fdunused(fdp, i);
	if (fp->f_type == DTYPE_MQUEUE)
	mq_fdclose(td, i, fp);
	FILEDESC_XUNLOCK(fdp);
	(void) closef(fp, td);
	FILEDESC_XLOCK(fdp);
	}
	}
	FILEDESC_XUNLOCK(fdp);
	}

	/*
	* It is unsafe for set[ug]id processes to be started with file
	* descriptors 0..2 closed, as these descriptors are given implicit
	* significance in the Standard C library. fdcheckstd() will create a
	* descriptor referencing /dev/null for each of stdin, stdout, and
	* stderr that is not already open.
	*/
	int
	fdcheckstd(struct thread *td)
	{
	struct filedesc *fdp;
	register_t retval, save;
	int i, error, devnull;

	fdp = td->td_proc->p_fd;
	if (fdp == NULL)
	return (0);
	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
	devnull = -1;
	error = 0;
	for (i = 0; i < 3; i++) {
	if (fdp->fd_ofiles[i] != NULL)
	continue;
	if (devnull < 0) {
	save = td->td_retval[0];
	error = kern_open(td, "/dev/null", UIO_SYSSPACE,
	O_RDWR, 0);
	devnull = td->td_retval[0];
	td->td_retval[0] = save;
	if (error)
	break;
	KASSERT(devnull == i, ("oof, we didn't get our fd"));
	} else {
	error = do_dup(td, DUP_FIXED, devnull, i, &retval);
	if (error != 0)
	break;
	}
	}
	return (error);
	}

	/*
	* Internal form of close. Decrement reference count on file structure.
	* Note: td may be NULL when closing a file that was being passed in a
	* message.
	*
	* XXXRW: Giant is not required for the caller, but often will be held; this
	* makes it moderately likely the Giant will be recursed in the VFS case.
	*/
	int
	closef(struct file fp, struct thread td)
	{
	struct vnode *vp;
	struct flock lf;
	struct filedesc_to_leader *fdtol;
	struct filedesc *fdp;
	struct file *fp_object;

	/*
	* POSIX record locking dictates that any close releases ALL
	* locks owned by this process. This is handled by setting
	* a flag in the unlock to free ONLY locks obeying POSIX
	* semantics, and not to free BSD-style file locks.
	* If the descriptor was in a message, POSIX-style locks
	* aren't passed with the descriptor, and the thread pointer
	* will be NULL. Callers should be careful only to pass a
	* NULL thread pointer when there really is no owning
	* context that might have locks, or the locks will be
	* leaked.
	*
	* If this is a capability, we do lock processing under the underlying
	* node, not the capability itself.
	*/
	(void)cap_funwrap(fp, 0, &fp_object);
	if ((fp_object->f_type == DTYPE_VNODE) && (td != NULL)) {
	int vfslocked;

	vp = fp_object->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
	lf.l_whence = SEEK_SET;
	lf.l_start = 0;
	lf.l_len = 0;
	lf.l_type = F_UNLCK;
	(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
	F_UNLCK, &lf, F_POSIX);
	}
	fdtol = td->td_proc->p_fdtol;
	if (fdtol != NULL) {
	/*
	* Handle special case where file descriptor table is
	* shared between multiple process leaders.
	*/
	fdp = td->td_proc->p_fd;
	FILEDESC_XLOCK(fdp);
	for (fdtol = fdtol->fdl_next;
	fdtol != td->td_proc->p_fdtol;
	fdtol = fdtol->fdl_next) {
	if ((fdtol->fdl_leader->p_flag &
	P_ADVLOCK) == 0)
	continue;
	fdtol->fdl_holdcount++;
	FILEDESC_XUNLOCK(fdp);
	lf.l_whence = SEEK_SET;
	lf.l_start = 0;
	lf.l_len = 0;
	lf.l_type = F_UNLCK;
	vp = fp_object->f_vnode;
	(void) VOP_ADVLOCK(vp,
	(caddr_t)fdtol->fdl_leader,
	F_UNLCK, &lf, F_POSIX);
	FILEDESC_XLOCK(fdp);
	fdtol->fdl_holdcount--;
	if (fdtol->fdl_holdcount == 0 &&
	fdtol->fdl_wakeup != 0) {
	fdtol->fdl_wakeup = 0;
	wakeup(fdtol);
	}
	}
	FILEDESC_XUNLOCK(fdp);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	}
	return (fdrop(fp, td));
	}

	/*
	* Initialize the file pointer with the specified properties.
	*
	* The ops are set with release semantics to be certain that the flags, type,
	* and data are visible when ops is. This is to prevent ops methods from being
	* called with bad data.
	*/
	void
	finit(struct file fp, u_int flag, short type, void data, struct fileops *ops)
	{
	fp->f_data = data;
	fp->f_flag = flag;
	fp->f_type = type;
	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
	}

	struct file *
	fget_unlocked(struct filedesc *fdp, int fd)
	{
	struct file *fp;
	u_int count;

	if (fd < 0 \|\| fd >= fdp->fd_nfiles)
	return (NULL);
	/*
	* Fetch the descriptor locklessly. We avoid fdrop() races by
	* never raising a refcount above 0. To accomplish this we have
	* to use a cmpset loop rather than an atomic_add. The descriptor
	* must be re-verified once we acquire a reference to be certain
	* that the identity is still correct and we did not lose a race
	* due to preemption.
	*/
	for (;;) {
	fp = fdp->fd_ofiles[fd];
	if (fp == NULL)
	break;
	count = fp->f_count;
	if (count == 0)
	continue;
	/*
	* Use an acquire barrier to prevent caching of fd_ofiles
	* so it is refreshed for verification.
	*/
	if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
	continue;
	if (fp == fdp->fd_ofiles[fd])
	break;
	fdrop(fp, curthread);
	}

	return (fp);
	}

	/*
	* Extract the file pointer associated with the specified descriptor for the
	* current user process.
	*
	* If the descriptor doesn't exist or doesn't match 'flags', EBADF is
	* returned.
	*
	* If the FGET_GETCAP flag is set, the capability itself will be returned.
	* Calling _fget() with FGET_GETCAP on a non-capability will return EINVAL.
	* Otherwise, if the file is a capability, its rights will be checked against
	* the capability rights mask, and if successful, the object will be unwrapped.
	*
	* If an error occured the non-zero error is returned and *fpp is set to
	* NULL. Otherwise *fpp is held and set and zero is returned. Caller is
	* responsible for fdrop().
	*/
	#define FGET_GETCAP 0x00000001
	static __inline int
	_fget(struct thread td, int fd, struct file *fpp, int flags,
	cap_rights_t needrights, cap_rights_t haverightsp, u_char maxprotp,
	int fget_flags)
	{
	struct filedesc *fdp;
	struct file *fp;
	#ifdef CAPABILITIES
	struct file *fp_fromcap;
	int error;
	#endif

	*fpp = NULL;
	if (td == NULL \|\| (fdp = td->td_proc->p_fd) == NULL)
	return (EBADF);
	if ((fp = fget_unlocked(fdp, fd)) == NULL)
	return (EBADF);
	if (fp->f_ops == &badfileops) {
	fdrop(fp, td);
	return (EBADF);
	}

	#ifdef CAPABILITIES
	/*
	* If this is a capability, what rights does it have?
	*/
	if (haverightsp != NULL) {
	if (fp->f_type == DTYPE_CAPABILITY)
	*haverightsp = cap_rights(fp);
	else
	*haverightsp = CAP_MASK_VALID;
	}

	/*
	* If a capability has been requested, return the capability directly.
	* Otherwise, check capability rights, extract the underlying object,
	* and check its access flags.
	*/
	if (fget_flags & FGET_GETCAP) {
	if (fp->f_type != DTYPE_CAPABILITY) {
	fdrop(fp, td);
	return (EINVAL);
	}
	} else {
	if (maxprotp == NULL)
	error = cap_funwrap(fp, needrights, &fp_fromcap);
	else
	error = cap_funwrap_mmap(fp, needrights, maxprotp,
	&fp_fromcap);
	if (error) {
	fdrop(fp, td);
	return (error);
	}

	/*
	* If we've unwrapped a file, drop the original capability
	* and hold the new descriptor. fp after this point refers to
	* the actual (unwrapped) object, not the capability.
	*/
	if (fp != fp_fromcap) {
	fhold(fp_fromcap);
	fdrop(fp, td);
	fp = fp_fromcap;
	}
	}
	#else /* !CAPABILITIES */
	KASSERT(fp->f_type != DTYPE_CAPABILITY,
	("%s: saw capability", __func__));
	if (maxprotp != NULL)
	*maxprotp = VM_PROT_ALL;
	#endif /* CAPABILITIES */

	/*
	* FREAD and FWRITE failure return EBADF as per POSIX.
	*
	* Only one flag, or 0, may be specified.
	*/
	if ((flags == FREAD && (fp->f_flag & FREAD) == 0) \|\|
	(flags == FWRITE && (fp->f_flag & FWRITE) == 0)) {
	fdrop(fp, td);
	return (EBADF);
	}
	*fpp = fp;
	return (0);
	}

	int
	fget(struct thread td, int fd, cap_rights_t rights, struct file *fpp)
	{

	return(_fget(td, fd, fpp, 0, rights, NULL, NULL, 0));
	}

	int
	fget_mmap(struct thread td, int fd, cap_rights_t rights, u_char maxprotp,
	struct file **fpp)
	{

	return (_fget(td, fd, fpp, 0, rights, NULL, maxprotp, 0));
	}

	int
	fget_read(struct thread td, int fd, cap_rights_t rights, struct file *fpp)
	{

	return(_fget(td, fd, fpp, FREAD, rights, NULL, NULL, 0));
	}

	int
	fget_write(struct thread td, int fd, cap_rights_t rights, struct file *fpp)
	{

	return (_fget(td, fd, fpp, FWRITE, rights, NULL, NULL, 0));
	}

	/*
	* Unlike the other fget() calls, which accept and check capability rights
	* but never return capabilities, fgetcap() returns the capability but doesn't
	* check capability rights.
	*/
	int
	fgetcap(struct thread td, int fd, struct file *fpp)
	{

	return (_fget(td, fd, fpp, 0, 0, NULL, NULL, FGET_GETCAP));
	}


	/*
	* Like fget() but loads the underlying vnode, or returns an error if the
	* descriptor does not represent a vnode. Note that pipes use vnodes but
	* never have VM objects. The returned vnode will be vref()'d.
	*
	* XXX: what about the unused flags ?
	*/
	static __inline int
	_fgetvp(struct thread *td, int fd, int flags, cap_rights_t needrights,
	cap_rights_t haverightsp, struct vnode *vpp)
	{
	struct file *fp;
	int error;

	*vpp = NULL;
	if ((error = _fget(td, fd, &fp, flags, needrights, haverightsp,
	NULL, 0)) != 0)
	return (error);
	if (fp->f_vnode == NULL) {
	error = EINVAL;
	} else {
	*vpp = fp->f_vnode;
	vref(*vpp);
	}
	fdrop(fp, td);

	return (error);
	}

	int
	fgetvp(struct thread td, int fd, cap_rights_t rights, struct vnode *vpp)
	{

	return (_fgetvp(td, fd, 0, rights, NULL, vpp));
	}

	int
	fgetvp_rights(struct thread td, int fd, cap_rights_t need, cap_rights_t have,
	struct vnode **vpp)
	{
	return (_fgetvp(td, fd, 0, need, have, vpp));
	}

	int
	fgetvp_read(struct thread td, int fd, cap_rights_t rights, struct vnode *vpp)
	{

	return (_fgetvp(td, fd, FREAD, rights, NULL, vpp));
	}

	#ifdef notyet
	int
	fgetvp_write(struct thread *td, int fd, cap_rights_t rights,
	struct vnode **vpp)
	{

	return (_fgetvp(td, fd, FWRITE, rights, NULL, vpp));
	}
	#endif

	/*
	* Like fget() but loads the underlying socket, or returns an error if the
	* descriptor does not represent a socket.
	*
	* We bump the ref count on the returned socket. XXX Also obtain the SX lock
	* in the future.
	*
	* Note: fgetsock() and fputsock() are deprecated, as consumers should rely
	* on their file descriptor reference to prevent the socket from being free'd
	* during use.
	*/
	int
	fgetsock(struct thread td, int fd, cap_rights_t rights, struct socket *spp,
	u_int *fflagp)
	{
	struct file *fp;
	int error;

	*spp = NULL;
	if (fflagp != NULL)
	*fflagp = 0;
	if ((error = _fget(td, fd, &fp, 0, rights, NULL, NULL, 0)) != 0)
	return (error);
	if (fp->f_type != DTYPE_SOCKET) {
	error = ENOTSOCK;
	} else {
	*spp = fp->f_data;
	if (fflagp)
	*fflagp = fp->f_flag;
	SOCK_LOCK(*spp);
	soref(*spp);
	SOCK_UNLOCK(*spp);
	}
	fdrop(fp, td);

	return (error);
	}

	/*
	* Drop the reference count on the socket and XXX release the SX lock in the
	* future. The last reference closes the socket.
	*
	* Note: fputsock() is deprecated, see comment for fgetsock().
	*/
	void
	fputsock(struct socket *so)
	{

	ACCEPT_LOCK();
	SOCK_LOCK(so);
	CURVNET_SET(so->so_vnet);
	sorele(so);
	CURVNET_RESTORE();
	}

	/*
	* Handle the last reference to a file being closed.
	*
	* No special capability handling here, as the capability's fo_close will run
	* instead of the object here, and perform any necessary drop on the object.
	*/
	int
	_fdrop(struct file fp, struct thread td)
	{
	int error;

	error = 0;
	if (fp->f_count != 0)
	panic("fdrop: count %d", fp->f_count);
	if (fp->f_ops != &badfileops)
	error = fo_close(fp, td);
	/*
	* The f_cdevpriv cannot be assigned non-NULL value while we
	* are destroying the file.
	*/
	if (fp->f_cdevpriv != NULL)
	devfs_fpdrop(fp);
	atomic_subtract_int(&openfiles, 1);
	crfree(fp->f_cred);
	uma_zfree(file_zone, fp);

	return (error);
	}

	/*
	* Apply an advisory lock on a file descriptor.
	*
	* Just attempt to get a record lock of the requested type on the entire file
	* (l_whence = SEEK_SET, l_start = 0, l_len = 0).
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct flock_args {
	int fd;
	int how;
	};
	#endif
	/* ARGSUSED */
	int
	-flock(struct thread td, struct flock_args uap)
	+sys_flock(struct thread td, struct flock_args uap)
	{
	struct file *fp;
	struct vnode *vp;
	struct flock lf;
	int vfslocked;
	int error;

	if ((error = fget(td, uap->fd, CAP_FLOCK, &fp)) != 0)
	return (error);
	if (fp->f_type != DTYPE_VNODE) {
	fdrop(fp, td);
	return (EOPNOTSUPP);
	}

	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	lf.l_whence = SEEK_SET;
	lf.l_start = 0;
	lf.l_len = 0;
	if (uap->how & LOCK_UN) {
	lf.l_type = F_UNLCK;
	atomic_clear_int(&fp->f_flag, FHASLOCK);
	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
	goto done2;
	}
	if (uap->how & LOCK_EX)
	lf.l_type = F_WRLCK;
	else if (uap->how & LOCK_SH)
	lf.l_type = F_RDLCK;
	else {
	error = EBADF;
	goto done2;
	}
	atomic_set_int(&fp->f_flag, FHASLOCK);
	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
	(uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK \| F_WAIT);
	done2:
	fdrop(fp, td);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	/*
	* Duplicate the specified descriptor to a free descriptor.
	*/
	int
	dupfdopen(struct thread td, struct filedesc fdp, int indx, int dfd, int mode, int error)
	{
	struct file *wfp;
	struct file *fp;

	/*
	* If the to-be-dup'd fd number is greater than the allowed number
	* of file descriptors, or the fd to be dup'd has already been
	* closed, then reject.
	*/
	FILEDESC_XLOCK(fdp);
	if (dfd < 0 \|\| dfd >= fdp->fd_nfiles \|\|
	(wfp = fdp->fd_ofiles[dfd]) == NULL) {
	FILEDESC_XUNLOCK(fdp);
	return (EBADF);
	}

	/*
	* There are two cases of interest here.
	*
	* For ENODEV simply dup (dfd) to file descriptor (indx) and return.
	*
	* For ENXIO steal away the file structure from (dfd) and store it in
	* (indx). (dfd) is effectively closed by this operation.
	*
	* Any other error code is just returned.
	*/
	switch (error) {
	case ENODEV:
	/*
	* Check that the mode the file is being opened for is a
	* subset of the mode of the existing descriptor.
	*/
	if (((mode & (FREAD\|FWRITE)) \| wfp->f_flag) != wfp->f_flag) {
	FILEDESC_XUNLOCK(fdp);
	return (EACCES);
	}
	fp = fdp->fd_ofiles[indx];
	fdp->fd_ofiles[indx] = wfp;
	fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
	if (fp == NULL)
	fdused(fdp, indx);
	fhold(wfp);
	FILEDESC_XUNLOCK(fdp);
	if (fp != NULL)
	/*
	* We now own the reference to fp that the ofiles[]
	* array used to own. Release it.
	*/
	fdrop(fp, td);
	return (0);

	case ENXIO:
	/*
	* Steal away the file pointer from dfd and stuff it into indx.
	*/
	fp = fdp->fd_ofiles[indx];
	fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
	fdp->fd_ofiles[dfd] = NULL;
	fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
	fdp->fd_ofileflags[dfd] = 0;
	fdunused(fdp, dfd);
	if (fp == NULL)
	fdused(fdp, indx);
	FILEDESC_XUNLOCK(fdp);

	/*
	* We now own the reference to fp that the ofiles[] array
	* used to own. Release it.
	*/
	if (fp != NULL)
	fdrop(fp, td);
	return (0);

	default:
	FILEDESC_XUNLOCK(fdp);
	return (error);
	}
	/* NOTREACHED */
	}

	/*
	* Scan all active processes and prisons to see if any of them have a current
	* or root directory of `olddp'. If so, replace them with the new mount point.
	*/
	void
	mountcheckdirs(struct vnode olddp, struct vnode newdp)
	{
	struct filedesc *fdp;
	struct prison *pr;
	struct proc *p;
	int nrele;

	if (vrefcnt(olddp) == 1)
	return;
	nrele = 0;
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	fdp = fdhold(p);
	if (fdp == NULL)
	continue;
	FILEDESC_XLOCK(fdp);
	if (fdp->fd_cdir == olddp) {
	vref(newdp);
	fdp->fd_cdir = newdp;
	nrele++;
	}
	if (fdp->fd_rdir == olddp) {
	vref(newdp);
	fdp->fd_rdir = newdp;
	nrele++;
	}
	if (fdp->fd_jdir == olddp) {
	vref(newdp);
	fdp->fd_jdir = newdp;
	nrele++;
	}
	FILEDESC_XUNLOCK(fdp);
	fddrop(fdp);
	}
	sx_sunlock(&allproc_lock);
	if (rootvnode == olddp) {
	vref(newdp);
	rootvnode = newdp;
	nrele++;
	}
	mtx_lock(&prison0.pr_mtx);
	if (prison0.pr_root == olddp) {
	vref(newdp);
	prison0.pr_root = newdp;
	nrele++;
	}
	mtx_unlock(&prison0.pr_mtx);
	sx_slock(&allprison_lock);
	TAILQ_FOREACH(pr, &allprison, pr_list) {
	mtx_lock(&pr->pr_mtx);
	if (pr->pr_root == olddp) {
	vref(newdp);
	pr->pr_root = newdp;
	nrele++;
	}
	mtx_unlock(&pr->pr_mtx);
	}
	sx_sunlock(&allprison_lock);
	while (nrele--)
	vrele(olddp);
	}

	struct filedesc_to_leader *
	filedesc_to_leader_alloc(struct filedesc_to_leader old, struct filedesc fdp, struct proc *leader)
	{
	struct filedesc_to_leader *fdtol;

	fdtol = malloc(sizeof(struct filedesc_to_leader),
	M_FILEDESC_TO_LEADER,
	M_WAITOK);
	fdtol->fdl_refcount = 1;
	fdtol->fdl_holdcount = 0;
	fdtol->fdl_wakeup = 0;
	fdtol->fdl_leader = leader;
	if (old != NULL) {
	FILEDESC_XLOCK(fdp);
	fdtol->fdl_next = old->fdl_next;
	fdtol->fdl_prev = old;
	old->fdl_next = fdtol;
	fdtol->fdl_next->fdl_prev = fdtol;
	FILEDESC_XUNLOCK(fdp);
	} else {
	fdtol->fdl_next = fdtol;
	fdtol->fdl_prev = fdtol;
	}
	return (fdtol);
	}

	/*
	* Get file structures globally.
	*/
	static int
	sysctl_kern_file(SYSCTL_HANDLER_ARGS)
	{
	struct xfile xf;
	struct filedesc *fdp;
	struct file *fp;
	struct proc *p;
	int error, n;

	error = sysctl_wire_old_buffer(req, 0);
	if (error != 0)
	return (error);
	if (req->oldptr == NULL) {
	n = 0;
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	if (p->p_state == PRS_NEW)
	continue;
	fdp = fdhold(p);
	if (fdp == NULL)
	continue;
	/* overestimates sparse tables. */
	if (fdp->fd_lastfile > 0)
	n += fdp->fd_lastfile;
	fddrop(fdp);
	}
	sx_sunlock(&allproc_lock);
	return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
	}
	error = 0;
	bzero(&xf, sizeof(xf));
	xf.xf_size = sizeof(xf);
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	PROC_LOCK(p);
	if (p->p_state == PRS_NEW) {
	PROC_UNLOCK(p);
	continue;
	}
	if (p_cansee(req->td, p) != 0) {
	PROC_UNLOCK(p);
	continue;
	}
	xf.xf_pid = p->p_pid;
	xf.xf_uid = p->p_ucred->cr_uid;
	PROC_UNLOCK(p);
	fdp = fdhold(p);
	if (fdp == NULL)
	continue;
	FILEDESC_SLOCK(fdp);
	for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
	if ((fp = fdp->fd_ofiles[n]) == NULL)
	continue;
	xf.xf_fd = n;
	xf.xf_file = fp;
	xf.xf_data = fp->f_data;
	xf.xf_vnode = fp->f_vnode;
	xf.xf_type = fp->f_type;
	xf.xf_count = fp->f_count;
	xf.xf_msgcount = 0;
	xf.xf_offset = fp->f_offset;
	xf.xf_flag = fp->f_flag;
	error = SYSCTL_OUT(req, &xf, sizeof(xf));
	if (error)
	break;
	}
	FILEDESC_SUNLOCK(fdp);
	fddrop(fdp);
	if (error)
	break;
	}
	sx_sunlock(&allproc_lock);
	return (error);
	}

	SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE\|CTLFLAG_RD,
	0, 0, sysctl_kern_file, "S,xfile", "Entire file table");

	#ifdef KINFO_OFILE_SIZE
	CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
	#endif

	#ifdef COMPAT_FREEBSD7
	static int
	export_vnode_for_osysctl(struct vnode *vp, int type,
	struct kinfo_ofile kif, struct filedesc fdp, struct sysctl_req *req)
	{
	int error;
	char fullpath, freepath;
	int vfslocked;

	bzero(kif, sizeof(*kif));
	kif->kf_structsize = sizeof(*kif);

	vref(vp);
	kif->kf_fd = type;
	kif->kf_type = KF_TYPE_VNODE;
	/* This function only handles directories. */
	if (vp->v_type != VDIR) {
	vrele(vp);
	return (ENOTDIR);
	}
	kif->kf_vnode_type = KF_VTYPE_VDIR;

	/*
	* This is not a true file descriptor, so we set a bogus refcount
	* and offset to indicate these fields should be ignored.
	*/
	kif->kf_ref_count = -1;
	kif->kf_offset = -1;

	freepath = NULL;
	fullpath = "-";
	FILEDESC_SUNLOCK(fdp);
	vn_fullpath(curthread, vp, &fullpath, &freepath);
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
	if (freepath != NULL)
	free(freepath, M_TEMP);
	error = SYSCTL_OUT(req, kif, sizeof(*kif));
	FILEDESC_SLOCK(fdp);
	return (error);
	}

	/*
	* Get per-process file descriptors for use by procstat(1), et al.
	*/
	static int
	sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
	{
	char fullpath, freepath;
	struct kinfo_ofile *kif;
	struct filedesc *fdp;
	int error, i, *name;
	struct socket *so;
	struct vnode *vp;
	struct file *fp;
	struct proc *p;
	struct tty *tp;
	int vfslocked;

	name = (int *)arg1;
	if ((p = pfind((pid_t)name[0])) == NULL)
	return (ESRCH);
	if ((error = p_candebug(curthread, p))) {
	PROC_UNLOCK(p);
	return (error);
	}
	fdp = fdhold(p);
	PROC_UNLOCK(p);
	if (fdp == NULL)
	return (ENOENT);
	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
	FILEDESC_SLOCK(fdp);
	if (fdp->fd_cdir != NULL)
	export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
	fdp, req);
	if (fdp->fd_rdir != NULL)
	export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
	fdp, req);
	if (fdp->fd_jdir != NULL)
	export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
	fdp, req);
	for (i = 0; i < fdp->fd_nfiles; i++) {
	if ((fp = fdp->fd_ofiles[i]) == NULL)
	continue;
	bzero(kif, sizeof(*kif));
	kif->kf_structsize = sizeof(*kif);
	vp = NULL;
	so = NULL;
	tp = NULL;
	kif->kf_fd = i;

	#ifdef CAPABILITIES
	/*
	* When reporting a capability, most fields will be from the
	* underlying object, but do mark as a capability. With
	* ofiledesc, we don't have a field to export the cap_rights_t,
	* but we do with the new filedesc.
	*/
	if (fp->f_type == DTYPE_CAPABILITY) {
	kif->kf_flags \|= KF_FLAG_CAPABILITY;
	(void)cap_funwrap(fp, 0, &fp);
	}
	#else
	KASSERT(fp->f_type != DTYPE_CAPABILITY,
	("sysctl_kern_proc_ofiledesc: saw capability"));
	#endif
	switch (fp->f_type) {
	case DTYPE_VNODE:
	kif->kf_type = KF_TYPE_VNODE;
	vp = fp->f_vnode;
	break;

	case DTYPE_SOCKET:
	kif->kf_type = KF_TYPE_SOCKET;
	so = fp->f_data;
	break;

	case DTYPE_PIPE:
	kif->kf_type = KF_TYPE_PIPE;
	break;

	case DTYPE_FIFO:
	kif->kf_type = KF_TYPE_FIFO;
	vp = fp->f_vnode;
	break;

	case DTYPE_KQUEUE:
	kif->kf_type = KF_TYPE_KQUEUE;
	break;

	case DTYPE_CRYPTO:
	kif->kf_type = KF_TYPE_CRYPTO;
	break;

	case DTYPE_MQUEUE:
	kif->kf_type = KF_TYPE_MQUEUE;
	break;

	case DTYPE_SHM:
	kif->kf_type = KF_TYPE_SHM;
	break;

	case DTYPE_SEM:
	kif->kf_type = KF_TYPE_SEM;
	break;

	case DTYPE_PTS:
	kif->kf_type = KF_TYPE_PTS;
	tp = fp->f_data;
	break;

	#ifdef PROCDESC
	case DTYPE_PROCDESC:
	kif->kf_type = KF_TYPE_PROCDESC;
	break;
	#endif

	default:
	kif->kf_type = KF_TYPE_UNKNOWN;
	break;
	}
	kif->kf_ref_count = fp->f_count;
	if (fp->f_flag & FREAD)
	kif->kf_flags \|= KF_FLAG_READ;
	if (fp->f_flag & FWRITE)
	kif->kf_flags \|= KF_FLAG_WRITE;
	if (fp->f_flag & FAPPEND)
	kif->kf_flags \|= KF_FLAG_APPEND;
	if (fp->f_flag & FASYNC)
	kif->kf_flags \|= KF_FLAG_ASYNC;
	if (fp->f_flag & FFSYNC)
	kif->kf_flags \|= KF_FLAG_FSYNC;
	if (fp->f_flag & FNONBLOCK)
	kif->kf_flags \|= KF_FLAG_NONBLOCK;
	if (fp->f_flag & O_DIRECT)
	kif->kf_flags \|= KF_FLAG_DIRECT;
	if (fp->f_flag & FHASLOCK)
	kif->kf_flags \|= KF_FLAG_HASLOCK;
	kif->kf_offset = fp->f_offset;
	if (vp != NULL) {
	vref(vp);
	switch (vp->v_type) {
	case VNON:
	kif->kf_vnode_type = KF_VTYPE_VNON;
	break;
	case VREG:
	kif->kf_vnode_type = KF_VTYPE_VREG;
	break;
	case VDIR:
	kif->kf_vnode_type = KF_VTYPE_VDIR;
	break;
	case VBLK:
	kif->kf_vnode_type = KF_VTYPE_VBLK;
	break;
	case VCHR:
	kif->kf_vnode_type = KF_VTYPE_VCHR;
	break;
	case VLNK:
	kif->kf_vnode_type = KF_VTYPE_VLNK;
	break;
	case VSOCK:
	kif->kf_vnode_type = KF_VTYPE_VSOCK;
	break;
	case VFIFO:
	kif->kf_vnode_type = KF_VTYPE_VFIFO;
	break;
	case VBAD:
	kif->kf_vnode_type = KF_VTYPE_VBAD;
	break;
	default:
	kif->kf_vnode_type = KF_VTYPE_UNKNOWN;
	break;
	}
	/*
	* It is OK to drop the filedesc lock here as we will
	* re-validate and re-evaluate its properties when
	* the loop continues.
	*/
	freepath = NULL;
	fullpath = "-";
	FILEDESC_SUNLOCK(fdp);
	vn_fullpath(curthread, vp, &fullpath, &freepath);
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	strlcpy(kif->kf_path, fullpath,
	sizeof(kif->kf_path));
	if (freepath != NULL)
	free(freepath, M_TEMP);
	FILEDESC_SLOCK(fdp);
	}
	if (so != NULL) {
	struct sockaddr *sa;

	if (so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa)
	== 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
	bcopy(sa, &kif->kf_sa_local, sa->sa_len);
	free(sa, M_SONAME);
	}
	if (so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa)
	== 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
	bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
	free(sa, M_SONAME);
	}
	kif->kf_sock_domain =
	so->so_proto->pr_domain->dom_family;
	kif->kf_sock_type = so->so_type;
	kif->kf_sock_protocol = so->so_proto->pr_protocol;
	}
	if (tp != NULL) {
	strlcpy(kif->kf_path, tty_devname(tp),
	sizeof(kif->kf_path));
	}
	error = SYSCTL_OUT(req, kif, sizeof(*kif));
	if (error)
	break;
	}
	FILEDESC_SUNLOCK(fdp);
	fddrop(fdp);
	free(kif, M_TEMP);
	return (0);
	}

	static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD,
	sysctl_kern_proc_ofiledesc, "Process ofiledesc entries");
	#endif /* COMPAT_FREEBSD7 */

	#ifdef KINFO_FILE_SIZE
	CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
	#endif

	static int
	export_fd_for_sysctl(void *data, int type, int fd, int fflags, int refcnt,
	int64_t offset, struct kinfo_file kif, struct sysctl_req req)
	{
	struct {
	int fflag;
	int kf_fflag;
	} fflags_table[] = {
	{ FAPPEND, KF_FLAG_APPEND },
	{ FASYNC, KF_FLAG_ASYNC },
	{ FFSYNC, KF_FLAG_FSYNC },
	{ FHASLOCK, KF_FLAG_HASLOCK },
	{ FNONBLOCK, KF_FLAG_NONBLOCK },
	{ FREAD, KF_FLAG_READ },
	{ FWRITE, KF_FLAG_WRITE },
	{ O_CREAT, KF_FLAG_CREAT },
	{ O_DIRECT, KF_FLAG_DIRECT },
	{ O_EXCL, KF_FLAG_EXCL },
	{ O_EXEC, KF_FLAG_EXEC },
	{ O_EXLOCK, KF_FLAG_EXLOCK },
	{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
	{ O_SHLOCK, KF_FLAG_SHLOCK },
	{ O_TRUNC, KF_FLAG_TRUNC }
	};
	#define NFFLAGS (sizeof(fflags_table) / sizeof(*fflags_table))
	struct vnode *vp;
	int error, vfslocked;
	unsigned int i;

	bzero(kif, sizeof(*kif));
	switch (type) {
	case KF_TYPE_FIFO:
	case KF_TYPE_VNODE:
	vp = (struct vnode *)data;
	error = fill_vnode_info(vp, kif);
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	break;
	case KF_TYPE_SOCKET:
	error = fill_socket_info((struct socket *)data, kif);
	break;
	case KF_TYPE_PIPE:
	error = fill_pipe_info((struct pipe *)data, kif);
	break;
	case KF_TYPE_PTS:
	error = fill_pts_info((struct tty *)data, kif);
	break;
	case KF_TYPE_PROCDESC:
	error = fill_procdesc_info((struct procdesc *)data, kif);
	break;
	default:
	error = 0;
	}
	if (error == 0)
	kif->kf_status \|= KF_ATTR_VALID;

	/*
	* Translate file access flags.
	*/
	for (i = 0; i < NFFLAGS; i++)
	if (fflags & fflags_table[i].fflag)
	kif->kf_flags \|= fflags_table[i].kf_fflag;
	kif->kf_fd = fd;
	kif->kf_type = type;
	kif->kf_ref_count = refcnt;
	kif->kf_offset = offset;
	/* Pack record size down */
	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
	strlen(kif->kf_path) + 1;
	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
	error = SYSCTL_OUT(req, kif, kif->kf_structsize);
	return (error);
	}

	/*
	* Get per-process file descriptors for use by procstat(1), et al.
	*/
	static int
	sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
	{
	struct file *fp;
	struct filedesc *fdp;
	struct kinfo_file *kif;
	struct proc *p;
	struct vnode cttyvp, textvp, *tracevp;
	size_t oldidx;
	int64_t offset;
	void *data;
	int error, i, *name;
	int type, refcnt, fflags;

	name = (int *)arg1;
	if ((p = pfind((pid_t)name[0])) == NULL)
	return (ESRCH);
	if ((error = p_candebug(curthread, p))) {
	PROC_UNLOCK(p);
	return (error);
	}
	/* ktrace vnode */
	tracevp = p->p_tracevp;
	if (tracevp != NULL)
	vref(tracevp);
	/* text vnode */
	textvp = p->p_textvp;
	if (textvp != NULL)
	vref(textvp);
	/* Controlling tty. */
	cttyvp = NULL;
	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
	cttyvp = p->p_pgrp->pg_session->s_ttyvp;
	if (cttyvp != NULL)
	vref(cttyvp);
	}
	fdp = fdhold(p);
	PROC_UNLOCK(p);
	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
	if (tracevp != NULL)
	export_fd_for_sysctl(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE,
	FREAD \| FWRITE, -1, -1, kif, req);
	if (textvp != NULL)
	export_fd_for_sysctl(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT,
	FREAD, -1, -1, kif, req);
	if (cttyvp != NULL)
	export_fd_for_sysctl(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY,
	FREAD \| FWRITE, -1, -1, kif, req);
	if (fdp == NULL)
	goto fail;
	FILEDESC_SLOCK(fdp);
	/* working directory */
	if (fdp->fd_cdir != NULL) {
	vref(fdp->fd_cdir);
	data = fdp->fd_cdir;
	FILEDESC_SUNLOCK(fdp);
	export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD,
	FREAD, -1, -1, kif, req);
	FILEDESC_SLOCK(fdp);
	}
	/* root directory */
	if (fdp->fd_rdir != NULL) {
	vref(fdp->fd_rdir);
	data = fdp->fd_rdir;
	FILEDESC_SUNLOCK(fdp);
	export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT,
	FREAD, -1, -1, kif, req);
	FILEDESC_SLOCK(fdp);
	}
	/* jail directory */
	if (fdp->fd_jdir != NULL) {
	vref(fdp->fd_jdir);
	data = fdp->fd_jdir;
	FILEDESC_SUNLOCK(fdp);
	export_fd_for_sysctl(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL,
	FREAD, -1, -1, kif, req);
	FILEDESC_SLOCK(fdp);
	}
	for (i = 0; i < fdp->fd_nfiles; i++) {
	if ((fp = fdp->fd_ofiles[i]) == NULL)
	continue;
	data = NULL;

	#ifdef CAPABILITIES
	/*
	* When reporting a capability, most fields will be from the
	* underlying object, but do mark as a capability and export
	* the capability rights mask.
	*/
	if (fp->f_type == DTYPE_CAPABILITY) {
	kif->kf_flags \|= KF_FLAG_CAPABILITY;
	kif->kf_cap_rights = cap_rights(fp);
	(void)cap_funwrap(fp, 0, &fp);
	}
	#else /* !CAPABILITIES */
	KASSERT(fp->f_type != DTYPE_CAPABILITY,
	("sysctl_kern_proc_filedesc: saw capability"));
	#endif
	switch (fp->f_type) {
	case DTYPE_VNODE:
	type = KF_TYPE_VNODE;
	vref(fp->f_vnode);
	data = fp->f_vnode;
	break;

	case DTYPE_SOCKET:
	type = KF_TYPE_SOCKET;
	data = fp->f_data;
	break;

	case DTYPE_PIPE:
	type = KF_TYPE_PIPE;
	data = fp->f_data;
	break;

	case DTYPE_FIFO:
	type = KF_TYPE_FIFO;
	vref(fp->f_vnode);
	data = fp->f_vnode;
	break;

	case DTYPE_KQUEUE:
	type = KF_TYPE_KQUEUE;
	break;

	case DTYPE_CRYPTO:
	type = KF_TYPE_CRYPTO;
	break;

	case DTYPE_MQUEUE:
	type = KF_TYPE_MQUEUE;
	break;

	case DTYPE_SHM:
	type = KF_TYPE_SHM;
	break;

	case DTYPE_SEM:
	type = KF_TYPE_SEM;
	break;

	case DTYPE_PTS:
	type = KF_TYPE_PTS;
	data = fp->f_data;
	break;

	#ifdef PROCDESC
	case DTYPE_PROCDESC:
	type = KF_TYPE_PROCDESC;
	data = fp->f_data;
	break;
	#endif

	default:
	type = KF_TYPE_UNKNOWN;
	break;
	}
	refcnt = fp->f_count;
	fflags = fp->f_flag;
	offset = fp->f_offset;

	/*
	* Create sysctl entry.
	* It is OK to drop the filedesc lock here as we will
	* re-validate and re-evaluate its properties when
	* the loop continues.
	*/
	oldidx = req->oldidx;
	if (type == KF_TYPE_VNODE \|\| type == KF_TYPE_FIFO)
	FILEDESC_SUNLOCK(fdp);
	error = export_fd_for_sysctl(data, type, i,
	fflags, refcnt, offset, kif, req);
	if (type == KF_TYPE_VNODE \|\| type == KF_TYPE_FIFO)
	FILEDESC_SLOCK(fdp);
	if (error) {
	if (error == ENOMEM) {
	/*
	* The hack to keep the ABI of sysctl
	* kern.proc.filedesc intact, but not
	* to account a partially copied
	* kinfo_file into the oldidx.
	*/
	req->oldidx = oldidx;
	error = 0;
	}
	break;
	}
	}
	FILEDESC_SUNLOCK(fdp);
	fail:
	if (fdp != NULL)
	fddrop(fdp);
	free(kif, M_TEMP);
	return (error);
	}

	int
	vntype_to_kinfo(int vtype)
	{
	struct {
	int vtype;
	int kf_vtype;
	} vtypes_table[] = {
	{ VBAD, KF_VTYPE_VBAD },
	{ VBLK, KF_VTYPE_VBLK },
	{ VCHR, KF_VTYPE_VCHR },
	{ VDIR, KF_VTYPE_VDIR },
	{ VFIFO, KF_VTYPE_VFIFO },
	{ VLNK, KF_VTYPE_VLNK },
	{ VNON, KF_VTYPE_VNON },
	{ VREG, KF_VTYPE_VREG },
	{ VSOCK, KF_VTYPE_VSOCK }
	};
	#define NVTYPES (sizeof(vtypes_table) / sizeof(*vtypes_table))
	unsigned int i;

	/*
	* Perform vtype translation.
	*/
	for (i = 0; i < NVTYPES; i++)
	if (vtypes_table[i].vtype == vtype)
	break;
	if (i < NVTYPES)
	return (vtypes_table[i].kf_vtype);

	return (KF_VTYPE_UNKNOWN);
	}

	static int
	fill_vnode_info(struct vnode vp, struct kinfo_file kif)
	{
	struct vattr va;
	char fullpath, freepath;
	int error, vfslocked;

	if (vp == NULL)
	return (1);
	kif->kf_vnode_type = vntype_to_kinfo(vp->v_type);
	freepath = NULL;
	fullpath = "-";
	error = vn_fullpath(curthread, vp, &fullpath, &freepath);
	if (error == 0) {
	strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
	}
	if (freepath != NULL)
	free(freepath, M_TEMP);

	/*
	* Retrieve vnode attributes.
	*/
	va.va_fsid = VNOVAL;
	va.va_rdev = NODEV;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	error = VOP_GETATTR(vp, &va, curthread->td_ucred);
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	if (error != 0)
	return (error);
	if (va.va_fsid != VNOVAL)
	kif->kf_un.kf_file.kf_file_fsid = va.va_fsid;
	else
	kif->kf_un.kf_file.kf_file_fsid =
	vp->v_mount->mnt_stat.f_fsid.val[0];
	kif->kf_un.kf_file.kf_file_fileid = va.va_fileid;
	kif->kf_un.kf_file.kf_file_mode = MAKEIMODE(va.va_type, va.va_mode);
	kif->kf_un.kf_file.kf_file_size = va.va_size;
	kif->kf_un.kf_file.kf_file_rdev = va.va_rdev;
	return (0);
	}

	static int
	fill_socket_info(struct socket so, struct kinfo_file kif)
	{
	struct sockaddr *sa;
	struct inpcb *inpcb;
	struct unpcb *unpcb;
	int error;

	if (so == NULL)
	return (1);
	kif->kf_sock_domain = so->so_proto->pr_domain->dom_family;
	kif->kf_sock_type = so->so_type;
	kif->kf_sock_protocol = so->so_proto->pr_protocol;
	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
	switch(kif->kf_sock_domain) {
	case AF_INET:
	case AF_INET6:
	if (kif->kf_sock_protocol == IPPROTO_TCP) {
	if (so->so_pcb != NULL) {
	inpcb = (struct inpcb *)(so->so_pcb);
	kif->kf_un.kf_sock.kf_sock_inpcb =
	(uintptr_t)inpcb->inp_ppcb;
	}
	}
	break;
	case AF_UNIX:
	if (so->so_pcb != NULL) {
	unpcb = (struct unpcb *)(so->so_pcb);
	if (unpcb->unp_conn) {
	kif->kf_un.kf_sock.kf_sock_unpconn =
	(uintptr_t)unpcb->unp_conn;
	kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
	so->so_rcv.sb_state;
	kif->kf_un.kf_sock.kf_sock_snd_sb_state =
	so->so_snd.sb_state;
	}
	}
	break;
	}
	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_local)) {
	bcopy(sa, &kif->kf_sa_local, sa->sa_len);
	free(sa, M_SONAME);
	}
	error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
	if (error == 0 && sa->sa_len <= sizeof(kif->kf_sa_peer)) {
	bcopy(sa, &kif->kf_sa_peer, sa->sa_len);
	free(sa, M_SONAME);
	}
	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
	sizeof(kif->kf_path));
	return (0);
	}

	static int
	fill_pts_info(struct tty tp, struct kinfo_file kif)
	{

	if (tp == NULL)
	return (1);
	kif->kf_un.kf_pts.kf_pts_dev = tty_udev(tp);
	strlcpy(kif->kf_path, tty_devname(tp), sizeof(kif->kf_path));
	return (0);
	}

	static int
	fill_pipe_info(struct pipe pi, struct kinfo_file kif)
	{

	if (pi == NULL)
	return (1);
	kif->kf_un.kf_pipe.kf_pipe_addr = (uintptr_t)pi;
	kif->kf_un.kf_pipe.kf_pipe_peer = (uintptr_t)pi->pipe_peer;
	kif->kf_un.kf_pipe.kf_pipe_buffer_cnt = pi->pipe_buffer.cnt;
	return (0);
	}

	static int
	fill_procdesc_info(struct procdesc pdp, struct kinfo_file kif)
	{

	if (pdp == NULL)
	return (1);
	kif->kf_un.kf_proc.kf_pid = pdp->pd_pid;
	return (0);
	}

	static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD,
	sysctl_kern_proc_filedesc, "Process filedesc entries");

	#ifdef DDB
	/*
	* For the purposes of debugging, generate a human-readable string for the
	* file type.
	*/
	static const char *
	file_type_to_name(short type)
	{

	switch (type) {
	case 0:
	return ("zero");
	case DTYPE_VNODE:
	return ("vnod");
	case DTYPE_SOCKET:
	return ("sock");
	case DTYPE_PIPE:
	return ("pipe");
	case DTYPE_FIFO:
	return ("fifo");
	case DTYPE_KQUEUE:
	return ("kque");
	case DTYPE_CRYPTO:
	return ("crpt");
	case DTYPE_MQUEUE:
	return ("mque");
	case DTYPE_SHM:
	return ("shm");
	case DTYPE_SEM:
	return ("ksem");
	default:
	return ("unkn");
	}
	}

	/*
	* For the purposes of debugging, identify a process (if any, perhaps one of
	* many) that references the passed file in its file descriptor array. Return
	* NULL if none.
	*/
	static struct proc *
	file_to_first_proc(struct file *fp)
	{
	struct filedesc *fdp;
	struct proc *p;
	int n;

	FOREACH_PROC_IN_SYSTEM(p) {
	if (p->p_state == PRS_NEW)
	continue;
	fdp = p->p_fd;
	if (fdp == NULL)
	continue;
	for (n = 0; n < fdp->fd_nfiles; n++) {
	if (fp == fdp->fd_ofiles[n])
	return (p);
	}
	}
	return (NULL);
	}

	static void
	db_print_file(struct file *fp, int header)
	{
	struct proc *p;

	if (header)
	db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
	"File", "Type", "Data", "Flag", "GCFl", "Count",
	"MCount", "Vnode", "FPID", "FCmd");
	p = file_to_first_proc(fp);
	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
	file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
	0, fp->f_count, 0, fp->f_vnode,
	p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
	}

	DB_SHOW_COMMAND(file, db_show_file)
	{
	struct file *fp;

	if (!have_addr) {
	db_printf("usage: show file <addr>\n");
	return;
	}
	fp = (struct file *)addr;
	db_print_file(fp, 1);
	}

	DB_SHOW_COMMAND(files, db_show_files)
	{
	struct filedesc *fdp;
	struct file *fp;
	struct proc *p;
	int header;
	int n;

	header = 1;
	FOREACH_PROC_IN_SYSTEM(p) {
	if (p->p_state == PRS_NEW)
	continue;
	if ((fdp = p->p_fd) == NULL)
	continue;
	for (n = 0; n < fdp->fd_nfiles; ++n) {
	if ((fp = fdp->fd_ofiles[n]) == NULL)
	continue;
	db_print_file(fp, header);
	header = 0;
	}
	}
	}
	#endif

	SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
	&maxfilesperproc, 0, "Maximum files allowed open per process");

	SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
	&maxfiles, 0, "Maximum number of files");

	SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
	__DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");

	/* ARGSUSED*/
	static void
	filelistinit(void *dummy)
	{

	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
	mtx_init(&fdesc_mtx, "fdesc", NULL, MTX_DEF);
	}
	SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);

	/-------------------------------------------------------------------/

	static int
	badfo_readwrite(struct file fp, struct uio uio, struct ucred active_cred, int flags, struct thread td)
	{

	return (EBADF);
	}

	static int
	badfo_truncate(struct file fp, off_t length, struct ucred active_cred, struct thread *td)
	{

	return (EINVAL);
	}

	static int
	badfo_ioctl(struct file fp, u_long com, void data, struct ucred active_cred, struct thread td)
	{

	return (EBADF);
	}

	static int
	badfo_poll(struct file fp, int events, struct ucred active_cred, struct thread *td)
	{

	return (0);
	}

	static int
	badfo_kqfilter(struct file fp, struct knote kn)
	{

	return (EBADF);
	}

	static int
	badfo_stat(struct file fp, struct stat sb, struct ucred active_cred, struct thread td)
	{

	return (EBADF);
	}

	static int
	badfo_close(struct file fp, struct thread td)
	{

	return (EBADF);
	}

	static int
	badfo_chmod(struct file fp, mode_t mode, struct ucred active_cred,
	struct thread *td)
	{

	return (EBADF);
	}

	static int
	badfo_chown(struct file fp, uid_t uid, gid_t gid, struct ucred active_cred,
	struct thread *td)
	{

	return (EBADF);
	}

	struct fileops badfileops = {
	.fo_read = badfo_readwrite,
	.fo_write = badfo_readwrite,
	.fo_truncate = badfo_truncate,
	.fo_ioctl = badfo_ioctl,
	.fo_poll = badfo_poll,
	.fo_kqfilter = badfo_kqfilter,
	.fo_stat = badfo_stat,
	.fo_close = badfo_close,
	.fo_chmod = badfo_chmod,
	.fo_chown = badfo_chown,
	};

	int
	invfo_chmod(struct file fp, mode_t mode, struct ucred active_cred,
	struct thread *td)
	{

	return (EINVAL);
	}

	int
	invfo_chown(struct file fp, uid_t uid, gid_t gid, struct ucred active_cred,
	struct thread *td)
	{

	return (EINVAL);
	}

	/-------------------------------------------------------------------/

	/*
	* File Descriptor pseudo-device driver (/dev/fd/).
	*
	* Opening minor device N dup()s the file (if any) connected to file
	* descriptor N belonging to the calling process. Note that this driver
	* consists of only the ``open()'' routine, because all subsequent
	* references to this file will be direct to the other driver.
	*
	* XXX: we could give this one a cloning event handler if necessary.
	*/

	/* ARGSUSED */
	static int
	fdopen(struct cdev dev, int mode, int type, struct thread td)
	{

	/*
	* XXX Kludge: set curthread->td_dupfd to contain the value of the
	* the file descriptor being sought for duplication. The error
	* return ensures that the vnode for this device will be released
	* by vn_open. Open will detect this special error and take the
	* actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
	* will simply report the error.
	*/
	td->td_dupfd = dev2unit(dev);
	return (ENODEV);
	}

	static struct cdevsw fildesc_cdevsw = {
	.d_version = D_VERSION,
	.d_open = fdopen,
	.d_name = "FD",
	};

	static void
	fildesc_drvinit(void *unused)
	{
	struct cdev *dev;

	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
	UID_ROOT, GID_WHEEL, 0666, "fd/0");
	make_dev_alias(dev, "stdin");
	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
	UID_ROOT, GID_WHEEL, 0666, "fd/1");
	make_dev_alias(dev, "stdout");
	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
	UID_ROOT, GID_WHEEL, 0666, "fd/2");
	make_dev_alias(dev, "stderr");
	}

	SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
	Index: head/sys/kern/kern_environment.c
	===================================================================
	--- head/sys/kern/kern_environment.c (revision 225616)
	+++ head/sys/kern/kern_environment.c (revision 225617)
	@@ -1,623 +1,623 @@
	/*-
	* Copyright (c) 1998 Michael Smith
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* The unified bootloader passes us a pointer to a preserved copy of
	* bootstrap/kernel environment variables. We convert them to a
	* dynamic array of strings later when the VM subsystem is up.
	*
	* We make these available through the kenv(2) syscall for userland
	* and through getenv()/freeenv() setenv() unsetenv() testenv() for
	* the kernel.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/proc.h>
	#include <sys/queue.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/kernel.h>
	#include <sys/systm.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/libkern.h>
	#include <sys/kenv.h>

	#include <security/mac/mac_framework.h>

	static MALLOC_DEFINE(M_KENV, "kenv", "kernel environment");

	#define KENV_SIZE 512 /* Maximum number of environment strings */

	/* pointer to the static environment */
	char *kern_envp;
	static int env_len;
	static int env_pos;
	static char kernenv_next(char );

	/* dynamic environment variables */
	char **kenvp;
	struct mtx kenv_lock;

	/*
	* No need to protect this with a mutex since SYSINITS are single threaded.
	*/
	int dynamic_kenv = 0;

	#define KENV_CHECK if (!dynamic_kenv) \
	panic("%s: called before SI_SUB_KMEM", __func__)

	int
	-kenv(td, uap)
	+sys_kenv(td, uap)
	struct thread *td;
	struct kenv_args /* {
	int what;
	const char *name;
	char *value;
	int len;
	} / uap;
	{
	char name, value, *buffer = NULL;
	size_t len, done, needed, buflen;
	int error, i;

	KASSERT(dynamic_kenv, ("kenv: dynamic_kenv = 0"));

	error = 0;
	if (uap->what == KENV_DUMP) {
	#ifdef MAC
	error = mac_kenv_check_dump(td->td_ucred);
	if (error)
	return (error);
	#endif
	done = needed = 0;
	buflen = uap->len;
	if (buflen > KENV_SIZE * (KENV_MNAMELEN + KENV_MVALLEN + 2))
	buflen = KENV_SIZE * (KENV_MNAMELEN +
	KENV_MVALLEN + 2);
	if (uap->len > 0 && uap->value != NULL)
	buffer = malloc(buflen, M_TEMP, M_WAITOK\|M_ZERO);
	mtx_lock(&kenv_lock);
	for (i = 0; kenvp[i] != NULL; i++) {
	len = strlen(kenvp[i]) + 1;
	needed += len;
	len = min(len, buflen - done);
	/*
	* If called with a NULL or insufficiently large
	* buffer, just keep computing the required size.
	*/
	if (uap->value != NULL && buffer != NULL && len > 0) {
	bcopy(kenvp[i], buffer + done, len);
	done += len;
	}
	}
	mtx_unlock(&kenv_lock);
	if (buffer != NULL) {
	error = copyout(buffer, uap->value, done);
	free(buffer, M_TEMP);
	}
	td->td_retval[0] = ((done == needed) ? 0 : needed);
	return (error);
	}

	switch (uap->what) {
	case KENV_SET:
	error = priv_check(td, PRIV_KENV_SET);
	if (error)
	return (error);
	break;

	case KENV_UNSET:
	error = priv_check(td, PRIV_KENV_UNSET);
	if (error)
	return (error);
	break;
	}

	name = malloc(KENV_MNAMELEN, M_TEMP, M_WAITOK);

	error = copyinstr(uap->name, name, KENV_MNAMELEN, NULL);
	if (error)
	goto done;

	switch (uap->what) {
	case KENV_GET:
	#ifdef MAC
	error = mac_kenv_check_get(td->td_ucred, name);
	if (error)
	goto done;
	#endif
	value = getenv(name);
	if (value == NULL) {
	error = ENOENT;
	goto done;
	}
	len = strlen(value) + 1;
	if (len > uap->len)
	len = uap->len;
	error = copyout(value, uap->value, len);
	freeenv(value);
	if (error)
	goto done;
	td->td_retval[0] = len;
	break;
	case KENV_SET:
	len = uap->len;
	if (len < 1) {
	error = EINVAL;
	goto done;
	}
	if (len > KENV_MVALLEN)
	len = KENV_MVALLEN;
	value = malloc(len, M_TEMP, M_WAITOK);
	error = copyinstr(uap->value, value, len, NULL);
	if (error) {
	free(value, M_TEMP);
	goto done;
	}
	#ifdef MAC
	error = mac_kenv_check_set(td->td_ucred, name, value);
	if (error == 0)
	#endif
	setenv(name, value);
	free(value, M_TEMP);
	break;
	case KENV_UNSET:
	#ifdef MAC
	error = mac_kenv_check_unset(td->td_ucred, name);
	if (error)
	goto done;
	#endif
	error = unsetenv(name);
	if (error)
	error = ENOENT;
	break;
	default:
	error = EINVAL;
	break;
	}
	done:
	free(name, M_TEMP);
	return (error);
	}

	void
	init_static_kenv(char *buf, size_t len)
	{
	kern_envp = buf;
	env_len = len;
	env_pos = 0;
	}

	/*
	* Setup the dynamic kernel environment.
	*/
	static void
	init_dynamic_kenv(void *data __unused)
	{
	char *cp;
	size_t len;
	int i;

	kenvp = malloc((KENV_SIZE + 1) * sizeof(char *), M_KENV,
	M_WAITOK \| M_ZERO);
	i = 0;
	for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
	len = strlen(cp) + 1;
	if (len > KENV_MNAMELEN + 1 + KENV_MVALLEN + 1) {
	printf("WARNING: too long kenv string, ignoring %s\n",
	cp);
	continue;
	}
	if (i < KENV_SIZE) {
	kenvp[i] = malloc(len, M_KENV, M_WAITOK);
	strcpy(kenvp[i++], cp);
	} else
	printf(
	"WARNING: too many kenv strings, ignoring %s\n",
	cp);
	}
	kenvp[i] = NULL;

	mtx_init(&kenv_lock, "kernel environment", NULL, MTX_DEF);
	dynamic_kenv = 1;
	}
	SYSINIT(kenv, SI_SUB_KMEM, SI_ORDER_ANY, init_dynamic_kenv, NULL);

	void
	freeenv(char *env)
	{

	if (dynamic_kenv)
	free(env, M_KENV);
	}

	/*
	* Internal functions for string lookup.
	*/
	static char *
	_getenv_dynamic(const char name, int idx)
	{
	char *cp;
	int len, i;

	mtx_assert(&kenv_lock, MA_OWNED);
	len = strlen(name);
	for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) {
	if ((strncmp(cp, name, len) == 0) &&
	(cp[len] == '=')) {
	if (idx != NULL)
	*idx = i;
	return (cp + len + 1);
	}
	}
	return (NULL);
	}

	static char *
	_getenv_static(const char *name)
	{
	char cp, ep;
	int len;

	for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
	for (ep = cp; (ep != '=') && (ep != 0); ep++)
	;
	if (*ep != '=')
	continue;
	len = ep - cp;
	ep++;
	if (!strncmp(name, cp, len) && name[len] == 0)
	return (ep);
	}
	return (NULL);
	}

	/*
	* Look up an environment variable by name.
	* Return a pointer to the string if found.
	* The pointer has to be freed with freeenv()
	* after use.
	*/
	char *
	getenv(const char *name)
	{
	char buf[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1];
	char ret, cp;
	int len;

	if (dynamic_kenv) {
	mtx_lock(&kenv_lock);
	cp = _getenv_dynamic(name, NULL);
	if (cp != NULL) {
	strcpy(buf, cp);
	mtx_unlock(&kenv_lock);
	len = strlen(buf) + 1;
	ret = malloc(len, M_KENV, M_WAITOK);
	strcpy(ret, buf);
	} else {
	mtx_unlock(&kenv_lock);
	ret = NULL;
	WITNESS_WARN(WARN_GIANTOK \| WARN_SLEEPOK, NULL,
	"getenv");
	}
	} else
	ret = _getenv_static(name);
	return (ret);
	}

	/*
	* Test if an environment variable is defined.
	*/
	int
	testenv(const char *name)
	{
	char *cp;

	if (dynamic_kenv) {
	mtx_lock(&kenv_lock);
	cp = _getenv_dynamic(name, NULL);
	mtx_unlock(&kenv_lock);
	} else
	cp = _getenv_static(name);
	if (cp != NULL)
	return (1);
	return (0);
	}

	static int
	setenv_static(const char name, const char value)
	{
	int len;

	if (env_pos >= env_len)
	return (-1);

	/* Check space for x=y and two nuls */
	len = strlen(name) + strlen(value);
	if (len + 3 < env_len - env_pos) {
	len = sprintf(&kern_envp[env_pos], "%s=%s", name, value);
	env_pos += len+1;
	kern_envp[env_pos] = '\0';
	return (0);
	} else
	return (-1);

	}

	/*
	* Set an environment variable by name.
	*/
	int
	setenv(const char name, const char value)
	{
	char buf, cp, *oldenv;
	int namelen, vallen, i;

	if (dynamic_kenv == 0 && env_len > 0)
	return (setenv_static(name, value));

	KENV_CHECK;

	namelen = strlen(name) + 1;
	if (namelen > KENV_MNAMELEN)
	return (-1);
	vallen = strlen(value) + 1;
	if (vallen > KENV_MVALLEN)
	return (-1);
	buf = malloc(namelen + vallen, M_KENV, M_WAITOK);
	sprintf(buf, "%s=%s", name, value);

	mtx_lock(&kenv_lock);
	cp = _getenv_dynamic(name, &i);
	if (cp != NULL) {
	oldenv = kenvp[i];
	kenvp[i] = buf;
	mtx_unlock(&kenv_lock);
	free(oldenv, M_KENV);
	} else {
	/* We add the option if it wasn't found */
	for (i = 0; (cp = kenvp[i]) != NULL; i++)
	;

	/* Bounds checking */
	if (i < 0 \|\| i >= KENV_SIZE) {
	free(buf, M_KENV);
	mtx_unlock(&kenv_lock);
	return (-1);
	}

	kenvp[i] = buf;
	kenvp[i + 1] = NULL;
	mtx_unlock(&kenv_lock);
	}
	return (0);
	}

	/*
	* Unset an environment variable string.
	*/
	int
	unsetenv(const char *name)
	{
	char cp, oldenv;
	int i, j;

	KENV_CHECK;

	mtx_lock(&kenv_lock);
	cp = _getenv_dynamic(name, &i);
	if (cp != NULL) {
	oldenv = kenvp[i];
	for (j = i + 1; kenvp[j] != NULL; j++)
	kenvp[i++] = kenvp[j];
	kenvp[i] = NULL;
	mtx_unlock(&kenv_lock);
	free(oldenv, M_KENV);
	return (0);
	}
	mtx_unlock(&kenv_lock);
	return (-1);
	}

	/*
	* Return a string value from an environment variable.
	*/
	int
	getenv_string(const char name, char data, int size)
	{
	char *tmp;

	tmp = getenv(name);
	if (tmp != NULL) {
	strlcpy(data, tmp, size);
	freeenv(tmp);
	return (1);
	} else
	return (0);
	}

	/*
	* Return an integer value from an environment variable.
	*/
	int
	getenv_int(const char name, int data)
	{
	quad_t tmp;
	int rval;

	rval = getenv_quad(name, &tmp);
	if (rval)
	*data = (int) tmp;
	return (rval);
	}

	/*
	* Return an unsigned integer value from an environment variable.
	*/
	int
	getenv_uint(const char name, unsigned int data)
	{
	quad_t tmp;
	int rval;

	rval = getenv_quad(name, &tmp);
	if (rval)
	*data = (unsigned int) tmp;
	return (rval);
	}

	/*
	* Return a long value from an environment variable.
	*/
	int
	getenv_long(const char name, long data)
	{
	quad_t tmp;
	int rval;

	rval = getenv_quad(name, &tmp);
	if (rval)
	*data = (long) tmp;
	return (rval);
	}

	/*
	* Return an unsigned long value from an environment variable.
	*/
	int
	getenv_ulong(const char name, unsigned long data)
	{
	quad_t tmp;
	int rval;

	rval = getenv_quad(name, &tmp);
	if (rval)
	*data = (unsigned long) tmp;
	return (rval);
	}

	/*
	* Return a quad_t value from an environment variable.
	*/
	int
	getenv_quad(const char name, quad_t data)
	{
	char *value;
	char *vtp;
	quad_t iv;

	value = getenv(name);
	if (value == NULL)
	return (0);
	iv = strtoq(value, &vtp, 0);
	if (vtp == value \|\| (vtp[0] != '\0' && vtp[1] != '\0')) {
	freeenv(value);
	return (0);
	}
	switch (vtp[0]) {
	case 't': case 'T':
	iv *= 1024;
	case 'g': case 'G':
	iv *= 1024;
	case 'm': case 'M':
	iv *= 1024;
	case 'k': case 'K':
	iv *= 1024;
	case '\0':
	break;
	default:
	freeenv(value);
	return (0);
	}
	*data = iv;
	freeenv(value);
	return (1);
	}

	/*
	* Find the next entry after the one which (cp) falls within, return a
	* pointer to its start or NULL if there are no more.
	*/
	static char *
	kernenv_next(char *cp)
	{

	if (cp != NULL) {
	while (*cp != 0)
	cp++;
	cp++;
	if (*cp == 0)
	cp = NULL;
	}
	return (cp);
	}

	void
	tunable_int_init(void *data)
	{
	struct tunable_int d = (struct tunable_int )data;

	TUNABLE_INT_FETCH(d->path, d->var);
	}

	void
	tunable_long_init(void *data)
	{
	struct tunable_long d = (struct tunable_long )data;

	TUNABLE_LONG_FETCH(d->path, d->var);
	}

	void
	tunable_ulong_init(void *data)
	{
	struct tunable_ulong d = (struct tunable_ulong )data;

	TUNABLE_ULONG_FETCH(d->path, d->var);
	}

	void
	tunable_quad_init(void *data)
	{
	struct tunable_quad d = (struct tunable_quad )data;

	TUNABLE_QUAD_FETCH(d->path, d->var);
	}

	void
	tunable_str_init(void *data)
	{
	struct tunable_str d = (struct tunable_str )data;

	TUNABLE_STR_FETCH(d->path, d->var, d->size);
	}
	Index: head/sys/kern/kern_event.c
	===================================================================
	--- head/sys/kern/kern_event.c (revision 225616)
	+++ head/sys/kern/kern_event.c (revision 225617)
	@@ -1,2201 +1,2201 @@
	/*-
	* Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
	* Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
	* Copyright (c) 2009 Apple, Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ktrace.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/malloc.h>
	#include <sys/unistd.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/filio.h>
	#include <sys/fcntl.h>
	#include <sys/kthread.h>
	#include <sys/selinfo.h>
	#include <sys/queue.h>
	#include <sys/event.h>
	#include <sys/eventvar.h>
	#include <sys/poll.h>
	#include <sys/protosw.h>
	#include <sys/sigio.h>
	#include <sys/signalvar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/stat.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/syscallsubr.h>
	#include <sys/taskqueue.h>
	#include <sys/uio.h>
	#ifdef KTRACE
	#include <sys/ktrace.h>
	#endif

	#include <vm/uma.h>

	static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");

	/*
	* This lock is used if multiple kq locks are required. This possibly
	* should be made into a per proc lock.
	*/
	static struct mtx kq_global;
	MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
	#define KQ_GLOBAL_LOCK(lck, haslck) do { \
	if (!haslck) \
	mtx_lock(lck); \
	haslck = 1; \
	} while (0)
	#define KQ_GLOBAL_UNLOCK(lck, haslck) do { \
	if (haslck) \
	mtx_unlock(lck); \
	haslck = 0; \
	} while (0)

	TASKQUEUE_DEFINE_THREAD(kqueue);

	static int kevent_copyout(void arg, struct kevent kevp, int count);
	static int kevent_copyin(void arg, struct kevent kevp, int count);
	static int kqueue_register(struct kqueue kq, struct kevent kev,
	struct thread *td, int waitok);
	static int kqueue_acquire(struct file fp, struct kqueue *kqp);
	static void kqueue_release(struct kqueue *kq, int locked);
	static int kqueue_expand(struct kqueue kq, struct filterops fops,
	uintptr_t ident, int waitok);
	static void kqueue_task(void *arg, int pending);
	static int kqueue_scan(struct kqueue *kq, int maxevents,
	struct kevent_copyops *k_ops,
	const struct timespec *timeout,
	struct kevent keva, struct thread td);
	static void kqueue_wakeup(struct kqueue *kq);
	static struct filterops *kqueue_fo_find(int filt);
	static void kqueue_fo_release(int filt);

	static fo_rdwr_t kqueue_read;
	static fo_rdwr_t kqueue_write;
	static fo_truncate_t kqueue_truncate;
	static fo_ioctl_t kqueue_ioctl;
	static fo_poll_t kqueue_poll;
	static fo_kqfilter_t kqueue_kqfilter;
	static fo_stat_t kqueue_stat;
	static fo_close_t kqueue_close;

	static struct fileops kqueueops = {
	.fo_read = kqueue_read,
	.fo_write = kqueue_write,
	.fo_truncate = kqueue_truncate,
	.fo_ioctl = kqueue_ioctl,
	.fo_poll = kqueue_poll,
	.fo_kqfilter = kqueue_kqfilter,
	.fo_stat = kqueue_stat,
	.fo_close = kqueue_close,
	.fo_chmod = invfo_chmod,
	.fo_chown = invfo_chown,
	};

	static int knote_attach(struct knote kn, struct kqueue kq);
	static void knote_drop(struct knote kn, struct thread td);
	static void knote_enqueue(struct knote *kn);
	static void knote_dequeue(struct knote *kn);
	static void knote_init(void);
	static struct knote *knote_alloc(int waitok);
	static void knote_free(struct knote *kn);

	static void filt_kqdetach(struct knote *kn);
	static int filt_kqueue(struct knote *kn, long hint);
	static int filt_procattach(struct knote *kn);
	static void filt_procdetach(struct knote *kn);
	static int filt_proc(struct knote *kn, long hint);
	static int filt_fileattach(struct knote *kn);
	static void filt_timerexpire(void *knx);
	static int filt_timerattach(struct knote *kn);
	static void filt_timerdetach(struct knote *kn);
	static int filt_timer(struct knote *kn, long hint);
	static int filt_userattach(struct knote *kn);
	static void filt_userdetach(struct knote *kn);
	static int filt_user(struct knote *kn, long hint);
	static void filt_usertouch(struct knote kn, struct kevent kev,
	u_long type);

	static struct filterops file_filtops = {
	.f_isfd = 1,
	.f_attach = filt_fileattach,
	};
	static struct filterops kqread_filtops = {
	.f_isfd = 1,
	.f_detach = filt_kqdetach,
	.f_event = filt_kqueue,
	};
	/* XXX - move to kern_proc.c? */
	static struct filterops proc_filtops = {
	.f_isfd = 0,
	.f_attach = filt_procattach,
	.f_detach = filt_procdetach,
	.f_event = filt_proc,
	};
	static struct filterops timer_filtops = {
	.f_isfd = 0,
	.f_attach = filt_timerattach,
	.f_detach = filt_timerdetach,
	.f_event = filt_timer,
	};
	static struct filterops user_filtops = {
	.f_attach = filt_userattach,
	.f_detach = filt_userdetach,
	.f_event = filt_user,
	.f_touch = filt_usertouch,
	};

	static uma_zone_t knote_zone;
	static int kq_ncallouts = 0;
	static int kq_calloutmax = (4 * 1024);
	SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
	&kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");

	/* XXX - ensure not KN_INFLUX?? */
	#define KNOTE_ACTIVATE(kn, islock) do { \
	if ((islock)) \
	mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED); \
	else \
	KQ_LOCK((kn)->kn_kq); \
	(kn)->kn_status \|= KN_ACTIVE; \
	if (((kn)->kn_status & (KN_QUEUED \| KN_DISABLED)) == 0) \
	knote_enqueue((kn)); \
	if (!(islock)) \
	KQ_UNLOCK((kn)->kn_kq); \
	} while(0)
	#define KQ_LOCK(kq) do { \
	mtx_lock(&(kq)->kq_lock); \
	} while (0)
	#define KQ_FLUX_WAKEUP(kq) do { \
	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) { \
	(kq)->kq_state &= ~KQ_FLUXWAIT; \
	wakeup((kq)); \
	} \
	} while (0)
	#define KQ_UNLOCK_FLUX(kq) do { \
	KQ_FLUX_WAKEUP(kq); \
	mtx_unlock(&(kq)->kq_lock); \
	} while (0)
	#define KQ_UNLOCK(kq) do { \
	mtx_unlock(&(kq)->kq_lock); \
	} while (0)
	#define KQ_OWNED(kq) do { \
	mtx_assert(&(kq)->kq_lock, MA_OWNED); \
	} while (0)
	#define KQ_NOTOWNED(kq) do { \
	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED); \
	} while (0)
	#define KN_LIST_LOCK(kn) do { \
	if (kn->kn_knlist != NULL) \
	kn->kn_knlist->kl_lock(kn->kn_knlist->kl_lockarg); \
	} while (0)
	#define KN_LIST_UNLOCK(kn) do { \
	if (kn->kn_knlist != NULL) \
	kn->kn_knlist->kl_unlock(kn->kn_knlist->kl_lockarg); \
	} while (0)
	#define KNL_ASSERT_LOCK(knl, islocked) do { \
	if (islocked) \
	KNL_ASSERT_LOCKED(knl); \
	else \
	KNL_ASSERT_UNLOCKED(knl); \
	} while (0)
	#ifdef INVARIANTS
	#define KNL_ASSERT_LOCKED(knl) do { \
	knl->kl_assert_locked((knl)->kl_lockarg); \
	} while (0)
	#define KNL_ASSERT_UNLOCKED(knl) do { \
	knl->kl_assert_unlocked((knl)->kl_lockarg); \
	} while (0)
	#else /* !INVARIANTS */
	#define KNL_ASSERT_LOCKED(knl) do {} while(0)
	#define KNL_ASSERT_UNLOCKED(knl) do {} while (0)
	#endif /* INVARIANTS */

	#define KN_HASHSIZE 64 /* XXX should be tunable */
	#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))

	static int
	filt_nullattach(struct knote *kn)
	{

	return (ENXIO);
	};

	struct filterops null_filtops = {
	.f_isfd = 0,
	.f_attach = filt_nullattach,
	};

	/* XXX - make SYSINIT to add these, and move into respective modules. */
	extern struct filterops sig_filtops;
	extern struct filterops fs_filtops;

	/*
	* Table for for all system-defined filters.
	*/
	static struct mtx filterops_lock;
	MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
	MTX_DEF);
	static struct {
	struct filterops *for_fop;
	int for_refcnt;
	} sysfilt_ops[EVFILT_SYSCOUNT] = {
	{ &file_filtops }, /* EVFILT_READ */
	{ &file_filtops }, /* EVFILT_WRITE */
	{ &null_filtops }, /* EVFILT_AIO */
	{ &file_filtops }, /* EVFILT_VNODE */
	{ &proc_filtops }, /* EVFILT_PROC */
	{ &sig_filtops }, /* EVFILT_SIGNAL */
	{ &timer_filtops }, /* EVFILT_TIMER */
	{ &null_filtops }, /* former EVFILT_NETDEV */
	{ &fs_filtops }, /* EVFILT_FS */
	{ &null_filtops }, /* EVFILT_LIO */
	{ &user_filtops }, /* EVFILT_USER */
	};

	/*
	* Simple redirection for all cdevsw style objects to call their fo_kqfilter
	* method.
	*/
	static int
	filt_fileattach(struct knote *kn)
	{

	return (fo_kqfilter(kn->kn_fp, kn));
	}

	/ARGSUSED/
	static int
	kqueue_kqfilter(struct file fp, struct knote kn)
	{
	struct kqueue *kq = kn->kn_fp->f_data;

	if (kn->kn_filter != EVFILT_READ)
	return (EINVAL);

	kn->kn_status \|= KN_KQUEUE;
	kn->kn_fop = &kqread_filtops;
	knlist_add(&kq->kq_sel.si_note, kn, 0);

	return (0);
	}

	static void
	filt_kqdetach(struct knote *kn)
	{
	struct kqueue *kq = kn->kn_fp->f_data;

	knlist_remove(&kq->kq_sel.si_note, kn, 0);
	}

	/ARGSUSED/
	static int
	filt_kqueue(struct knote *kn, long hint)
	{
	struct kqueue *kq = kn->kn_fp->f_data;

	kn->kn_data = kq->kq_count;
	return (kn->kn_data > 0);
	}

	/* XXX - move to kern_proc.c? */
	static int
	filt_procattach(struct knote *kn)
	{
	struct proc *p;
	int immediate;
	int error;

	immediate = 0;
	p = pfind(kn->kn_id);
	if (p == NULL && (kn->kn_sfflags & NOTE_EXIT)) {
	p = zpfind(kn->kn_id);
	immediate = 1;
	} else if (p != NULL && (p->p_flag & P_WEXIT)) {
	immediate = 1;
	}

	if (p == NULL)
	return (ESRCH);
	if ((error = p_cansee(curthread, p))) {
	PROC_UNLOCK(p);
	return (error);
	}

	kn->kn_ptr.p_proc = p;
	kn->kn_flags \|= EV_CLEAR; /* automatically set */

	/*
	* internal flag indicating registration done by kernel
	*/
	if (kn->kn_flags & EV_FLAG1) {
	kn->kn_data = kn->kn_sdata; /* ppid */
	kn->kn_fflags = NOTE_CHILD;
	kn->kn_flags &= ~EV_FLAG1;
	}

	if (immediate == 0)
	knlist_add(&p->p_klist, kn, 1);

	/*
	* Immediately activate any exit notes if the target process is a
	* zombie. This is necessary to handle the case where the target
	* process, e.g. a child, dies before the kevent is registered.
	*/
	if (immediate && filt_proc(kn, NOTE_EXIT))
	KNOTE_ACTIVATE(kn, 0);

	PROC_UNLOCK(p);

	return (0);
	}

	/*
	* The knote may be attached to a different process, which may exit,
	* leaving nothing for the knote to be attached to. So when the process
	* exits, the knote is marked as DETACHED and also flagged as ONESHOT so
	* it will be deleted when read out. However, as part of the knote deletion,
	* this routine is called, so a check is needed to avoid actually performing
	* a detach, because the original process does not exist any more.
	*/
	/* XXX - move to kern_proc.c? */
	static void
	filt_procdetach(struct knote *kn)
	{
	struct proc *p;

	p = kn->kn_ptr.p_proc;
	knlist_remove(&p->p_klist, kn, 0);
	kn->kn_ptr.p_proc = NULL;
	}

	/* XXX - move to kern_proc.c? */
	static int
	filt_proc(struct knote *kn, long hint)
	{
	struct proc *p = kn->kn_ptr.p_proc;
	u_int event;

	/*
	* mask off extra data
	*/
	event = (u_int)hint & NOTE_PCTRLMASK;

	/*
	* if the user is interested in this event, record it.
	*/
	if (kn->kn_sfflags & event)
	kn->kn_fflags \|= event;

	/*
	* process is gone, so flag the event as finished.
	*/
	if (event == NOTE_EXIT) {
	if (!(kn->kn_status & KN_DETACHED))
	knlist_remove_inevent(&p->p_klist, kn);
	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
	kn->kn_data = p->p_xstat;
	kn->kn_ptr.p_proc = NULL;
	return (1);
	}

	return (kn->kn_fflags != 0);
	}

	/*
	* Called when the process forked. It mostly does the same as the
	* knote(), activating all knotes registered to be activated when the
	* process forked. Additionally, for each knote attached to the
	* parent, check whether user wants to track the new process. If so
	* attach a new knote to it, and immediately report an event with the
	* child's pid.
	*/
	void
	knote_fork(struct knlist *list, int pid)
	{
	struct kqueue *kq;
	struct knote *kn;
	struct kevent kev;
	int error;

	if (list == NULL)
	return;
	list->kl_lock(list->kl_lockarg);

	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
	if ((kn->kn_status & KN_INFLUX) == KN_INFLUX)
	continue;
	kq = kn->kn_kq;
	KQ_LOCK(kq);
	if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
	KQ_UNLOCK(kq);
	continue;
	}

	/*
	* The same as knote(), activate the event.
	*/
	if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
	kn->kn_status \|= KN_HASKQLOCK;
	if (kn->kn_fop->f_event(kn, NOTE_FORK \| pid))
	KNOTE_ACTIVATE(kn, 1);
	kn->kn_status &= ~KN_HASKQLOCK;
	KQ_UNLOCK(kq);
	continue;
	}

	/*
	* The NOTE_TRACK case. In addition to the activation
	* of the event, we need to register new event to
	* track the child. Drop the locks in preparation for
	* the call to kqueue_register().
	*/
	kn->kn_status \|= KN_INFLUX;
	KQ_UNLOCK(kq);
	list->kl_unlock(list->kl_lockarg);

	/*
	* Activate existing knote and register a knote with
	* new process.
	*/
	kev.ident = pid;
	kev.filter = kn->kn_filter;
	kev.flags = kn->kn_flags \| EV_ADD \| EV_ENABLE \| EV_FLAG1;
	kev.fflags = kn->kn_sfflags;
	kev.data = kn->kn_id; /* parent */
	kev.udata = kn->kn_kevent.udata;/* preserve udata */
	error = kqueue_register(kq, &kev, NULL, 0);
	if (kn->kn_fop->f_event(kn, NOTE_FORK \| pid))
	KNOTE_ACTIVATE(kn, 0);
	if (error)
	kn->kn_fflags \|= NOTE_TRACKERR;
	KQ_LOCK(kq);
	kn->kn_status &= ~KN_INFLUX;
	KQ_UNLOCK_FLUX(kq);
	list->kl_lock(list->kl_lockarg);
	}
	list->kl_unlock(list->kl_lockarg);
	}

	static int
	timertoticks(intptr_t data)
	{
	struct timeval tv;
	int tticks;

	tv.tv_sec = data / 1000;
	tv.tv_usec = (data % 1000) * 1000;
	tticks = tvtohz(&tv);

	return tticks;
	}

	/* XXX - move to kern_timeout.c? */
	static void
	filt_timerexpire(void *knx)
	{
	struct knote *kn = knx;
	struct callout *calloutp;

	kn->kn_data++;
	KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */

	if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
	calloutp = (struct callout *)kn->kn_hook;
	callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
	filt_timerexpire, kn);
	}
	}

	/*
	* data contains amount of time to sleep, in milliseconds
	*/
	/* XXX - move to kern_timeout.c? */
	static int
	filt_timerattach(struct knote *kn)
	{
	struct callout *calloutp;

	atomic_add_int(&kq_ncallouts, 1);

	if (kq_ncallouts >= kq_calloutmax) {
	atomic_add_int(&kq_ncallouts, -1);
	return (ENOMEM);
	}

	kn->kn_flags \|= EV_CLEAR; /* automatically set */
	kn->kn_status &= ~KN_DETACHED; /* knlist_add usually sets it */
	calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
	callout_init(calloutp, CALLOUT_MPSAFE);
	kn->kn_hook = calloutp;
	callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
	filt_timerexpire, kn);

	return (0);
	}

	/* XXX - move to kern_timeout.c? */
	static void
	filt_timerdetach(struct knote *kn)
	{
	struct callout *calloutp;

	calloutp = (struct callout *)kn->kn_hook;
	callout_drain(calloutp);
	free(calloutp, M_KQUEUE);
	atomic_add_int(&kq_ncallouts, -1);
	kn->kn_status \|= KN_DETACHED; /* knlist_remove usually clears it */
	}

	/* XXX - move to kern_timeout.c? */
	static int
	filt_timer(struct knote *kn, long hint)
	{

	return (kn->kn_data != 0);
	}

	static int
	filt_userattach(struct knote *kn)
	{

	/*
	* EVFILT_USER knotes are not attached to anything in the kernel.
	*/
	kn->kn_hook = NULL;
	if (kn->kn_fflags & NOTE_TRIGGER)
	kn->kn_hookid = 1;
	else
	kn->kn_hookid = 0;
	return (0);
	}

	static void
	filt_userdetach(__unused struct knote *kn)
	{

	/*
	* EVFILT_USER knotes are not attached to anything in the kernel.
	*/
	}

	static int
	filt_user(struct knote *kn, __unused long hint)
	{

	return (kn->kn_hookid);
	}

	static void
	filt_usertouch(struct knote kn, struct kevent kev, u_long type)
	{
	u_int ffctrl;

	switch (type) {
	case EVENT_REGISTER:
	if (kev->fflags & NOTE_TRIGGER)
	kn->kn_hookid = 1;

	ffctrl = kev->fflags & NOTE_FFCTRLMASK;
	kev->fflags &= NOTE_FFLAGSMASK;
	switch (ffctrl) {
	case NOTE_FFNOP:
	break;

	case NOTE_FFAND:
	kn->kn_sfflags &= kev->fflags;
	break;

	case NOTE_FFOR:
	kn->kn_sfflags \|= kev->fflags;
	break;

	case NOTE_FFCOPY:
	kn->kn_sfflags = kev->fflags;
	break;

	default:
	/* XXX Return error? */
	break;
	}
	kn->kn_sdata = kev->data;
	if (kev->flags & EV_CLEAR) {
	kn->kn_hookid = 0;
	kn->kn_data = 0;
	kn->kn_fflags = 0;
	}
	break;

	case EVENT_PROCESS:
	*kev = kn->kn_kevent;
	kev->fflags = kn->kn_sfflags;
	kev->data = kn->kn_sdata;
	if (kn->kn_flags & EV_CLEAR) {
	kn->kn_hookid = 0;
	kn->kn_data = 0;
	kn->kn_fflags = 0;
	}
	break;

	default:
	panic("filt_usertouch() - invalid type (%ld)", type);
	break;
	}
	}

	int
	-kqueue(struct thread td, struct kqueue_args uap)
	+sys_kqueue(struct thread td, struct kqueue_args uap)
	{
	struct filedesc *fdp;
	struct kqueue *kq;
	struct file *fp;
	int fd, error;

	fdp = td->td_proc->p_fd;
	error = falloc(td, &fp, &fd, 0);
	if (error)
	goto done2;

	/* An extra reference on `nfp' has been held for us by falloc(). */
	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK \| M_ZERO);
	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF\|MTX_DUPOK);
	TAILQ_INIT(&kq->kq_head);
	kq->kq_fdp = fdp;
	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);

	FILEDESC_XLOCK(fdp);
	SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
	FILEDESC_XUNLOCK(fdp);

	finit(fp, FREAD \| FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
	fdrop(fp, td);

	td->td_retval[0] = fd;
	done2:
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct kevent_args {
	int fd;
	const struct kevent *changelist;
	int nchanges;
	struct kevent *eventlist;
	int nevents;
	const struct timespec *timeout;
	};
	#endif
	int
	-kevent(struct thread td, struct kevent_args uap)
	+sys_kevent(struct thread td, struct kevent_args uap)
	{
	struct timespec ts, *tsp;
	struct kevent_copyops k_ops = { uap,
	kevent_copyout,
	kevent_copyin};
	int error;
	#ifdef KTRACE
	struct uio ktruio;
	struct iovec ktriov;
	struct uio *ktruioin = NULL;
	struct uio *ktruioout = NULL;
	#endif

	if (uap->timeout != NULL) {
	error = copyin(uap->timeout, &ts, sizeof(ts));
	if (error)
	return (error);
	tsp = &ts;
	} else
	tsp = NULL;

	#ifdef KTRACE
	if (KTRPOINT(td, KTR_GENIO)) {
	ktriov.iov_base = uap->changelist;
	ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
	ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
	.uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
	.uio_td = td };
	ktruioin = cloneuio(&ktruio);
	ktriov.iov_base = uap->eventlist;
	ktriov.iov_len = uap->nevents * sizeof(struct kevent);
	ktruioout = cloneuio(&ktruio);
	}
	#endif

	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
	&k_ops, tsp);

	#ifdef KTRACE
	if (ktruioin != NULL) {
	ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
	ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
	ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
	ktrgenio(uap->fd, UIO_READ, ktruioout, error);
	}
	#endif

	return (error);
	}

	/*
	* Copy 'count' items into the destination list pointed to by uap->eventlist.
	*/
	static int
	kevent_copyout(void arg, struct kevent kevp, int count)
	{
	struct kevent_args *uap;
	int error;

	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
	uap = (struct kevent_args *)arg;

	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
	if (error == 0)
	uap->eventlist += count;
	return (error);
	}

	/*
	* Copy 'count' items from the list pointed to by uap->changelist.
	*/
	static int
	kevent_copyin(void arg, struct kevent kevp, int count)
	{
	struct kevent_args *uap;
	int error;

	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
	uap = (struct kevent_args *)arg;

	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
	if (error == 0)
	uap->changelist += count;
	return (error);
	}

	int
	kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
	struct kevent_copyops k_ops, const struct timespec timeout)
	{
	struct kevent keva[KQ_NEVENTS];
	struct kevent kevp, changes;
	struct kqueue *kq;
	struct file *fp;
	int i, n, nerrors, error;

	if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
	return (error);
	if ((error = kqueue_acquire(fp, &kq)) != 0)
	goto done_norel;

	nerrors = 0;

	while (nchanges > 0) {
	n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
	error = k_ops->k_copyin(k_ops->arg, keva, n);
	if (error)
	goto done;
	changes = keva;
	for (i = 0; i < n; i++) {
	kevp = &changes[i];
	if (!kevp->filter)
	continue;
	kevp->flags &= ~EV_SYSFLAGS;
	error = kqueue_register(kq, kevp, td, 1);
	if (error \|\| (kevp->flags & EV_RECEIPT)) {
	if (nevents != 0) {
	kevp->flags = EV_ERROR;
	kevp->data = error;
	(void) k_ops->k_copyout(k_ops->arg,
	kevp, 1);
	nevents--;
	nerrors++;
	} else {
	goto done;
	}
	}
	}
	nchanges -= n;
	}
	if (nerrors) {
	td->td_retval[0] = nerrors;
	error = 0;
	goto done;
	}

	error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td);
	done:
	kqueue_release(kq, 0);
	done_norel:
	fdrop(fp, td);
	return (error);
	}

	int
	kqueue_add_filteropts(int filt, struct filterops *filtops)
	{
	int error;

	error = 0;
	if (filt > 0 \|\| filt + EVFILT_SYSCOUNT < 0) {
	printf(
	"trying to add a filterop that is out of range: %d is beyond %d\n",
	~filt, EVFILT_SYSCOUNT);
	return EINVAL;
	}
	mtx_lock(&filterops_lock);
	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
	sysfilt_ops[~filt].for_fop != NULL)
	error = EEXIST;
	else {
	sysfilt_ops[~filt].for_fop = filtops;
	sysfilt_ops[~filt].for_refcnt = 0;
	}
	mtx_unlock(&filterops_lock);

	return (error);
	}

	int
	kqueue_del_filteropts(int filt)
	{
	int error;

	error = 0;
	if (filt > 0 \|\| filt + EVFILT_SYSCOUNT < 0)
	return EINVAL;

	mtx_lock(&filterops_lock);
	if (sysfilt_ops[~filt].for_fop == &null_filtops \|\|
	sysfilt_ops[~filt].for_fop == NULL)
	error = EINVAL;
	else if (sysfilt_ops[~filt].for_refcnt != 0)
	error = EBUSY;
	else {
	sysfilt_ops[~filt].for_fop = &null_filtops;
	sysfilt_ops[~filt].for_refcnt = 0;
	}
	mtx_unlock(&filterops_lock);

	return error;
	}

	static struct filterops *
	kqueue_fo_find(int filt)
	{

	if (filt > 0 \|\| filt + EVFILT_SYSCOUNT < 0)
	return NULL;

	mtx_lock(&filterops_lock);
	sysfilt_ops[~filt].for_refcnt++;
	if (sysfilt_ops[~filt].for_fop == NULL)
	sysfilt_ops[~filt].for_fop = &null_filtops;
	mtx_unlock(&filterops_lock);

	return sysfilt_ops[~filt].for_fop;
	}

	static void
	kqueue_fo_release(int filt)
	{

	if (filt > 0 \|\| filt + EVFILT_SYSCOUNT < 0)
	return;

	mtx_lock(&filterops_lock);
	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
	("filter object refcount not valid on release"));
	sysfilt_ops[~filt].for_refcnt--;
	mtx_unlock(&filterops_lock);
	}

	/*
	* A ref to kq (obtained via kqueue_acquire) must be held. waitok will
	* influence if memory allocation should wait. Make sure it is 0 if you
	* hold any mutexes.
	*/
	static int
	kqueue_register(struct kqueue kq, struct kevent kev, struct thread *td, int waitok)
	{
	struct filterops *fops;
	struct file *fp;
	struct knote kn, tkn;
	int error, filt, event;
	int haskqglobal;

	fp = NULL;
	kn = NULL;
	error = 0;
	haskqglobal = 0;

	filt = kev->filter;
	fops = kqueue_fo_find(filt);
	if (fops == NULL)
	return EINVAL;

	tkn = knote_alloc(waitok); /* prevent waiting with locks */

	findkn:
	if (fops->f_isfd) {
	KASSERT(td != NULL, ("td is NULL"));
	error = fget(td, kev->ident, CAP_POLL_EVENT, &fp);
	if (error)
	goto done;

	if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
	kev->ident, 0) != 0) {
	/* try again */
	fdrop(fp, td);
	fp = NULL;
	error = kqueue_expand(kq, fops, kev->ident, waitok);
	if (error)
	goto done;
	goto findkn;
	}

	if (fp->f_type == DTYPE_KQUEUE) {
	/*
	* if we add some inteligence about what we are doing,
	* we should be able to support events on ourselves.
	* We need to know when we are doing this to prevent
	* getting both the knlist lock and the kq lock since
	* they are the same thing.
	*/
	if (fp->f_data == kq) {
	error = EINVAL;
	goto done;
	}

	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
	}

	KQ_LOCK(kq);
	if (kev->ident < kq->kq_knlistsize) {
	SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
	if (kev->filter == kn->kn_filter)
	break;
	}
	} else {
	if ((kev->flags & EV_ADD) == EV_ADD)
	kqueue_expand(kq, fops, kev->ident, waitok);

	KQ_LOCK(kq);
	if (kq->kq_knhashmask != 0) {
	struct klist *list;

	list = &kq->kq_knhash[
	KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
	SLIST_FOREACH(kn, list, kn_link)
	if (kev->ident == kn->kn_id &&
	kev->filter == kn->kn_filter)
	break;
	}
	}

	/* knote is in the process of changing, wait for it to stablize. */
	if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
	kq->kq_state \|= KQ_FLUXWAIT;
	msleep(kq, &kq->kq_lock, PSOCK \| PDROP, "kqflxwt", 0);
	if (fp != NULL) {
	fdrop(fp, td);
	fp = NULL;
	}
	goto findkn;
	}

	/*
	* kn now contains the matching knote, or NULL if no match
	*/
	if (kn == NULL) {
	if (kev->flags & EV_ADD) {
	kn = tkn;
	tkn = NULL;
	if (kn == NULL) {
	KQ_UNLOCK(kq);
	error = ENOMEM;
	goto done;
	}
	kn->kn_fp = fp;
	kn->kn_kq = kq;
	kn->kn_fop = fops;
	/*
	* apply reference counts to knote structure, and
	* do not release it at the end of this routine.
	*/
	fops = NULL;
	fp = NULL;

	kn->kn_sfflags = kev->fflags;
	kn->kn_sdata = kev->data;
	kev->fflags = 0;
	kev->data = 0;
	kn->kn_kevent = *kev;
	kn->kn_kevent.flags &= ~(EV_ADD \| EV_DELETE \|
	EV_ENABLE \| EV_DISABLE);
	kn->kn_status = KN_INFLUX\|KN_DETACHED;

	error = knote_attach(kn, kq);
	KQ_UNLOCK(kq);
	if (error != 0) {
	tkn = kn;
	goto done;
	}

	if ((error = kn->kn_fop->f_attach(kn)) != 0) {
	knote_drop(kn, td);
	goto done;
	}
	KN_LIST_LOCK(kn);
	goto done_ev_add;
	} else {
	/* No matching knote and the EV_ADD flag is not set. */
	KQ_UNLOCK(kq);
	error = ENOENT;
	goto done;
	}
	}

	if (kev->flags & EV_DELETE) {
	kn->kn_status \|= KN_INFLUX;
	KQ_UNLOCK(kq);
	if (!(kn->kn_status & KN_DETACHED))
	kn->kn_fop->f_detach(kn);
	knote_drop(kn, td);
	goto done;
	}

	/*
	* The user may change some filter values after the initial EV_ADD,
	* but doing so will not reset any filter which has already been
	* triggered.
	*/
	kn->kn_status \|= KN_INFLUX;
	KQ_UNLOCK(kq);
	KN_LIST_LOCK(kn);
	kn->kn_kevent.udata = kev->udata;
	if (!fops->f_isfd && fops->f_touch != NULL) {
	fops->f_touch(kn, kev, EVENT_REGISTER);
	} else {
	kn->kn_sfflags = kev->fflags;
	kn->kn_sdata = kev->data;
	}

	/*
	* We can get here with kn->kn_knlist == NULL. This can happen when
	* the initial attach event decides that the event is "completed"
	* already. i.e. filt_procattach is called on a zombie process. It
	* will call filt_proc which will remove it from the list, and NULL
	* kn_knlist.
	*/
	done_ev_add:
	event = kn->kn_fop->f_event(kn, 0);
	KQ_LOCK(kq);
	if (event)
	KNOTE_ACTIVATE(kn, 1);
	kn->kn_status &= ~KN_INFLUX;
	KN_LIST_UNLOCK(kn);

	if ((kev->flags & EV_DISABLE) &&
	((kn->kn_status & KN_DISABLED) == 0)) {
	kn->kn_status \|= KN_DISABLED;
	}

	if ((kev->flags & EV_ENABLE) && (kn->kn_status & KN_DISABLED)) {
	kn->kn_status &= ~KN_DISABLED;
	if ((kn->kn_status & KN_ACTIVE) &&
	((kn->kn_status & KN_QUEUED) == 0))
	knote_enqueue(kn);
	}
	KQ_UNLOCK_FLUX(kq);

	done:
	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
	if (fp != NULL)
	fdrop(fp, td);
	if (tkn != NULL)
	knote_free(tkn);
	if (fops != NULL)
	kqueue_fo_release(filt);
	return (error);
	}

	static int
	kqueue_acquire(struct file fp, struct kqueue *kqp)
	{
	int error;
	struct kqueue *kq;

	error = 0;

	kq = fp->f_data;
	if (fp->f_type != DTYPE_KQUEUE \|\| kq == NULL)
	return (EBADF);
	*kqp = kq;
	KQ_LOCK(kq);
	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
	KQ_UNLOCK(kq);
	return (EBADF);
	}
	kq->kq_refcnt++;
	KQ_UNLOCK(kq);

	return error;
	}

	static void
	kqueue_release(struct kqueue *kq, int locked)
	{
	if (locked)
	KQ_OWNED(kq);
	else
	KQ_LOCK(kq);
	kq->kq_refcnt--;
	if (kq->kq_refcnt == 1)
	wakeup(&kq->kq_refcnt);
	if (!locked)
	KQ_UNLOCK(kq);
	}

	static void
	kqueue_schedtask(struct kqueue *kq)
	{

	KQ_OWNED(kq);
	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
	("scheduling kqueue task while draining"));

	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
	taskqueue_enqueue(taskqueue_kqueue, &kq->kq_task);
	kq->kq_state \|= KQ_TASKSCHED;
	}
	}

	/*
	* Expand the kq to make sure we have storage for fops/ident pair.
	*
	* Return 0 on success (or no work necessary), return errno on failure.
	*
	* Not calling hashinit w/ waitok (proper malloc flag) should be safe.
	* If kqueue_register is called from a non-fd context, there usually/should
	* be no locks held.
	*/
	static int
	kqueue_expand(struct kqueue kq, struct filterops fops, uintptr_t ident,
	int waitok)
	{
	struct klist list, tmp_knhash, *to_free;
	u_long tmp_knhashmask;
	int size;
	int fd;
	int mflag = waitok ? M_WAITOK : M_NOWAIT;

	KQ_NOTOWNED(kq);

	to_free = NULL;
	if (fops->f_isfd) {
	fd = ident;
	if (kq->kq_knlistsize <= fd) {
	size = kq->kq_knlistsize;
	while (size <= fd)
	size += KQEXTENT;
	list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
	if (list == NULL)
	return ENOMEM;
	KQ_LOCK(kq);
	if (kq->kq_knlistsize > fd) {
	to_free = list;
	list = NULL;
	} else {
	if (kq->kq_knlist != NULL) {
	bcopy(kq->kq_knlist, list,
	kq->kq_knlistsize * sizeof(*list));
	to_free = kq->kq_knlist;
	kq->kq_knlist = NULL;
	}
	bzero((caddr_t)list +
	kq->kq_knlistsize * sizeof(*list),
	(size - kq->kq_knlistsize) * sizeof(*list));
	kq->kq_knlistsize = size;
	kq->kq_knlist = list;
	}
	KQ_UNLOCK(kq);
	}
	} else {
	if (kq->kq_knhashmask == 0) {
	tmp_knhash = hashinit(KN_HASHSIZE, M_KQUEUE,
	&tmp_knhashmask);
	if (tmp_knhash == NULL)
	return ENOMEM;
	KQ_LOCK(kq);
	if (kq->kq_knhashmask == 0) {
	kq->kq_knhash = tmp_knhash;
	kq->kq_knhashmask = tmp_knhashmask;
	} else {
	to_free = tmp_knhash;
	}
	KQ_UNLOCK(kq);
	}
	}
	free(to_free, M_KQUEUE);

	KQ_NOTOWNED(kq);
	return 0;
	}

	static void
	kqueue_task(void *arg, int pending)
	{
	struct kqueue *kq;
	int haskqglobal;

	haskqglobal = 0;
	kq = arg;

	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
	KQ_LOCK(kq);

	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);

	kq->kq_state &= ~KQ_TASKSCHED;
	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
	wakeup(&kq->kq_state);
	}
	KQ_UNLOCK(kq);
	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
	}

	/*
	* Scan, update kn_data (if not ONESHOT), and copyout triggered events.
	* We treat KN_MARKER knotes as if they are INFLUX.
	*/
	static int
	kqueue_scan(struct kqueue kq, int maxevents, struct kevent_copyops k_ops,
	const struct timespec tsp, struct kevent keva, struct thread *td)
	{
	struct kevent *kevp;
	struct timeval atv, rtv, ttv;
	struct knote kn, marker;
	int count, timeout, nkev, error, influx;
	int haskqglobal, touch;

	count = maxevents;
	nkev = 0;
	error = 0;
	haskqglobal = 0;

	if (maxevents == 0)
	goto done_nl;

	if (tsp != NULL) {
	TIMESPEC_TO_TIMEVAL(&atv, tsp);
	if (itimerfix(&atv)) {
	error = EINVAL;
	goto done_nl;
	}
	if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
	timeout = -1;
	else
	timeout = atv.tv_sec > 24 * 60 * 60 ?
	24 * 60 * 60 * hz : tvtohz(&atv);
	getmicrouptime(&rtv);
	timevaladd(&atv, &rtv);
	} else {
	atv.tv_sec = 0;
	atv.tv_usec = 0;
	timeout = 0;
	}
	marker = knote_alloc(1);
	if (marker == NULL) {
	error = ENOMEM;
	goto done_nl;
	}
	marker->kn_status = KN_MARKER;
	KQ_LOCK(kq);
	goto start;

	retry:
	if (atv.tv_sec \|\| atv.tv_usec) {
	getmicrouptime(&rtv);
	if (timevalcmp(&rtv, &atv, >=))
	goto done;
	ttv = atv;
	timevalsub(&ttv, &rtv);
	timeout = ttv.tv_sec > 24 * 60 * 60 ?
	24 * 60 * 60 * hz : tvtohz(&ttv);
	}

	start:
	kevp = keva;
	if (kq->kq_count == 0) {
	if (timeout < 0) {
	error = EWOULDBLOCK;
	} else {
	kq->kq_state \|= KQ_SLEEP;
	error = msleep(kq, &kq->kq_lock, PSOCK \| PCATCH,
	"kqread", timeout);
	}
	if (error == 0)
	goto retry;
	/* don't restart after signals... */
	if (error == ERESTART)
	error = EINTR;
	else if (error == EWOULDBLOCK)
	error = 0;
	goto done;
	}

	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
	influx = 0;
	while (count) {
	KQ_OWNED(kq);
	kn = TAILQ_FIRST(&kq->kq_head);

	if ((kn->kn_status == KN_MARKER && kn != marker) \|\|
	(kn->kn_status & KN_INFLUX) == KN_INFLUX) {
	if (influx) {
	influx = 0;
	KQ_FLUX_WAKEUP(kq);
	}
	kq->kq_state \|= KQ_FLUXWAIT;
	error = msleep(kq, &kq->kq_lock, PSOCK,
	"kqflxwt", 0);
	continue;
	}

	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
	if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
	kn->kn_status &= ~KN_QUEUED;
	kq->kq_count--;
	continue;
	}
	if (kn == marker) {
	KQ_FLUX_WAKEUP(kq);
	if (count == maxevents)
	goto retry;
	goto done;
	}
	KASSERT((kn->kn_status & KN_INFLUX) == 0,
	("KN_INFLUX set when not suppose to be"));

	if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
	kn->kn_status &= ~KN_QUEUED;
	kn->kn_status \|= KN_INFLUX;
	kq->kq_count--;
	KQ_UNLOCK(kq);
	/*
	* We don't need to lock the list since we've marked
	* it _INFLUX.
	*/
	*kevp = kn->kn_kevent;
	if (!(kn->kn_status & KN_DETACHED))
	kn->kn_fop->f_detach(kn);
	knote_drop(kn, td);
	KQ_LOCK(kq);
	kn = NULL;
	} else {
	kn->kn_status \|= KN_INFLUX;
	KQ_UNLOCK(kq);
	if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
	KN_LIST_LOCK(kn);
	if (kn->kn_fop->f_event(kn, 0) == 0) {
	KQ_LOCK(kq);
	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
	kn->kn_status &=
	~(KN_QUEUED \| KN_ACTIVE \| KN_INFLUX);
	kq->kq_count--;
	KN_LIST_UNLOCK(kn);
	influx = 1;
	continue;
	}
	touch = (!kn->kn_fop->f_isfd &&
	kn->kn_fop->f_touch != NULL);
	if (touch)
	kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
	else
	*kevp = kn->kn_kevent;
	KQ_LOCK(kq);
	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
	if (kn->kn_flags & (EV_CLEAR \| EV_DISPATCH)) {
	/*
	* Manually clear knotes who weren't
	* 'touch'ed.
	*/
	if (touch == 0 && kn->kn_flags & EV_CLEAR) {
	kn->kn_data = 0;
	kn->kn_fflags = 0;
	}
	if (kn->kn_flags & EV_DISPATCH)
	kn->kn_status \|= KN_DISABLED;
	kn->kn_status &= ~(KN_QUEUED \| KN_ACTIVE);
	kq->kq_count--;
	} else
	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);

	kn->kn_status &= ~(KN_INFLUX);
	KN_LIST_UNLOCK(kn);
	influx = 1;
	}

	/* we are returning a copy to the user */
	kevp++;
	nkev++;
	count--;

	if (nkev == KQ_NEVENTS) {
	influx = 0;
	KQ_UNLOCK_FLUX(kq);
	error = k_ops->k_copyout(k_ops->arg, keva, nkev);
	nkev = 0;
	kevp = keva;
	KQ_LOCK(kq);
	if (error)
	break;
	}
	}
	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
	done:
	KQ_OWNED(kq);
	KQ_UNLOCK_FLUX(kq);
	knote_free(marker);
	done_nl:
	KQ_NOTOWNED(kq);
	if (nkev != 0)
	error = k_ops->k_copyout(k_ops->arg, keva, nkev);
	td->td_retval[0] = maxevents - count;
	return (error);
	}

	/*
	* XXX
	* This could be expanded to call kqueue_scan, if desired.
	*/
	/ARGSUSED/
	static int
	kqueue_read(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{
	return (ENXIO);
	}

	/ARGSUSED/
	static int
	kqueue_write(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{
	return (ENXIO);
	}

	/ARGSUSED/
	static int
	kqueue_truncate(struct file fp, off_t length, struct ucred active_cred,
	struct thread *td)
	{

	return (EINVAL);
	}

	/ARGSUSED/
	static int
	kqueue_ioctl(struct file fp, u_long cmd, void data,
	struct ucred active_cred, struct thread td)
	{
	/*
	* Enabling sigio causes two major problems:
	* 1) infinite recursion:
	* Synopsys: kevent is being used to track signals and have FIOASYNC
	* set. On receipt of a signal this will cause a kqueue to recurse
	* into itself over and over. Sending the sigio causes the kqueue
	* to become ready, which in turn posts sigio again, forever.
	* Solution: this can be solved by setting a flag in the kqueue that
	* we have a SIGIO in progress.
	* 2) locking problems:
	* Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
	* us above the proc and pgrp locks.
	* Solution: Post a signal using an async mechanism, being sure to
	* record a generation count in the delivery so that we do not deliver
	* a signal to the wrong process.
	*
	* Note, these two mechanisms are somewhat mutually exclusive!
	*/
	#if 0
	struct kqueue *kq;

	kq = fp->f_data;
	switch (cmd) {
	case FIOASYNC:
	if ((int )data) {
	kq->kq_state \|= KQ_ASYNC;
	} else {
	kq->kq_state &= ~KQ_ASYNC;
	}
	return (0);

	case FIOSETOWN:
	return (fsetown((int )data, &kq->kq_sigio));

	case FIOGETOWN:
	(int )data = fgetown(&kq->kq_sigio);
	return (0);
	}
	#endif

	return (ENOTTY);
	}

	/ARGSUSED/
	static int
	kqueue_poll(struct file fp, int events, struct ucred active_cred,
	struct thread *td)
	{
	struct kqueue *kq;
	int revents = 0;
	int error;

	if ((error = kqueue_acquire(fp, &kq)))
	return POLLERR;

	KQ_LOCK(kq);
	if (events & (POLLIN \| POLLRDNORM)) {
	if (kq->kq_count) {
	revents \|= events & (POLLIN \| POLLRDNORM);
	} else {
	selrecord(td, &kq->kq_sel);
	if (SEL_WAITING(&kq->kq_sel))
	kq->kq_state \|= KQ_SEL;
	}
	}
	kqueue_release(kq, 1);
	KQ_UNLOCK(kq);
	return (revents);
	}

	/ARGSUSED/
	static int
	kqueue_stat(struct file fp, struct stat st, struct ucred *active_cred,
	struct thread *td)
	{

	bzero((void )st, sizeof st);
	/*
	* We no longer return kq_count because the unlocked value is useless.
	* If you spent all this time getting the count, why not spend your
	* syscall better by calling kevent?
	*
	* XXX - This is needed for libc_r.
	*/
	st->st_mode = S_IFIFO;
	return (0);
	}

	/ARGSUSED/
	static int
	kqueue_close(struct file fp, struct thread td)
	{
	struct kqueue *kq = fp->f_data;
	struct filedesc *fdp;
	struct knote *kn;
	int i;
	int error;

	if ((error = kqueue_acquire(fp, &kq)))
	return error;

	KQ_LOCK(kq);

	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
	("kqueue already closing"));
	kq->kq_state \|= KQ_CLOSING;
	if (kq->kq_refcnt > 1)
	msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);

	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
	fdp = kq->kq_fdp;

	KASSERT(knlist_empty(&kq->kq_sel.si_note),
	("kqueue's knlist not empty"));

	for (i = 0; i < kq->kq_knlistsize; i++) {
	while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
	if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
	kq->kq_state \|= KQ_FLUXWAIT;
	msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
	continue;
	}
	kn->kn_status \|= KN_INFLUX;
	KQ_UNLOCK(kq);
	if (!(kn->kn_status & KN_DETACHED))
	kn->kn_fop->f_detach(kn);
	knote_drop(kn, td);
	KQ_LOCK(kq);
	}
	}
	if (kq->kq_knhashmask != 0) {
	for (i = 0; i <= kq->kq_knhashmask; i++) {
	while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
	if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
	kq->kq_state \|= KQ_FLUXWAIT;
	msleep(kq, &kq->kq_lock, PSOCK,
	"kqclo2", 0);
	continue;
	}
	kn->kn_status \|= KN_INFLUX;
	KQ_UNLOCK(kq);
	if (!(kn->kn_status & KN_DETACHED))
	kn->kn_fop->f_detach(kn);
	knote_drop(kn, td);
	KQ_LOCK(kq);
	}
	}
	}

	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
	kq->kq_state \|= KQ_TASKDRAIN;
	msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
	}

	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
	selwakeuppri(&kq->kq_sel, PSOCK);
	if (!SEL_WAITING(&kq->kq_sel))
	kq->kq_state &= ~KQ_SEL;
	}

	KQ_UNLOCK(kq);

	FILEDESC_XLOCK(fdp);
	SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
	FILEDESC_XUNLOCK(fdp);

	seldrain(&kq->kq_sel);
	knlist_destroy(&kq->kq_sel.si_note);
	mtx_destroy(&kq->kq_lock);
	kq->kq_fdp = NULL;

	if (kq->kq_knhash != NULL)
	free(kq->kq_knhash, M_KQUEUE);
	if (kq->kq_knlist != NULL)
	free(kq->kq_knlist, M_KQUEUE);

	funsetown(&kq->kq_sigio);
	free(kq, M_KQUEUE);
	fp->f_data = NULL;

	return (0);
	}

	static void
	kqueue_wakeup(struct kqueue *kq)
	{
	KQ_OWNED(kq);

	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
	kq->kq_state &= ~KQ_SLEEP;
	wakeup(kq);
	}
	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
	selwakeuppri(&kq->kq_sel, PSOCK);
	if (!SEL_WAITING(&kq->kq_sel))
	kq->kq_state &= ~KQ_SEL;
	}
	if (!knlist_empty(&kq->kq_sel.si_note))
	kqueue_schedtask(kq);
	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
	pgsigio(&kq->kq_sigio, SIGIO, 0);
	}
	}

	/*
	* Walk down a list of knotes, activating them if their event has triggered.
	*
	* There is a possibility to optimize in the case of one kq watching another.
	* Instead of scheduling a task to wake it up, you could pass enough state
	* down the chain to make up the parent kqueue. Make this code functional
	* first.
	*/
	void
	knote(struct knlist *list, long hint, int lockflags)
	{
	struct kqueue *kq;
	struct knote *kn;
	int error;

	if (list == NULL)
	return;

	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);

	if ((lockflags & KNF_LISTLOCKED) == 0)
	list->kl_lock(list->kl_lockarg);

	/*
	* If we unlock the list lock (and set KN_INFLUX), we can eliminate
	* the kqueue scheduling, but this will introduce four
	* lock/unlock's for each knote to test. If we do, continue to use
	* SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
	* only safe if you want to remove the current item, which we are
	* not doing.
	*/
	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
	kq = kn->kn_kq;
	if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
	KQ_LOCK(kq);
	if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
	KQ_UNLOCK(kq);
	} else if ((lockflags & KNF_NOKQLOCK) != 0) {
	kn->kn_status \|= KN_INFLUX;
	KQ_UNLOCK(kq);
	error = kn->kn_fop->f_event(kn, hint);
	KQ_LOCK(kq);
	kn->kn_status &= ~KN_INFLUX;
	if (error)
	KNOTE_ACTIVATE(kn, 1);
	KQ_UNLOCK_FLUX(kq);
	} else {
	kn->kn_status \|= KN_HASKQLOCK;
	if (kn->kn_fop->f_event(kn, hint))
	KNOTE_ACTIVATE(kn, 1);
	kn->kn_status &= ~KN_HASKQLOCK;
	KQ_UNLOCK(kq);
	}
	}
	kq = NULL;
	}
	if ((lockflags & KNF_LISTLOCKED) == 0)
	list->kl_unlock(list->kl_lockarg);
	}

	/*
	* add a knote to a knlist
	*/
	void
	knlist_add(struct knlist knl, struct knote kn, int islocked)
	{
	KNL_ASSERT_LOCK(knl, islocked);
	KQ_NOTOWNED(kn->kn_kq);
	KASSERT((kn->kn_status & (KN_INFLUX\|KN_DETACHED)) ==
	(KN_INFLUX\|KN_DETACHED), ("knote not KN_INFLUX and KN_DETACHED"));
	if (!islocked)
	knl->kl_lock(knl->kl_lockarg);
	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
	if (!islocked)
	knl->kl_unlock(knl->kl_lockarg);
	KQ_LOCK(kn->kn_kq);
	kn->kn_knlist = knl;
	kn->kn_status &= ~KN_DETACHED;
	KQ_UNLOCK(kn->kn_kq);
	}

	static void
	knlist_remove_kq(struct knlist knl, struct knote kn, int knlislocked, int kqislocked)
	{
	KASSERT(!(!!kqislocked && !knlislocked), ("kq locked w/o knl locked"));
	KNL_ASSERT_LOCK(knl, knlislocked);
	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
	if (!kqislocked)
	KASSERT((kn->kn_status & (KN_INFLUX\|KN_DETACHED)) == KN_INFLUX,
	("knlist_remove called w/o knote being KN_INFLUX or already removed"));
	if (!knlislocked)
	knl->kl_lock(knl->kl_lockarg);
	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
	kn->kn_knlist = NULL;
	if (!knlislocked)
	knl->kl_unlock(knl->kl_lockarg);
	if (!kqislocked)
	KQ_LOCK(kn->kn_kq);
	kn->kn_status \|= KN_DETACHED;
	if (!kqislocked)
	KQ_UNLOCK(kn->kn_kq);
	}

	/*
	* remove all knotes from a specified klist
	*/
	void
	knlist_remove(struct knlist knl, struct knote kn, int islocked)
	{

	knlist_remove_kq(knl, kn, islocked, 0);
	}

	/*
	* remove knote from a specified klist while in f_event handler.
	*/
	void
	knlist_remove_inevent(struct knlist knl, struct knote kn)
	{

	knlist_remove_kq(knl, kn, 1,
	(kn->kn_status & KN_HASKQLOCK) == KN_HASKQLOCK);
	}

	int
	knlist_empty(struct knlist *knl)
	{
	KNL_ASSERT_LOCKED(knl);
	return SLIST_EMPTY(&knl->kl_list);
	}

	static struct mtx knlist_lock;
	MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
	MTX_DEF);
	static void knlist_mtx_lock(void *arg);
	static void knlist_mtx_unlock(void *arg);

	static void
	knlist_mtx_lock(void *arg)
	{
	mtx_lock((struct mtx *)arg);
	}

	static void
	knlist_mtx_unlock(void *arg)
	{
	mtx_unlock((struct mtx *)arg);
	}

	static void
	knlist_mtx_assert_locked(void *arg)
	{
	mtx_assert((struct mtx *)arg, MA_OWNED);
	}

	static void
	knlist_mtx_assert_unlocked(void *arg)
	{
	mtx_assert((struct mtx *)arg, MA_NOTOWNED);
	}

	void
	knlist_init(struct knlist knl, void lock, void (kl_lock)(void ),
	void (kl_unlock)(void ),
	void (kl_assert_locked)(void ), void (kl_assert_unlocked)(void ))
	{

	if (lock == NULL)
	knl->kl_lockarg = &knlist_lock;
	else
	knl->kl_lockarg = lock;

	if (kl_lock == NULL)
	knl->kl_lock = knlist_mtx_lock;
	else
	knl->kl_lock = kl_lock;
	if (kl_unlock == NULL)
	knl->kl_unlock = knlist_mtx_unlock;
	else
	knl->kl_unlock = kl_unlock;
	if (kl_assert_locked == NULL)
	knl->kl_assert_locked = knlist_mtx_assert_locked;
	else
	knl->kl_assert_locked = kl_assert_locked;
	if (kl_assert_unlocked == NULL)
	knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
	else
	knl->kl_assert_unlocked = kl_assert_unlocked;

	SLIST_INIT(&knl->kl_list);
	}

	void
	knlist_init_mtx(struct knlist knl, struct mtx lock)
	{

	knlist_init(knl, lock, NULL, NULL, NULL, NULL);
	}

	void
	knlist_destroy(struct knlist *knl)
	{

	#ifdef INVARIANTS
	/*
	* if we run across this error, we need to find the offending
	* driver and have it call knlist_clear.
	*/
	if (!SLIST_EMPTY(&knl->kl_list))
	printf("WARNING: destroying knlist w/ knotes on it!\n");
	#endif

	knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
	SLIST_INIT(&knl->kl_list);
	}

	/*
	* Even if we are locked, we may need to drop the lock to allow any influx
	* knotes time to "settle".
	*/
	void
	knlist_cleardel(struct knlist knl, struct thread td, int islocked, int killkn)
	{
	struct knote kn, kn2;
	struct kqueue *kq;

	if (islocked)
	KNL_ASSERT_LOCKED(knl);
	else {
	KNL_ASSERT_UNLOCKED(knl);
	again: /* need to reacquire lock since we have dropped it */
	knl->kl_lock(knl->kl_lockarg);
	}

	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
	kq = kn->kn_kq;
	KQ_LOCK(kq);
	if ((kn->kn_status & KN_INFLUX)) {
	KQ_UNLOCK(kq);
	continue;
	}
	knlist_remove_kq(knl, kn, 1, 1);
	if (killkn) {
	kn->kn_status \|= KN_INFLUX \| KN_DETACHED;
	KQ_UNLOCK(kq);
	knote_drop(kn, td);
	} else {
	/* Make sure cleared knotes disappear soon */
	kn->kn_flags \|= (EV_EOF \| EV_ONESHOT);
	KQ_UNLOCK(kq);
	}
	kq = NULL;
	}

	if (!SLIST_EMPTY(&knl->kl_list)) {
	/* there are still KN_INFLUX remaining */
	kn = SLIST_FIRST(&knl->kl_list);
	kq = kn->kn_kq;
	KQ_LOCK(kq);
	KASSERT(kn->kn_status & KN_INFLUX,
	("knote removed w/o list lock"));
	knl->kl_unlock(knl->kl_lockarg);
	kq->kq_state \|= KQ_FLUXWAIT;
	msleep(kq, &kq->kq_lock, PSOCK \| PDROP, "kqkclr", 0);
	kq = NULL;
	goto again;
	}

	if (islocked)
	KNL_ASSERT_LOCKED(knl);
	else {
	knl->kl_unlock(knl->kl_lockarg);
	KNL_ASSERT_UNLOCKED(knl);
	}
	}

	/*
	* Remove all knotes referencing a specified fd must be called with FILEDESC
	* lock. This prevents a race where a new fd comes along and occupies the
	* entry and we attach a knote to the fd.
	*/
	void
	knote_fdclose(struct thread *td, int fd)
	{
	struct filedesc *fdp = td->td_proc->p_fd;
	struct kqueue *kq;
	struct knote *kn;
	int influx;

	FILEDESC_XLOCK_ASSERT(fdp);

	/*
	* We shouldn't have to worry about new kevents appearing on fd
	* since filedesc is locked.
	*/
	SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
	KQ_LOCK(kq);

	again:
	influx = 0;
	while (kq->kq_knlistsize > fd &&
	(kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
	if (kn->kn_status & KN_INFLUX) {
	/* someone else might be waiting on our knote */
	if (influx)
	wakeup(kq);
	kq->kq_state \|= KQ_FLUXWAIT;
	msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
	goto again;
	}
	kn->kn_status \|= KN_INFLUX;
	KQ_UNLOCK(kq);
	if (!(kn->kn_status & KN_DETACHED))
	kn->kn_fop->f_detach(kn);
	knote_drop(kn, td);
	influx = 1;
	KQ_LOCK(kq);
	}
	KQ_UNLOCK_FLUX(kq);
	}
	}

	static int
	knote_attach(struct knote kn, struct kqueue kq)
	{
	struct klist *list;

	KASSERT(kn->kn_status & KN_INFLUX, ("knote not marked INFLUX"));
	KQ_OWNED(kq);

	if (kn->kn_fop->f_isfd) {
	if (kn->kn_id >= kq->kq_knlistsize)
	return ENOMEM;
	list = &kq->kq_knlist[kn->kn_id];
	} else {
	if (kq->kq_knhash == NULL)
	return ENOMEM;
	list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
	}

	SLIST_INSERT_HEAD(list, kn, kn_link);

	return 0;
	}

	/*
	* knote must already have been detached using the f_detach method.
	* no lock need to be held, it is assumed that the KN_INFLUX flag is set
	* to prevent other removal.
	*/
	static void
	knote_drop(struct knote kn, struct thread td)
	{
	struct kqueue *kq;
	struct klist *list;

	kq = kn->kn_kq;

	KQ_NOTOWNED(kq);
	KASSERT((kn->kn_status & KN_INFLUX) == KN_INFLUX,
	("knote_drop called without KN_INFLUX set in kn_status"));

	KQ_LOCK(kq);
	if (kn->kn_fop->f_isfd)
	list = &kq->kq_knlist[kn->kn_id];
	else
	list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];

	if (!SLIST_EMPTY(list))
	SLIST_REMOVE(list, kn, knote, kn_link);
	if (kn->kn_status & KN_QUEUED)
	knote_dequeue(kn);
	KQ_UNLOCK_FLUX(kq);

	if (kn->kn_fop->f_isfd) {
	fdrop(kn->kn_fp, td);
	kn->kn_fp = NULL;
	}
	kqueue_fo_release(kn->kn_kevent.filter);
	kn->kn_fop = NULL;
	knote_free(kn);
	}

	static void
	knote_enqueue(struct knote *kn)
	{
	struct kqueue *kq = kn->kn_kq;

	KQ_OWNED(kn->kn_kq);
	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));

	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
	kn->kn_status \|= KN_QUEUED;
	kq->kq_count++;
	kqueue_wakeup(kq);
	}

	static void
	knote_dequeue(struct knote *kn)
	{
	struct kqueue *kq = kn->kn_kq;

	KQ_OWNED(kn->kn_kq);
	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));

	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
	kn->kn_status &= ~KN_QUEUED;
	kq->kq_count--;
	}

	static void
	knote_init(void)
	{

	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, 0);
	}
	SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);

	static struct knote *
	knote_alloc(int waitok)
	{
	return ((struct knote *)uma_zalloc(knote_zone,
	(waitok ? M_WAITOK : M_NOWAIT)\|M_ZERO));
	}

	static void
	knote_free(struct knote *kn)
	{
	if (kn != NULL)
	uma_zfree(knote_zone, kn);
	}

	/*
	* Register the kev w/ the kq specified by fd.
	*/
	int
	kqfd_register(int fd, struct kevent kev, struct thread td, int waitok)
	{
	struct kqueue *kq;
	struct file *fp;
	int error;

	if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
	return (error);
	if ((error = kqueue_acquire(fp, &kq)) != 0)
	goto noacquire;

	error = kqueue_register(kq, kev, td, waitok);

	kqueue_release(kq, 0);

	noacquire:
	fdrop(fp, td);

	return error;
	}
	Index: head/sys/kern/kern_exec.c
	===================================================================
	--- head/sys/kern/kern_exec.c (revision 225616)
	+++ head/sys/kern/kern_exec.c (revision 225617)
	@@ -1,1577 +1,1577 @@
	/*-
	* Copyright (c) 1993, David Greenman
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_capsicum.h"
	#include "opt_hwpmc_hooks.h"
	#include "opt_kdtrace.h"
	#include "opt_ktrace.h"
	#include "opt_vm.h"

	#include <sys/param.h>
	#include <sys/capability.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/eventhandler.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysproto.h>
	#include <sys/signalvar.h>
	#include <sys/kernel.h>
	#include <sys/mount.h>
	#include <sys/filedesc.h>
	#include <sys/fcntl.h>
	#include <sys/acct.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/imgact_elf.h>
	#include <sys/wait.h>
	#include <sys/malloc.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/pioctl.h>
	#include <sys/namei.h>
	#include <sys/resourcevar.h>
	#include <sys/sdt.h>
	#include <sys/sf_buf.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/shm.h>
	#include <sys/sysctl.h>
	#include <sys/vnode.h>
	#include <sys/stat.h>
	#ifdef KTRACE
	#include <sys/ktrace.h>
	#endif

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_page.h>
	#include <vm/vm_map.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_object.h>
	#include <vm/vm_pager.h>

	#ifdef HWPMC_HOOKS
	#include <sys/pmckern.h>
	#endif

	#include <machine/reg.h>

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>

	#ifdef KDTRACE_HOOKS
	#include <sys/dtrace_bsd.h>
	dtrace_execexit_func_t dtrace_fasttrap_exec;
	#endif

	SDT_PROVIDER_DECLARE(proc);
	SDT_PROBE_DEFINE(proc, kernel, , exec, exec);
	SDT_PROBE_ARGTYPE(proc, kernel, , exec, 0, "char *");
	SDT_PROBE_DEFINE(proc, kernel, , exec_failure, exec-failure);
	SDT_PROBE_ARGTYPE(proc, kernel, , exec_failure, 0, "int");
	SDT_PROBE_DEFINE(proc, kernel, , exec_success, exec-success);
	SDT_PROBE_ARGTYPE(proc, kernel, , exec_success, 0, "char *");

	MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");

	static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
	static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
	static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
	static int do_execve(struct thread td, struct image_args args,
	struct mac *mac_p);

	/* XXX This should be vm_size_t. */
	SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG\|CTLFLAG_RD,
	NULL, 0, sysctl_kern_ps_strings, "LU", "");

	/* XXX This should be vm_size_t. */
	SYSCTL_PROC(_kern, KERN_USRSTACK, usrstack, CTLTYPE_ULONG\|CTLFLAG_RD\|
	CTLFLAG_CAPRD, NULL, 0, sysctl_kern_usrstack, "LU", "");

	SYSCTL_PROC(_kern, OID_AUTO, stackprot, CTLTYPE_INT\|CTLFLAG_RD,
	NULL, 0, sysctl_kern_stackprot, "I", "");

	u_long ps_arg_cache_limit = PAGE_SIZE / 16;
	SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
	&ps_arg_cache_limit, 0, "");

	static int map_at_zero = 0;
	TUNABLE_INT("security.bsd.map_at_zero", &map_at_zero);
	SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RW, &map_at_zero, 0,
	"Permit processes to map an object at virtual address 0.");

	static int
	sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS)
	{
	struct proc *p;
	int error;

	p = curproc;
	#ifdef SCTL_MASK32
	if (req->flags & SCTL_MASK32) {
	unsigned int val;
	val = (unsigned int)p->p_sysent->sv_psstrings;
	error = SYSCTL_OUT(req, &val, sizeof(val));
	} else
	#endif
	error = SYSCTL_OUT(req, &p->p_sysent->sv_psstrings,
	sizeof(p->p_sysent->sv_psstrings));
	return error;
	}

	static int
	sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS)
	{
	struct proc *p;
	int error;

	p = curproc;
	#ifdef SCTL_MASK32
	if (req->flags & SCTL_MASK32) {
	unsigned int val;
	val = (unsigned int)p->p_sysent->sv_usrstack;
	error = SYSCTL_OUT(req, &val, sizeof(val));
	} else
	#endif
	error = SYSCTL_OUT(req, &p->p_sysent->sv_usrstack,
	sizeof(p->p_sysent->sv_usrstack));
	return error;
	}

	static int
	sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS)
	{
	struct proc *p;

	p = curproc;
	return (SYSCTL_OUT(req, &p->p_sysent->sv_stackprot,
	sizeof(p->p_sysent->sv_stackprot)));
	}

	/*
	* Each of the items is a pointer to a `const struct execsw', hence the
	* double pointer here.
	*/
	static const struct execsw **execsw;

	#ifndef _SYS_SYSPROTO_H_
	struct execve_args {
	char *fname;
	char **argv;
	char **envv;
	};
	#endif

	int
	-execve(td, uap)
	+sys_execve(td, uap)
	struct thread *td;
	struct execve_args /* {
	char *fname;
	char **argv;
	char **envv;
	} / uap;
	{
	int error;
	struct image_args args;

	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
	uap->argv, uap->envv);
	if (error == 0)
	error = kern_execve(td, &args, NULL);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct fexecve_args {
	int fd;
	char **argv;
	char **envv;
	}
	#endif
	int
	-fexecve(struct thread td, struct fexecve_args uap)
	+sys_fexecve(struct thread td, struct fexecve_args uap)
	{
	int error;
	struct image_args args;

	error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
	uap->argv, uap->envv);
	if (error == 0) {
	args.fd = uap->fd;
	error = kern_execve(td, &args, NULL);
	}
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct __mac_execve_args {
	char *fname;
	char **argv;
	char **envv;
	struct mac *mac_p;
	};
	#endif

	int
	-__mac_execve(td, uap)
	+sys___mac_execve(td, uap)
	struct thread *td;
	struct __mac_execve_args /* {
	char *fname;
	char **argv;
	char **envv;
	struct mac *mac_p;
	} / uap;
	{
	#ifdef MAC
	int error;
	struct image_args args;

	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
	uap->argv, uap->envv);
	if (error == 0)
	error = kern_execve(td, &args, uap->mac_p);
	return (error);
	#else
	return (ENOSYS);
	#endif
	}

	/*
	* XXX: kern_execve has the astonishing property of not always returning to
	* the caller. If sufficiently bad things happen during the call to
	* do_execve(), it can end up calling exit1(); as a result, callers must
	* avoid doing anything which they might need to undo (e.g., allocating
	* memory).
	*/
	int
	kern_execve(td, args, mac_p)
	struct thread *td;
	struct image_args *args;
	struct mac *mac_p;
	{
	struct proc *p = td->td_proc;
	int error;

	AUDIT_ARG_ARGV(args->begin_argv, args->argc,
	args->begin_envv - args->begin_argv);
	AUDIT_ARG_ENVV(args->begin_envv, args->envc,
	args->endp - args->begin_envv);
	if (p->p_flag & P_HADTHREADS) {
	PROC_LOCK(p);
	if (thread_single(SINGLE_BOUNDARY)) {
	PROC_UNLOCK(p);
	exec_free_args(args);
	return (ERESTART); /* Try again later. */
	}
	PROC_UNLOCK(p);
	}

	error = do_execve(td, args, mac_p);

	if (p->p_flag & P_HADTHREADS) {
	PROC_LOCK(p);
	/*
	* If success, we upgrade to SINGLE_EXIT state to
	* force other threads to suicide.
	*/
	if (error == 0)
	thread_single(SINGLE_EXIT);
	else
	thread_single_end();
	PROC_UNLOCK(p);
	}

	return (error);
	}

	/*
	* In-kernel implementation of execve(). All arguments are assumed to be
	* userspace pointers from the passed thread.
	*/
	static int
	do_execve(td, args, mac_p)
	struct thread *td;
	struct image_args *args;
	struct mac *mac_p;
	{
	struct proc *p = td->td_proc;
	struct nameidata nd;
	struct ucred newcred = NULL, oldcred;
	struct uidinfo *euip;
	register_t *stack_base;
	int error, i;
	struct image_params image_params, *imgp;
	struct vattr attr;
	int (img_first)(struct image_params );
	struct pargs oldargs = NULL, newargs = NULL;
	struct sigacts oldsigacts, newsigacts;
	#ifdef KTRACE
	struct vnode *tracevp = NULL;
	struct ucred *tracecred = NULL;
	#endif
	struct vnode textvp = NULL, binvp = NULL;
	int credential_changing;
	int vfslocked;
	int textset;
	#ifdef MAC
	struct label *interpvplabel = NULL;
	int will_transition;
	#endif
	#ifdef HWPMC_HOOKS
	struct pmckern_procexec pe;
	#endif
	static const char fexecv_proc_title[] = "(fexecv)";

	vfslocked = 0;
	imgp = &image_params;

	/*
	* Lock the process and set the P_INEXEC flag to indicate that
	* it should be left alone until we're done here. This is
	* necessary to avoid race conditions - e.g. in ptrace() -
	* that might allow a local user to illicitly obtain elevated
	* privileges.
	*/
	PROC_LOCK(p);
	KASSERT((p->p_flag & P_INEXEC) == 0,
	("%s(): process already has P_INEXEC flag", __func__));
	p->p_flag \|= P_INEXEC;
	PROC_UNLOCK(p);

	/*
	* Initialize part of the common data
	*/
	imgp->proc = p;
	imgp->execlabel = NULL;
	imgp->attr = &attr;
	imgp->entry_addr = 0;
	imgp->reloc_base = 0;
	imgp->vmspace_destroyed = 0;
	imgp->interpreted = 0;
	imgp->opened = 0;
	imgp->interpreter_name = NULL;
	imgp->auxargs = NULL;
	imgp->vp = NULL;
	imgp->object = NULL;
	imgp->firstpage = NULL;
	imgp->ps_strings = 0;
	imgp->auxarg_size = 0;
	imgp->args = args;
	imgp->execpath = imgp->freepath = NULL;
	imgp->execpathp = 0;
	imgp->canary = 0;
	imgp->canarylen = 0;
	imgp->pagesizes = 0;
	imgp->pagesizeslen = 0;
	imgp->stack_prot = 0;

	#ifdef MAC
	error = mac_execve_enter(imgp, mac_p);
	if (error)
	goto exec_fail;
	#endif

	imgp->image_header = NULL;

	/*
	* Translate the file name. namei() returns a vnode pointer
	* in ni_vp amoung other things.
	*
	* XXXAUDIT: It would be desirable to also audit the name of the
	* interpreter if this is an interpreted binary.
	*/
	if (args->fname != NULL) {
	NDINIT(&nd, LOOKUP, ISOPEN \| LOCKLEAF \| FOLLOW \| SAVENAME
	\| MPSAFE \| AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
	}

	SDT_PROBE(proc, kernel, , exec, args->fname, 0, 0, 0, 0 );

	interpret:
	if (args->fname != NULL) {
	#ifdef CAPABILITY_MODE
	/*
	* While capability mode can't reach this point via direct
	* path arguments to execve(), we also don't allow
	* interpreters to be used in capability mode (for now).
	* Catch indirect lookups and return a permissions error.
	*/
	if (IN_CAPABILITY_MODE(td)) {
	error = ECAPMODE;
	goto exec_fail;
	}
	#endif
	error = namei(&nd);
	if (error)
	goto exec_fail;

	vfslocked = NDHASGIANT(&nd);
	binvp = nd.ni_vp;
	imgp->vp = binvp;
	} else {
	AUDIT_ARG_FD(args->fd);
	/*
	* Some might argue that CAP_READ and/or CAP_MMAP should also
	* be required here; such arguments will be entertained.
	*/
	error = fgetvp_read(td, args->fd, CAP_FEXECVE, &binvp);
	if (error)
	goto exec_fail;
	vfslocked = VFS_LOCK_GIANT(binvp->v_mount);
	vn_lock(binvp, LK_EXCLUSIVE \| LK_RETRY);
	AUDIT_ARG_VNODE1(binvp);
	imgp->vp = binvp;
	}

	/*
	* Check file permissions (also 'opens' file)
	*/
	error = exec_check_permissions(imgp);
	if (error)
	goto exec_fail_dealloc;

	imgp->object = imgp->vp->v_object;
	if (imgp->object != NULL)
	vm_object_reference(imgp->object);

	/*
	* Set VV_TEXT now so no one can write to the executable while we're
	* activating it.
	*
	* Remember if this was set before and unset it in case this is not
	* actually an executable image.
	*/
	textset = imgp->vp->v_vflag & VV_TEXT;
	imgp->vp->v_vflag \|= VV_TEXT;

	error = exec_map_first_page(imgp);
	if (error)
	goto exec_fail_dealloc;

	imgp->proc->p_osrel = 0;
	/*
	* If the current process has a special image activator it
	* wants to try first, call it. For example, emulating shell
	* scripts differently.
	*/
	error = -1;
	if ((img_first = imgp->proc->p_sysent->sv_imgact_try) != NULL)
	error = img_first(imgp);

	/*
	* Loop through the list of image activators, calling each one.
	* An activator returns -1 if there is no match, 0 on success,
	* and an error otherwise.
	*/
	for (i = 0; error == -1 && execsw[i]; ++i) {
	if (execsw[i]->ex_imgact == NULL \|\|
	execsw[i]->ex_imgact == img_first) {
	continue;
	}
	error = (*execsw[i]->ex_imgact)(imgp);
	}

	if (error) {
	if (error == -1) {
	if (textset == 0)
	imgp->vp->v_vflag &= ~VV_TEXT;
	error = ENOEXEC;
	}
	goto exec_fail_dealloc;
	}

	/*
	* Special interpreter operation, cleanup and loop up to try to
	* activate the interpreter.
	*/
	if (imgp->interpreted) {
	exec_unmap_first_page(imgp);
	/*
	* VV_TEXT needs to be unset for scripts. There is a short
	* period before we determine that something is a script where
	* VV_TEXT will be set. The vnode lock is held over this
	* entire period so nothing should illegitimately be blocked.
	*/
	imgp->vp->v_vflag &= ~VV_TEXT;
	/* free name buffer and old vnode */
	if (args->fname != NULL)
	NDFREE(&nd, NDF_ONLY_PNBUF);
	#ifdef MAC
	mac_execve_interpreter_enter(binvp, &interpvplabel);
	#endif
	if (imgp->opened) {
	VOP_CLOSE(binvp, FREAD, td->td_ucred, td);
	imgp->opened = 0;
	}
	vput(binvp);
	vm_object_deallocate(imgp->object);
	imgp->object = NULL;
	VFS_UNLOCK_GIANT(vfslocked);
	vfslocked = 0;
	/* set new name to that of the interpreter */
	NDINIT(&nd, LOOKUP, LOCKLEAF \| FOLLOW \| SAVENAME \| MPSAFE,
	UIO_SYSSPACE, imgp->interpreter_name, td);
	args->fname = imgp->interpreter_name;
	goto interpret;
	}

	/*
	* NB: We unlock the vnode here because it is believed that none
	* of the sv_copyout_strings/sv_fixup operations require the vnode.
	*/
	VOP_UNLOCK(imgp->vp, 0);

	/*
	* Do the best to calculate the full path to the image file.
	*/
	if (imgp->auxargs != NULL &&
	((args->fname != NULL && args->fname[0] == '/') \|\|
	vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0))
	imgp->execpath = args->fname;

	/*
	* Copy out strings (args and env) and initialize stack base
	*/
	if (p->p_sysent->sv_copyout_strings)
	stack_base = (*p->p_sysent->sv_copyout_strings)(imgp);
	else
	stack_base = exec_copyout_strings(imgp);

	/*
	* If custom stack fixup routine present for this process
	* let it do the stack setup.
	* Else stuff argument count as first item on stack
	*/
	if (p->p_sysent->sv_fixup != NULL)
	(*p->p_sysent->sv_fixup)(&stack_base, imgp);
	else
	suword(--stack_base, imgp->args->argc);

	/*
	* For security and other reasons, the file descriptor table cannot
	* be shared after an exec.
	*/
	fdunshare(p, td);

	/*
	* Malloc things before we need locks.
	*/
	newcred = crget();
	euip = uifind(attr.va_uid);
	i = imgp->args->begin_envv - imgp->args->begin_argv;
	/* Cache arguments if they fit inside our allowance */
	if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
	newargs = pargs_alloc(i);
	bcopy(imgp->args->begin_argv, newargs->ar_args, i);
	}

	/* close files on exec */
	fdcloseexec(td);
	vn_lock(imgp->vp, LK_EXCLUSIVE \| LK_RETRY);

	/* Get a reference to the vnode prior to locking the proc */
	VREF(binvp);

	/*
	* For security and other reasons, signal handlers cannot
	* be shared after an exec. The new process gets a copy of the old
	* handlers. In execsigs(), the new process will have its signals
	* reset.
	*/
	PROC_LOCK(p);
	oldcred = crcopysafe(p, newcred);
	if (sigacts_shared(p->p_sigacts)) {
	oldsigacts = p->p_sigacts;
	PROC_UNLOCK(p);
	newsigacts = sigacts_alloc();
	sigacts_copy(newsigacts, oldsigacts);
	PROC_LOCK(p);
	p->p_sigacts = newsigacts;
	} else
	oldsigacts = NULL;

	/* Stop profiling */
	stopprofclock(p);

	/* reset caught signals */
	execsigs(p);

	/* name this process - nameiexec(p, ndp) */
	bzero(p->p_comm, sizeof(p->p_comm));
	if (args->fname)
	bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
	min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
	else if (vn_commname(binvp, p->p_comm, sizeof(p->p_comm)) != 0)
	bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
	bcopy(p->p_comm, td->td_name, sizeof(td->td_name));

	/*
	* mark as execed, wakeup the process that vforked (if any) and tell
	* it that it now has its own resources back
	*/
	p->p_flag \|= P_EXEC;
	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
	p->p_flag &= ~P_PPWAIT;
	cv_broadcast(&p->p_pwait);
	}

	/*
	* Implement image setuid/setgid.
	*
	* Don't honor setuid/setgid if the filesystem prohibits it or if
	* the process is being traced.
	*
	* We disable setuid/setgid/etc in compatibility mode on the basis
	* that most setugid applications are not written with that
	* environment in mind, and will therefore almost certainly operate
	* incorrectly. In principle there's no reason that setugid
	* applications might not be useful in capability mode, so we may want
	* to reconsider this conservative design choice in the future.
	*
	* XXXMAC: For the time being, use NOSUID to also prohibit
	* transitions on the file system.
	*/
	credential_changing = 0;
	credential_changing \|= (attr.va_mode & S_ISUID) && oldcred->cr_uid !=
	attr.va_uid;
	credential_changing \|= (attr.va_mode & S_ISGID) && oldcred->cr_gid !=
	attr.va_gid;
	#ifdef MAC
	will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
	interpvplabel, imgp);
	credential_changing \|= will_transition;
	#endif

	if (credential_changing &&
	#ifdef CAPABILITY_MODE
	((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
	#endif
	(imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
	(p->p_flag & P_TRACED) == 0) {
	/*
	* Turn off syscall tracing for set-id programs, except for
	* root. Record any set-id flags first to make sure that
	* we do not regain any tracing during a possible block.
	*/
	setsugid(p);

	#ifdef KTRACE
	if (priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0))
	ktrprocexec(p, &tracecred, &tracevp);
	#endif
	/*
	* Close any file descriptors 0..2 that reference procfs,
	* then make sure file descriptors 0..2 are in use.
	*
	* setugidsafety() may call closef() and then pfind()
	* which may grab the process lock.
	* fdcheckstd() may call falloc() which may block to
	* allocate memory, so temporarily drop the process lock.
	*/
	PROC_UNLOCK(p);
	VOP_UNLOCK(imgp->vp, 0);
	setugidsafety(td);
	error = fdcheckstd(td);
	vn_lock(imgp->vp, LK_EXCLUSIVE \| LK_RETRY);
	if (error != 0)
	goto done1;
	PROC_LOCK(p);
	/*
	* Set the new credentials.
	*/
	if (attr.va_mode & S_ISUID)
	change_euid(newcred, euip);
	if (attr.va_mode & S_ISGID)
	change_egid(newcred, attr.va_gid);
	#ifdef MAC
	if (will_transition) {
	mac_vnode_execve_transition(oldcred, newcred, imgp->vp,
	interpvplabel, imgp);
	}
	#endif
	/*
	* Implement correct POSIX saved-id behavior.
	*
	* XXXMAC: Note that the current logic will save the
	* uid and gid if a MAC domain transition occurs, even
	* though maybe it shouldn't.
	*/
	change_svuid(newcred, newcred->cr_uid);
	change_svgid(newcred, newcred->cr_gid);
	p->p_ucred = newcred;
	newcred = NULL;
	} else {
	if (oldcred->cr_uid == oldcred->cr_ruid &&
	oldcred->cr_gid == oldcred->cr_rgid)
	p->p_flag &= ~P_SUGID;
	/*
	* Implement correct POSIX saved-id behavior.
	*
	* XXX: It's not clear that the existing behavior is
	* POSIX-compliant. A number of sources indicate that the
	* saved uid/gid should only be updated if the new ruid is
	* not equal to the old ruid, or the new euid is not equal
	* to the old euid and the new euid is not equal to the old
	* ruid. The FreeBSD code always updates the saved uid/gid.
	* Also, this code uses the new (replaced) euid and egid as
	* the source, which may or may not be the right ones to use.
	*/
	if (oldcred->cr_svuid != oldcred->cr_uid \|\|
	oldcred->cr_svgid != oldcred->cr_gid) {
	change_svuid(newcred, newcred->cr_uid);
	change_svgid(newcred, newcred->cr_gid);
	p->p_ucred = newcred;
	newcred = NULL;
	}
	}

	/*
	* Store the vp for use in procfs. This vnode was referenced prior
	* to locking the proc lock.
	*/
	textvp = p->p_textvp;
	p->p_textvp = binvp;

	#ifdef KDTRACE_HOOKS
	/*
	* Tell the DTrace fasttrap provider about the exec if it
	* has declared an interest.
	*/
	if (dtrace_fasttrap_exec)
	dtrace_fasttrap_exec(p);
	#endif

	/*
	* Notify others that we exec'd, and clear the P_INEXEC flag
	* as we're now a bona fide freshly-execed process.
	*/
	KNOTE_LOCKED(&p->p_klist, NOTE_EXEC);
	p->p_flag &= ~P_INEXEC;

	/*
	* If tracing the process, trap to the debugger so that
	* breakpoints can be set before the program executes. We
	* have to use tdsignal() to deliver the signal to the current
	* thread since any other threads in this process will exit if
	* execve() succeeds.
	*/
	if (p->p_flag & P_TRACED)
	tdsignal(td, SIGTRAP);

	/* clear "fork but no exec" flag, as we _are_ execing */
	p->p_acflag &= ~AFORK;

	/*
	* Free any previous argument cache and replace it with
	* the new argument cache, if any.
	*/
	oldargs = p->p_args;
	p->p_args = newargs;
	newargs = NULL;

	#ifdef HWPMC_HOOKS
	/*
	* Check if system-wide sampling is in effect or if the
	* current process is using PMCs. If so, do exec() time
	* processing. This processing needs to happen AFTER the
	* P_INEXEC flag is cleared.
	*
	* The proc lock needs to be released before taking the PMC
	* SX.
	*/
	if (PMC_SYSTEM_SAMPLING_ACTIVE() \|\| PMC_PROC_IS_USING_PMCS(p)) {
	PROC_UNLOCK(p);
	VOP_UNLOCK(imgp->vp, 0);
	pe.pm_credentialschanged = credential_changing;
	pe.pm_entryaddr = imgp->entry_addr;

	PMC_CALL_HOOK_X(td, PMC_FN_PROCESS_EXEC, (void *) &pe);
	vn_lock(imgp->vp, LK_EXCLUSIVE \| LK_RETRY);
	} else
	PROC_UNLOCK(p);
	#else /* !HWPMC_HOOKS */
	PROC_UNLOCK(p);
	#endif

	/* Set values passed into the program in registers. */
	if (p->p_sysent->sv_setregs)
	(*p->p_sysent->sv_setregs)(td, imgp,
	(u_long)(uintptr_t)stack_base);
	else
	exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base);

	vfs_mark_atime(imgp->vp, td->td_ucred);

	SDT_PROBE(proc, kernel, , exec_success, args->fname, 0, 0, 0, 0);

	done1:
	/*
	* Free any resources malloc'd earlier that we didn't use.
	*/
	uifree(euip);
	if (newcred == NULL)
	crfree(oldcred);
	else
	crfree(newcred);
	VOP_UNLOCK(imgp->vp, 0);

	/*
	* Handle deferred decrement of ref counts.
	*/
	if (textvp != NULL) {
	int tvfslocked;

	tvfslocked = VFS_LOCK_GIANT(textvp->v_mount);
	vrele(textvp);
	VFS_UNLOCK_GIANT(tvfslocked);
	}
	if (binvp && error != 0)
	vrele(binvp);
	#ifdef KTRACE
	if (tracevp != NULL) {
	int tvfslocked;

	tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
	vrele(tracevp);
	VFS_UNLOCK_GIANT(tvfslocked);
	}
	if (tracecred != NULL)
	crfree(tracecred);
	#endif
	vn_lock(imgp->vp, LK_EXCLUSIVE \| LK_RETRY);
	pargs_drop(oldargs);
	pargs_drop(newargs);
	if (oldsigacts != NULL)
	sigacts_free(oldsigacts);

	exec_fail_dealloc:

	/*
	* free various allocated resources
	*/
	if (imgp->firstpage != NULL)
	exec_unmap_first_page(imgp);

	if (imgp->vp != NULL) {
	if (args->fname)
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (imgp->opened)
	VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
	vput(imgp->vp);
	}

	if (imgp->object != NULL)
	vm_object_deallocate(imgp->object);

	free(imgp->freepath, M_TEMP);

	if (error == 0) {
	PROC_LOCK(p);
	td->td_dbgflags \|= TDB_EXEC;
	PROC_UNLOCK(p);

	/*
	* Stop the process here if its stop event mask has
	* the S_EXEC bit set.
	*/
	STOPEVENT(p, S_EXEC, 0);
	goto done2;
	}

	exec_fail:
	/* we're done here, clear P_INEXEC */
	PROC_LOCK(p);
	p->p_flag &= ~P_INEXEC;
	PROC_UNLOCK(p);

	SDT_PROBE(proc, kernel, , exec_failure, error, 0, 0, 0, 0);

	done2:
	#ifdef MAC
	mac_execve_exit(imgp);
	mac_execve_interpreter_exit(interpvplabel);
	#endif
	VFS_UNLOCK_GIANT(vfslocked);
	exec_free_args(args);

	if (error && imgp->vmspace_destroyed) {
	/* sorry, no more process anymore. exit gracefully */
	exit1(td, W_EXITCODE(0, SIGABRT));
	/* NOT REACHED */
	}

	#ifdef KTRACE
	if (error == 0)
	ktrprocctor(p);
	#endif

	return (error);
	}

	int
	exec_map_first_page(imgp)
	struct image_params *imgp;
	{
	int rv, i;
	int initial_pagein;
	vm_page_t ma[VM_INITIAL_PAGEIN];
	vm_object_t object;

	if (imgp->firstpage != NULL)
	exec_unmap_first_page(imgp);

	object = imgp->vp->v_object;
	if (object == NULL)
	return (EACCES);
	VM_OBJECT_LOCK(object);
	#if VM_NRESERVLEVEL > 0
	if ((object->flags & OBJ_COLORED) == 0) {
	object->flags \|= OBJ_COLORED;
	object->pg_color = 0;
	}
	#endif
	ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL \| VM_ALLOC_RETRY);
	if (ma[0]->valid != VM_PAGE_BITS_ALL) {
	initial_pagein = VM_INITIAL_PAGEIN;
	if (initial_pagein > object->size)
	initial_pagein = object->size;
	for (i = 1; i < initial_pagein; i++) {
	if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
	if (ma[i]->valid)
	break;
	if ((ma[i]->oflags & VPO_BUSY) \|\| ma[i]->busy)
	break;
	vm_page_busy(ma[i]);
	} else {
	ma[i] = vm_page_alloc(object, i,
	VM_ALLOC_NORMAL \| VM_ALLOC_IFNOTCACHED);
	if (ma[i] == NULL)
	break;
	}
	}
	initial_pagein = i;
	rv = vm_pager_get_pages(object, ma, initial_pagein, 0);
	ma[0] = vm_page_lookup(object, 0);
	if ((rv != VM_PAGER_OK) \|\| (ma[0] == NULL)) {
	if (ma[0] != NULL) {
	vm_page_lock(ma[0]);
	vm_page_free(ma[0]);
	vm_page_unlock(ma[0]);
	}
	VM_OBJECT_UNLOCK(object);
	return (EIO);
	}
	}
	vm_page_lock(ma[0]);
	vm_page_hold(ma[0]);
	vm_page_unlock(ma[0]);
	vm_page_wakeup(ma[0]);
	VM_OBJECT_UNLOCK(object);

	imgp->firstpage = sf_buf_alloc(ma[0], 0);
	imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);

	return (0);
	}

	void
	exec_unmap_first_page(imgp)
	struct image_params *imgp;
	{
	vm_page_t m;

	if (imgp->firstpage != NULL) {
	m = sf_buf_page(imgp->firstpage);
	sf_buf_free(imgp->firstpage);
	imgp->firstpage = NULL;
	vm_page_lock(m);
	vm_page_unhold(m);
	vm_page_unlock(m);
	}
	}

	/*
	* Destroy old address space, and allocate a new stack
	* The new stack is only SGROWSIZ large because it is grown
	* automatically in trap.c.
	*/
	int
	exec_new_vmspace(imgp, sv)
	struct image_params *imgp;
	struct sysentvec *sv;
	{
	int error;
	struct proc *p = imgp->proc;
	struct vmspace *vmspace = p->p_vmspace;
	vm_object_t obj;
	vm_offset_t sv_minuser, stack_addr;
	vm_map_t map;
	u_long ssiz;

	imgp->vmspace_destroyed = 1;
	imgp->sysent = sv;

	/* May be called with Giant held */
	EVENTHANDLER_INVOKE(process_exec, p, imgp);

	/*
	* Blow away entire process VM, if address space not shared,
	* otherwise, create a new VM space so that other threads are
	* not disrupted
	*/
	map = &vmspace->vm_map;
	if (map_at_zero)
	sv_minuser = sv->sv_minuser;
	else
	sv_minuser = MAX(sv->sv_minuser, PAGE_SIZE);
	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv_minuser &&
	vm_map_max(map) == sv->sv_maxuser) {
	shmexit(vmspace);
	pmap_remove_pages(vmspace_pmap(vmspace));
	vm_map_remove(map, vm_map_min(map), vm_map_max(map));
	} else {
	error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
	if (error)
	return (error);
	vmspace = p->p_vmspace;
	map = &vmspace->vm_map;
	}

	/* Map a shared page */
	obj = sv->sv_shared_page_obj;
	if (obj != NULL) {
	vm_object_reference(obj);
	error = vm_map_fixed(map, obj, 0,
	sv->sv_shared_page_base, sv->sv_shared_page_len,
	VM_PROT_READ \| VM_PROT_EXECUTE, VM_PROT_ALL,
	MAP_COPY_ON_WRITE \| MAP_ACC_NO_CHARGE);
	if (error) {
	vm_object_deallocate(obj);
	return (error);
	}
	}

	/* Allocate a new stack */
	if (sv->sv_maxssiz != NULL)
	ssiz = *sv->sv_maxssiz;
	else
	ssiz = maxssiz;
	stack_addr = sv->sv_usrstack - ssiz;
	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
	obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
	sv->sv_stackprot,
	VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
	if (error)
	return (error);

	#ifdef __ia64__
	/* Allocate a new register stack */
	stack_addr = IA64_BACKINGSTORE;
	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
	sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
	if (error)
	return (error);
	#endif

	/* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the
	* VM_STACK case, but they are still used to monitor the size of the
	* process stack so we can check the stack rlimit.
	*/
	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;

	return (0);
	}

	/*
	* Copy out argument and environment strings from the old process address
	* space into the temporary string buffer.
	*/
	int
	exec_copyin_args(struct image_args args, char fname,
	enum uio_seg segflg, char argv, char envv)
	{
	char argp, envp;
	int error;
	size_t length;

	bzero(args, sizeof(*args));
	if (argv == NULL)
	return (EFAULT);

	/*
	* Allocate demand-paged memory for the file name, argument, and
	* environment strings.
	*/
	error = exec_alloc_args(args);
	if (error != 0)
	return (error);

	/*
	* Copy the file name.
	*/
	if (fname != NULL) {
	args->fname = args->buf;
	error = (segflg == UIO_SYSSPACE) ?
	copystr(fname, args->fname, PATH_MAX, &length) :
	copyinstr(fname, args->fname, PATH_MAX, &length);
	if (error != 0)
	goto err_exit;
	} else
	length = 0;

	args->begin_argv = args->buf + length;
	args->endp = args->begin_argv;
	args->stringspace = ARG_MAX;

	/*
	* extract arguments first
	*/
	while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
	if (argp == (caddr_t) -1) {
	error = EFAULT;
	goto err_exit;
	}
	if ((error = copyinstr(argp, args->endp,
	args->stringspace, &length))) {
	if (error == ENAMETOOLONG)
	error = E2BIG;
	goto err_exit;
	}
	args->stringspace -= length;
	args->endp += length;
	args->argc++;
	}

	args->begin_envv = args->endp;

	/*
	* extract environment strings
	*/
	if (envv) {
	while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
	if (envp == (caddr_t)-1) {
	error = EFAULT;
	goto err_exit;
	}
	if ((error = copyinstr(envp, args->endp,
	args->stringspace, &length))) {
	if (error == ENAMETOOLONG)
	error = E2BIG;
	goto err_exit;
	}
	args->stringspace -= length;
	args->endp += length;
	args->envc++;
	}
	}

	return (0);

	err_exit:
	exec_free_args(args);
	return (error);
	}

	/*
	* Allocate temporary demand-paged, zero-filled memory for the file name,
	* argument, and environment strings. Returns zero if the allocation succeeds
	* and ENOMEM otherwise.
	*/
	int
	exec_alloc_args(struct image_args *args)
	{

	args->buf = (char *)kmem_alloc_wait(exec_map, PATH_MAX + ARG_MAX);
	return (args->buf != NULL ? 0 : ENOMEM);
	}

	void
	exec_free_args(struct image_args *args)
	{

	if (args->buf != NULL) {
	kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
	PATH_MAX + ARG_MAX);
	args->buf = NULL;
	}
	if (args->fname_buf != NULL) {
	free(args->fname_buf, M_TEMP);
	args->fname_buf = NULL;
	}
	}

	/*
	* Copy strings out to the new process address space, constructing new arg
	* and env vector tables. Return a pointer to the base so that it can be used
	* as the initial stack pointer.
	*/
	register_t *
	exec_copyout_strings(imgp)
	struct image_params *imgp;
	{
	int argc, envc;
	char **vectp;
	char stringp, destp;
	register_t *stack_base;
	struct ps_strings *arginfo;
	struct proc *p;
	size_t execpath_len;
	int szsigcode, szps;
	char canary[sizeof(long) * 8];

	szps = sizeof(pagesizes[0]) * MAXPAGESIZES;
	/*
	* Calculate string base and vector table pointers.
	* Also deal with signal trampoline code for this exec type.
	*/
	if (imgp->execpath != NULL && imgp->auxargs != NULL)
	execpath_len = strlen(imgp->execpath) + 1;
	else
	execpath_len = 0;
	p = imgp->proc;
	szsigcode = 0;
	arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings;
	if (p->p_sysent->sv_sigcode_base == 0) {
	if (p->p_sysent->sv_szsigcode != NULL)
	szsigcode = *(p->p_sysent->sv_szsigcode);
	}
	destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
	roundup(execpath_len, sizeof(char *)) -
	roundup(sizeof(canary), sizeof(char *)) -
	roundup(szps, sizeof(char *)) -
	roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));

	/*
	* install sigcode
	*/
	if (szsigcode != 0)
	copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
	szsigcode), szsigcode);

	/*
	* Copy the image path for the rtld.
	*/
	if (execpath_len != 0) {
	imgp->execpathp = (uintptr_t)arginfo - szsigcode - execpath_len;
	copyout(imgp->execpath, (void *)imgp->execpathp,
	execpath_len);
	}

	/*
	* Prepare the canary for SSP.
	*/
	arc4rand(canary, sizeof(canary), 0);
	imgp->canary = (uintptr_t)arginfo - szsigcode - execpath_len -
	sizeof(canary);
	copyout(canary, (void *)imgp->canary, sizeof(canary));
	imgp->canarylen = sizeof(canary);

	/*
	* Prepare the pagesizes array.
	*/
	imgp->pagesizes = (uintptr_t)arginfo - szsigcode - execpath_len -
	roundup(sizeof(canary), sizeof(char *)) - szps;
	copyout(pagesizes, (void *)imgp->pagesizes, szps);
	imgp->pagesizeslen = szps;

	/*
	* If we have a valid auxargs ptr, prepare some room
	* on the stack.
	*/
	if (imgp->auxargs) {
	/*
	* 'AT_COUNT*2' is size for the ELF Auxargs data. This is for
	* lower compatibility.
	*/
	imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size :
	(AT_COUNT * 2);
	/*
	* The '+ 2' is for the null pointers at the end of each of
	* the arg and env vector sets,and imgp->auxarg_size is room
	* for argument of Runtime loader.
	*/
	vectp = (char **)(destp - (imgp->args->argc +
	imgp->args->envc + 2 + imgp->auxarg_size)
	* sizeof(char *));
	} else {
	/*
	* The '+ 2' is for the null pointers at the end of each of
	* the arg and env vector sets
	*/
	vectp = (char *)(destp - (imgp->args->argc + imgp->args->envc + 2)
	sizeof(char *));
	}

	/*
	* vectp also becomes our initial stack base
	*/
	stack_base = (register_t *)vectp;

	stringp = imgp->args->begin_argv;
	argc = imgp->args->argc;
	envc = imgp->args->envc;

	/*
	* Copy out strings - arguments and environment.
	*/
	copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);

	/*
	* Fill in "ps_strings" struct for ps, w, etc.
	*/
	suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp);
	suword32(&arginfo->ps_nargvstr, argc);

	/*
	* Fill in argument portion of vector table.
	*/
	for (; argc > 0; --argc) {
	suword(vectp++, (long)(intptr_t)destp);
	while (*stringp++ != 0)
	destp++;
	destp++;
	}

	/* a null vector table pointer separates the argp's from the envp's */
	suword(vectp++, 0);

	suword(&arginfo->ps_envstr, (long)(intptr_t)vectp);
	suword32(&arginfo->ps_nenvstr, envc);

	/*
	* Fill in environment portion of vector table.
	*/
	for (; envc > 0; --envc) {
	suword(vectp++, (long)(intptr_t)destp);
	while (*stringp++ != 0)
	destp++;
	destp++;
	}

	/* end of vector table is a null pointer */
	suword(vectp, 0);

	return (stack_base);
	}

	/*
	* Check permissions of file to execute.
	* Called with imgp->vp locked.
	* Return 0 for success or error code on failure.
	*/
	int
	exec_check_permissions(imgp)
	struct image_params *imgp;
	{
	struct vnode *vp = imgp->vp;
	struct vattr *attr = imgp->attr;
	struct thread *td;
	int error;

	td = curthread;

	/* Get file attributes */
	error = VOP_GETATTR(vp, attr, td->td_ucred);
	if (error)
	return (error);

	#ifdef MAC
	error = mac_vnode_check_exec(td->td_ucred, imgp->vp, imgp);
	if (error)
	return (error);
	#endif

	/*
	* 1) Check if file execution is disabled for the filesystem that
	* this file resides on.
	* 2) Ensure that at least one execute bit is on. Otherwise, a
	* privileged user will always succeed, and we don't want this
	* to happen unless the file really is executable.
	* 3) Ensure that the file is a regular file.
	*/
	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) \|\|
	(attr->va_mode & (S_IXUSR \| S_IXGRP \| S_IXOTH)) == 0 \|\|
	(attr->va_type != VREG))
	return (EACCES);

	/*
	* Zero length files can't be exec'd
	*/
	if (attr->va_size == 0)
	return (ENOEXEC);

	/*
	* Check for execute permission to file based on current credentials.
	*/
	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
	if (error)
	return (error);

	/*
	* Check number of open-for-writes on the file and deny execution
	* if there are any.
	*/
	if (vp->v_writecount)
	return (ETXTBSY);

	/*
	* Call filesystem specific open routine (which does nothing in the
	* general case).
	*/
	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
	if (error == 0)
	imgp->opened = 1;
	return (error);
	}

	/*
	* Exec handler registration
	*/
	int
	exec_register(execsw_arg)
	const struct execsw *execsw_arg;
	{
	const struct execsw es, xs, **newexecsw;
	int count = 2; /* New slot and trailing NULL */

	if (execsw)
	for (es = execsw; *es; es++)
	count++;
	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
	if (newexecsw == NULL)
	return (ENOMEM);
	xs = newexecsw;
	if (execsw)
	for (es = execsw; *es; es++)
	xs++ = es;
	*xs++ = execsw_arg;
	*xs = NULL;
	if (execsw)
	free(execsw, M_TEMP);
	execsw = newexecsw;
	return (0);
	}

	int
	exec_unregister(execsw_arg)
	const struct execsw *execsw_arg;
	{
	const struct execsw es, xs, **newexecsw;
	int count = 1;

	if (execsw == NULL)
	panic("unregister with no handlers left?\n");

	for (es = execsw; *es; es++) {
	if (*es == execsw_arg)
	break;
	}
	if (*es == NULL)
	return (ENOENT);
	for (es = execsw; *es; es++)
	if (*es != execsw_arg)
	count++;
	newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
	if (newexecsw == NULL)
	return (ENOMEM);
	xs = newexecsw;
	for (es = execsw; *es; es++)
	if (*es != execsw_arg)
	xs++ = es;
	*xs = NULL;
	if (execsw)
	free(execsw, M_TEMP);
	execsw = newexecsw;
	return (0);
	}

	static vm_object_t shared_page_obj;
	static int shared_page_free;

	int
	shared_page_fill(int size, int align, const char *data)
	{
	vm_page_t m;
	struct sf_buf *s;
	vm_offset_t sk;
	int res;

	VM_OBJECT_LOCK(shared_page_obj);
	m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_RETRY);
	res = roundup(shared_page_free, align);
	if (res + size >= IDX_TO_OFF(shared_page_obj->size))
	res = -1;
	else {
	VM_OBJECT_UNLOCK(shared_page_obj);
	s = sf_buf_alloc(m, SFB_DEFAULT);
	sk = sf_buf_kva(s);
	bcopy(data, (void *)(sk + res), size);
	shared_page_free = res + size;
	sf_buf_free(s);
	VM_OBJECT_LOCK(shared_page_obj);
	}
	vm_page_wakeup(m);
	VM_OBJECT_UNLOCK(shared_page_obj);
	return (res);
	}

	static void
	shared_page_init(void *dummy __unused)
	{
	vm_page_t m;

	shared_page_obj = vm_pager_allocate(OBJT_PHYS, 0, PAGE_SIZE,
	VM_PROT_DEFAULT, 0, NULL);
	VM_OBJECT_LOCK(shared_page_obj);
	m = vm_page_grab(shared_page_obj, 0, VM_ALLOC_RETRY \| VM_ALLOC_NOBUSY \|
	VM_ALLOC_ZERO);
	m->valid = VM_PAGE_BITS_ALL;
	VM_OBJECT_UNLOCK(shared_page_obj);
	}

	SYSINIT(shp, SI_SUB_EXEC, SI_ORDER_FIRST, (sysinit_cfunc_t)shared_page_init,
	NULL);

	void
	exec_sysvec_init(void *param)
	{
	struct sysentvec *sv;

	sv = (struct sysentvec *)param;

	if ((sv->sv_flags & SV_SHP) == 0)
	return;
	sv->sv_shared_page_obj = shared_page_obj;
	sv->sv_sigcode_base = sv->sv_shared_page_base +
	shared_page_fill(*(sv->sv_szsigcode), 16, sv->sv_sigcode);
	}
	Index: head/sys/kern/kern_exit.c
	===================================================================
	--- head/sys/kern/kern_exit.c (revision 225616)
	+++ head/sys/kern/kern_exit.c (revision 225617)
	@@ -1,956 +1,956 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_exit.c 8.7 (Berkeley) 2/12/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_kdtrace.h"
	#include "opt_ktrace.h"
	#include "opt_procdesc.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/capability.h>
	#include <sys/eventhandler.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/procdesc.h>
	#include <sys/pioctl.h>
	#include <sys/jail.h>
	#include <sys/tty.h>
	#include <sys/wait.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>
	#include <sys/racct.h>
	#include <sys/resourcevar.h>
	#include <sys/sbuf.h>
	#include <sys/signalvar.h>
	#include <sys/sched.h>
	#include <sys/sx.h>
	#include <sys/syscallsubr.h>
	#include <sys/syslog.h>
	#include <sys/ptrace.h>
	#include <sys/acct.h> /* for acct_process() function prototype */
	#include <sys/filedesc.h>
	#include <sys/sdt.h>
	#include <sys/shm.h>
	#include <sys/sem.h>
	#ifdef KTRACE
	#include <sys/ktrace.h>
	#endif

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_page.h>
	#include <vm/uma.h>

	#ifdef KDTRACE_HOOKS
	#include <sys/dtrace_bsd.h>
	dtrace_execexit_func_t dtrace_fasttrap_exit;
	#endif

	SDT_PROVIDER_DECLARE(proc);
	SDT_PROBE_DEFINE(proc, kernel, , exit, exit);
	SDT_PROBE_ARGTYPE(proc, kernel, , exit, 0, "int");

	/* Hook for NFS teardown procedure. */
	void (nlminfo_release_p)(struct proc p);

	/*
	* exit -- death of process.
	*/
	void
	-sys_exit(struct thread td, struct sys_exit_args uap)
	+sys_sys_exit(struct thread td, struct sys_exit_args uap)
	{

	exit1(td, W_EXITCODE(uap->rval, 0));
	/* NOTREACHED */
	}

	/*
	* Exit: deallocate address space and other resources, change proc state to
	* zombie, and unlink proc from allproc and parent's lists. Save exit status
	* and rusage for wait(). Check for child processes and orphan them.
	*/
	void
	exit1(struct thread *td, int rv)
	{
	struct proc p, nq, *q;
	struct vnode *vtmp;
	struct vnode *ttyvp = NULL;
	struct plimit *plim;
	int locked;

	mtx_assert(&Giant, MA_NOTOWNED);

	p = td->td_proc;
	/*
	* XXX in case we're rebooting we just let init die in order to
	* work around an unsolved stack overflow seen very late during
	* shutdown on sparc64 when the gmirror worker process exists.
	*/
	if (p == initproc && rebooting == 0) {
	printf("init died (signal %d, exit %d)\n",
	WTERMSIG(rv), WEXITSTATUS(rv));
	panic("Going nowhere without my init!");
	}

	/*
	* MUST abort all other threads before proceeding past here.
	*/
	PROC_LOCK(p);
	while (p->p_flag & P_HADTHREADS) {
	/*
	* First check if some other thread got here before us..
	* if so, act apropriatly, (exit or suspend);
	*/
	thread_suspend_check(0);

	/*
	* Kill off the other threads. This requires
	* some co-operation from other parts of the kernel
	* so it may not be instantaneous. With this state set
	* any thread entering the kernel from userspace will
	* thread_exit() in trap(). Any thread attempting to
	* sleep will return immediately with EINTR or EWOULDBLOCK
	* which will hopefully force them to back out to userland
	* freeing resources as they go. Any thread attempting
	* to return to userland will thread_exit() from userret().
	* thread_exit() will unsuspend us when the last of the
	* other threads exits.
	* If there is already a thread singler after resumption,
	* calling thread_single will fail; in that case, we just
	* re-check all suspension request, the thread should
	* either be suspended there or exit.
	*/
	if (! thread_single(SINGLE_EXIT))
	break;

	/*
	* All other activity in this process is now stopped.
	* Threading support has been turned off.
	*/
	}
	KASSERT(p->p_numthreads == 1,
	("exit1: proc %p exiting with %d threads", p, p->p_numthreads));
	racct_sub(p, RACCT_NTHR, 1);
	/*
	* Wakeup anyone in procfs' PIOCWAIT. They should have a hold
	* on our vmspace, so we should block below until they have
	* released their reference to us. Note that if they have
	* requested S_EXIT stops we will block here until they ack
	* via PIOCCONT.
	*/
	_STOPEVENT(p, S_EXIT, rv);

	/*
	* Note that we are exiting and do another wakeup of anyone in
	* PIOCWAIT in case they aren't listening for S_EXIT stops or
	* decided to wait again after we told them we are exiting.
	*/
	p->p_flag \|= P_WEXIT;
	wakeup(&p->p_stype);

	/*
	* Wait for any processes that have a hold on our vmspace to
	* release their reference.
	*/
	while (p->p_lock > 0)
	msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0);

	p->p_xstat = rv; /* Let event handler change exit status */
	PROC_UNLOCK(p);
	/* Drain the limit callout while we don't have the proc locked */
	callout_drain(&p->p_limco);

	#ifdef AUDIT
	/*
	* The Sun BSM exit token contains two components: an exit status as
	* passed to exit(), and a return value to indicate what sort of exit
	* it was. The exit status is WEXITSTATUS(rv), but it's not clear
	* what the return value is.
	*/
	AUDIT_ARG_EXIT(WEXITSTATUS(rv), 0);
	AUDIT_SYSCALL_EXIT(0, td);
	#endif

	/* Are we a task leader? */
	if (p == p->p_leader) {
	mtx_lock(&ppeers_lock);
	q = p->p_peers;
	while (q != NULL) {
	PROC_LOCK(q);
	- psignal(q, SIGKILL);
	+ kern_psignal(q, SIGKILL);
	PROC_UNLOCK(q);
	q = q->p_peers;
	}
	while (p->p_peers != NULL)
	msleep(p, &ppeers_lock, PWAIT, "exit1", 0);
	mtx_unlock(&ppeers_lock);
	}

	/*
	* Check if any loadable modules need anything done at process exit.
	* E.g. SYSV IPC stuff
	* XXX what if one of these generates an error?
	*/
	EVENTHANDLER_INVOKE(process_exit, p);

	/*
	* If parent is waiting for us to exit or exec,
	* P_PPWAIT is set; we will wakeup the parent below.
	*/
	PROC_LOCK(p);
	rv = p->p_xstat; /* Event handler could change exit status */
	stopprofclock(p);
	p->p_flag &= ~(P_TRACED \| P_PPWAIT);

	/*
	* Stop the real interval timer. If the handler is currently
	* executing, prevent it from rearming itself and let it finish.
	*/
	if (timevalisset(&p->p_realtimer.it_value) &&
	callout_stop(&p->p_itcallout) == 0) {
	timevalclear(&p->p_realtimer.it_interval);
	msleep(&p->p_itcallout, &p->p_mtx, PWAIT, "ritwait", 0);
	KASSERT(!timevalisset(&p->p_realtimer.it_value),
	("realtime timer is still armed"));
	}
	PROC_UNLOCK(p);

	/*
	* Reset any sigio structures pointing to us as a result of
	* F_SETOWN with our pid.
	*/
	funsetownlst(&p->p_sigiolst);

	/*
	* If this process has an nlminfo data area (for lockd), release it
	*/
	if (nlminfo_release_p != NULL && p->p_nlminfo != NULL)
	(*nlminfo_release_p)(p);

	/*
	* Close open files and release open-file table.
	* This may block!
	*/
	fdfree(td);

	/*
	* If this thread tickled GEOM, we need to wait for the giggling to
	* stop before we return to userland
	*/
	if (td->td_pflags & TDP_GEOM)
	g_waitidle();

	/*
	* Remove ourself from our leader's peer list and wake our leader.
	*/
	mtx_lock(&ppeers_lock);
	if (p->p_leader->p_peers) {
	q = p->p_leader;
	while (q->p_peers != p)
	q = q->p_peers;
	q->p_peers = p->p_peers;
	wakeup(p->p_leader);
	}
	mtx_unlock(&ppeers_lock);

	vmspace_exit(td);

	sx_xlock(&proctree_lock);
	if (SESS_LEADER(p)) {
	struct session *sp = p->p_session;
	struct tty *tp;

	/*
	* s_ttyp is not zero'd; we use this to indicate that
	* the session once had a controlling terminal. (for
	* logging and informational purposes)
	*/
	SESS_LOCK(sp);
	ttyvp = sp->s_ttyvp;
	tp = sp->s_ttyp;
	sp->s_ttyvp = NULL;
	sp->s_ttydp = NULL;
	sp->s_leader = NULL;
	SESS_UNLOCK(sp);

	/*
	* Signal foreground pgrp and revoke access to
	* controlling terminal if it has not been revoked
	* already.
	*
	* Because the TTY may have been revoked in the mean
	* time and could already have a new session associated
	* with it, make sure we don't send a SIGHUP to a
	* foreground process group that does not belong to this
	* session.
	*/

	if (tp != NULL) {
	tty_lock(tp);
	if (tp->t_session == sp)
	tty_signal_pgrp(tp, SIGHUP);
	tty_unlock(tp);
	}

	if (ttyvp != NULL) {
	sx_xunlock(&proctree_lock);
	if (vn_lock(ttyvp, LK_EXCLUSIVE) == 0) {
	VOP_REVOKE(ttyvp, REVOKEALL);
	VOP_UNLOCK(ttyvp, 0);
	}
	sx_xlock(&proctree_lock);
	}
	}
	fixjobc(p, p->p_pgrp, 0);
	sx_xunlock(&proctree_lock);
	(void)acct_process(td);

	/* Release the TTY now we've unlocked everything. */
	if (ttyvp != NULL)
	vrele(ttyvp);
	#ifdef KTRACE
	ktrprocexit(td);
	#endif
	/*
	* Release reference to text vnode
	*/
	if ((vtmp = p->p_textvp) != NULL) {
	p->p_textvp = NULL;
	locked = VFS_LOCK_GIANT(vtmp->v_mount);
	vrele(vtmp);
	VFS_UNLOCK_GIANT(locked);
	}

	/*
	* Release our limits structure.
	*/
	PROC_LOCK(p);
	plim = p->p_limit;
	p->p_limit = NULL;
	PROC_UNLOCK(p);
	lim_free(plim);

	tidhash_remove(td);

	/*
	* Remove proc from allproc queue and pidhash chain.
	* Place onto zombproc. Unlink from parent's child list.
	*/
	sx_xlock(&allproc_lock);
	LIST_REMOVE(p, p_list);
	LIST_INSERT_HEAD(&zombproc, p, p_list);
	LIST_REMOVE(p, p_hash);
	sx_xunlock(&allproc_lock);

	/*
	* Call machine-dependent code to release any
	* machine-dependent resources other than the address space.
	* The address space is released by "vmspace_exitfree(p)" in
	* vm_waitproc().
	*/
	cpu_exit(td);

	WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);

	/*
	* Reparent all of our children to init.
	*/
	sx_xlock(&proctree_lock);
	q = LIST_FIRST(&p->p_children);
	if (q != NULL) /* only need this if any child is S_ZOMB */
	wakeup(initproc);
	for (; q != NULL; q = nq) {
	nq = LIST_NEXT(q, p_sibling);
	PROC_LOCK(q);
	proc_reparent(q, initproc);
	q->p_sigparent = SIGCHLD;
	/*
	* Traced processes are killed
	* since their existence means someone is screwing up.
	*/
	if (q->p_flag & P_TRACED) {
	struct thread *temp;

	q->p_flag &= ~(P_TRACED \| P_STOPPED_TRACE);
	FOREACH_THREAD_IN_PROC(q, temp)
	temp->td_dbgflags &= ~TDB_SUSPEND;
	- psignal(q, SIGKILL);
	+ kern_psignal(q, SIGKILL);
	}
	PROC_UNLOCK(q);
	}

	/* Save exit status. */
	PROC_LOCK(p);
	p->p_xthread = td;

	/* Tell the prison that we are gone. */
	prison_proc_free(p->p_ucred->cr_prison);

	#ifdef KDTRACE_HOOKS
	/*
	* Tell the DTrace fasttrap provider about the exit if it
	* has declared an interest.
	*/
	if (dtrace_fasttrap_exit)
	dtrace_fasttrap_exit(p);
	#endif

	/*
	* Notify interested parties of our demise.
	*/
	KNOTE_LOCKED(&p->p_klist, NOTE_EXIT);

	#ifdef KDTRACE_HOOKS
	int reason = CLD_EXITED;
	if (WCOREDUMP(rv))
	reason = CLD_DUMPED;
	else if (WIFSIGNALED(rv))
	reason = CLD_KILLED;
	SDT_PROBE(proc, kernel, , exit, reason, 0, 0, 0, 0);
	#endif

	/*
	* Just delete all entries in the p_klist. At this point we won't
	* report any more events, and there are nasty race conditions that
	* can beat us if we don't.
	*/
	knlist_clear(&p->p_klist, 1);

	/*
	* If this is a process with a descriptor, we may not need to deliver
	* a signal to the parent. proctree_lock is held over
	* procdesc_exit() to serialize concurrent calls to close() and
	* exit().
	*/
	#ifdef PROCDESC
	if (p->p_procdesc == NULL \|\| procdesc_exit(p)) {
	#endif
	/*
	* Notify parent that we're gone. If parent has the
	* PS_NOCLDWAIT flag set, or if the handler is set to SIG_IGN,
	* notify process 1 instead (and hope it will handle this
	* situation).
	*/
	PROC_LOCK(p->p_pptr);
	mtx_lock(&p->p_pptr->p_sigacts->ps_mtx);
	if (p->p_pptr->p_sigacts->ps_flag &
	(PS_NOCLDWAIT \| PS_CLDSIGIGN)) {
	struct proc *pp;

	mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
	pp = p->p_pptr;
	PROC_UNLOCK(pp);
	proc_reparent(p, initproc);
	p->p_sigparent = SIGCHLD;
	PROC_LOCK(p->p_pptr);

	/*
	* Notify parent, so in case he was wait(2)ing or
	* executing waitpid(2) with our pid, he will
	* continue.
	*/
	wakeup(pp);
	} else
	mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);

	if (p->p_pptr == initproc)
	- psignal(p->p_pptr, SIGCHLD);
	+ kern_psignal(p->p_pptr, SIGCHLD);
	else if (p->p_sigparent != 0) {
	if (p->p_sigparent == SIGCHLD)
	childproc_exited(p);
	else /* LINUX thread */
	- psignal(p->p_pptr, p->p_sigparent);
	+ kern_psignal(p->p_pptr, p->p_sigparent);
	}
	#ifdef PROCDESC
	} else
	PROC_LOCK(p->p_pptr);
	#endif
	sx_xunlock(&proctree_lock);

	/*
	* The state PRS_ZOMBIE prevents other proesses from sending
	* signal to the process, to avoid memory leak, we free memory
	* for signal queue at the time when the state is set.
	*/
	sigqueue_flush(&p->p_sigqueue);
	sigqueue_flush(&td->td_sigqueue);

	/*
	* We have to wait until after acquiring all locks before
	* changing p_state. We need to avoid all possible context
	* switches (including ones from blocking on a mutex) while
	* marked as a zombie. We also have to set the zombie state
	* before we release the parent process' proc lock to avoid
	* a lost wakeup. So, we first call wakeup, then we grab the
	* sched lock, update the state, and release the parent process'
	* proc lock.
	*/
	wakeup(p->p_pptr);
	cv_broadcast(&p->p_pwait);
	sched_exit(p->p_pptr, td);
	PROC_SLOCK(p);
	p->p_state = PRS_ZOMBIE;
	PROC_UNLOCK(p->p_pptr);

	/*
	* Hopefully no one will try to deliver a signal to the process this
	* late in the game.
	*/
	knlist_destroy(&p->p_klist);

	/*
	* Save our children's rusage information in our exit rusage.
	*/
	ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);

	/*
	* Make sure the scheduler takes this thread out of its tables etc.
	* This will also release this thread's reference to the ucred.
	* Other thread parts to release include pcb bits and such.
	*/
	thread_exit();
	}


	#ifndef _SYS_SYSPROTO_H_
	struct abort2_args {
	char *why;
	int nargs;
	void **args;
	};
	#endif

	int
	-abort2(struct thread td, struct abort2_args uap)
	+sys_abort2(struct thread td, struct abort2_args uap)
	{
	struct proc *p = td->td_proc;
	struct sbuf *sb;
	void *uargs[16];
	int error, i, sig;

	/*
	* Do it right now so we can log either proper call of abort2(), or
	* note, that invalid argument was passed. 512 is big enough to
	* handle 16 arguments' descriptions with additional comments.
	*/
	sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN);
	sbuf_clear(sb);
	sbuf_printf(sb, "%s(pid %d uid %d) aborted: ",
	p->p_comm, p->p_pid, td->td_ucred->cr_uid);
	/*
	* Since we can't return from abort2(), send SIGKILL in cases, where
	* abort2() was called improperly
	*/
	sig = SIGKILL;
	/* Prevent from DoSes from user-space. */
	if (uap->nargs < 0 \|\| uap->nargs > 16)
	goto out;
	if (uap->nargs > 0) {
	if (uap->args == NULL)
	goto out;
	error = copyin(uap->args, uargs, uap->nargs * sizeof(void *));
	if (error != 0)
	goto out;
	}
	/*
	* Limit size of 'reason' string to 128. Will fit even when
	* maximal number of arguments was chosen to be logged.
	*/
	if (uap->why != NULL) {
	error = sbuf_copyin(sb, uap->why, 128);
	if (error < 0)
	goto out;
	} else {
	sbuf_printf(sb, "(null)");
	}
	if (uap->nargs > 0) {
	sbuf_printf(sb, "(");
	for (i = 0;i < uap->nargs; i++)
	sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]);
	sbuf_printf(sb, ")");
	}
	/*
	* Final stage: arguments were proper, string has been
	* successfully copied from userspace, and copying pointers
	* from user-space succeed.
	*/
	sig = SIGABRT;
	out:
	if (sig == SIGKILL) {
	sbuf_trim(sb);
	sbuf_printf(sb, " (Reason text inaccessible)");
	}
	sbuf_cat(sb, "\n");
	sbuf_finish(sb);
	log(LOG_INFO, "%s", sbuf_data(sb));
	sbuf_delete(sb);
	exit1(td, W_EXITCODE(0, sig));
	return (0);
	}


	#ifdef COMPAT_43
	/*
	* The dirty work is handled by kern_wait().
	*/
	int
	owait(struct thread td, struct owait_args uap __unused)
	{
	int error, status;

	error = kern_wait(td, WAIT_ANY, &status, 0, NULL);
	if (error == 0)
	td->td_retval[1] = status;
	return (error);
	}
	#endif /* COMPAT_43 */

	/*
	* The dirty work is handled by kern_wait().
	*/
	int
	-wait4(struct thread td, struct wait_args uap)
	+sys_wait4(struct thread td, struct wait_args uap)
	{
	struct rusage ru, *rup;
	int error, status;

	if (uap->rusage != NULL)
	rup = &ru;
	else
	rup = NULL;
	error = kern_wait(td, uap->pid, &status, uap->options, rup);
	if (uap->status != NULL && error == 0)
	error = copyout(&status, uap->status, sizeof(status));
	if (uap->rusage != NULL && error == 0)
	error = copyout(&ru, uap->rusage, sizeof(struct rusage));
	return (error);
	}

	/*
	* Reap the remains of a zombie process and optionally return status and
	* rusage. Asserts and will release both the proctree_lock and the process
	* lock as part of its work.
	*/
	void
	proc_reap(struct thread td, struct proc p, int *status, int options,
	struct rusage *rusage)
	{
	struct proc q, t;

	sx_assert(&proctree_lock, SA_XLOCKED);
	PROC_LOCK_ASSERT(p, MA_OWNED);
	PROC_SLOCK_ASSERT(p, MA_OWNED);
	KASSERT(p->p_state == PRS_ZOMBIE, ("proc_reap: !PRS_ZOMBIE"));

	q = td->td_proc;
	if (rusage) {
	*rusage = p->p_ru;
	calcru(p, &rusage->ru_utime, &rusage->ru_stime);
	}
	PROC_SUNLOCK(p);
	td->td_retval[0] = p->p_pid;
	if (status)
	status = p->p_xstat; / convert to int */
	if (options & WNOWAIT) {
	/*
	* Only poll, returning the status. Caller does not wish to
	* release the proc struct just yet.
	*/
	PROC_UNLOCK(p);
	sx_xunlock(&proctree_lock);
	return;
	}

	PROC_LOCK(q);
	sigqueue_take(p->p_ksi);
	PROC_UNLOCK(q);
	PROC_UNLOCK(p);

	/*
	* If we got the child via a ptrace 'attach', we need to give it back
	* to the old parent.
	*/
	if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
	PROC_LOCK(p);
	proc_reparent(p, t);
	p->p_pptr->p_dbg_child--;
	p->p_oppid = 0;
	PROC_UNLOCK(p);
	pksignal(t, SIGCHLD, p->p_ksi);
	wakeup(t);
	cv_broadcast(&p->p_pwait);
	PROC_UNLOCK(t);
	sx_xunlock(&proctree_lock);
	return;
	}

	/*
	* Remove other references to this process to ensure we have an
	* exclusive reference.
	*/
	sx_xlock(&allproc_lock);
	LIST_REMOVE(p, p_list); /* off zombproc */
	sx_xunlock(&allproc_lock);
	LIST_REMOVE(p, p_sibling);
	leavepgrp(p);
	#ifdef PROCDESC
	if (p->p_procdesc != NULL)
	procdesc_reap(p);
	#endif
	sx_xunlock(&proctree_lock);

	/*
	* As a side effect of this lock, we know that all other writes to
	* this proc are visible now, so no more locking is needed for p.
	*/
	PROC_LOCK(p);
	p->p_xstat = 0; /* XXX: why? */
	PROC_UNLOCK(p);
	PROC_LOCK(q);
	ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru, &p->p_rux);
	PROC_UNLOCK(q);

	/*
	* Decrement the count of procs running with this uid.
	*/
	(void)chgproccnt(p->p_ucred->cr_ruidinfo, -1, 0);

	/*
	* Destroy resource accounting information associated with the process.
	*/
	racct_proc_exit(p);
	#ifdef RACCT
	PROC_LOCK(p->p_pptr);
	racct_sub(p->p_pptr, RACCT_NPROC, 1);
	PROC_UNLOCK(p->p_pptr);
	#endif

	/*
	* Free credentials, arguments, and sigacts.
	*/
	crfree(p->p_ucred);
	p->p_ucred = NULL;
	pargs_drop(p->p_args);
	p->p_args = NULL;
	sigacts_free(p->p_sigacts);
	p->p_sigacts = NULL;

	/*
	* Do any thread-system specific cleanups.
	*/
	thread_wait(p);

	/*
	* Give vm and machine-dependent layer a chance to free anything that
	* cpu_exit couldn't release while still running in process context.
	*/
	vm_waitproc(p);
	#ifdef MAC
	mac_proc_destroy(p);
	#endif
	KASSERT(FIRST_THREAD_IN_PROC(p),
	("proc_reap: no residual thread!"));
	uma_zfree(proc_zone, p);
	sx_xlock(&allproc_lock);
	nprocs--;
	sx_xunlock(&allproc_lock);
	}

	int
	kern_wait(struct thread td, pid_t pid, int status, int options,
	struct rusage *rusage)
	{
	struct proc p, q;
	int error, nfound;

	AUDIT_ARG_PID(pid);
	AUDIT_ARG_VALUE(options);

	q = td->td_proc;
	if (pid == 0) {
	PROC_LOCK(q);
	pid = -q->p_pgid;
	PROC_UNLOCK(q);
	}
	/* If we don't know the option, just return. */
	if (options & ~(WUNTRACED\|WNOHANG\|WCONTINUED\|WNOWAIT\|WLINUXCLONE))
	return (EINVAL);
	loop:
	if (q->p_flag & P_STATCHILD) {
	PROC_LOCK(q);
	q->p_flag &= ~P_STATCHILD;
	PROC_UNLOCK(q);
	}
	nfound = 0;
	sx_xlock(&proctree_lock);
	LIST_FOREACH(p, &q->p_children, p_sibling) {
	PROC_LOCK(p);
	if (pid != WAIT_ANY &&
	p->p_pid != pid && p->p_pgid != -pid) {
	PROC_UNLOCK(p);
	continue;
	}
	if (p_canwait(td, p)) {
	PROC_UNLOCK(p);
	continue;
	}

	/*
	* This special case handles a kthread spawned by linux_clone
	* (see linux_misc.c). The linux_wait4 and linux_waitpid
	* functions need to be able to distinguish between waiting
	* on a process and waiting on a thread. It is a thread if
	* p_sigparent is not SIGCHLD, and the WLINUXCLONE option
	* signifies we want to wait for threads and not processes.
	*/
	if ((p->p_sigparent != SIGCHLD) ^
	((options & WLINUXCLONE) != 0)) {
	PROC_UNLOCK(p);
	continue;
	}

	nfound++;
	PROC_SLOCK(p);
	if (p->p_state == PRS_ZOMBIE) {
	proc_reap(td, p, status, options, rusage);
	return (0);
	}
	if ((p->p_flag & P_STOPPED_SIG) &&
	(p->p_suspcount == p->p_numthreads) &&
	(p->p_flag & P_WAITED) == 0 &&
	(p->p_flag & P_TRACED \|\| options & WUNTRACED)) {
	PROC_SUNLOCK(p);
	p->p_flag \|= P_WAITED;
	sx_xunlock(&proctree_lock);
	td->td_retval[0] = p->p_pid;
	if (status)
	*status = W_STOPCODE(p->p_xstat);

	PROC_LOCK(q);
	sigqueue_take(p->p_ksi);
	PROC_UNLOCK(q);
	PROC_UNLOCK(p);

	return (0);
	}
	PROC_SUNLOCK(p);
	if (options & WCONTINUED && (p->p_flag & P_CONTINUED)) {
	sx_xunlock(&proctree_lock);
	td->td_retval[0] = p->p_pid;
	p->p_flag &= ~P_CONTINUED;

	PROC_LOCK(q);
	sigqueue_take(p->p_ksi);
	PROC_UNLOCK(q);
	PROC_UNLOCK(p);

	if (status)
	*status = SIGCONT;
	return (0);
	}
	PROC_UNLOCK(p);
	}
	if (nfound == 0) {
	sx_xunlock(&proctree_lock);
	if (td->td_proc->p_dbg_child)
	return (0);
	else
	return (ECHILD);
	}
	if (options & WNOHANG) {
	sx_xunlock(&proctree_lock);
	td->td_retval[0] = 0;
	return (0);
	}
	PROC_LOCK(q);
	sx_xunlock(&proctree_lock);
	if (q->p_flag & P_STATCHILD) {
	q->p_flag &= ~P_STATCHILD;
	error = 0;
	} else
	error = msleep(q, &q->p_mtx, PWAIT \| PCATCH, "wait", 0);
	PROC_UNLOCK(q);
	if (error)
	return (error);
	goto loop;
	}

	/*
	* Make process 'parent' the new parent of process 'child'.
	* Must be called with an exclusive hold of proctree lock.
	*/
	void
	proc_reparent(struct proc child, struct proc parent)
	{
	#ifdef RACCT
	int locked;
	#endif

	sx_assert(&proctree_lock, SX_XLOCKED);
	PROC_LOCK_ASSERT(child, MA_OWNED);
	if (child->p_pptr == parent)
	return;

	#ifdef RACCT
	locked = PROC_LOCKED(parent);
	if (!locked)
	PROC_LOCK(parent);
	racct_add_force(parent, RACCT_NPROC, 1);
	if (!locked)
	PROC_UNLOCK(parent);
	#endif
	PROC_LOCK(child->p_pptr);
	racct_sub(child->p_pptr, RACCT_NPROC, 1);
	sigqueue_take(child->p_ksi);
	PROC_UNLOCK(child->p_pptr);
	LIST_REMOVE(child, p_sibling);
	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
	child->p_pptr = parent;
	}
	Index: head/sys/kern/kern_fork.c
	===================================================================
	--- head/sys/kern/kern_fork.c (revision 225616)
	+++ head/sys/kern/kern_fork.c (revision 225617)
	@@ -1,1087 +1,1087 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_kdtrace.h"
	#include "opt_ktrace.h"
	#include "opt_kstack_pages.h"
	#include "opt_procdesc.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/eventhandler.h>
	#include <sys/fcntl.h>
	#include <sys/filedesc.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/sysctl.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/procdesc.h>
	#include <sys/pioctl.h>
	#include <sys/racct.h>
	#include <sys/resourcevar.h>
	#include <sys/sched.h>
	#include <sys/syscall.h>
	#include <sys/vmmeter.h>
	#include <sys/vnode.h>
	#include <sys/acct.h>
	#include <sys/ktr.h>
	#include <sys/ktrace.h>
	#include <sys/unistd.h>
	#include <sys/sdt.h>
	#include <sys/sx.h>
	#include <sys/sysent.h>
	#include <sys/signalvar.h>

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_extern.h>
	#include <vm/uma.h>

	#ifdef KDTRACE_HOOKS
	#include <sys/dtrace_bsd.h>
	dtrace_fork_func_t dtrace_fasttrap_fork;
	#endif

	SDT_PROVIDER_DECLARE(proc);
	SDT_PROBE_DEFINE(proc, kernel, , create, create);
	SDT_PROBE_ARGTYPE(proc, kernel, , create, 0, "struct proc *");
	SDT_PROBE_ARGTYPE(proc, kernel, , create, 1, "struct proc *");
	SDT_PROBE_ARGTYPE(proc, kernel, , create, 2, "int");

	#ifndef _SYS_SYSPROTO_H_
	struct fork_args {
	int dummy;
	};
	#endif

	/* ARGSUSED */
	int
	-fork(struct thread td, struct fork_args uap)
	+sys_fork(struct thread td, struct fork_args uap)
	{
	int error;
	struct proc *p2;

	error = fork1(td, RFFDG \| RFPROC, 0, &p2, NULL, 0);
	if (error == 0) {
	td->td_retval[0] = p2->p_pid;
	td->td_retval[1] = 0;
	}
	return (error);
	}

	/* ARGUSED */
	int
	-pdfork(td, uap)
	+sys_pdfork(td, uap)
	struct thread *td;
	struct pdfork_args *uap;
	{
	#ifdef PROCDESC
	int error, fd;
	struct proc *p2;

	/*
	* It is necessary to return fd by reference because 0 is a valid file
	* descriptor number, and the child needs to be able to distinguish
	* itself from the parent using the return value.
	*/
	error = fork1(td, RFFDG \| RFPROC \| RFPROCDESC, 0, &p2,
	&fd, uap->flags);
	if (error == 0) {
	td->td_retval[0] = p2->p_pid;
	td->td_retval[1] = 0;
	error = copyout(&fd, uap->fdp, sizeof(fd));
	}
	return (error);
	#else
	return (ENOSYS);
	#endif
	}

	/* ARGSUSED */
	int
	-vfork(struct thread td, struct vfork_args uap)
	+sys_vfork(struct thread td, struct vfork_args uap)
	{
	int error, flags;
	struct proc *p2;

	#ifdef XEN
	flags = RFFDG \| RFPROC; /* validate that this is still an issue */
	#else
	flags = RFFDG \| RFPROC \| RFPPWAIT \| RFMEM;
	#endif
	error = fork1(td, flags, 0, &p2, NULL, 0);
	if (error == 0) {
	td->td_retval[0] = p2->p_pid;
	td->td_retval[1] = 0;
	}
	return (error);
	}

	int
	-rfork(struct thread td, struct rfork_args uap)
	+sys_rfork(struct thread td, struct rfork_args uap)
	{
	struct proc *p2;
	int error;

	/* Don't allow kernel-only flags. */
	if ((uap->flags & RFKERNELONLY) != 0)
	return (EINVAL);

	AUDIT_ARG_FFLAGS(uap->flags);
	error = fork1(td, uap->flags, 0, &p2, NULL, 0);
	if (error == 0) {
	td->td_retval[0] = p2 ? p2->p_pid : 0;
	td->td_retval[1] = 0;
	}
	return (error);
	}

	int nprocs = 1; /* process 0 */
	int lastpid = 0;
	SYSCTL_INT(_kern, OID_AUTO, lastpid, CTLFLAG_RD, &lastpid, 0,
	"Last used PID");

	/*
	* Random component to lastpid generation. We mix in a random factor to make
	* it a little harder to predict. We sanity check the modulus value to avoid
	* doing it in critical paths. Don't let it be too small or we pointlessly
	* waste randomness entropy, and don't let it be impossibly large. Using a
	* modulus that is too big causes a LOT more process table scans and slows
	* down fork processing as the pidchecked caching is defeated.
	*/
	static int randompid = 0;

	static int
	sysctl_kern_randompid(SYSCTL_HANDLER_ARGS)
	{
	int error, pid;

	error = sysctl_wire_old_buffer(req, sizeof(int));
	if (error != 0)
	return(error);
	sx_xlock(&allproc_lock);
	pid = randompid;
	error = sysctl_handle_int(oidp, &pid, 0, req);
	if (error == 0 && req->newptr != NULL) {
	if (pid < 0 \|\| pid > PID_MAX - 100) /* out of range */
	pid = PID_MAX - 100;
	else if (pid < 2) /* NOP */
	pid = 0;
	else if (pid < 100) /* Make it reasonable */
	pid = 100;
	randompid = pid;
	}
	sx_xunlock(&allproc_lock);
	return (error);
	}

	SYSCTL_PROC(_kern, OID_AUTO, randompid, CTLTYPE_INT\|CTLFLAG_RW,
	0, 0, sysctl_kern_randompid, "I", "Random PID modulus");

	static int
	fork_findpid(int flags)
	{
	struct proc *p;
	int trypid;
	static int pidchecked = 0;

	/*
	* Requires allproc_lock in order to iterate over the list
	* of processes, and proctree_lock to access p_pgrp.
	*/
	sx_assert(&allproc_lock, SX_LOCKED);
	sx_assert(&proctree_lock, SX_LOCKED);

	/*
	* Find an unused process ID. We remember a range of unused IDs
	* ready to use (from lastpid+1 through pidchecked-1).
	*
	* If RFHIGHPID is set (used during system boot), do not allocate
	* low-numbered pids.
	*/
	trypid = lastpid + 1;
	if (flags & RFHIGHPID) {
	if (trypid < 10)
	trypid = 10;
	} else {
	if (randompid)
	trypid += arc4random() % randompid;
	}
	retry:
	/*
	* If the process ID prototype has wrapped around,
	* restart somewhat above 0, as the low-numbered procs
	* tend to include daemons that don't exit.
	*/
	if (trypid >= PID_MAX) {
	trypid = trypid % PID_MAX;
	if (trypid < 100)
	trypid += 100;
	pidchecked = 0;
	}
	if (trypid >= pidchecked) {
	int doingzomb = 0;

	pidchecked = PID_MAX;
	/*
	* Scan the active and zombie procs to check whether this pid
	* is in use. Remember the lowest pid that's greater
	* than trypid, so we can avoid checking for a while.
	*/
	p = LIST_FIRST(&allproc);
	again:
	for (; p != NULL; p = LIST_NEXT(p, p_list)) {
	while (p->p_pid == trypid \|\|
	(p->p_pgrp != NULL &&
	(p->p_pgrp->pg_id == trypid \|\|
	(p->p_session != NULL &&
	p->p_session->s_sid == trypid)))) {
	trypid++;
	if (trypid >= pidchecked)
	goto retry;
	}
	if (p->p_pid > trypid && pidchecked > p->p_pid)
	pidchecked = p->p_pid;
	if (p->p_pgrp != NULL) {
	if (p->p_pgrp->pg_id > trypid &&
	pidchecked > p->p_pgrp->pg_id)
	pidchecked = p->p_pgrp->pg_id;
	if (p->p_session != NULL &&
	p->p_session->s_sid > trypid &&
	pidchecked > p->p_session->s_sid)
	pidchecked = p->p_session->s_sid;
	}
	}
	if (!doingzomb) {
	doingzomb = 1;
	p = LIST_FIRST(&zombproc);
	goto again;
	}
	}

	/*
	* RFHIGHPID does not mess with the lastpid counter during boot.
	*/
	if (flags & RFHIGHPID)
	pidchecked = 0;
	else
	lastpid = trypid;

	return (trypid);
	}

	static int
	fork_norfproc(struct thread *td, int flags)
	{
	int error;
	struct proc *p1;

	KASSERT((flags & RFPROC) == 0,
	("fork_norfproc called with RFPROC set"));
	p1 = td->td_proc;

	if (((p1->p_flag & (P_HADTHREADS\|P_SYSTEM)) == P_HADTHREADS) &&
	(flags & (RFCFDG \| RFFDG))) {
	PROC_LOCK(p1);
	if (thread_single(SINGLE_BOUNDARY)) {
	PROC_UNLOCK(p1);
	return (ERESTART);
	}
	PROC_UNLOCK(p1);
	}

	error = vm_forkproc(td, NULL, NULL, NULL, flags);
	if (error)
	goto fail;

	/*
	* Close all file descriptors.
	*/
	if (flags & RFCFDG) {
	struct filedesc *fdtmp;
	fdtmp = fdinit(td->td_proc->p_fd);
	fdfree(td);
	p1->p_fd = fdtmp;
	}

	/*
	* Unshare file descriptors (from parent).
	*/
	if (flags & RFFDG)
	fdunshare(p1, td);

	fail:
	if (((p1->p_flag & (P_HADTHREADS\|P_SYSTEM)) == P_HADTHREADS) &&
	(flags & (RFCFDG \| RFFDG))) {
	PROC_LOCK(p1);
	thread_single_end();
	PROC_UNLOCK(p1);
	}
	return (error);
	}

	static void
	do_fork(struct thread td, int flags, struct proc p2, struct thread *td2,
	struct vmspace *vm2, int pdflags)
	{
	struct proc p1, pptr;
	int p2_held, trypid;
	struct filedesc *fd;
	struct filedesc_to_leader *fdtol;
	struct sigacts *newsigacts;

	sx_assert(&proctree_lock, SX_SLOCKED);
	sx_assert(&allproc_lock, SX_XLOCKED);

	p2_held = 0;
	p1 = td->td_proc;

	/*
	* Increment the nprocs resource before blocking can occur. There
	* are hard-limits as to the number of processes that can run.
	*/
	nprocs++;

	trypid = fork_findpid(flags);

	sx_sunlock(&proctree_lock);

	p2->p_state = PRS_NEW; /* protect against others */
	p2->p_pid = trypid;
	AUDIT_ARG_PID(p2->p_pid);
	LIST_INSERT_HEAD(&allproc, p2, p_list);
	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
	tidhash_add(td2);
	PROC_LOCK(p2);
	PROC_LOCK(p1);

	sx_xunlock(&allproc_lock);

	bcopy(&p1->p_startcopy, &p2->p_startcopy,
	__rangeof(struct proc, p_startcopy, p_endcopy));
	pargs_hold(p2->p_args);
	PROC_UNLOCK(p1);

	bzero(&p2->p_startzero,
	__rangeof(struct proc, p_startzero, p_endzero));

	p2->p_ucred = crhold(td->td_ucred);

	/* Tell the prison that we exist. */
	prison_proc_hold(p2->p_ucred->cr_prison);

	PROC_UNLOCK(p2);

	/*
	* Malloc things while we don't hold any locks.
	*/
	if (flags & RFSIGSHARE)
	newsigacts = NULL;
	else
	newsigacts = sigacts_alloc();

	/*
	* Copy filedesc.
	*/
	if (flags & RFCFDG) {
	fd = fdinit(p1->p_fd);
	fdtol = NULL;
	} else if (flags & RFFDG) {
	fd = fdcopy(p1->p_fd);
	fdtol = NULL;
	} else {
	fd = fdshare(p1->p_fd);
	if (p1->p_fdtol == NULL)
	p1->p_fdtol = filedesc_to_leader_alloc(NULL, NULL,
	p1->p_leader);
	if ((flags & RFTHREAD) != 0) {
	/*
	* Shared file descriptor table, and shared
	* process leaders.
	*/
	fdtol = p1->p_fdtol;
	FILEDESC_XLOCK(p1->p_fd);
	fdtol->fdl_refcount++;
	FILEDESC_XUNLOCK(p1->p_fd);
	} else {
	/*
	* Shared file descriptor table, and different
	* process leaders.
	*/
	fdtol = filedesc_to_leader_alloc(p1->p_fdtol,
	p1->p_fd, p2);
	}
	}
	/*
	* Make a proc table entry for the new process.
	* Start by zeroing the section of proc that is zero-initialized,
	* then copy the section that is copied directly from the parent.
	*/

	PROC_LOCK(p2);
	PROC_LOCK(p1);

	bzero(&td2->td_startzero,
	__rangeof(struct thread, td_startzero, td_endzero));

	bcopy(&td->td_startcopy, &td2->td_startcopy,
	__rangeof(struct thread, td_startcopy, td_endcopy));

	bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
	td2->td_sigstk = td->td_sigstk;
	td2->td_sigmask = td->td_sigmask;
	td2->td_flags = TDF_INMEM;
	td2->td_lend_user_pri = PRI_MAX;

	#ifdef VIMAGE
	td2->td_vnet = NULL;
	td2->td_vnet_lpush = NULL;
	#endif

	/*
	* Allow the scheduler to initialize the child.
	*/
	thread_lock(td);
	sched_fork(td, td2);
	thread_unlock(td);

	/*
	* Duplicate sub-structures as needed.
	* Increase reference counts on shared objects.
	*/
	p2->p_flag = P_INMEM;
	p2->p_swtick = ticks;
	if (p1->p_flag & P_PROFIL)
	startprofclock(p2);
	td2->td_ucred = crhold(p2->p_ucred);

	if (flags & RFSIGSHARE) {
	p2->p_sigacts = sigacts_hold(p1->p_sigacts);
	} else {
	sigacts_copy(newsigacts, p1->p_sigacts);
	p2->p_sigacts = newsigacts;
	}

	if (flags & RFTSIGZMB)
	p2->p_sigparent = RFTSIGNUM(flags);
	else if (flags & RFLINUXTHPN)
	p2->p_sigparent = SIGUSR1;
	else
	p2->p_sigparent = SIGCHLD;

	p2->p_textvp = p1->p_textvp;
	p2->p_fd = fd;
	p2->p_fdtol = fdtol;

	/*
	* p_limit is copy-on-write. Bump its refcount.
	*/
	lim_fork(p1, p2);

	pstats_fork(p1->p_stats, p2->p_stats);

	PROC_UNLOCK(p1);
	PROC_UNLOCK(p2);

	/* Bump references to the text vnode (for procfs). */
	if (p2->p_textvp)
	vref(p2->p_textvp);

	/*
	* Set up linkage for kernel based threading.
	*/
	if ((flags & RFTHREAD) != 0) {
	mtx_lock(&ppeers_lock);
	p2->p_peers = p1->p_peers;
	p1->p_peers = p2;
	p2->p_leader = p1->p_leader;
	mtx_unlock(&ppeers_lock);
	PROC_LOCK(p1->p_leader);
	if ((p1->p_leader->p_flag & P_WEXIT) != 0) {
	PROC_UNLOCK(p1->p_leader);
	/*
	* The task leader is exiting, so process p1 is
	* going to be killed shortly. Since p1 obviously
	* isn't dead yet, we know that the leader is either
	* sending SIGKILL's to all the processes in this
	* task or is sleeping waiting for all the peers to
	* exit. We let p1 complete the fork, but we need
	* to go ahead and kill the new process p2 since
	* the task leader may not get a chance to send
	* SIGKILL to it. We leave it on the list so that
	* the task leader will wait for this new process
	* to commit suicide.
	*/
	PROC_LOCK(p2);
	- psignal(p2, SIGKILL);
	+ kern_psignal(p2, SIGKILL);
	PROC_UNLOCK(p2);
	} else
	PROC_UNLOCK(p1->p_leader);
	} else {
	p2->p_peers = NULL;
	p2->p_leader = p2;
	}

	sx_xlock(&proctree_lock);
	PGRP_LOCK(p1->p_pgrp);
	PROC_LOCK(p2);
	PROC_LOCK(p1);

	/*
	* Preserve some more flags in subprocess. P_PROFIL has already
	* been preserved.
	*/
	p2->p_flag \|= p1->p_flag & P_SUGID;
	td2->td_pflags \|= td->td_pflags & TDP_ALTSTACK;
	SESS_LOCK(p1->p_session);
	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
	p2->p_flag \|= P_CONTROLT;
	SESS_UNLOCK(p1->p_session);
	if (flags & RFPPWAIT)
	p2->p_flag \|= P_PPWAIT;

	p2->p_pgrp = p1->p_pgrp;
	LIST_INSERT_AFTER(p1, p2, p_pglist);
	PGRP_UNLOCK(p1->p_pgrp);
	LIST_INIT(&p2->p_children);

	callout_init(&p2->p_itcallout, CALLOUT_MPSAFE);

	/*
	* If PF_FORK is set, the child process inherits the
	* procfs ioctl flags from its parent.
	*/
	if (p1->p_pfsflags & PF_FORK) {
	p2->p_stops = p1->p_stops;
	p2->p_pfsflags = p1->p_pfsflags;
	}

	/*
	* This begins the section where we must prevent the parent
	* from being swapped.
	*/
	_PHOLD(p1);
	PROC_UNLOCK(p1);

	/*
	* Attach the new process to its parent.
	*
	* If RFNOWAIT is set, the newly created process becomes a child
	* of init. This effectively disassociates the child from the
	* parent.
	*/
	if (flags & RFNOWAIT)
	pptr = initproc;
	else
	pptr = p1;
	p2->p_pptr = pptr;
	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
	sx_xunlock(&proctree_lock);

	/* Inform accounting that we have forked. */
	p2->p_acflag = AFORK;
	PROC_UNLOCK(p2);

	#ifdef KTRACE
	ktrprocfork(p1, p2);
	#endif

	/*
	* Finish creating the child process. It will return via a different
	* execution path later. (ie: directly into user mode)
	*/
	vm_forkproc(td, p2, td2, vm2, flags);

	if (flags == (RFFDG \| RFPROC)) {
	PCPU_INC(cnt.v_forks);
	PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
	p2->p_vmspace->vm_ssize);
	} else if (flags == (RFFDG \| RFPROC \| RFPPWAIT \| RFMEM)) {
	PCPU_INC(cnt.v_vforks);
	PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
	p2->p_vmspace->vm_ssize);
	} else if (p1 == &proc0) {
	PCPU_INC(cnt.v_kthreads);
	PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
	p2->p_vmspace->vm_ssize);
	} else {
	PCPU_INC(cnt.v_rforks);
	PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
	p2->p_vmspace->vm_ssize);
	}

	#ifdef PROCDESC
	/*
	* Associate the process descriptor with the process before anything
	* can happen that might cause that process to need the descriptor.
	* However, don't do this until after fork(2) can no longer fail.
	*/
	if (flags & RFPROCDESC)
	procdesc_new(p2, pdflags);
	#endif

	/*
	* Both processes are set up, now check if any loadable modules want
	* to adjust anything.
	*/
	EVENTHANDLER_INVOKE(process_fork, p1, p2, flags);

	/*
	* Set the child start time and mark the process as being complete.
	*/
	PROC_LOCK(p2);
	PROC_LOCK(p1);
	microuptime(&p2->p_stats->p_start);
	PROC_SLOCK(p2);
	p2->p_state = PRS_NORMAL;
	PROC_SUNLOCK(p2);

	#ifdef KDTRACE_HOOKS
	/*
	* Tell the DTrace fasttrap provider about the new process
	* if it has registered an interest. We have to do this only after
	* p_state is PRS_NORMAL since the fasttrap module will use pfind()
	* later on.
	*/
	if (dtrace_fasttrap_fork)
	dtrace_fasttrap_fork(p1, p2);
	#endif
	if ((p1->p_flag & (P_TRACED \| P_FOLLOWFORK)) == (P_TRACED \|
	P_FOLLOWFORK)) {
	/*
	* Arrange for debugger to receive the fork event.
	*
	* We can report PL_FLAG_FORKED regardless of
	* P_FOLLOWFORK settings, but it does not make a sense
	* for runaway child.
	*/
	td->td_dbgflags \|= TDB_FORK;
	td->td_dbg_forked = p2->p_pid;
	td2->td_dbgflags \|= TDB_STOPATFORK;
	_PHOLD(p2);
	p2_held = 1;
	}
	PROC_UNLOCK(p2);
	if ((flags & RFSTOPPED) == 0) {
	/*
	* If RFSTOPPED not requested, make child runnable and
	* add to run queue.
	*/
	thread_lock(td2);
	TD_SET_CAN_RUN(td2);
	sched_add(td2, SRQ_BORING);
	thread_unlock(td2);
	}

	/*
	* Now can be swapped.
	*/
	_PRELE(p1);
	PROC_UNLOCK(p1);

	/*
	* Tell any interested parties about the new process.
	*/
	knote_fork(&p1->p_klist, p2->p_pid);
	SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0);

	/*
	* Wait until debugger is attached to child.
	*/
	PROC_LOCK(p2);
	while ((td2->td_dbgflags & TDB_STOPATFORK) != 0)
	cv_wait(&p2->p_dbgwait, &p2->p_mtx);
	if (p2_held)
	_PRELE(p2);

	/*
	* Preserve synchronization semantics of vfork. If waiting for
	* child to exec or exit, set P_PPWAIT on child, and sleep on our
	* proc (in case of exit).
	*/
	while (p2->p_flag & P_PPWAIT)
	cv_wait(&p2->p_pwait, &p2->p_mtx);
	PROC_UNLOCK(p2);
	}

	int
	fork1(struct thread td, int flags, int pages, struct proc *procp,
	int *procdescp, int pdflags)
	{
	struct proc *p1;
	struct proc *newproc;
	int ok;
	struct thread *td2;
	struct vmspace *vm2;
	vm_ooffset_t mem_charged;
	int error;
	static int curfail;
	static struct timeval lastfail;
	#ifdef PROCDESC
	struct file *fp_procdesc = NULL;
	#endif

	/* Check for the undefined or unimplemented flags. */
	if ((flags & ~(RFFLAGS \| RFTSIGFLAGS(RFTSIGMASK))) != 0)
	return (EINVAL);

	/* Signal value requires RFTSIGZMB. */
	if ((flags & RFTSIGFLAGS(RFTSIGMASK)) != 0 && (flags & RFTSIGZMB) == 0)
	return (EINVAL);

	/* Can't copy and clear. */
	if ((flags & (RFFDG\|RFCFDG)) == (RFFDG\|RFCFDG))
	return (EINVAL);

	/* Check the validity of the signal number. */
	if ((flags & RFTSIGZMB) != 0 && (u_int)RFTSIGNUM(flags) > _SIG_MAXSIG)
	return (EINVAL);

	#ifdef PROCDESC
	if ((flags & RFPROCDESC) != 0) {
	/* Can't not create a process yet get a process descriptor. */
	if ((flags & RFPROC) == 0)
	return (EINVAL);

	/* Must provide a place to put a procdesc if creating one. */
	if (procdescp == NULL)
	return (EINVAL);
	}
	#endif

	p1 = td->td_proc;

	/*
	* Here we don't create a new process, but we divorce
	* certain parts of a process from itself.
	*/
	if ((flags & RFPROC) == 0) {
	*procp = NULL;
	return (fork_norfproc(td, flags));
	}

	#ifdef RACCT
	PROC_LOCK(p1);
	error = racct_add(p1, RACCT_NPROC, 1);
	PROC_UNLOCK(p1);
	if (error != 0)
	return (EAGAIN);
	#endif

	#ifdef PROCDESC
	/*
	* If required, create a process descriptor in the parent first; we
	* will abandon it if something goes wrong. We don't finit() until
	* later.
	*/
	if (flags & RFPROCDESC) {
	error = falloc(td, &fp_procdesc, procdescp, 0);
	if (error != 0) {
	#ifdef RACCT
	PROC_LOCK(p1);
	racct_sub(p1, RACCT_NPROC, 1);
	PROC_UNLOCK(p1);
	#endif
	return (error);
	}
	}
	#endif

	mem_charged = 0;
	vm2 = NULL;
	if (pages == 0)
	pages = KSTACK_PAGES;
	/* Allocate new proc. */
	newproc = uma_zalloc(proc_zone, M_WAITOK);
	td2 = FIRST_THREAD_IN_PROC(newproc);
	if (td2 == NULL) {
	td2 = thread_alloc(pages);
	if (td2 == NULL) {
	error = ENOMEM;
	goto fail1;
	}
	proc_linkup(newproc, td2);
	} else {
	if (td2->td_kstack == 0 \|\| td2->td_kstack_pages != pages) {
	if (td2->td_kstack != 0)
	vm_thread_dispose(td2);
	if (!thread_alloc_stack(td2, pages)) {
	error = ENOMEM;
	goto fail1;
	}
	}
	}

	if ((flags & RFMEM) == 0) {
	vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
	if (vm2 == NULL) {
	error = ENOMEM;
	goto fail1;
	}
	if (!swap_reserve(mem_charged)) {
	/*
	* The swap reservation failed. The accounting
	* from the entries of the copied vm2 will be
	* substracted in vmspace_free(), so force the
	* reservation there.
	*/
	swap_reserve_force(mem_charged);
	error = ENOMEM;
	goto fail1;
	}
	} else
	vm2 = NULL;
	#ifdef MAC
	mac_proc_init(newproc);
	#endif
	knlist_init_mtx(&newproc->p_klist, &newproc->p_mtx);
	STAILQ_INIT(&newproc->p_ktr);

	/*
	* XXX: This is ugly; when we copy resource usage, we need to bump
	* per-cred resource counters.
	*/
	newproc->p_ucred = p1->p_ucred;

	/*
	* Initialize resource accounting for the child process.
	*/
	error = racct_proc_fork(p1, newproc);
	if (error != 0) {
	error = EAGAIN;
	goto fail1;
	}

	/* We have to lock the process tree while we look for a pid. */
	sx_slock(&proctree_lock);

	/*
	* Although process entries are dynamically created, we still keep
	* a global limit on the maximum number we will create. Don't allow
	* a nonprivileged user to use the last ten processes; don't let root
	* exceed the limit. The variable nprocs is the current number of
	* processes, maxproc is the limit.
	*/
	sx_xlock(&allproc_lock);
	if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
	PRIV_MAXPROC, 0) != 0) \|\| nprocs >= maxproc) {
	error = EAGAIN;
	goto fail;
	}

	#ifdef RACCT
	/*
	* After fork, there is exactly one thread running.
	*/
	PROC_LOCK(newproc);
	error = racct_set(newproc, RACCT_NTHR, 1);
	PROC_UNLOCK(newproc);
	if (error != 0) {
	error = EAGAIN;
	goto fail;
	}
	#endif

	/*
	* Increment the count of procs running with this uid. Don't allow
	* a nonprivileged user to exceed their current limit.
	*
	* XXXRW: Can we avoid privilege here if it's not needed?
	*/
	error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
	if (error == 0)
	ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
	else {
	PROC_LOCK(p1);
	ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
	lim_cur(p1, RLIMIT_NPROC));
	PROC_UNLOCK(p1);
	}
	if (ok) {
	do_fork(td, flags, newproc, td2, vm2, pdflags);

	/*
	* Return child proc pointer to parent.
	*/
	*procp = newproc;
	#ifdef PROCDESC
	if (flags & RFPROCDESC)
	procdesc_finit(newproc->p_procdesc, fp_procdesc);
	#endif
	return (0);
	}

	error = EAGAIN;
	fail:
	racct_proc_exit(newproc);
	sx_sunlock(&proctree_lock);
	if (ppsratecheck(&lastfail, &curfail, 1))
	printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n",
	td->td_ucred->cr_ruid);
	sx_xunlock(&allproc_lock);
	#ifdef MAC
	mac_proc_destroy(newproc);
	#endif
	fail1:
	if (vm2 != NULL)
	vmspace_free(vm2);
	uma_zfree(proc_zone, newproc);
	#ifdef PROCDESC
	if (((flags & RFPROCDESC) != 0) && (fp_procdesc != NULL))
	fdrop(fp_procdesc, td);
	#endif
	pause("fork", hz / 2);
	#ifdef RACCT
	PROC_LOCK(p1);
	racct_sub(p1, RACCT_NPROC, 1);
	PROC_UNLOCK(p1);
	#endif
	return (error);
	}

	/*
	* Handle the return of a child process from fork1(). This function
	* is called from the MD fork_trampoline() entry point.
	*/
	void
	fork_exit(void (callout)(void , struct trapframe ), void arg,
	struct trapframe *frame)
	{
	struct proc *p;
	struct thread *td;
	struct thread *dtd;

	td = curthread;
	p = td->td_proc;
	KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));

	CTR4(KTR_PROC, "fork_exit: new thread %p (td_sched %p, pid %d, %s)",
	td, td->td_sched, p->p_pid, td->td_name);

	sched_fork_exit(td);
	/*
	* Processes normally resume in mi_switch() after being
	* cpu_switch()'ed to, but when children start up they arrive here
	* instead, so we must do much the same things as mi_switch() would.
	*/
	if ((dtd = PCPU_GET(deadthread))) {
	PCPU_SET(deadthread, NULL);
	thread_stash(dtd);
	}
	thread_unlock(td);

	/*
	* cpu_set_fork_handler intercepts this function call to
	* have this call a non-return function to stay in kernel mode.
	* initproc has its own fork handler, but it does return.
	*/
	KASSERT(callout != NULL, ("NULL callout in fork_exit"));
	callout(arg, frame);

	/*
	* Check if a kernel thread misbehaved and returned from its main
	* function.
	*/
	if (p->p_flag & P_KTHREAD) {
	printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
	td->td_name, p->p_pid);
	kproc_exit(0);
	}
	mtx_assert(&Giant, MA_NOTOWNED);

	if (p->p_sysent->sv_schedtail != NULL)
	(p->p_sysent->sv_schedtail)(td);
	}

	/*
	* Simplified back end of syscall(), used when returning from fork()
	* directly into user mode. Giant is not held on entry, and must not
	* be held on return. This function is passed in to fork_exit() as the
	* first parameter and is called when returning to a new userland process.
	*/
	void
	fork_return(struct thread td, struct trapframe frame)
	{
	struct proc p, dbg;

	if (td->td_dbgflags & TDB_STOPATFORK) {
	p = td->td_proc;
	sx_xlock(&proctree_lock);
	PROC_LOCK(p);
	if ((p->p_pptr->p_flag & (P_TRACED \| P_FOLLOWFORK)) ==
	(P_TRACED \| P_FOLLOWFORK)) {
	/*
	* If debugger still wants auto-attach for the
	* parent's children, do it now.
	*/
	dbg = p->p_pptr->p_pptr;
	p->p_flag \|= P_TRACED;
	p->p_oppid = p->p_pptr->p_pid;
	proc_reparent(p, dbg);
	sx_xunlock(&proctree_lock);
	ptracestop(td, SIGSTOP);
	} else {
	/*
	* ... otherwise clear the request.
	*/
	sx_xunlock(&proctree_lock);
	td->td_dbgflags &= ~TDB_STOPATFORK;
	cv_broadcast(&p->p_dbgwait);
	}
	PROC_UNLOCK(p);
	}

	userret(td, frame);

	#ifdef KTRACE
	if (KTRPOINT(td, KTR_SYSRET))
	ktrsysret(SYS_fork, 0, 0);
	#endif
	mtx_assert(&Giant, MA_NOTOWNED);
	}
	Index: head/sys/kern/kern_jail.c
	===================================================================
	--- head/sys/kern/kern_jail.c (revision 225616)
	+++ head/sys/kern/kern_jail.c (revision 225617)
	@@ -1,4480 +1,4480 @@
	/*-
	* Copyright (c) 1999 Poul-Henning Kamp.
	* Copyright (c) 2008 Bjoern A. Zeeb.
	* Copyright (c) 2009 James Gritton.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_ddb.h"
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/types.h>
	#include <sys/kernel.h>
	#include <sys/systm.h>
	#include <sys/errno.h>
	#include <sys/sysproto.h>
	#include <sys/malloc.h>
	#include <sys/osd.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/taskqueue.h>
	#include <sys/fcntl.h>
	#include <sys/jail.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/racct.h>
	#include <sys/refcount.h>
	#include <sys/sx.h>
	#include <sys/sysent.h>
	#include <sys/namei.h>
	#include <sys/mount.h>
	#include <sys/queue.h>
	#include <sys/socket.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/vnode.h>

	#include <net/if.h>
	#include <net/vnet.h>

	#include <netinet/in.h>

	#ifdef DDB
	#include <ddb/ddb.h>
	#ifdef INET6
	#include <netinet6/in6_var.h>
	#endif /* INET6 */
	#endif /* DDB */

	#include <security/mac/mac_framework.h>

	#define DEFAULT_HOSTUUID "00000000-0000-0000-0000-000000000000"

	MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
	MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");

	/* Keep struct prison prison0 and some code in kern_jail_set() readable. */
	#ifdef INET
	#ifdef INET6
	#define _PR_IP_SADDRSEL PR_IP4_SADDRSEL\|PR_IP6_SADDRSEL
	#else
	#define _PR_IP_SADDRSEL PR_IP4_SADDRSEL
	#endif
	#else /* !INET */
	#ifdef INET6
	#define _PR_IP_SADDRSEL PR_IP6_SADDRSEL
	#else
	#define _PR_IP_SADDRSEL 0
	#endif
	#endif

	/* prison0 describes what is "real" about the system. */
	struct prison prison0 = {
	.pr_id = 0,
	.pr_name = "0",
	.pr_ref = 1,
	.pr_uref = 1,
	.pr_path = "/",
	.pr_securelevel = -1,
	.pr_childmax = JAIL_MAX,
	.pr_hostuuid = DEFAULT_HOSTUUID,
	.pr_children = LIST_HEAD_INITIALIZER(prison0.pr_children),
	#ifdef VIMAGE
	.pr_flags = PR_HOST\|PR_VNET\|_PR_IP_SADDRSEL,
	#else
	.pr_flags = PR_HOST\|_PR_IP_SADDRSEL,
	#endif
	.pr_allow = PR_ALLOW_ALL,
	};
	MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);

	/* allprison, allprison_racct and lastprid are protected by allprison_lock. */
	struct sx allprison_lock;
	SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
	struct prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
	LIST_HEAD(, prison_racct) allprison_racct;
	int lastprid = 0;

	static int do_jail_attach(struct thread td, struct prison pr);
	static void prison_complete(void *context, int pending);
	static void prison_deref(struct prison *pr, int flags);
	static char prison_path(struct prison pr1, struct prison *pr2);
	static void prison_remove_one(struct prison *pr);
	#ifdef RACCT
	static void prison_racct_attach(struct prison *pr);
	static void prison_racct_detach(struct prison *pr);
	#endif
	#ifdef INET
	static int _prison_check_ip4(struct prison pr, struct in_addr ia);
	static int prison_restrict_ip4(struct prison pr, struct in_addr newip4);
	#endif
	#ifdef INET6
	static int _prison_check_ip6(struct prison pr, struct in6_addr ia6);
	static int prison_restrict_ip6(struct prison pr, struct in6_addr newip6);
	#endif

	/* Flags for prison_deref */
	#define PD_DEREF 0x01
	#define PD_DEUREF 0x02
	#define PD_LOCKED 0x04
	#define PD_LIST_SLOCKED 0x08
	#define PD_LIST_XLOCKED 0x10

	/*
	* Parameter names corresponding to PR_* flag values. Size values are for kvm
	* as we cannot figure out the size of a sparse array, or an array without a
	* terminating entry.
	*/
	static char *pr_flag_names[] = {
	[0] = "persist",
	#ifdef INET
	[7] = "ip4.saddrsel",
	#endif
	#ifdef INET6
	[8] = "ip6.saddrsel",
	#endif
	};
	const size_t pr_flag_names_size = sizeof(pr_flag_names);

	static char *pr_flag_nonames[] = {
	[0] = "nopersist",
	#ifdef INET
	[7] = "ip4.nosaddrsel",
	#endif
	#ifdef INET6
	[8] = "ip6.nosaddrsel",
	#endif
	};
	const size_t pr_flag_nonames_size = sizeof(pr_flag_nonames);

	struct jailsys_flags {
	const char *name;
	unsigned disable;
	unsigned new;
	} pr_flag_jailsys[] = {
	{ "host", 0, PR_HOST },
	#ifdef VIMAGE
	{ "vnet", 0, PR_VNET },
	#endif
	#ifdef INET
	{ "ip4", PR_IP4_USER \| PR_IP4_DISABLE, PR_IP4_USER },
	#endif
	#ifdef INET6
	{ "ip6", PR_IP6_USER \| PR_IP6_DISABLE, PR_IP6_USER },
	#endif
	};
	const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);

	static char *pr_allow_names[] = {
	"allow.set_hostname",
	"allow.sysvipc",
	"allow.raw_sockets",
	"allow.chflags",
	"allow.mount",
	"allow.quotas",
	"allow.socket_af",
	};
	const size_t pr_allow_names_size = sizeof(pr_allow_names);

	static char *pr_allow_nonames[] = {
	"allow.noset_hostname",
	"allow.nosysvipc",
	"allow.noraw_sockets",
	"allow.nochflags",
	"allow.nomount",
	"allow.noquotas",
	"allow.nosocket_af",
	};
	const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);

	#define JAIL_DEFAULT_ALLOW PR_ALLOW_SET_HOSTNAME
	#define JAIL_DEFAULT_ENFORCE_STATFS 2
	static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
	static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
	#if defined(INET) \|\| defined(INET6)
	static unsigned jail_max_af_ips = 255;
	#endif

	#ifdef INET
	static int
	qcmp_v4(const void ip1, const void ip2)
	{
	in_addr_t iaa, iab;

	/*
	* We need to compare in HBO here to get the list sorted as expected
	* by the result of the code. Sorting NBO addresses gives you
	* interesting results. If you do not understand, do not try.
	*/
	iaa = ntohl(((const struct in_addr *)ip1)->s_addr);
	iab = ntohl(((const struct in_addr *)ip2)->s_addr);

	/*
	* Do not simply return the difference of the two numbers, the int is
	* not wide enough.
	*/
	if (iaa > iab)
	return (1);
	else if (iaa < iab)
	return (-1);
	else
	return (0);
	}
	#endif

	#ifdef INET6
	static int
	qcmp_v6(const void ip1, const void ip2)
	{
	const struct in6_addr ia6a, ia6b;
	int i, rc;

	ia6a = (const struct in6_addr *)ip1;
	ia6b = (const struct in6_addr *)ip2;

	rc = 0;
	for (i = 0; rc == 0 && i < sizeof(struct in6_addr); i++) {
	if (ia6a->s6_addr[i] > ia6b->s6_addr[i])
	rc = 1;
	else if (ia6a->s6_addr[i] < ia6b->s6_addr[i])
	rc = -1;
	}
	return (rc);
	}
	#endif

	/*
	* struct jail_args {
	* struct jail *jail;
	* };
	*/
	int
	-jail(struct thread td, struct jail_args uap)
	+sys_jail(struct thread td, struct jail_args uap)
	{
	uint32_t version;
	int error;
	struct jail j;

	error = copyin(uap->jail, &version, sizeof(uint32_t));
	if (error)
	return (error);

	switch (version) {
	case 0:
	{
	struct jail_v0 j0;

	/* FreeBSD single IPv4 jails. */
	bzero(&j, sizeof(struct jail));
	error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
	if (error)
	return (error);
	j.version = j0.version;
	j.path = j0.path;
	j.hostname = j0.hostname;
	j.ip4s = j0.ip_number;
	break;
	}

	case 1:
	/*
	* Version 1 was used by multi-IPv4 jail implementations
	* that never made it into the official kernel.
	*/
	return (EINVAL);

	case 2: /* JAIL_API_VERSION */
	/* FreeBSD multi-IPv4/IPv6,noIP jails. */
	error = copyin(uap->jail, &j, sizeof(struct jail));
	if (error)
	return (error);
	break;

	default:
	/* Sci-Fi jails are not supported, sorry. */
	return (EINVAL);
	}
	return (kern_jail(td, &j));
	}

	int
	kern_jail(struct thread td, struct jail j)
	{
	struct iovec optiov[2 * (4
	+ sizeof(pr_allow_names) / sizeof(pr_allow_names[0])
	#ifdef INET
	+ 1
	#endif
	#ifdef INET6
	+ 1
	#endif
	)];
	struct uio opt;
	char u_path, u_hostname, *u_name;
	#ifdef INET
	uint32_t ip4s;
	struct in_addr *u_ip4;
	#endif
	#ifdef INET6
	struct in6_addr *u_ip6;
	#endif
	size_t tmplen;
	int error, enforce_statfs, fi;

	bzero(&optiov, sizeof(optiov));
	opt.uio_iov = optiov;
	opt.uio_iovcnt = 0;
	opt.uio_offset = -1;
	opt.uio_resid = -1;
	opt.uio_segflg = UIO_SYSSPACE;
	opt.uio_rw = UIO_READ;
	opt.uio_td = td;

	/* Set permissions for top-level jails from sysctls. */
	if (!jailed(td->td_ucred)) {
	for (fi = 0; fi < sizeof(pr_allow_names) /
	sizeof(pr_allow_names[0]); fi++) {
	optiov[opt.uio_iovcnt].iov_base =
	(jail_default_allow & (1 << fi))
	? pr_allow_names[fi] : pr_allow_nonames[fi];
	optiov[opt.uio_iovcnt].iov_len =
	strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
	opt.uio_iovcnt += 2;
	}
	optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
	optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
	opt.uio_iovcnt++;
	enforce_statfs = jail_default_enforce_statfs;
	optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
	optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
	opt.uio_iovcnt++;
	}

	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
	#ifdef INET
	ip4s = (j->version == 0) ? 1 : j->ip4s;
	if (ip4s > jail_max_af_ips)
	return (EINVAL);
	tmplen += ip4s * sizeof(struct in_addr);
	#else
	if (j->ip4s > 0)
	return (EINVAL);
	#endif
	#ifdef INET6
	if (j->ip6s > jail_max_af_ips)
	return (EINVAL);
	tmplen += j->ip6s * sizeof(struct in6_addr);
	#else
	if (j->ip6s > 0)
	return (EINVAL);
	#endif
	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
	u_hostname = u_path + MAXPATHLEN;
	u_name = u_hostname + MAXHOSTNAMELEN;
	#ifdef INET
	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
	#endif
	#ifdef INET6
	#ifdef INET
	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
	#else
	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
	#endif
	#endif
	optiov[opt.uio_iovcnt].iov_base = "path";
	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
	opt.uio_iovcnt++;
	optiov[opt.uio_iovcnt].iov_base = u_path;
	error = copyinstr(j->path, u_path, MAXPATHLEN,
	&optiov[opt.uio_iovcnt].iov_len);
	if (error) {
	free(u_path, M_TEMP);
	return (error);
	}
	opt.uio_iovcnt++;
	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
	opt.uio_iovcnt++;
	optiov[opt.uio_iovcnt].iov_base = u_hostname;
	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
	&optiov[opt.uio_iovcnt].iov_len);
	if (error) {
	free(u_path, M_TEMP);
	return (error);
	}
	opt.uio_iovcnt++;
	if (j->jailname != NULL) {
	optiov[opt.uio_iovcnt].iov_base = "name";
	optiov[opt.uio_iovcnt].iov_len = sizeof("name");
	opt.uio_iovcnt++;
	optiov[opt.uio_iovcnt].iov_base = u_name;
	error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
	&optiov[opt.uio_iovcnt].iov_len);
	if (error) {
	free(u_path, M_TEMP);
	return (error);
	}
	opt.uio_iovcnt++;
	}
	#ifdef INET
	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
	opt.uio_iovcnt++;
	optiov[opt.uio_iovcnt].iov_base = u_ip4;
	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
	if (j->version == 0)
	u_ip4->s_addr = j->ip4s;
	else {
	error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
	if (error) {
	free(u_path, M_TEMP);
	return (error);
	}
	}
	opt.uio_iovcnt++;
	#endif
	#ifdef INET6
	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
	opt.uio_iovcnt++;
	optiov[opt.uio_iovcnt].iov_base = u_ip6;
	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
	if (error) {
	free(u_path, M_TEMP);
	return (error);
	}
	opt.uio_iovcnt++;
	#endif
	KASSERT(opt.uio_iovcnt <= sizeof(optiov) / sizeof(optiov[0]),
	("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
	error = kern_jail_set(td, &opt, JAIL_CREATE \| JAIL_ATTACH);
	free(u_path, M_TEMP);
	return (error);
	}


	/*
	* struct jail_set_args {
	* struct iovec *iovp;
	* unsigned int iovcnt;
	* int flags;
	* };
	*/
	int
	-jail_set(struct thread td, struct jail_set_args uap)
	+sys_jail_set(struct thread td, struct jail_set_args uap)
	{
	struct uio *auio;
	int error;

	/* Check that we have an even number of iovecs. */
	if (uap->iovcnt & 1)
	return (EINVAL);

	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_jail_set(td, auio, uap->flags);
	free(auio, M_IOV);
	return (error);
	}

	int
	kern_jail_set(struct thread td, struct uio optuio, int flags)
	{
	struct nameidata nd;
	#ifdef INET
	struct in_addr *ip4;
	#endif
	#ifdef INET6
	struct in6_addr *ip6;
	#endif
	struct vfsopt *opt;
	struct vfsoptlist *opts;
	struct prison pr, deadpr, mypr, ppr, *tpr;
	struct vnode *root;
	char domain, errmsg, host, name, namelc, p, path, uuid;
	#if defined(INET) \|\| defined(INET6)
	struct prison *tppr;
	void *op;
	#endif
	unsigned long hid;
	size_t namelen, onamelen;
	int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
	int gotchildmax, gotenforce, gothid, gotslevel;
	int fi, jid, jsys, len, level;
	int childmax, slevel, vfslocked;
	#if defined(INET) \|\| defined(INET6)
	int ii, ij;
	#endif
	#ifdef INET
	int ip4s, redo_ip4;
	#endif
	#ifdef INET6
	int ip6s, redo_ip6;
	#endif
	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
	unsigned tallow;
	char numbuf[12];

	error = priv_check(td, PRIV_JAIL_SET);
	if (!error && (flags & JAIL_ATTACH))
	error = priv_check(td, PRIV_JAIL_ATTACH);
	if (error)
	return (error);
	mypr = ppr = td->td_ucred->cr_prison;
	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
	return (EPERM);
	if (flags & ~JAIL_SET_MASK)
	return (EINVAL);

	/*
	* Check all the parameters before committing to anything. Not all
	* errors can be caught early, but we may as well try. Also, this
	* takes care of some expensive stuff (path lookup) before getting
	* the allprison lock.
	*
	* XXX Jails are not filesystems, and jail parameters are not mount
	* options. But it makes more sense to re-use the vfsopt code
	* than duplicate it under a different name.
	*/
	error = vfs_buildopts(optuio, &opts);
	if (error)
	return (error);
	#ifdef INET
	ip4 = NULL;
	#endif
	#ifdef INET6
	ip6 = NULL;
	#endif

	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
	if (error == ENOENT)
	jid = 0;
	else if (error != 0)
	goto done_free;

	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
	if (error == ENOENT)
	gotslevel = 0;
	else if (error != 0)
	goto done_free;
	else
	gotslevel = 1;

	error =
	vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
	if (error == ENOENT)
	gotchildmax = 0;
	else if (error != 0)
	goto done_free;
	else
	gotchildmax = 1;

	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
	if (error == ENOENT)
	gotenforce = 0;
	else if (error != 0)
	goto done_free;
	else if (enforce < 0 \|\| enforce > 2) {
	error = EINVAL;
	goto done_free;
	} else
	gotenforce = 1;

	pr_flags = ch_flags = 0;
	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
	fi++) {
	if (pr_flag_names[fi] == NULL)
	continue;
	vfs_flagopt(opts, pr_flag_names[fi], &pr_flags, 1 << fi);
	vfs_flagopt(opts, pr_flag_nonames[fi], &ch_flags, 1 << fi);
	}
	ch_flags \|= pr_flags;
	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
	fi++) {
	error = vfs_copyopt(opts, pr_flag_jailsys[fi].name, &jsys,
	sizeof(jsys));
	if (error == ENOENT)
	continue;
	if (error != 0)
	goto done_free;
	switch (jsys) {
	case JAIL_SYS_DISABLE:
	if (!pr_flag_jailsys[fi].disable) {
	error = EINVAL;
	goto done_free;
	}
	pr_flags \|= pr_flag_jailsys[fi].disable;
	break;
	case JAIL_SYS_NEW:
	pr_flags \|= pr_flag_jailsys[fi].new;
	break;
	case JAIL_SYS_INHERIT:
	break;
	default:
	error = EINVAL;
	goto done_free;
	}
	ch_flags \|=
	pr_flag_jailsys[fi].new \| pr_flag_jailsys[fi].disable;
	}
	if ((flags & (JAIL_CREATE \| JAIL_UPDATE \| JAIL_ATTACH)) == JAIL_CREATE
	&& !(pr_flags & PR_PERSIST)) {
	error = EINVAL;
	vfs_opterror(opts, "new jail must persist or attach");
	goto done_errmsg;
	}
	#ifdef VIMAGE
	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
	error = EINVAL;
	vfs_opterror(opts, "vnet cannot be changed after creation");
	goto done_errmsg;
	}
	#endif
	#ifdef INET
	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
	error = EINVAL;
	vfs_opterror(opts, "ip4 cannot be changed after creation");
	goto done_errmsg;
	}
	#endif
	#ifdef INET6
	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
	error = EINVAL;
	vfs_opterror(opts, "ip6 cannot be changed after creation");
	goto done_errmsg;
	}
	#endif

	pr_allow = ch_allow = 0;
	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
	fi++) {
	vfs_flagopt(opts, pr_allow_names[fi], &pr_allow, 1 << fi);
	vfs_flagopt(opts, pr_allow_nonames[fi], &ch_allow, 1 << fi);
	}
	ch_allow \|= pr_allow;

	error = vfs_getopt(opts, "name", (void **)&name, &len);
	if (error == ENOENT)
	name = NULL;
	else if (error != 0)
	goto done_free;
	else {
	if (len == 0 \|\| name[len - 1] != '\0') {
	error = EINVAL;
	goto done_free;
	}
	if (len > MAXHOSTNAMELEN) {
	error = ENAMETOOLONG;
	goto done_free;
	}
	}

	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
	if (error == ENOENT)
	host = NULL;
	else if (error != 0)
	goto done_free;
	else {
	ch_flags \|= PR_HOST;
	pr_flags \|= PR_HOST;
	if (len == 0 \|\| host[len - 1] != '\0') {
	error = EINVAL;
	goto done_free;
	}
	if (len > MAXHOSTNAMELEN) {
	error = ENAMETOOLONG;
	goto done_free;
	}
	}

	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
	if (error == ENOENT)
	domain = NULL;
	else if (error != 0)
	goto done_free;
	else {
	ch_flags \|= PR_HOST;
	pr_flags \|= PR_HOST;
	if (len == 0 \|\| domain[len - 1] != '\0') {
	error = EINVAL;
	goto done_free;
	}
	if (len > MAXHOSTNAMELEN) {
	error = ENAMETOOLONG;
	goto done_free;
	}
	}

	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
	if (error == ENOENT)
	uuid = NULL;
	else if (error != 0)
	goto done_free;
	else {
	ch_flags \|= PR_HOST;
	pr_flags \|= PR_HOST;
	if (len == 0 \|\| uuid[len - 1] != '\0') {
	error = EINVAL;
	goto done_free;
	}
	if (len > HOSTUUIDLEN) {
	error = ENAMETOOLONG;
	goto done_free;
	}
	}

	#ifdef COMPAT_FREEBSD32
	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
	uint32_t hid32;

	error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
	hid = hid32;
	} else
	#endif
	error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
	if (error == ENOENT)
	gothid = 0;
	else if (error != 0)
	goto done_free;
	else {
	gothid = 1;
	ch_flags \|= PR_HOST;
	pr_flags \|= PR_HOST;
	}

	#ifdef INET
	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
	if (error == ENOENT)
	ip4s = (pr_flags & PR_IP4_DISABLE) ? 0 : -1;
	else if (error != 0)
	goto done_free;
	else if (ip4s & (sizeof(*ip4) - 1)) {
	error = EINVAL;
	goto done_free;
	} else {
	ch_flags \|= PR_IP4_USER \| PR_IP4_DISABLE;
	if (ip4s == 0)
	pr_flags \|= PR_IP4_USER \| PR_IP4_DISABLE;
	else {
	pr_flags = (pr_flags & ~PR_IP4_DISABLE) \| PR_IP4_USER;
	ip4s /= sizeof(*ip4);
	if (ip4s > jail_max_af_ips) {
	error = EINVAL;
	vfs_opterror(opts, "too many IPv4 addresses");
	goto done_errmsg;
	}
	ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
	bcopy(op, ip4, ip4s * sizeof(*ip4));
	/*
	* IP addresses are all sorted but ip[0] to preserve
	* the primary IP address as given from userland.
	* This special IP is used for unbound outgoing
	* connections as well for "loopback" traffic in case
	* source address selection cannot find any more fitting
	* address to connect from.
	*/
	if (ip4s > 1)
	qsort(ip4 + 1, ip4s - 1, sizeof(*ip4), qcmp_v4);
	/*
	* Check for duplicate addresses and do some simple
	* zero and broadcast checks. If users give other bogus
	* addresses it is their problem.
	*
	* We do not have to care about byte order for these
	* checks so we will do them in NBO.
	*/
	for (ii = 0; ii < ip4s; ii++) {
	if (ip4[ii].s_addr == INADDR_ANY \|\|
	ip4[ii].s_addr == INADDR_BROADCAST) {
	error = EINVAL;
	goto done_free;
	}
	if ((ii+1) < ip4s &&
	(ip4[0].s_addr == ip4[ii+1].s_addr \|\|
	ip4[ii].s_addr == ip4[ii+1].s_addr)) {
	error = EINVAL;
	goto done_free;
	}
	}
	}
	}
	#endif

	#ifdef INET6
	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
	if (error == ENOENT)
	ip6s = (pr_flags & PR_IP6_DISABLE) ? 0 : -1;
	else if (error != 0)
	goto done_free;
	else if (ip6s & (sizeof(*ip6) - 1)) {
	error = EINVAL;
	goto done_free;
	} else {
	ch_flags \|= PR_IP6_USER \| PR_IP6_DISABLE;
	if (ip6s == 0)
	pr_flags \|= PR_IP6_USER \| PR_IP6_DISABLE;
	else {
	pr_flags = (pr_flags & ~PR_IP6_DISABLE) \| PR_IP6_USER;
	ip6s /= sizeof(*ip6);
	if (ip6s > jail_max_af_ips) {
	error = EINVAL;
	vfs_opterror(opts, "too many IPv6 addresses");
	goto done_errmsg;
	}
	ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
	bcopy(op, ip6, ip6s * sizeof(*ip6));
	if (ip6s > 1)
	qsort(ip6 + 1, ip6s - 1, sizeof(*ip6), qcmp_v6);
	for (ii = 0; ii < ip6s; ii++) {
	if (IN6_IS_ADDR_UNSPECIFIED(&ip6[ii])) {
	error = EINVAL;
	goto done_free;
	}
	if ((ii+1) < ip6s &&
	(IN6_ARE_ADDR_EQUAL(&ip6[0], &ip6[ii+1]) \|\|
	IN6_ARE_ADDR_EQUAL(&ip6[ii], &ip6[ii+1])))
	{
	error = EINVAL;
	goto done_free;
	}
	}
	}
	}
	#endif

	#if defined(VIMAGE) && (defined(INET) \|\| defined(INET6))
	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER \| PR_IP6_USER))) {
	error = EINVAL;
	vfs_opterror(opts,
	"vnet jails cannot have IP address restrictions");
	goto done_errmsg;
	}
	#endif

	root = NULL;
	error = vfs_getopt(opts, "path", (void **)&path, &len);
	if (error == ENOENT)
	path = NULL;
	else if (error != 0)
	goto done_free;
	else {
	if (flags & JAIL_UPDATE) {
	error = EINVAL;
	vfs_opterror(opts,
	"path cannot be changed after creation");
	goto done_errmsg;
	}
	if (len == 0 \|\| path[len - 1] != '\0') {
	error = EINVAL;
	goto done_free;
	}
	if (len < 2 \|\| (len == 2 && path[0] == '/'))
	path = NULL;
	else {
	/* Leave room for a real-root full pathname. */
	if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
	? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
	error = ENAMETOOLONG;
	goto done_free;
	}
	NDINIT(&nd, LOOKUP, MPSAFE \| FOLLOW, UIO_SYSSPACE,
	path, td);
	error = namei(&nd);
	if (error)
	goto done_free;
	vfslocked = NDHASGIANT(&nd);
	root = nd.ni_vp;
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (root->v_type != VDIR) {
	error = ENOTDIR;
	vrele(root);
	VFS_UNLOCK_GIANT(vfslocked);
	goto done_free;
	}
	VFS_UNLOCK_GIANT(vfslocked);
	}
	}

	/*
	* Grab the allprison lock before letting modules check their
	* parameters. Once we have it, do not let go so we'll have a
	* consistent view of the OSD list.
	*/
	sx_xlock(&allprison_lock);
	error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
	if (error)
	goto done_unlock_list;

	/* By now, all parameters should have been noted. */
	TAILQ_FOREACH(opt, opts, link) {
	if (!opt->seen && strcmp(opt->name, "errmsg")) {
	error = EINVAL;
	vfs_opterror(opts, "unknown parameter: %s", opt->name);
	goto done_unlock_list;
	}
	}

	/*
	* See if we are creating a new record or updating an existing one.
	* This abuses the file error codes ENOENT and EEXIST.
	*/
	cuflags = flags & (JAIL_CREATE \| JAIL_UPDATE);
	if (!cuflags) {
	error = EINVAL;
	vfs_opterror(opts, "no valid operation (create or update)");
	goto done_unlock_list;
	}
	pr = NULL;
	namelc = NULL;
	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
	namelc = strrchr(name, '.');
	jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
	if (*p != '\0')
	jid = 0;
	}
	if (jid != 0) {
	/*
	* See if a requested jid already exists. There is an
	* information leak here if the jid exists but is not within
	* the caller's jail hierarchy. Jail creators will get EEXIST
	* even though they cannot see the jail, and CREATE \| UPDATE
	* will return ENOENT which is not normally a valid error.
	*/
	if (jid < 0) {
	error = EINVAL;
	vfs_opterror(opts, "negative jid");
	goto done_unlock_list;
	}
	pr = prison_find(jid);
	if (pr != NULL) {
	ppr = pr->pr_parent;
	/* Create: jid must not exist. */
	if (cuflags == JAIL_CREATE) {
	mtx_unlock(&pr->pr_mtx);
	error = EEXIST;
	vfs_opterror(opts, "jail %d already exists",
	jid);
	goto done_unlock_list;
	}
	if (!prison_ischild(mypr, pr)) {
	mtx_unlock(&pr->pr_mtx);
	pr = NULL;
	} else if (pr->pr_uref == 0) {
	if (!(flags & JAIL_DYING)) {
	mtx_unlock(&pr->pr_mtx);
	error = ENOENT;
	vfs_opterror(opts, "jail %d is dying",
	jid);
	goto done_unlock_list;
	} else if ((flags & JAIL_ATTACH) \|\|
	(pr_flags & PR_PERSIST)) {
	/*
	* A dying jail might be resurrected
	* (via attach or persist), but first
	* it must determine if another jail
	* has claimed its name. Accomplish
	* this by implicitly re-setting the
	* name.
	*/
	if (name == NULL)
	name = prison_name(mypr, pr);
	}
	}
	}
	if (pr == NULL) {
	/* Update: jid must exist. */
	if (cuflags == JAIL_UPDATE) {
	error = ENOENT;
	vfs_opterror(opts, "jail %d not found", jid);
	goto done_unlock_list;
	}
	}
	}
	/*
	* If the caller provided a name, look for a jail by that name.
	* This has different semantics for creates and updates keyed by jid
	* (where the name must not already exist in a different jail),
	* and updates keyed by the name itself (where the name must exist
	* because that is the jail being updated).
	*/
	if (name != NULL) {
	namelc = strrchr(name, '.');
	if (namelc == NULL)
	namelc = name;
	else {
	/*
	* This is a hierarchical name. Split it into the
	* parent and child names, and make sure the parent
	* exists or matches an already found jail.
	*/
	*namelc = '\0';
	if (pr != NULL) {
	if (strncmp(name, ppr->pr_name, namelc - name)
	\|\| ppr->pr_name[namelc - name] != '\0') {
	mtx_unlock(&pr->pr_mtx);
	error = EINVAL;
	vfs_opterror(opts,
	"cannot change jail's parent");
	goto done_unlock_list;
	}
	} else {
	ppr = prison_find_name(mypr, name);
	if (ppr == NULL) {
	error = ENOENT;
	vfs_opterror(opts,
	"jail \"%s\" not found", name);
	goto done_unlock_list;
	}
	mtx_unlock(&ppr->pr_mtx);
	}
	name = ++namelc;
	}
	if (name[0] != '\0') {
	namelen =
	(ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
	name_again:
	deadpr = NULL;
	FOREACH_PRISON_CHILD(ppr, tpr) {
	if (tpr != pr && tpr->pr_ref > 0 &&
	!strcmp(tpr->pr_name + namelen, name)) {
	if (pr == NULL &&
	cuflags != JAIL_CREATE) {
	mtx_lock(&tpr->pr_mtx);
	if (tpr->pr_ref > 0) {
	/*
	* Use this jail
	* for updates.
	*/
	if (tpr->pr_uref > 0) {
	pr = tpr;
	break;
	}
	deadpr = tpr;
	}
	mtx_unlock(&tpr->pr_mtx);
	} else if (tpr->pr_uref > 0) {
	/*
	* Create, or update(jid):
	* name must not exist in an
	* active sibling jail.
	*/
	error = EEXIST;
	if (pr != NULL)
	mtx_unlock(&pr->pr_mtx);
	vfs_opterror(opts,
	"jail \"%s\" already exists",
	name);
	goto done_unlock_list;
	}
	}
	}
	/* If no active jail is found, use a dying one. */
	if (deadpr != NULL && pr == NULL) {
	if (flags & JAIL_DYING) {
	mtx_lock(&deadpr->pr_mtx);
	if (deadpr->pr_ref == 0) {
	mtx_unlock(&deadpr->pr_mtx);
	goto name_again;
	}
	pr = deadpr;
	} else if (cuflags == JAIL_UPDATE) {
	error = ENOENT;
	vfs_opterror(opts,
	"jail \"%s\" is dying", name);
	goto done_unlock_list;
	}
	}
	/* Update: name must exist if no jid. */
	else if (cuflags == JAIL_UPDATE && pr == NULL) {
	error = ENOENT;
	vfs_opterror(opts, "jail \"%s\" not found",
	name);
	goto done_unlock_list;
	}
	}
	}
	/* Update: must provide a jid or name. */
	else if (cuflags == JAIL_UPDATE && pr == NULL) {
	error = ENOENT;
	vfs_opterror(opts, "update specified no jail");
	goto done_unlock_list;
	}

	/* If there's no prison to update, create a new one and link it in. */
	if (pr == NULL) {
	for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
	if (tpr->pr_childcount >= tpr->pr_childmax) {
	error = EPERM;
	vfs_opterror(opts, "prison limit exceeded");
	goto done_unlock_list;
	}
	created = 1;
	mtx_lock(&ppr->pr_mtx);
	if (ppr->pr_ref == 0 \|\| (ppr->pr_flags & PR_REMOVE)) {
	mtx_unlock(&ppr->pr_mtx);
	error = ENOENT;
	vfs_opterror(opts, "parent jail went away!");
	goto done_unlock_list;
	}
	ppr->pr_ref++;
	ppr->pr_uref++;
	mtx_unlock(&ppr->pr_mtx);
	pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK \| M_ZERO);
	if (jid == 0) {
	/* Find the next free jid. */
	jid = lastprid + 1;
	findnext:
	if (jid == JAIL_MAX)
	jid = 1;
	TAILQ_FOREACH(tpr, &allprison, pr_list) {
	if (tpr->pr_id < jid)
	continue;
	if (tpr->pr_id > jid \|\| tpr->pr_ref == 0) {
	TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
	break;
	}
	if (jid == lastprid) {
	error = EAGAIN;
	vfs_opterror(opts,
	"no available jail IDs");
	free(pr, M_PRISON);
	prison_deref(ppr, PD_DEREF \|
	PD_DEUREF \| PD_LIST_XLOCKED);
	goto done_releroot;
	}
	jid++;
	goto findnext;
	}
	lastprid = jid;
	} else {
	/*
	* The jail already has a jid (that did not yet exist),
	* so just find where to insert it.
	*/
	TAILQ_FOREACH(tpr, &allprison, pr_list)
	if (tpr->pr_id >= jid) {
	TAILQ_INSERT_BEFORE(tpr, pr, pr_list);
	break;
	}
	}
	if (tpr == NULL)
	TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
	LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
	for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
	tpr->pr_childcount++;

	pr->pr_parent = ppr;
	pr->pr_id = jid;

	/* Set some default values, and inherit some from the parent. */
	if (name == NULL)
	name = "";
	if (path == NULL) {
	path = "/";
	root = mypr->pr_root;
	vref(root);
	}
	strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
	pr->pr_flags \|= PR_HOST;
	#if defined(INET) \|\| defined(INET6)
	#ifdef VIMAGE
	if (!(pr_flags & PR_VNET))
	#endif
	{
	#ifdef INET
	if (!(ch_flags & PR_IP4_USER))
	pr->pr_flags \|=
	PR_IP4 \| PR_IP4_USER \| PR_IP4_DISABLE;
	else if (!(pr_flags & PR_IP4_USER)) {
	pr->pr_flags \|= ppr->pr_flags & PR_IP4;
	if (ppr->pr_ip4 != NULL) {
	pr->pr_ip4s = ppr->pr_ip4s;
	pr->pr_ip4 = malloc(pr->pr_ip4s *
	sizeof(struct in_addr), M_PRISON,
	M_WAITOK);
	bcopy(ppr->pr_ip4, pr->pr_ip4,
	pr->pr_ip4s * sizeof(*pr->pr_ip4));
	}
	}
	#endif
	#ifdef INET6
	if (!(ch_flags & PR_IP6_USER))
	pr->pr_flags \|=
	PR_IP6 \| PR_IP6_USER \| PR_IP6_DISABLE;
	else if (!(pr_flags & PR_IP6_USER)) {
	pr->pr_flags \|= ppr->pr_flags & PR_IP6;
	if (ppr->pr_ip6 != NULL) {
	pr->pr_ip6s = ppr->pr_ip6s;
	pr->pr_ip6 = malloc(pr->pr_ip6s *
	sizeof(struct in6_addr), M_PRISON,
	M_WAITOK);
	bcopy(ppr->pr_ip6, pr->pr_ip6,
	pr->pr_ip6s * sizeof(*pr->pr_ip6));
	}
	}
	#endif
	}
	#endif
	/* Source address selection is always on by default. */
	pr->pr_flags \|= _PR_IP_SADDRSEL;

	pr->pr_securelevel = ppr->pr_securelevel;
	pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
	pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;

	LIST_INIT(&pr->pr_children);
	mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF \| MTX_DUPOK);

	#ifdef VIMAGE
	/* Allocate a new vnet if specified. */
	pr->pr_vnet = (pr_flags & PR_VNET)
	? vnet_alloc() : ppr->pr_vnet;
	#endif
	/*
	* Allocate a dedicated cpuset for each jail.
	* Unlike other initial settings, this may return an erorr.
	*/
	error = cpuset_create_root(ppr, &pr->pr_cpuset);
	if (error) {
	prison_deref(pr, PD_LIST_XLOCKED);
	goto done_releroot;
	}

	mtx_lock(&pr->pr_mtx);
	/*
	* New prisons do not yet have a reference, because we do not
	* want other to see the incomplete prison once the
	* allprison_lock is downgraded.
	*/
	} else {
	created = 0;
	/*
	* Grab a reference for existing prisons, to ensure they
	* continue to exist for the duration of the call.
	*/
	pr->pr_ref++;
	#if defined(VIMAGE) && (defined(INET) \|\| defined(INET6))
	if ((pr->pr_flags & PR_VNET) &&
	(ch_flags & (PR_IP4_USER \| PR_IP6_USER))) {
	error = EINVAL;
	vfs_opterror(opts,
	"vnet jails cannot have IP address restrictions");
	goto done_deref_locked;
	}
	#endif
	#ifdef INET
	if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
	error = EINVAL;
	vfs_opterror(opts,
	"ip4 cannot be changed after creation");
	goto done_deref_locked;
	}
	#endif
	#ifdef INET6
	if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
	error = EINVAL;
	vfs_opterror(opts,
	"ip6 cannot be changed after creation");
	goto done_deref_locked;
	}
	#endif
	}

	/* Do final error checking before setting anything. */
	if (gotslevel) {
	if (slevel < ppr->pr_securelevel) {
	error = EPERM;
	goto done_deref_locked;
	}
	}
	if (gotchildmax) {
	if (childmax >= ppr->pr_childmax) {
	error = EPERM;
	goto done_deref_locked;
	}
	}
	if (gotenforce) {
	if (enforce < ppr->pr_enforce_statfs) {
	error = EPERM;
	goto done_deref_locked;
	}
	}
	#ifdef INET
	if (ip4s > 0) {
	if (ppr->pr_flags & PR_IP4) {
	/*
	* Make sure the new set of IP addresses is a
	* subset of the parent's list. Don't worry
	* about the parent being unlocked, as any
	* setting is done with allprison_lock held.
	*/
	for (ij = 0; ij < ppr->pr_ip4s; ij++)
	if (ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
	break;
	if (ij == ppr->pr_ip4s) {
	error = EPERM;
	goto done_deref_locked;
	}
	if (ip4s > 1) {
	for (ii = ij = 1; ii < ip4s; ii++) {
	if (ip4[ii].s_addr ==
	ppr->pr_ip4[0].s_addr)
	continue;
	for (; ij < ppr->pr_ip4s; ij++)
	if (ip4[ii].s_addr ==
	ppr->pr_ip4[ij].s_addr)
	break;
	if (ij == ppr->pr_ip4s)
	break;
	}
	if (ij == ppr->pr_ip4s) {
	error = EPERM;
	goto done_deref_locked;
	}
	}
	}
	/*
	* Check for conflicting IP addresses. We permit them
	* if there is no more than one IP on each jail. If
	* there is a duplicate on a jail with more than one
	* IP stop checking and return error.
	*/
	tppr = ppr;
	#ifdef VIMAGE
	for (; tppr != &prison0; tppr = tppr->pr_parent)
	if (tppr->pr_flags & PR_VNET)
	break;
	#endif
	FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
	if (tpr == pr \|\|
	#ifdef VIMAGE
	(tpr != tppr && (tpr->pr_flags & PR_VNET)) \|\|
	#endif
	tpr->pr_uref == 0) {
	descend = 0;
	continue;
	}
	if (!(tpr->pr_flags & PR_IP4_USER))
	continue;
	descend = 0;
	if (tpr->pr_ip4 == NULL \|\|
	(ip4s == 1 && tpr->pr_ip4s == 1))
	continue;
	for (ii = 0; ii < ip4s; ii++) {
	if (_prison_check_ip4(tpr, &ip4[ii]) == 0) {
	error = EADDRINUSE;
	vfs_opterror(opts,
	"IPv4 addresses clash");
	goto done_deref_locked;
	}
	}
	}
	}
	#endif
	#ifdef INET6
	if (ip6s > 0) {
	if (ppr->pr_flags & PR_IP6) {
	/*
	* Make sure the new set of IP addresses is a
	* subset of the parent's list.
	*/
	for (ij = 0; ij < ppr->pr_ip6s; ij++)
	if (IN6_ARE_ADDR_EQUAL(&ip6[0],
	&ppr->pr_ip6[ij]))
	break;
	if (ij == ppr->pr_ip6s) {
	error = EPERM;
	goto done_deref_locked;
	}
	if (ip6s > 1) {
	for (ii = ij = 1; ii < ip6s; ii++) {
	if (IN6_ARE_ADDR_EQUAL(&ip6[ii],
	&ppr->pr_ip6[0]))
	continue;
	for (; ij < ppr->pr_ip6s; ij++)
	if (IN6_ARE_ADDR_EQUAL(
	&ip6[ii], &ppr->pr_ip6[ij]))
	break;
	if (ij == ppr->pr_ip6s)
	break;
	}
	if (ij == ppr->pr_ip6s) {
	error = EPERM;
	goto done_deref_locked;
	}
	}
	}
	/* Check for conflicting IP addresses. */
	tppr = ppr;
	#ifdef VIMAGE
	for (; tppr != &prison0; tppr = tppr->pr_parent)
	if (tppr->pr_flags & PR_VNET)
	break;
	#endif
	FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
	if (tpr == pr \|\|
	#ifdef VIMAGE
	(tpr != tppr && (tpr->pr_flags & PR_VNET)) \|\|
	#endif
	tpr->pr_uref == 0) {
	descend = 0;
	continue;
	}
	if (!(tpr->pr_flags & PR_IP6_USER))
	continue;
	descend = 0;
	if (tpr->pr_ip6 == NULL \|\|
	(ip6s == 1 && tpr->pr_ip6s == 1))
	continue;
	for (ii = 0; ii < ip6s; ii++) {
	if (_prison_check_ip6(tpr, &ip6[ii]) == 0) {
	error = EADDRINUSE;
	vfs_opterror(opts,
	"IPv6 addresses clash");
	goto done_deref_locked;
	}
	}
	}
	}
	#endif
	onamelen = namelen = 0;
	if (name != NULL) {
	/* Give a default name of the jid. */
	if (name[0] == '\0')
	snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
	else if (*namelc == '0' \|\| (strtoul(namelc, &p, 10) != jid &&
	*p == '\0')) {
	error = EINVAL;
	vfs_opterror(opts,
	"name cannot be numeric (unless it is the jid)");
	goto done_deref_locked;
	}
	/*
	* Make sure the name isn't too long for the prison or its
	* children.
	*/
	onamelen = strlen(pr->pr_name);
	namelen = strlen(name);
	if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
	error = ENAMETOOLONG;
	goto done_deref_locked;
	}
	FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
	if (strlen(tpr->pr_name) + (namelen - onamelen) >=
	sizeof(pr->pr_name)) {
	error = ENAMETOOLONG;
	goto done_deref_locked;
	}
	}
	}
	if (pr_allow & ~ppr->pr_allow) {
	error = EPERM;
	goto done_deref_locked;
	}

	/* Set the parameters of the prison. */
	#ifdef INET
	redo_ip4 = 0;
	if (pr_flags & PR_IP4_USER) {
	pr->pr_flags \|= PR_IP4;
	free(pr->pr_ip4, M_PRISON);
	pr->pr_ip4s = ip4s;
	pr->pr_ip4 = ip4;
	ip4 = NULL;
	FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
	#ifdef VIMAGE
	if (tpr->pr_flags & PR_VNET) {
	descend = 0;
	continue;
	}
	#endif
	if (prison_restrict_ip4(tpr, NULL)) {
	redo_ip4 = 1;
	descend = 0;
	}
	}
	}
	#endif
	#ifdef INET6
	redo_ip6 = 0;
	if (pr_flags & PR_IP6_USER) {
	pr->pr_flags \|= PR_IP6;
	free(pr->pr_ip6, M_PRISON);
	pr->pr_ip6s = ip6s;
	pr->pr_ip6 = ip6;
	ip6 = NULL;
	FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
	#ifdef VIMAGE
	if (tpr->pr_flags & PR_VNET) {
	descend = 0;
	continue;
	}
	#endif
	if (prison_restrict_ip6(tpr, NULL)) {
	redo_ip6 = 1;
	descend = 0;
	}
	}
	}
	#endif
	if (gotslevel) {
	pr->pr_securelevel = slevel;
	/* Set all child jails to be at least this level. */
	FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
	if (tpr->pr_securelevel < slevel)
	tpr->pr_securelevel = slevel;
	}
	if (gotchildmax) {
	pr->pr_childmax = childmax;
	/* Set all child jails to under this limit. */
	FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
	if (tpr->pr_childmax > childmax - level)
	tpr->pr_childmax = childmax > level
	? childmax - level : 0;
	}
	if (gotenforce) {
	pr->pr_enforce_statfs = enforce;
	/* Pass this restriction on to the children. */
	FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
	if (tpr->pr_enforce_statfs < enforce)
	tpr->pr_enforce_statfs = enforce;
	}
	if (name != NULL) {
	if (ppr == &prison0)
	strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
	else
	snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
	ppr->pr_name, name);
	/* Change this component of child names. */
	FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
	bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
	strlen(tpr->pr_name + onamelen) + 1);
	bcopy(pr->pr_name, tpr->pr_name, namelen);
	}
	}
	if (path != NULL) {
	/* Try to keep a real-rooted full pathname. */
	if (path[0] == '/' && strcmp(mypr->pr_path, "/"))
	snprintf(pr->pr_path, sizeof(pr->pr_path), "%s%s",
	mypr->pr_path, path);
	else
	strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
	pr->pr_root = root;
	}
	if (PR_HOST & ch_flags & ~pr_flags) {
	if (pr->pr_flags & PR_HOST) {
	/*
	* Copy the parent's host info. As with pr_ip4 above,
	* the lack of a lock on the parent is not a problem;
	* it is always set with allprison_lock at least
	* shared, and is held exclusively here.
	*/
	strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
	sizeof(pr->pr_hostname));
	strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
	sizeof(pr->pr_domainname));
	strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
	sizeof(pr->pr_hostuuid));
	pr->pr_hostid = pr->pr_parent->pr_hostid;
	}
	} else if (host != NULL \|\| domain != NULL \|\| uuid != NULL \|\| gothid) {
	/* Set this prison, and any descendants without PR_HOST. */
	if (host != NULL)
	strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
	if (domain != NULL)
	strlcpy(pr->pr_domainname, domain,
	sizeof(pr->pr_domainname));
	if (uuid != NULL)
	strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
	if (gothid)
	pr->pr_hostid = hid;
	FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
	if (tpr->pr_flags & PR_HOST)
	descend = 0;
	else {
	if (host != NULL)
	strlcpy(tpr->pr_hostname,
	pr->pr_hostname,
	sizeof(tpr->pr_hostname));
	if (domain != NULL)
	strlcpy(tpr->pr_domainname,
	pr->pr_domainname,
	sizeof(tpr->pr_domainname));
	if (uuid != NULL)
	strlcpy(tpr->pr_hostuuid,
	pr->pr_hostuuid,
	sizeof(tpr->pr_hostuuid));
	if (gothid)
	tpr->pr_hostid = hid;
	}
	}
	}
	if ((tallow = ch_allow & ~pr_allow)) {
	/* Clear allow bits in all children. */
	FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
	tpr->pr_allow &= ~tallow;
	}
	pr->pr_allow = (pr->pr_allow & ~ch_allow) \| pr_allow;
	/*
	* Persistent prisons get an extra reference, and prisons losing their
	* persist flag lose that reference. Only do this for existing prisons
	* for now, so new ones will remain unseen until after the module
	* handlers have completed.
	*/
	if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
	if (pr_flags & PR_PERSIST) {
	pr->pr_ref++;
	pr->pr_uref++;
	} else {
	pr->pr_ref--;
	pr->pr_uref--;
	}
	}
	pr->pr_flags = (pr->pr_flags & ~ch_flags) \| pr_flags;
	mtx_unlock(&pr->pr_mtx);

	#ifdef RACCT
	if (created)
	prison_racct_attach(pr);
	#endif

	/* Locks may have prevented a complete restriction of child IP
	* addresses. If so, allocate some more memory and try again.
	*/
	#ifdef INET
	while (redo_ip4) {
	ip4s = pr->pr_ip4s;
	ip4 = malloc(ip4s * sizeof(*ip4), M_PRISON, M_WAITOK);
	mtx_lock(&pr->pr_mtx);
	redo_ip4 = 0;
	FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
	#ifdef VIMAGE
	if (tpr->pr_flags & PR_VNET) {
	descend = 0;
	continue;
	}
	#endif
	if (prison_restrict_ip4(tpr, ip4)) {
	if (ip4 != NULL)
	ip4 = NULL;
	else
	redo_ip4 = 1;
	}
	}
	mtx_unlock(&pr->pr_mtx);
	}
	#endif
	#ifdef INET6
	while (redo_ip6) {
	ip6s = pr->pr_ip6s;
	ip6 = malloc(ip6s * sizeof(*ip6), M_PRISON, M_WAITOK);
	mtx_lock(&pr->pr_mtx);
	redo_ip6 = 0;
	FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
	#ifdef VIMAGE
	if (tpr->pr_flags & PR_VNET) {
	descend = 0;
	continue;
	}
	#endif
	if (prison_restrict_ip6(tpr, ip6)) {
	if (ip6 != NULL)
	ip6 = NULL;
	else
	redo_ip6 = 1;
	}
	}
	mtx_unlock(&pr->pr_mtx);
	}
	#endif

	/* Let the modules do their work. */
	sx_downgrade(&allprison_lock);
	if (created) {
	error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
	if (error) {
	prison_deref(pr, PD_LIST_SLOCKED);
	goto done_errmsg;
	}
	}
	error = osd_jail_call(pr, PR_METHOD_SET, opts);
	if (error) {
	prison_deref(pr, created
	? PD_LIST_SLOCKED
	: PD_DEREF \| PD_LIST_SLOCKED);
	goto done_errmsg;
	}

	/* Attach this process to the prison if requested. */
	if (flags & JAIL_ATTACH) {
	mtx_lock(&pr->pr_mtx);
	error = do_jail_attach(td, pr);
	if (error) {
	vfs_opterror(opts, "attach failed");
	if (!created)
	prison_deref(pr, PD_DEREF);
	goto done_errmsg;
	}
	}

	/*
	* Now that it is all there, drop the temporary reference from existing
	* prisons. Or add a reference to newly created persistent prisons
	* (which was not done earlier so that the prison would not be publicly
	* visible).
	*/
	if (!created) {
	prison_deref(pr, (flags & JAIL_ATTACH)
	? PD_DEREF
	: PD_DEREF \| PD_LIST_SLOCKED);
	} else {
	if (pr_flags & PR_PERSIST) {
	mtx_lock(&pr->pr_mtx);
	pr->pr_ref++;
	pr->pr_uref++;
	mtx_unlock(&pr->pr_mtx);
	}
	if (!(flags & JAIL_ATTACH))
	sx_sunlock(&allprison_lock);
	}
	td->td_retval[0] = pr->pr_id;
	goto done_errmsg;

	done_deref_locked:
	prison_deref(pr, created
	? PD_LOCKED \| PD_LIST_XLOCKED
	: PD_DEREF \| PD_LOCKED \| PD_LIST_XLOCKED);
	goto done_releroot;
	done_unlock_list:
	sx_xunlock(&allprison_lock);
	done_releroot:
	if (root != NULL) {
	vfslocked = VFS_LOCK_GIANT(root->v_mount);
	vrele(root);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	done_errmsg:
	if (error) {
	vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
	if (errmsg_len > 0) {
	errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
	if (errmsg_pos > 0) {
	if (optuio->uio_segflg == UIO_SYSSPACE)
	bcopy(errmsg,
	optuio->uio_iov[errmsg_pos].iov_base,
	errmsg_len);
	else
	copyout(errmsg,
	optuio->uio_iov[errmsg_pos].iov_base,
	errmsg_len);
	}
	}
	}
	done_free:
	#ifdef INET
	free(ip4, M_PRISON);
	#endif
	#ifdef INET6
	free(ip6, M_PRISON);
	#endif
	vfs_freeopts(opts);
	return (error);
	}


	/*
	* struct jail_get_args {
	* struct iovec *iovp;
	* unsigned int iovcnt;
	* int flags;
	* };
	*/
	int
	-jail_get(struct thread td, struct jail_get_args uap)
	+sys_jail_get(struct thread td, struct jail_get_args uap)
	{
	struct uio *auio;
	int error;

	/* Check that we have an even number of iovecs. */
	if (uap->iovcnt & 1)
	return (EINVAL);

	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_jail_get(td, auio, uap->flags);
	if (error == 0)
	error = copyout(auio->uio_iov, uap->iovp,
	uap->iovcnt * sizeof (struct iovec));
	free(auio, M_IOV);
	return (error);
	}

	int
	kern_jail_get(struct thread td, struct uio optuio, int flags)
	{
	struct prison pr, mypr;
	struct vfsopt *opt;
	struct vfsoptlist *opts;
	char errmsg, name;
	int error, errmsg_len, errmsg_pos, fi, i, jid, len, locked, pos;

	if (flags & ~JAIL_GET_MASK)
	return (EINVAL);

	/* Get the parameter list. */
	error = vfs_buildopts(optuio, &opts);
	if (error)
	return (error);
	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
	mypr = td->td_ucred->cr_prison;

	/*
	* Find the prison specified by one of: lastjid, jid, name.
	*/
	sx_slock(&allprison_lock);
	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
	if (error == 0) {
	TAILQ_FOREACH(pr, &allprison, pr_list) {
	if (pr->pr_id > jid && prison_ischild(mypr, pr)) {
	mtx_lock(&pr->pr_mtx);
	if (pr->pr_ref > 0 &&
	(pr->pr_uref > 0 \|\| (flags & JAIL_DYING)))
	break;
	mtx_unlock(&pr->pr_mtx);
	}
	}
	if (pr != NULL)
	goto found_prison;
	error = ENOENT;
	vfs_opterror(opts, "no jail after %d", jid);
	goto done_unlock_list;
	} else if (error != ENOENT)
	goto done_unlock_list;

	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
	if (error == 0) {
	if (jid != 0) {
	pr = prison_find_child(mypr, jid);
	if (pr != NULL) {
	if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
	mtx_unlock(&pr->pr_mtx);
	error = ENOENT;
	vfs_opterror(opts, "jail %d is dying",
	jid);
	goto done_unlock_list;
	}
	goto found_prison;
	}
	error = ENOENT;
	vfs_opterror(opts, "jail %d not found", jid);
	goto done_unlock_list;
	}
	} else if (error != ENOENT)
	goto done_unlock_list;

	error = vfs_getopt(opts, "name", (void **)&name, &len);
	if (error == 0) {
	if (len == 0 \|\| name[len - 1] != '\0') {
	error = EINVAL;
	goto done_unlock_list;
	}
	pr = prison_find_name(mypr, name);
	if (pr != NULL) {
	if (pr->pr_uref == 0 && !(flags & JAIL_DYING)) {
	mtx_unlock(&pr->pr_mtx);
	error = ENOENT;
	vfs_opterror(opts, "jail \"%s\" is dying",
	name);
	goto done_unlock_list;
	}
	goto found_prison;
	}
	error = ENOENT;
	vfs_opterror(opts, "jail \"%s\" not found", name);
	goto done_unlock_list;
	} else if (error != ENOENT)
	goto done_unlock_list;

	vfs_opterror(opts, "no jail specified");
	error = ENOENT;
	goto done_unlock_list;

	found_prison:
	/* Get the parameters of the prison. */
	pr->pr_ref++;
	locked = PD_LOCKED;
	td->td_retval[0] = pr->pr_id;
	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
	error = vfs_setopt(opts, "parent", &i, sizeof(i));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
	sizeof(pr->pr_cpuset->cs_id));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	#ifdef INET
	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_ip4,
	pr->pr_ip4s * sizeof(*pr->pr_ip4));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	#endif
	#ifdef INET6
	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_ip6,
	pr->pr_ip6s * sizeof(*pr->pr_ip6));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	#endif
	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
	sizeof(pr->pr_securelevel));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
	sizeof(pr->pr_childcount));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
	sizeof(pr->pr_childmax));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
	if (error != 0 && error != ENOENT)
	goto done_deref;
	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
	if (error != 0 && error != ENOENT)
	goto done_deref;
	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
	if (error != 0 && error != ENOENT)
	goto done_deref;
	#ifdef COMPAT_FREEBSD32
	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
	uint32_t hid32 = pr->pr_hostid;

	error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
	} else
	#endif
	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
	sizeof(pr->pr_hostid));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
	sizeof(pr->pr_enforce_statfs));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
	fi++) {
	if (pr_flag_names[fi] == NULL)
	continue;
	i = (pr->pr_flags & (1 << fi)) ? 1 : 0;
	error = vfs_setopt(opts, pr_flag_names[fi], &i, sizeof(i));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	i = !i;
	error = vfs_setopt(opts, pr_flag_nonames[fi], &i, sizeof(i));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	}
	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
	fi++) {
	i = pr->pr_flags &
	(pr_flag_jailsys[fi].disable \| pr_flag_jailsys[fi].new);
	i = pr_flag_jailsys[fi].disable &&
	(i == pr_flag_jailsys[fi].disable) ? JAIL_SYS_DISABLE
	: (i == pr_flag_jailsys[fi].new) ? JAIL_SYS_NEW
	: JAIL_SYS_INHERIT;
	error =
	vfs_setopt(opts, pr_flag_jailsys[fi].name, &i, sizeof(i));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	}
	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
	fi++) {
	if (pr_allow_names[fi] == NULL)
	continue;
	i = (pr->pr_allow & (1 << fi)) ? 1 : 0;
	error = vfs_setopt(opts, pr_allow_names[fi], &i, sizeof(i));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	i = !i;
	error = vfs_setopt(opts, pr_allow_nonames[fi], &i, sizeof(i));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	}
	i = (pr->pr_uref == 0);
	error = vfs_setopt(opts, "dying", &i, sizeof(i));
	if (error != 0 && error != ENOENT)
	goto done_deref;
	i = !i;
	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
	if (error != 0 && error != ENOENT)
	goto done_deref;

	/* Get the module parameters. */
	mtx_unlock(&pr->pr_mtx);
	locked = 0;
	error = osd_jail_call(pr, PR_METHOD_GET, opts);
	if (error)
	goto done_deref;
	prison_deref(pr, PD_DEREF \| PD_LIST_SLOCKED);

	/* By now, all parameters should have been noted. */
	TAILQ_FOREACH(opt, opts, link) {
	if (!opt->seen && strcmp(opt->name, "errmsg")) {
	error = EINVAL;
	vfs_opterror(opts, "unknown parameter: %s", opt->name);
	goto done_errmsg;
	}
	}

	/* Write the fetched parameters back to userspace. */
	error = 0;
	TAILQ_FOREACH(opt, opts, link) {
	if (opt->pos >= 0 && opt->pos != errmsg_pos) {
	pos = 2 * opt->pos + 1;
	optuio->uio_iov[pos].iov_len = opt->len;
	if (opt->value != NULL) {
	if (optuio->uio_segflg == UIO_SYSSPACE) {
	bcopy(opt->value,
	optuio->uio_iov[pos].iov_base,
	opt->len);
	} else {
	error = copyout(opt->value,
	optuio->uio_iov[pos].iov_base,
	opt->len);
	if (error)
	break;
	}
	}
	}
	}
	goto done_errmsg;

	done_deref:
	prison_deref(pr, locked \| PD_DEREF \| PD_LIST_SLOCKED);
	goto done_errmsg;

	done_unlock_list:
	sx_sunlock(&allprison_lock);
	done_errmsg:
	if (error && errmsg_pos >= 0) {
	vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
	errmsg_pos = 2 * errmsg_pos + 1;
	if (errmsg_len > 0) {
	if (optuio->uio_segflg == UIO_SYSSPACE)
	bcopy(errmsg,
	optuio->uio_iov[errmsg_pos].iov_base,
	errmsg_len);
	else
	copyout(errmsg,
	optuio->uio_iov[errmsg_pos].iov_base,
	errmsg_len);
	}
	}
	vfs_freeopts(opts);
	return (error);
	}


	/*
	* struct jail_remove_args {
	* int jid;
	* };
	*/
	int
	-jail_remove(struct thread td, struct jail_remove_args uap)
	+sys_jail_remove(struct thread td, struct jail_remove_args uap)
	{
	struct prison pr, cpr, lpr, tpr;
	int descend, error;

	error = priv_check(td, PRIV_JAIL_REMOVE);
	if (error)
	return (error);

	sx_xlock(&allprison_lock);
	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
	if (pr == NULL) {
	sx_xunlock(&allprison_lock);
	return (EINVAL);
	}

	/* Remove all descendants of this prison, then remove this prison. */
	pr->pr_ref++;
	pr->pr_flags \|= PR_REMOVE;
	if (!LIST_EMPTY(&pr->pr_children)) {
	mtx_unlock(&pr->pr_mtx);
	lpr = NULL;
	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
	mtx_lock(&cpr->pr_mtx);
	if (cpr->pr_ref > 0) {
	tpr = cpr;
	cpr->pr_ref++;
	cpr->pr_flags \|= PR_REMOVE;
	} else {
	/* Already removed - do not do it again. */
	tpr = NULL;
	}
	mtx_unlock(&cpr->pr_mtx);
	if (lpr != NULL) {
	mtx_lock(&lpr->pr_mtx);
	prison_remove_one(lpr);
	sx_xlock(&allprison_lock);
	}
	lpr = tpr;
	}
	if (lpr != NULL) {
	mtx_lock(&lpr->pr_mtx);
	prison_remove_one(lpr);
	sx_xlock(&allprison_lock);
	}
	mtx_lock(&pr->pr_mtx);
	}
	prison_remove_one(pr);
	return (0);
	}

	static void
	prison_remove_one(struct prison *pr)
	{
	struct proc *p;
	int deuref;

	/* If the prison was persistent, it is not anymore. */
	deuref = 0;
	if (pr->pr_flags & PR_PERSIST) {
	pr->pr_ref--;
	deuref = PD_DEUREF;
	pr->pr_flags &= ~PR_PERSIST;
	}

	/*
	* jail_remove added a reference. If that's the only one, remove
	* the prison now.
	*/
	KASSERT(pr->pr_ref > 0,
	("prison_remove_one removing a dead prison (jid=%d)", pr->pr_id));
	if (pr->pr_ref == 1) {
	prison_deref(pr,
	deuref \| PD_DEREF \| PD_LOCKED \| PD_LIST_XLOCKED);
	return;
	}

	mtx_unlock(&pr->pr_mtx);
	sx_xunlock(&allprison_lock);
	/*
	* Kill all processes unfortunate enough to be attached to this prison.
	*/
	sx_slock(&allproc_lock);
	LIST_FOREACH(p, &allproc, p_list) {
	PROC_LOCK(p);
	if (p->p_state != PRS_NEW && p->p_ucred &&
	p->p_ucred->cr_prison == pr)
	- psignal(p, SIGKILL);
	+ kern_psignal(p, SIGKILL);
	PROC_UNLOCK(p);
	}
	sx_sunlock(&allproc_lock);
	/* Remove the temporary reference added by jail_remove. */
	prison_deref(pr, deuref \| PD_DEREF);
	}


	/*
	* struct jail_attach_args {
	* int jid;
	* };
	*/
	int
	-jail_attach(struct thread td, struct jail_attach_args uap)
	+sys_jail_attach(struct thread td, struct jail_attach_args uap)
	{
	struct prison *pr;
	int error;

	error = priv_check(td, PRIV_JAIL_ATTACH);
	if (error)
	return (error);

	sx_slock(&allprison_lock);
	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
	if (pr == NULL) {
	sx_sunlock(&allprison_lock);
	return (EINVAL);
	}

	/*
	* Do not allow a process to attach to a prison that is not
	* considered to be "alive".
	*/
	if (pr->pr_uref == 0) {
	mtx_unlock(&pr->pr_mtx);
	sx_sunlock(&allprison_lock);
	return (EINVAL);
	}

	return (do_jail_attach(td, pr));
	}

	static int
	do_jail_attach(struct thread td, struct prison pr)
	{
	struct prison *ppr;
	struct proc *p;
	struct ucred newcred, oldcred;
	int vfslocked, error;

	/*
	* XXX: Note that there is a slight race here if two threads
	* in the same privileged process attempt to attach to two
	* different jails at the same time. It is important for
	* user processes not to do this, or they might end up with
	* a process root from one prison, but attached to the jail
	* of another.
	*/
	pr->pr_ref++;
	pr->pr_uref++;
	mtx_unlock(&pr->pr_mtx);

	/* Let modules do whatever they need to prepare for attaching. */
	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
	if (error) {
	prison_deref(pr, PD_DEREF \| PD_DEUREF \| PD_LIST_SLOCKED);
	return (error);
	}
	sx_sunlock(&allprison_lock);

	/*
	* Reparent the newly attached process to this jail.
	*/
	ppr = td->td_ucred->cr_prison;
	p = td->td_proc;
	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
	if (error)
	goto e_revert_osd;

	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
	vn_lock(pr->pr_root, LK_EXCLUSIVE \| LK_RETRY);
	if ((error = change_dir(pr->pr_root, td)) != 0)
	goto e_unlock;
	#ifdef MAC
	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
	goto e_unlock;
	#endif
	VOP_UNLOCK(pr->pr_root, 0);
	if ((error = change_root(pr->pr_root, td)))
	goto e_unlock_giant;
	VFS_UNLOCK_GIANT(vfslocked);

	newcred = crget();
	PROC_LOCK(p);
	oldcred = p->p_ucred;
	setsugid(p);
	crcopy(newcred, oldcred);
	newcred->cr_prison = pr;
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	#ifdef RACCT
	racct_proc_ucred_changed(p, oldcred, newcred);
	#endif
	crfree(oldcred);
	prison_deref(ppr, PD_DEREF \| PD_DEUREF);
	return (0);
	e_unlock:
	VOP_UNLOCK(pr->pr_root, 0);
	e_unlock_giant:
	VFS_UNLOCK_GIANT(vfslocked);
	e_revert_osd:
	/* Tell modules this thread is still in its old jail after all. */
	(void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
	prison_deref(pr, PD_DEREF \| PD_DEUREF);
	return (error);
	}


	/*
	* Returns a locked prison instance, or NULL on failure.
	*/
	struct prison *
	prison_find(int prid)
	{
	struct prison *pr;

	sx_assert(&allprison_lock, SX_LOCKED);
	TAILQ_FOREACH(pr, &allprison, pr_list) {
	if (pr->pr_id == prid) {
	mtx_lock(&pr->pr_mtx);
	if (pr->pr_ref > 0)
	return (pr);
	mtx_unlock(&pr->pr_mtx);
	}
	}
	return (NULL);
	}

	/*
	* Find a prison that is a descendant of mypr. Returns a locked prison or NULL.
	*/
	struct prison *
	prison_find_child(struct prison *mypr, int prid)
	{
	struct prison *pr;
	int descend;

	sx_assert(&allprison_lock, SX_LOCKED);
	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
	if (pr->pr_id == prid) {
	mtx_lock(&pr->pr_mtx);
	if (pr->pr_ref > 0)
	return (pr);
	mtx_unlock(&pr->pr_mtx);
	}
	}
	return (NULL);
	}

	/*
	* Look for the name relative to mypr. Returns a locked prison or NULL.
	*/
	struct prison *
	prison_find_name(struct prison mypr, const char name)
	{
	struct prison pr, deadpr;
	size_t mylen;
	int descend;

	sx_assert(&allprison_lock, SX_LOCKED);
	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
	again:
	deadpr = NULL;
	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
	if (!strcmp(pr->pr_name + mylen, name)) {
	mtx_lock(&pr->pr_mtx);
	if (pr->pr_ref > 0) {
	if (pr->pr_uref > 0)
	return (pr);
	deadpr = pr;
	}
	mtx_unlock(&pr->pr_mtx);
	}
	}
	/* There was no valid prison - perhaps there was a dying one. */
	if (deadpr != NULL) {
	mtx_lock(&deadpr->pr_mtx);
	if (deadpr->pr_ref == 0) {
	mtx_unlock(&deadpr->pr_mtx);
	goto again;
	}
	}
	return (deadpr);
	}

	/*
	* See if a prison has the specific flag set.
	*/
	int
	prison_flag(struct ucred *cred, unsigned flag)
	{

	/* This is an atomic read, so no locking is necessary. */
	return (cred->cr_prison->pr_flags & flag);
	}

	int
	prison_allow(struct ucred *cred, unsigned flag)
	{

	/* This is an atomic read, so no locking is necessary. */
	return (cred->cr_prison->pr_allow & flag);
	}

	/*
	* Remove a prison reference. If that was the last reference, remove the
	* prison itself - but not in this context in case there are locks held.
	*/
	void
	prison_free_locked(struct prison *pr)
	{

	mtx_assert(&pr->pr_mtx, MA_OWNED);
	pr->pr_ref--;
	if (pr->pr_ref == 0) {
	mtx_unlock(&pr->pr_mtx);
	TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
	taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
	return;
	}
	mtx_unlock(&pr->pr_mtx);
	}

	void
	prison_free(struct prison *pr)
	{

	mtx_lock(&pr->pr_mtx);
	prison_free_locked(pr);
	}

	static void
	prison_complete(void *context, int pending)
	{

	prison_deref((struct prison *)context, 0);
	}

	/*
	* Remove a prison reference (usually). This internal version assumes no
	* mutexes are held, except perhaps the prison itself. If there are no more
	* references, release and delist the prison. On completion, the prison lock
	* and the allprison lock are both unlocked.
	*/
	static void
	prison_deref(struct prison *pr, int flags)
	{
	struct prison ppr, tpr;
	int vfslocked;

	if (!(flags & PD_LOCKED))
	mtx_lock(&pr->pr_mtx);
	for (;;) {
	if (flags & PD_DEUREF) {
	pr->pr_uref--;
	KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
	}
	if (flags & PD_DEREF)
	pr->pr_ref--;
	/* If the prison still has references, nothing else to do. */
	if (pr->pr_ref > 0) {
	mtx_unlock(&pr->pr_mtx);
	if (flags & PD_LIST_SLOCKED)
	sx_sunlock(&allprison_lock);
	else if (flags & PD_LIST_XLOCKED)
	sx_xunlock(&allprison_lock);
	return;
	}

	mtx_unlock(&pr->pr_mtx);
	if (flags & PD_LIST_SLOCKED) {
	if (!sx_try_upgrade(&allprison_lock)) {
	sx_sunlock(&allprison_lock);
	sx_xlock(&allprison_lock);
	}
	} else if (!(flags & PD_LIST_XLOCKED))
	sx_xlock(&allprison_lock);

	TAILQ_REMOVE(&allprison, pr, pr_list);
	LIST_REMOVE(pr, pr_sibling);
	ppr = pr->pr_parent;
	for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
	tpr->pr_childcount--;
	sx_xunlock(&allprison_lock);

	#ifdef VIMAGE
	if (pr->pr_vnet != ppr->pr_vnet)
	vnet_destroy(pr->pr_vnet);
	#endif
	if (pr->pr_root != NULL) {
	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
	vrele(pr->pr_root);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	mtx_destroy(&pr->pr_mtx);
	#ifdef INET
	free(pr->pr_ip4, M_PRISON);
	#endif
	#ifdef INET6
	free(pr->pr_ip6, M_PRISON);
	#endif
	if (pr->pr_cpuset != NULL)
	cpuset_rel(pr->pr_cpuset);
	osd_jail_exit(pr);
	#ifdef RACCT
	prison_racct_detach(pr);
	#endif
	free(pr, M_PRISON);

	/* Removing a prison frees a reference on its parent. */
	pr = ppr;
	mtx_lock(&pr->pr_mtx);
	flags = PD_DEREF \| PD_DEUREF;
	}
	}

	void
	prison_hold_locked(struct prison *pr)
	{

	mtx_assert(&pr->pr_mtx, MA_OWNED);
	KASSERT(pr->pr_ref > 0,
	("Trying to hold dead prison (jid=%d).", pr->pr_id));
	pr->pr_ref++;
	}

	void
	prison_hold(struct prison *pr)
	{

	mtx_lock(&pr->pr_mtx);
	prison_hold_locked(pr);
	mtx_unlock(&pr->pr_mtx);
	}

	void
	prison_proc_hold(struct prison *pr)
	{

	mtx_lock(&pr->pr_mtx);
	KASSERT(pr->pr_uref > 0,
	("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
	pr->pr_uref++;
	mtx_unlock(&pr->pr_mtx);
	}

	void
	prison_proc_free(struct prison *pr)
	{

	mtx_lock(&pr->pr_mtx);
	KASSERT(pr->pr_uref > 0,
	("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
	prison_deref(pr, PD_DEUREF \| PD_LOCKED);
	}


	#ifdef INET
	/*
	* Restrict a prison's IP address list with its parent's, possibly replacing
	* it. Return true if the replacement buffer was used (or would have been).
	*/
	static int
	prison_restrict_ip4(struct prison pr, struct in_addr newip4)
	{
	int ii, ij, used;
	struct prison *ppr;

	ppr = pr->pr_parent;
	if (!(pr->pr_flags & PR_IP4_USER)) {
	/* This has no user settings, so just copy the parent's list. */
	if (pr->pr_ip4s < ppr->pr_ip4s) {
	/*
	* There's no room for the parent's list. Use the
	* new list buffer, which is assumed to be big enough
	* (if it was passed). If there's no buffer, try to
	* allocate one.
	*/
	used = 1;
	if (newip4 == NULL) {
	newip4 = malloc(ppr->pr_ip4s * sizeof(*newip4),
	M_PRISON, M_NOWAIT);
	if (newip4 != NULL)
	used = 0;
	}
	if (newip4 != NULL) {
	bcopy(ppr->pr_ip4, newip4,
	ppr->pr_ip4s * sizeof(*newip4));
	free(pr->pr_ip4, M_PRISON);
	pr->pr_ip4 = newip4;
	pr->pr_ip4s = ppr->pr_ip4s;
	}
	return (used);
	}
	pr->pr_ip4s = ppr->pr_ip4s;
	if (pr->pr_ip4s > 0)
	bcopy(ppr->pr_ip4, pr->pr_ip4,
	pr->pr_ip4s * sizeof(*newip4));
	else if (pr->pr_ip4 != NULL) {
	free(pr->pr_ip4, M_PRISON);
	pr->pr_ip4 = NULL;
	}
	} else if (pr->pr_ip4s > 0) {
	/* Remove addresses that aren't in the parent. */
	for (ij = 0; ij < ppr->pr_ip4s; ij++)
	if (pr->pr_ip4[0].s_addr == ppr->pr_ip4[ij].s_addr)
	break;
	if (ij < ppr->pr_ip4s)
	ii = 1;
	else {
	bcopy(pr->pr_ip4 + 1, pr->pr_ip4,
	--pr->pr_ip4s * sizeof(*pr->pr_ip4));
	ii = 0;
	}
	for (ij = 1; ii < pr->pr_ip4s; ) {
	if (pr->pr_ip4[ii].s_addr == ppr->pr_ip4[0].s_addr) {
	ii++;
	continue;
	}
	switch (ij >= ppr->pr_ip4s ? -1 :
	qcmp_v4(&pr->pr_ip4[ii], &ppr->pr_ip4[ij])) {
	case -1:
	bcopy(pr->pr_ip4 + ii + 1, pr->pr_ip4 + ii,
	(--pr->pr_ip4s - ii) * sizeof(*pr->pr_ip4));
	break;
	case 0:
	ii++;
	ij++;
	break;
	case 1:
	ij++;
	break;
	}
	}
	if (pr->pr_ip4s == 0) {
	pr->pr_flags \|= PR_IP4_DISABLE;
	free(pr->pr_ip4, M_PRISON);
	pr->pr_ip4 = NULL;
	}
	}
	return (0);
	}

	/*
	* Pass back primary IPv4 address of this jail.
	*
	* If not restricted return success but do not alter the address. Caller has
	* to make sure to initialize it correctly (e.g. INADDR_ANY).
	*
	* Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
	* Address returned in NBO.
	*/
	int
	prison_get_ip4(struct ucred cred, struct in_addr ia)
	{
	struct prison *pr;

	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));

	pr = cred->cr_prison;
	if (!(pr->pr_flags & PR_IP4))
	return (0);
	mtx_lock(&pr->pr_mtx);
	if (!(pr->pr_flags & PR_IP4)) {
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}
	if (pr->pr_ip4 == NULL) {
	mtx_unlock(&pr->pr_mtx);
	return (EAFNOSUPPORT);
	}

	ia->s_addr = pr->pr_ip4[0].s_addr;
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}

	/*
	* Return 1 if we should do proper source address selection or are not jailed.
	* We will return 0 if we should bypass source address selection in favour
	* of the primary jail IPv4 address. Only in this case *ia will be updated and
	* returned in NBO.
	* Return EAFNOSUPPORT, in case this jail does not allow IPv4.
	*/
	int
	prison_saddrsel_ip4(struct ucred cred, struct in_addr ia)
	{
	struct prison *pr;
	struct in_addr lia;
	int error;

	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));

	if (!jailed(cred))
	return (1);

	pr = cred->cr_prison;
	if (pr->pr_flags & PR_IP4_SADDRSEL)
	return (1);

	lia.s_addr = INADDR_ANY;
	error = prison_get_ip4(cred, &lia);
	if (error)
	return (error);
	if (lia.s_addr == INADDR_ANY)
	return (1);

	ia->s_addr = lia.s_addr;
	return (0);
	}

	/*
	* Return true if pr1 and pr2 have the same IPv4 address restrictions.
	*/
	int
	prison_equal_ip4(struct prison pr1, struct prison pr2)
	{

	if (pr1 == pr2)
	return (1);

	/*
	* No need to lock since the PR_IP4_USER flag can't be altered for
	* existing prisons.
	*/
	while (pr1 != &prison0 &&
	#ifdef VIMAGE
	!(pr1->pr_flags & PR_VNET) &&
	#endif
	!(pr1->pr_flags & PR_IP4_USER))
	pr1 = pr1->pr_parent;
	while (pr2 != &prison0 &&
	#ifdef VIMAGE
	!(pr2->pr_flags & PR_VNET) &&
	#endif
	!(pr2->pr_flags & PR_IP4_USER))
	pr2 = pr2->pr_parent;
	return (pr1 == pr2);
	}

	/*
	* Make sure our (source) address is set to something meaningful to this
	* jail.
	*
	* Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
	* EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
	* doesn't allow IPv4. Address passed in in NBO and returned in NBO.
	*/
	int
	prison_local_ip4(struct ucred cred, struct in_addr ia)
	{
	struct prison *pr;
	struct in_addr ia0;
	int error;

	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));

	pr = cred->cr_prison;
	if (!(pr->pr_flags & PR_IP4))
	return (0);
	mtx_lock(&pr->pr_mtx);
	if (!(pr->pr_flags & PR_IP4)) {
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}
	if (pr->pr_ip4 == NULL) {
	mtx_unlock(&pr->pr_mtx);
	return (EAFNOSUPPORT);
	}

	ia0.s_addr = ntohl(ia->s_addr);
	if (ia0.s_addr == INADDR_LOOPBACK) {
	ia->s_addr = pr->pr_ip4[0].s_addr;
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}

	if (ia0.s_addr == INADDR_ANY) {
	/*
	* In case there is only 1 IPv4 address, bind directly.
	*/
	if (pr->pr_ip4s == 1)
	ia->s_addr = pr->pr_ip4[0].s_addr;
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}

	error = _prison_check_ip4(pr, ia);
	mtx_unlock(&pr->pr_mtx);
	return (error);
	}

	/*
	* Rewrite destination address in case we will connect to loopback address.
	*
	* Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv4.
	* Address passed in in NBO and returned in NBO.
	*/
	int
	prison_remote_ip4(struct ucred cred, struct in_addr ia)
	{
	struct prison *pr;

	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));

	pr = cred->cr_prison;
	if (!(pr->pr_flags & PR_IP4))
	return (0);
	mtx_lock(&pr->pr_mtx);
	if (!(pr->pr_flags & PR_IP4)) {
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}
	if (pr->pr_ip4 == NULL) {
	mtx_unlock(&pr->pr_mtx);
	return (EAFNOSUPPORT);
	}

	if (ntohl(ia->s_addr) == INADDR_LOOPBACK) {
	ia->s_addr = pr->pr_ip4[0].s_addr;
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}

	/*
	* Return success because nothing had to be changed.
	*/
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}

	/*
	* Check if given address belongs to the jail referenced by cred/prison.
	*
	* Returns 0 if jail doesn't restrict IPv4 or if address belongs to jail,
	* EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
	* doesn't allow IPv4. Address passed in in NBO.
	*/
	static int
	_prison_check_ip4(struct prison pr, struct in_addr ia)
	{
	int i, a, z, d;

	/*
	* Check the primary IP.
	*/
	if (pr->pr_ip4[0].s_addr == ia->s_addr)
	return (0);

	/*
	* All the other IPs are sorted so we can do a binary search.
	*/
	a = 0;
	z = pr->pr_ip4s - 2;
	while (a <= z) {
	i = (a + z) / 2;
	d = qcmp_v4(&pr->pr_ip4[i+1], ia);
	if (d > 0)
	z = i - 1;
	else if (d < 0)
	a = i + 1;
	else
	return (0);
	}

	return (EADDRNOTAVAIL);
	}

	int
	prison_check_ip4(struct ucred cred, struct in_addr ia)
	{
	struct prison *pr;
	int error;

	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
	KASSERT(ia != NULL, ("%s: ia is NULL", __func__));

	pr = cred->cr_prison;
	if (!(pr->pr_flags & PR_IP4))
	return (0);
	mtx_lock(&pr->pr_mtx);
	if (!(pr->pr_flags & PR_IP4)) {
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}
	if (pr->pr_ip4 == NULL) {
	mtx_unlock(&pr->pr_mtx);
	return (EAFNOSUPPORT);
	}

	error = _prison_check_ip4(pr, ia);
	mtx_unlock(&pr->pr_mtx);
	return (error);
	}
	#endif

	#ifdef INET6
	static int
	prison_restrict_ip6(struct prison pr, struct in6_addr newip6)
	{
	int ii, ij, used;
	struct prison *ppr;

	ppr = pr->pr_parent;
	if (!(pr->pr_flags & PR_IP6_USER)) {
	/* This has no user settings, so just copy the parent's list. */
	if (pr->pr_ip6s < ppr->pr_ip6s) {
	/*
	* There's no room for the parent's list. Use the
	* new list buffer, which is assumed to be big enough
	* (if it was passed). If there's no buffer, try to
	* allocate one.
	*/
	used = 1;
	if (newip6 == NULL) {
	newip6 = malloc(ppr->pr_ip6s * sizeof(*newip6),
	M_PRISON, M_NOWAIT);
	if (newip6 != NULL)
	used = 0;
	}
	if (newip6 != NULL) {
	bcopy(ppr->pr_ip6, newip6,
	ppr->pr_ip6s * sizeof(*newip6));
	free(pr->pr_ip6, M_PRISON);
	pr->pr_ip6 = newip6;
	pr->pr_ip6s = ppr->pr_ip6s;
	}
	return (used);
	}
	pr->pr_ip6s = ppr->pr_ip6s;
	if (pr->pr_ip6s > 0)
	bcopy(ppr->pr_ip6, pr->pr_ip6,
	pr->pr_ip6s * sizeof(*newip6));
	else if (pr->pr_ip6 != NULL) {
	free(pr->pr_ip6, M_PRISON);
	pr->pr_ip6 = NULL;
	}
	} else if (pr->pr_ip6s > 0) {
	/* Remove addresses that aren't in the parent. */
	for (ij = 0; ij < ppr->pr_ip6s; ij++)
	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0],
	&ppr->pr_ip6[ij]))
	break;
	if (ij < ppr->pr_ip6s)
	ii = 1;
	else {
	bcopy(pr->pr_ip6 + 1, pr->pr_ip6,
	--pr->pr_ip6s * sizeof(*pr->pr_ip6));
	ii = 0;
	}
	for (ij = 1; ii < pr->pr_ip6s; ) {
	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[ii],
	&ppr->pr_ip6[0])) {
	ii++;
	continue;
	}
	switch (ij >= ppr->pr_ip4s ? -1 :
	qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
	case -1:
	bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
	(--pr->pr_ip6s - ii) * sizeof(*pr->pr_ip6));
	break;
	case 0:
	ii++;
	ij++;
	break;
	case 1:
	ij++;
	break;
	}
	}
	if (pr->pr_ip6s == 0) {
	pr->pr_flags \|= PR_IP6_DISABLE;
	free(pr->pr_ip6, M_PRISON);
	pr->pr_ip6 = NULL;
	}
	}
	return 0;
	}

	/*
	* Pass back primary IPv6 address for this jail.
	*
	* If not restricted return success but do not alter the address. Caller has
	* to make sure to initialize it correctly (e.g. IN6ADDR_ANY_INIT).
	*
	* Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
	*/
	int
	prison_get_ip6(struct ucred cred, struct in6_addr ia6)
	{
	struct prison *pr;

	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));

	pr = cred->cr_prison;
	if (!(pr->pr_flags & PR_IP6))
	return (0);
	mtx_lock(&pr->pr_mtx);
	if (!(pr->pr_flags & PR_IP6)) {
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}
	if (pr->pr_ip6 == NULL) {
	mtx_unlock(&pr->pr_mtx);
	return (EAFNOSUPPORT);
	}

	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}

	/*
	* Return 1 if we should do proper source address selection or are not jailed.
	* We will return 0 if we should bypass source address selection in favour
	* of the primary jail IPv6 address. Only in this case *ia will be updated and
	* returned in NBO.
	* Return EAFNOSUPPORT, in case this jail does not allow IPv6.
	*/
	int
	prison_saddrsel_ip6(struct ucred cred, struct in6_addr ia6)
	{
	struct prison *pr;
	struct in6_addr lia6;
	int error;

	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));

	if (!jailed(cred))
	return (1);

	pr = cred->cr_prison;
	if (pr->pr_flags & PR_IP6_SADDRSEL)
	return (1);

	lia6 = in6addr_any;
	error = prison_get_ip6(cred, &lia6);
	if (error)
	return (error);
	if (IN6_IS_ADDR_UNSPECIFIED(&lia6))
	return (1);

	bcopy(&lia6, ia6, sizeof(struct in6_addr));
	return (0);
	}

	/*
	* Return true if pr1 and pr2 have the same IPv6 address restrictions.
	*/
	int
	prison_equal_ip6(struct prison pr1, struct prison pr2)
	{

	if (pr1 == pr2)
	return (1);

	while (pr1 != &prison0 &&
	#ifdef VIMAGE
	!(pr1->pr_flags & PR_VNET) &&
	#endif
	!(pr1->pr_flags & PR_IP6_USER))
	pr1 = pr1->pr_parent;
	while (pr2 != &prison0 &&
	#ifdef VIMAGE
	!(pr2->pr_flags & PR_VNET) &&
	#endif
	!(pr2->pr_flags & PR_IP6_USER))
	pr2 = pr2->pr_parent;
	return (pr1 == pr2);
	}

	/*
	* Make sure our (source) address is set to something meaningful to this jail.
	*
	* v6only should be set based on (inp->inp_flags & IN6P_IPV6_V6ONLY != 0)
	* when needed while binding.
	*
	* Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
	* EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
	* doesn't allow IPv6.
	*/
	int
	prison_local_ip6(struct ucred cred, struct in6_addr ia6, int v6only)
	{
	struct prison *pr;
	int error;

	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));

	pr = cred->cr_prison;
	if (!(pr->pr_flags & PR_IP6))
	return (0);
	mtx_lock(&pr->pr_mtx);
	if (!(pr->pr_flags & PR_IP6)) {
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}
	if (pr->pr_ip6 == NULL) {
	mtx_unlock(&pr->pr_mtx);
	return (EAFNOSUPPORT);
	}

	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}

	if (IN6_IS_ADDR_UNSPECIFIED(ia6)) {
	/*
	* In case there is only 1 IPv6 address, and v6only is true,
	* then bind directly.
	*/
	if (v6only != 0 && pr->pr_ip6s == 1)
	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}

	error = _prison_check_ip6(pr, ia6);
	mtx_unlock(&pr->pr_mtx);
	return (error);
	}

	/*
	* Rewrite destination address in case we will connect to loopback address.
	*
	* Returns 0 on success, EAFNOSUPPORT if the jail doesn't allow IPv6.
	*/
	int
	prison_remote_ip6(struct ucred cred, struct in6_addr ia6)
	{
	struct prison *pr;

	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));

	pr = cred->cr_prison;
	if (!(pr->pr_flags & PR_IP6))
	return (0);
	mtx_lock(&pr->pr_mtx);
	if (!(pr->pr_flags & PR_IP6)) {
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}
	if (pr->pr_ip6 == NULL) {
	mtx_unlock(&pr->pr_mtx);
	return (EAFNOSUPPORT);
	}

	if (IN6_IS_ADDR_LOOPBACK(ia6)) {
	bcopy(&pr->pr_ip6[0], ia6, sizeof(struct in6_addr));
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}

	/*
	* Return success because nothing had to be changed.
	*/
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}

	/*
	* Check if given address belongs to the jail referenced by cred/prison.
	*
	* Returns 0 if jail doesn't restrict IPv6 or if address belongs to jail,
	* EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if the jail
	* doesn't allow IPv6.
	*/
	static int
	_prison_check_ip6(struct prison pr, struct in6_addr ia6)
	{
	int i, a, z, d;

	/*
	* Check the primary IP.
	*/
	if (IN6_ARE_ADDR_EQUAL(&pr->pr_ip6[0], ia6))
	return (0);

	/*
	* All the other IPs are sorted so we can do a binary search.
	*/
	a = 0;
	z = pr->pr_ip6s - 2;
	while (a <= z) {
	i = (a + z) / 2;
	d = qcmp_v6(&pr->pr_ip6[i+1], ia6);
	if (d > 0)
	z = i - 1;
	else if (d < 0)
	a = i + 1;
	else
	return (0);
	}

	return (EADDRNOTAVAIL);
	}

	int
	prison_check_ip6(struct ucred cred, struct in6_addr ia6)
	{
	struct prison *pr;
	int error;

	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
	KASSERT(ia6 != NULL, ("%s: ia6 is NULL", __func__));

	pr = cred->cr_prison;
	if (!(pr->pr_flags & PR_IP6))
	return (0);
	mtx_lock(&pr->pr_mtx);
	if (!(pr->pr_flags & PR_IP6)) {
	mtx_unlock(&pr->pr_mtx);
	return (0);
	}
	if (pr->pr_ip6 == NULL) {
	mtx_unlock(&pr->pr_mtx);
	return (EAFNOSUPPORT);
	}

	error = _prison_check_ip6(pr, ia6);
	mtx_unlock(&pr->pr_mtx);
	return (error);
	}
	#endif

	/*
	* Check if a jail supports the given address family.
	*
	* Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
	* if not.
	*/
	int
	prison_check_af(struct ucred *cred, int af)
	{
	struct prison *pr;
	int error;

	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));

	pr = cred->cr_prison;
	#ifdef VIMAGE
	/* Prisons with their own network stack are not limited. */
	if (prison_owns_vnet(cred))
	return (0);
	#endif

	error = 0;
	switch (af)
	{
	#ifdef INET
	case AF_INET:
	if (pr->pr_flags & PR_IP4)
	{
	mtx_lock(&pr->pr_mtx);
	if ((pr->pr_flags & PR_IP4) && pr->pr_ip4 == NULL)
	error = EAFNOSUPPORT;
	mtx_unlock(&pr->pr_mtx);
	}
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	if (pr->pr_flags & PR_IP6)
	{
	mtx_lock(&pr->pr_mtx);
	if ((pr->pr_flags & PR_IP6) && pr->pr_ip6 == NULL)
	error = EAFNOSUPPORT;
	mtx_unlock(&pr->pr_mtx);
	}
	break;
	#endif
	case AF_LOCAL:
	case AF_ROUTE:
	break;
	default:
	if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
	error = EAFNOSUPPORT;
	}
	return (error);
	}

	/*
	* Check if given address belongs to the jail referenced by cred (wrapper to
	* prison_check_ip[46]).
	*
	* Returns 0 if jail doesn't restrict the address family or if address belongs
	* to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
	* the jail doesn't allow the address family. IPv4 Address passed in in NBO.
	*/
	int
	prison_if(struct ucred cred, struct sockaddr sa)
	{
	#ifdef INET
	struct sockaddr_in *sai;
	#endif
	#ifdef INET6
	struct sockaddr_in6 *sai6;
	#endif
	int error;

	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));

	#ifdef VIMAGE
	if (prison_owns_vnet(cred))
	return (0);
	#endif

	error = 0;
	switch (sa->sa_family)
	{
	#ifdef INET
	case AF_INET:
	sai = (struct sockaddr_in *)sa;
	error = prison_check_ip4(cred, &sai->sin_addr);
	break;
	#endif
	#ifdef INET6
	case AF_INET6:
	sai6 = (struct sockaddr_in6 *)sa;
	error = prison_check_ip6(cred, &sai6->sin6_addr);
	break;
	#endif
	default:
	if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
	error = EAFNOSUPPORT;
	}
	return (error);
	}

	/*
	* Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
	*/
	int
	prison_check(struct ucred cred1, struct ucred cred2)
	{

	return ((cred1->cr_prison == cred2->cr_prison \|\|
	prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
	}

	/*
	* Return 1 if p2 is a child of p1, otherwise 0.
	*/
	int
	prison_ischild(struct prison pr1, struct prison pr2)
	{

	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
	if (pr1 == pr2)
	return (1);
	return (0);
	}

	/*
	* Return 1 if the passed credential is in a jail, otherwise 0.
	*/
	int
	jailed(struct ucred *cred)
	{

	return (cred->cr_prison != &prison0);
	}

	/*
	* Return 1 if the passed credential is in a jail and that jail does not
	* have its own virtual network stack, otherwise 0.
	*/
	int
	jailed_without_vnet(struct ucred *cred)
	{

	if (!jailed(cred))
	return (0);
	#ifdef VIMAGE
	if (prison_owns_vnet(cred))
	return (0);
	#endif

	return (1);
	}

	/*
	* Return the correct hostname (domainname, et al) for the passed credential.
	*/
	void
	getcredhostname(struct ucred cred, char buf, size_t size)
	{
	struct prison *pr;

	/*
	* A NULL credential can be used to shortcut to the physical
	* system's hostname.
	*/
	pr = (cred != NULL) ? cred->cr_prison : &prison0;
	mtx_lock(&pr->pr_mtx);
	strlcpy(buf, pr->pr_hostname, size);
	mtx_unlock(&pr->pr_mtx);
	}

	void
	getcreddomainname(struct ucred cred, char buf, size_t size)
	{

	mtx_lock(&cred->cr_prison->pr_mtx);
	strlcpy(buf, cred->cr_prison->pr_domainname, size);
	mtx_unlock(&cred->cr_prison->pr_mtx);
	}

	void
	getcredhostuuid(struct ucred cred, char buf, size_t size)
	{

	mtx_lock(&cred->cr_prison->pr_mtx);
	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
	mtx_unlock(&cred->cr_prison->pr_mtx);
	}

	void
	getcredhostid(struct ucred cred, unsigned long hostid)
	{

	mtx_lock(&cred->cr_prison->pr_mtx);
	*hostid = cred->cr_prison->pr_hostid;
	mtx_unlock(&cred->cr_prison->pr_mtx);
	}

	#ifdef VIMAGE
	/*
	* Determine whether the prison represented by cred owns
	* its vnet rather than having it inherited.
	*
	* Returns 1 in case the prison owns the vnet, 0 otherwise.
	*/
	int
	prison_owns_vnet(struct ucred *cred)
	{

	/*
	* vnets cannot be added/removed after jail creation,
	* so no need to lock here.
	*/
	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
	}
	#endif

	/*
	* Determine whether the subject represented by cred can "see"
	* status of a mount point.
	* Returns: 0 for permitted, ENOENT otherwise.
	* XXX: This function should be called cr_canseemount() and should be
	* placed in kern_prot.c.
	*/
	int
	prison_canseemount(struct ucred cred, struct mount mp)
	{
	struct prison *pr;
	struct statfs *sp;
	size_t len;

	pr = cred->cr_prison;
	if (pr->pr_enforce_statfs == 0)
	return (0);
	if (pr->pr_root->v_mount == mp)
	return (0);
	if (pr->pr_enforce_statfs == 2)
	return (ENOENT);
	/*
	* If jail's chroot directory is set to "/" we should be able to see
	* all mount-points from inside a jail.
	* This is ugly check, but this is the only situation when jail's
	* directory ends with '/'.
	*/
	if (strcmp(pr->pr_path, "/") == 0)
	return (0);
	len = strlen(pr->pr_path);
	sp = &mp->mnt_stat;
	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
	return (ENOENT);
	/*
	* Be sure that we don't have situation where jail's root directory
	* is "/some/path" and mount point is "/some/pathpath".
	*/
	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
	return (ENOENT);
	return (0);
	}

	void
	prison_enforce_statfs(struct ucred cred, struct mount mp, struct statfs *sp)
	{
	char jpath[MAXPATHLEN];
	struct prison *pr;
	size_t len;

	pr = cred->cr_prison;
	if (pr->pr_enforce_statfs == 0)
	return;
	if (prison_canseemount(cred, mp) != 0) {
	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
	strlcpy(sp->f_mntonname, "[restricted]",
	sizeof(sp->f_mntonname));
	return;
	}
	if (pr->pr_root->v_mount == mp) {
	/*
	* Clear current buffer data, so we are sure nothing from
	* the valid path left there.
	*/
	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
	*sp->f_mntonname = '/';
	return;
	}
	/*
	* If jail's chroot directory is set to "/" we should be able to see
	* all mount-points from inside a jail.
	*/
	if (strcmp(pr->pr_path, "/") == 0)
	return;
	len = strlen(pr->pr_path);
	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
	/*
	* Clear current buffer data, so we are sure nothing from
	* the valid path left there.
	*/
	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
	if (*jpath == '\0') {
	/* Should never happen. */
	*sp->f_mntonname = '/';
	} else {
	strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
	}
	}

	/*
	* Check with permission for a specific privilege is granted within jail. We
	* have a specific list of accepted privileges; the rest are denied.
	*/
	int
	prison_priv_check(struct ucred *cred, int priv)
	{

	if (!jailed(cred))
	return (0);

	#ifdef VIMAGE
	/*
	* Privileges specific to prisons with a virtual network stack.
	* There might be a duplicate entry here in case the privilege
	* is only granted conditionally in the legacy jail case.
	*/
	switch (priv) {
	#ifdef notyet
	/*
	* NFS-specific privileges.
	*/
	case PRIV_NFS_DAEMON:
	case PRIV_NFS_LOCKD:
	#endif
	/*
	* Network stack privileges.
	*/
	case PRIV_NET_BRIDGE:
	case PRIV_NET_GRE:
	case PRIV_NET_BPF:
	case PRIV_NET_RAW: /* Dup, cond. in legacy jail case. */
	case PRIV_NET_ROUTE:
	case PRIV_NET_TAP:
	case PRIV_NET_SETIFMTU:
	case PRIV_NET_SETIFFLAGS:
	case PRIV_NET_SETIFCAP:
	case PRIV_NET_SETIFDESCR:
	case PRIV_NET_SETIFNAME :
	case PRIV_NET_SETIFMETRIC:
	case PRIV_NET_SETIFPHYS:
	case PRIV_NET_SETIFMAC:
	case PRIV_NET_ADDMULTI:
	case PRIV_NET_DELMULTI:
	case PRIV_NET_HWIOCTL:
	case PRIV_NET_SETLLADDR:
	case PRIV_NET_ADDIFGROUP:
	case PRIV_NET_DELIFGROUP:
	case PRIV_NET_IFCREATE:
	case PRIV_NET_IFDESTROY:
	case PRIV_NET_ADDIFADDR:
	case PRIV_NET_DELIFADDR:
	case PRIV_NET_LAGG:
	case PRIV_NET_GIF:
	case PRIV_NET_SETIFVNET:
	case PRIV_NET_SETIFFIB:

	/*
	* 802.11-related privileges.
	*/
	case PRIV_NET80211_GETKEY:
	#ifdef notyet
	case PRIV_NET80211_MANAGE: /* XXX-BZ discuss with sam@ */
	#endif

	#ifdef notyet
	/*
	* AppleTalk privileges.
	*/
	case PRIV_NETATALK_RESERVEDPORT:

	/*
	* ATM privileges.
	*/
	case PRIV_NETATM_CFG:
	case PRIV_NETATM_ADD:
	case PRIV_NETATM_DEL:
	case PRIV_NETATM_SET:

	/*
	* Bluetooth privileges.
	*/
	case PRIV_NETBLUETOOTH_RAW:
	#endif

	/*
	* Netgraph and netgraph module privileges.
	*/
	case PRIV_NETGRAPH_CONTROL:
	#ifdef notyet
	case PRIV_NETGRAPH_TTY:
	#endif

	/*
	* IPv4 and IPv6 privileges.
	*/
	case PRIV_NETINET_IPFW:
	case PRIV_NETINET_DIVERT:
	case PRIV_NETINET_PF:
	case PRIV_NETINET_DUMMYNET:
	case PRIV_NETINET_CARP:
	case PRIV_NETINET_MROUTE:
	case PRIV_NETINET_RAW:
	case PRIV_NETINET_ADDRCTRL6:
	case PRIV_NETINET_ND6:
	case PRIV_NETINET_SCOPE6:
	case PRIV_NETINET_ALIFETIME6:
	case PRIV_NETINET_IPSEC:
	case PRIV_NETINET_BINDANY:

	#ifdef notyet
	/*
	* IPX/SPX privileges.
	*/
	case PRIV_NETIPX_RESERVEDPORT:
	case PRIV_NETIPX_RAW:

	/*
	* NCP privileges.
	*/
	case PRIV_NETNCP:

	/*
	* SMB privileges.
	*/
	case PRIV_NETSMB:
	#endif

	/*
	* No default: or deny here.
	* In case of no permit fall through to next switch().
	*/
	if (cred->cr_prison->pr_flags & PR_VNET)
	return (0);
	}
	#endif /* VIMAGE */

	switch (priv) {

	/*
	* Allow ktrace privileges for root in jail.
	*/
	case PRIV_KTRACE:

	#if 0
	/*
	* Allow jailed processes to configure audit identity and
	* submit audit records (login, etc). In the future we may
	* want to further refine the relationship between audit and
	* jail.
	*/
	case PRIV_AUDIT_GETAUDIT:
	case PRIV_AUDIT_SETAUDIT:
	case PRIV_AUDIT_SUBMIT:
	#endif

	/*
	* Allow jailed processes to manipulate process UNIX
	* credentials in any way they see fit.
	*/
	case PRIV_CRED_SETUID:
	case PRIV_CRED_SETEUID:
	case PRIV_CRED_SETGID:
	case PRIV_CRED_SETEGID:
	case PRIV_CRED_SETGROUPS:
	case PRIV_CRED_SETREUID:
	case PRIV_CRED_SETREGID:
	case PRIV_CRED_SETRESUID:
	case PRIV_CRED_SETRESGID:

	/*
	* Jail implements visibility constraints already, so allow
	* jailed root to override uid/gid-based constraints.
	*/
	case PRIV_SEEOTHERGIDS:
	case PRIV_SEEOTHERUIDS:

	/*
	* Jail implements inter-process debugging limits already, so
	* allow jailed root various debugging privileges.
	*/
	case PRIV_DEBUG_DIFFCRED:
	case PRIV_DEBUG_SUGID:
	case PRIV_DEBUG_UNPRIV:

	/*
	* Allow jail to set various resource limits and login
	* properties, and for now, exceed process resource limits.
	*/
	case PRIV_PROC_LIMIT:
	case PRIV_PROC_SETLOGIN:
	case PRIV_PROC_SETRLIMIT:

	/*
	* System V and POSIX IPC privileges are granted in jail.
	*/
	case PRIV_IPC_READ:
	case PRIV_IPC_WRITE:
	case PRIV_IPC_ADMIN:
	case PRIV_IPC_MSGSIZE:
	case PRIV_MQ_ADMIN:

	/*
	* Jail operations within a jail work on child jails.
	*/
	case PRIV_JAIL_ATTACH:
	case PRIV_JAIL_SET:
	case PRIV_JAIL_REMOVE:

	/*
	* Jail implements its own inter-process limits, so allow
	* root processes in jail to change scheduling on other
	* processes in the same jail. Likewise for signalling.
	*/
	case PRIV_SCHED_DIFFCRED:
	case PRIV_SCHED_CPUSET:
	case PRIV_SIGNAL_DIFFCRED:
	case PRIV_SIGNAL_SUGID:

	/*
	* Allow jailed processes to write to sysctls marked as jail
	* writable.
	*/
	case PRIV_SYSCTL_WRITEJAIL:

	/*
	* Allow root in jail to manage a variety of quota
	* properties. These should likely be conditional on a
	* configuration option.
	*/
	case PRIV_VFS_GETQUOTA:
	case PRIV_VFS_SETQUOTA:

	/*
	* Since Jail relies on chroot() to implement file system
	* protections, grant many VFS privileges to root in jail.
	* Be careful to exclude mount-related and NFS-related
	* privileges.
	*/
	case PRIV_VFS_READ:
	case PRIV_VFS_WRITE:
	case PRIV_VFS_ADMIN:
	case PRIV_VFS_EXEC:
	case PRIV_VFS_LOOKUP:
	case PRIV_VFS_BLOCKRESERVE: /* XXXRW: Slightly surprising. */
	case PRIV_VFS_CHFLAGS_DEV:
	case PRIV_VFS_CHOWN:
	case PRIV_VFS_CHROOT:
	case PRIV_VFS_RETAINSUGID:
	case PRIV_VFS_FCHROOT:
	case PRIV_VFS_LINK:
	case PRIV_VFS_SETGID:
	case PRIV_VFS_STAT:
	case PRIV_VFS_STICKYFILE:
	return (0);

	/*
	* Depending on the global setting, allow privilege of
	* setting system flags.
	*/
	case PRIV_VFS_SYSFLAGS:
	if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
	return (0);
	else
	return (EPERM);

	/*
	* Depending on the global setting, allow privilege of
	* mounting/unmounting file systems.
	*/
	case PRIV_VFS_MOUNT:
	case PRIV_VFS_UNMOUNT:
	case PRIV_VFS_MOUNT_NONUSER:
	case PRIV_VFS_MOUNT_OWNER:
	if (cred->cr_prison->pr_allow & PR_ALLOW_MOUNT &&
	cred->cr_prison->pr_enforce_statfs < 2)
	return (0);
	else
	return (EPERM);

	/*
	* Allow jailed root to bind reserved ports and reuse in-use
	* ports.
	*/
	case PRIV_NETINET_RESERVEDPORT:
	case PRIV_NETINET_REUSEPORT:
	return (0);

	/*
	* Allow jailed root to set certian IPv4/6 (option) headers.
	*/
	case PRIV_NETINET_SETHDROPTS:
	return (0);

	/*
	* Conditionally allow creating raw sockets in jail.
	*/
	case PRIV_NETINET_RAW:
	if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
	return (0);
	else
	return (EPERM);

	/*
	* Since jail implements its own visibility limits on netstat
	* sysctls, allow getcred. This allows identd to work in
	* jail.
	*/
	case PRIV_NETINET_GETCRED:
	return (0);

	/*
	* Allow jailed root to set loginclass.
	*/
	case PRIV_PROC_SETLOGINCLASS:
	return (0);

	default:
	/*
	* In all remaining cases, deny the privilege request. This
	* includes almost all network privileges, many system
	* configuration privileges.
	*/
	return (EPERM);
	}
	}

	/*
	* Return the part of pr2's name that is relative to pr1, or the whole name
	* if it does not directly follow.
	*/

	char *
	prison_name(struct prison pr1, struct prison pr2)
	{
	char *name;

	/* Jails see themselves as "0" (if they see themselves at all). */
	if (pr1 == pr2)
	return "0";
	name = pr2->pr_name;
	if (prison_ischild(pr1, pr2)) {
	/*
	* pr1 isn't locked (and allprison_lock may not be either)
	* so its length can't be counted on. But the number of dots
	* can be counted on - and counted.
	*/
	for (; pr1 != &prison0; pr1 = pr1->pr_parent)
	name = strchr(name, '.') + 1;
	}
	return (name);
	}

	/*
	* Return the part of pr2's path that is relative to pr1, or the whole path
	* if it does not directly follow.
	*/
	static char *
	prison_path(struct prison pr1, struct prison pr2)
	{
	char path1, path2;
	int len1;

	path1 = pr1->pr_path;
	path2 = pr2->pr_path;
	if (!strcmp(path1, "/"))
	return (path2);
	len1 = strlen(path1);
	if (strncmp(path1, path2, len1))
	return (path2);
	if (path2[len1] == '\0')
	return "/";
	if (path2[len1] == '/')
	return (path2 + len1);
	return (path2);
	}


	/*
	* Jail-related sysctls.
	*/
	SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
	"Jails");

	static int
	sysctl_jail_list(SYSCTL_HANDLER_ARGS)
	{
	struct xprison *xp;
	struct prison pr, cpr;
	#ifdef INET
	struct in_addr *ip4 = NULL;
	int ip4s = 0;
	#endif
	#ifdef INET6
	struct in6_addr *ip6 = NULL;
	int ip6s = 0;
	#endif
	int descend, error;

	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
	pr = req->td->td_ucred->cr_prison;
	error = 0;
	sx_slock(&allprison_lock);
	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
	#if defined(INET) \|\| defined(INET6)
	again:
	#endif
	mtx_lock(&cpr->pr_mtx);
	#ifdef INET
	if (cpr->pr_ip4s > 0) {
	if (ip4s < cpr->pr_ip4s) {
	ip4s = cpr->pr_ip4s;
	mtx_unlock(&cpr->pr_mtx);
	ip4 = realloc(ip4, ip4s *
	sizeof(struct in_addr), M_TEMP, M_WAITOK);
	goto again;
	}
	bcopy(cpr->pr_ip4, ip4,
	cpr->pr_ip4s * sizeof(struct in_addr));
	}
	#endif
	#ifdef INET6
	if (cpr->pr_ip6s > 0) {
	if (ip6s < cpr->pr_ip6s) {
	ip6s = cpr->pr_ip6s;
	mtx_unlock(&cpr->pr_mtx);
	ip6 = realloc(ip6, ip6s *
	sizeof(struct in6_addr), M_TEMP, M_WAITOK);
	goto again;
	}
	bcopy(cpr->pr_ip6, ip6,
	cpr->pr_ip6s * sizeof(struct in6_addr));
	}
	#endif
	if (cpr->pr_ref == 0) {
	mtx_unlock(&cpr->pr_mtx);
	continue;
	}
	bzero(xp, sizeof(*xp));
	xp->pr_version = XPRISON_VERSION;
	xp->pr_id = cpr->pr_id;
	xp->pr_state = cpr->pr_uref > 0
	? PRISON_STATE_ALIVE : PRISON_STATE_DYING;
	strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
	strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
	strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
	#ifdef INET
	xp->pr_ip4s = cpr->pr_ip4s;
	#endif
	#ifdef INET6
	xp->pr_ip6s = cpr->pr_ip6s;
	#endif
	mtx_unlock(&cpr->pr_mtx);
	error = SYSCTL_OUT(req, xp, sizeof(*xp));
	if (error)
	break;
	#ifdef INET
	if (xp->pr_ip4s > 0) {
	error = SYSCTL_OUT(req, ip4,
	xp->pr_ip4s * sizeof(struct in_addr));
	if (error)
	break;
	}
	#endif
	#ifdef INET6
	if (xp->pr_ip6s > 0) {
	error = SYSCTL_OUT(req, ip6,
	xp->pr_ip6s * sizeof(struct in6_addr));
	if (error)
	break;
	}
	#endif
	}
	sx_sunlock(&allprison_lock);
	free(xp, M_TEMP);
	#ifdef INET
	free(ip4, M_TEMP);
	#endif
	#ifdef INET6
	free(ip6, M_TEMP);
	#endif
	return (error);
	}

	SYSCTL_OID(_security_jail, OID_AUTO, list,
	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_MPSAFE, NULL, 0,
	sysctl_jail_list, "S", "List of active jails");

	static int
	sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
	{
	int error, injail;

	injail = jailed(req->td->td_ucred);
	error = SYSCTL_OUT(req, &injail, sizeof(injail));

	return (error);
	}

	SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
	CTLTYPE_INT \| CTLFLAG_RD \| CTLFLAG_MPSAFE, NULL, 0,
	sysctl_jail_jailed, "I", "Process in jail?");

	#if defined(INET) \|\| defined(INET6)
	SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
	&jail_max_af_ips, 0,
	"Number of IP addresses a jail may have at most per address family");
	#endif

	/*
	* Default parameters for jail(2) compatability. For historical reasons,
	* the sysctl names have varying similarity to the parameter names. Prisons
	* just see their own parameters, and can't change them.
	*/
	static int
	sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
	{
	struct prison *pr;
	int allow, error, i;

	pr = req->td->td_ucred->cr_prison;
	allow = (pr == &prison0) ? jail_default_allow : pr->pr_allow;

	/* Get the current flag value, and convert it to a boolean. */
	i = (allow & arg2) ? 1 : 0;
	if (arg1 != NULL)
	i = !i;
	error = sysctl_handle_int(oidp, &i, 0, req);
	if (error \|\| !req->newptr)
	return (error);
	i = i ? arg2 : 0;
	if (arg1 != NULL)
	i ^= arg2;
	/*
	* The sysctls don't have CTLFLAGS_PRISON, so assume prison0
	* for writing.
	*/
	mtx_lock(&prison0.pr_mtx);
	jail_default_allow = (jail_default_allow & ~arg2) \| i;
	mtx_unlock(&prison0.pr_mtx);
	return (0);
	}

	SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
	"Processes in jail can set their hostnames");
	SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	(void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
	"Processes in jail are limited to creating UNIX/IP/route sockets only");
	SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
	"Processes in jail can use System V IPC primitives");
	SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
	"Prison root can create raw sockets");
	SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
	"Processes in jail can alter system file flags");
	SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
	"Processes in jail can mount/unmount jail-friendly file systems");

	static int
	sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
	{
	struct prison *pr;
	int level, error;

	pr = req->td->td_ucred->cr_prison;
	level = (pr == &prison0) ? (int )arg1 : (int )((char *)pr + arg2);
	error = sysctl_handle_int(oidp, &level, 0, req);
	if (error \|\| !req->newptr)
	return (error);
	(int )arg1 = level;
	return (0);
	}

	SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_MPSAFE,
	&jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
	sysctl_jail_default_level, "I",
	"Processes in jail cannot see all mounted file systems");

	/*
	* Nodes to describe jail parameters. Maximum length of string parameters
	* is returned in the string itself, and the other parameters exist merely
	* to make themselves and their types known.
	*/
	SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW, 0,
	"Jail parameters");

	int
	sysctl_jail_param(SYSCTL_HANDLER_ARGS)
	{
	int i;
	long l;
	size_t s;
	char numbuf[12];

	switch (oidp->oid_kind & CTLTYPE)
	{
	case CTLTYPE_LONG:
	case CTLTYPE_ULONG:
	l = 0;
	#ifdef SCTL_MASK32
	if (!(req->flags & SCTL_MASK32))
	#endif
	return (SYSCTL_OUT(req, &l, sizeof(l)));
	case CTLTYPE_INT:
	case CTLTYPE_UINT:
	i = 0;
	return (SYSCTL_OUT(req, &i, sizeof(i)));
	case CTLTYPE_STRING:
	snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
	return
	(sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
	case CTLTYPE_STRUCT:
	s = (size_t)arg2;
	return (SYSCTL_OUT(req, &s, sizeof(s)));
	}
	return (0);
	}

	SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT \| CTLFLAG_RDTUN, "I", "Jail ID");
	SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT \| CTLFLAG_RD, "I", "Jail parent ID");
	SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
	SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
	SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT \| CTLFLAG_RW,
	"I", "Jail secure level");
	SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT \| CTLFLAG_RW,
	"I", "Jail cannot see all mounted file systems");
	SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT \| CTLFLAG_RW,
	"B", "Jail persistence");
	#ifdef VIMAGE
	SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT \| CTLFLAG_RDTUN,
	"E,jailsys", "Virtual network stack");
	#endif
	SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT \| CTLFLAG_RD,
	"B", "Jail is in the process of shutting down");

	SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
	SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT \| CTLFLAG_RD,
	"I", "Current number of child jails");
	SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT \| CTLFLAG_RW,
	"I", "Maximum number of child jails");

	SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
	SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
	"Jail hostname");
	SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
	"Jail NIS domainname");
	SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
	"Jail host UUID");
	SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG \| CTLFLAG_RW,
	"LU", "Jail host ID");

	SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
	SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT \| CTLFLAG_RD, "I", "Jail cpuset ID");

	#ifdef INET
	SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
	"Jail IPv4 address virtualization");
	SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
	"S,in_addr,a", "Jail IPv4 addresses");
	SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT \| CTLFLAG_RW,
	"B", "Do (not) use IPv4 source address selection rather than the "
	"primary jail IPv4 address.");
	#endif
	#ifdef INET6
	SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
	"Jail IPv6 address virtualization");
	SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
	"S,in6_addr,a", "Jail IPv6 addresses");
	SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT \| CTLFLAG_RW,
	"B", "Do (not) use IPv6 source address selection rather than the "
	"primary jail IPv6 address.");
	#endif

	SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
	SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT \| CTLFLAG_RW,
	"B", "Jail may set hostname");
	SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT \| CTLFLAG_RW,
	"B", "Jail may use SYSV IPC");
	SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT \| CTLFLAG_RW,
	"B", "Jail may create raw sockets");
	SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT \| CTLFLAG_RW,
	"B", "Jail may alter system file flags");
	SYSCTL_JAIL_PARAM(_allow, mount, CTLTYPE_INT \| CTLFLAG_RW,
	"B", "Jail may mount/unmount jail-friendly file systems");
	SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT \| CTLFLAG_RW,
	"B", "Jail may set file quotas");
	SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT \| CTLFLAG_RW,
	"B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");

	void
	prison_racct_foreach(void (callback)(struct racct racct,
	void arg2, void arg3), void arg2, void arg3)
	{
	struct prison_racct *prr;

	sx_slock(&allprison_lock);
	LIST_FOREACH(prr, &allprison_racct, prr_next)
	(callback)(prr->prr_racct, arg2, arg3);
	sx_sunlock(&allprison_lock);
	}

	static struct prison_racct *
	prison_racct_find_locked(const char *name)
	{
	struct prison_racct *prr;

	sx_assert(&allprison_lock, SA_XLOCKED);

	if (name[0] == '\0' \|\| strlen(name) >= MAXHOSTNAMELEN)
	return (NULL);

	LIST_FOREACH(prr, &allprison_racct, prr_next) {
	if (strcmp(name, prr->prr_name) != 0)
	continue;

	/* Found prison_racct with a matching name? */
	prison_racct_hold(prr);
	return (prr);
	}

	/* Add new prison_racct. */
	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO \| M_WAITOK);
	racct_create(&prr->prr_racct);

	strcpy(prr->prr_name, name);
	refcount_init(&prr->prr_refcount, 1);
	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);

	return (prr);
	}

	struct prison_racct *
	prison_racct_find(const char *name)
	{
	struct prison_racct *prr;

	sx_xlock(&allprison_lock);
	prr = prison_racct_find_locked(name);
	sx_xunlock(&allprison_lock);
	return (prr);
	}

	void
	prison_racct_hold(struct prison_racct *prr)
	{

	refcount_acquire(&prr->prr_refcount);
	}

	void
	prison_racct_free(struct prison_racct *prr)
	{
	int old;

	old = prr->prr_refcount;
	if (old > 1 && atomic_cmpset_int(&prr->prr_refcount, old, old - 1))
	return;

	sx_xlock(&allprison_lock);
	if (refcount_release(&prr->prr_refcount)) {
	racct_destroy(&prr->prr_racct);
	LIST_REMOVE(prr, prr_next);
	sx_xunlock(&allprison_lock);
	free(prr, M_PRISON_RACCT);

	return;
	}
	sx_xunlock(&allprison_lock);
	}

	#ifdef RACCT
	static void
	prison_racct_attach(struct prison *pr)
	{
	struct prison_racct *prr;

	prr = prison_racct_find_locked(pr->pr_name);
	KASSERT(prr != NULL, ("cannot find prison_racct"));

	pr->pr_prison_racct = prr;
	}

	static void
	prison_racct_detach(struct prison *pr)
	{
	prison_racct_free(pr->pr_prison_racct);
	pr->pr_prison_racct = NULL;
	}
	#endif /* RACCT */

	#ifdef DDB

	static void
	db_show_prison(struct prison *pr)
	{
	int fi;
	#if defined(INET) \|\| defined(INET6)
	int ii;
	#endif
	unsigned jsf;
	#ifdef INET6
	char ip6buf[INET6_ADDRSTRLEN];
	#endif

	db_printf("prison %p:\n", pr);
	db_printf(" jid = %d\n", pr->pr_id);
	db_printf(" name = %s\n", pr->pr_name);
	db_printf(" parent = %p\n", pr->pr_parent);
	db_printf(" ref = %d\n", pr->pr_ref);
	db_printf(" uref = %d\n", pr->pr_uref);
	db_printf(" path = %s\n", pr->pr_path);
	db_printf(" cpuset = %d\n", pr->pr_cpuset
	? pr->pr_cpuset->cs_id : -1);
	#ifdef VIMAGE
	db_printf(" vnet = %p\n", pr->pr_vnet);
	#endif
	db_printf(" root = %p\n", pr->pr_root);
	db_printf(" securelevel = %d\n", pr->pr_securelevel);
	db_printf(" children.max = %d\n", pr->pr_childmax);
	db_printf(" children.cur = %d\n", pr->pr_childcount);
	db_printf(" child = %p\n", LIST_FIRST(&pr->pr_children));
	db_printf(" sibling = %p\n", LIST_NEXT(pr, pr_sibling));
	db_printf(" flags = 0x%x", pr->pr_flags);
	for (fi = 0; fi < sizeof(pr_flag_names) / sizeof(pr_flag_names[0]);
	fi++)
	if (pr_flag_names[fi] != NULL && (pr->pr_flags & (1 << fi)))
	db_printf(" %s", pr_flag_names[fi]);
	for (fi = 0; fi < sizeof(pr_flag_jailsys) / sizeof(pr_flag_jailsys[0]);
	fi++) {
	jsf = pr->pr_flags &
	(pr_flag_jailsys[fi].disable \| pr_flag_jailsys[fi].new);
	db_printf(" %-16s= %s\n", pr_flag_jailsys[fi].name,
	pr_flag_jailsys[fi].disable &&
	(jsf == pr_flag_jailsys[fi].disable) ? "disable"
	: (jsf == pr_flag_jailsys[fi].new) ? "new"
	: "inherit");
	}
	db_printf(" allow = 0x%x", pr->pr_allow);
	for (fi = 0; fi < sizeof(pr_allow_names) / sizeof(pr_allow_names[0]);
	fi++)
	if (pr_allow_names[fi] != NULL && (pr->pr_allow & (1 << fi)))
	db_printf(" %s", pr_allow_names[fi]);
	db_printf("\n");
	db_printf(" enforce_statfs = %d\n", pr->pr_enforce_statfs);
	db_printf(" host.hostname = %s\n", pr->pr_hostname);
	db_printf(" host.domainname = %s\n", pr->pr_domainname);
	db_printf(" host.hostuuid = %s\n", pr->pr_hostuuid);
	db_printf(" host.hostid = %lu\n", pr->pr_hostid);
	#ifdef INET
	db_printf(" ip4s = %d\n", pr->pr_ip4s);
	for (ii = 0; ii < pr->pr_ip4s; ii++)
	db_printf(" %s %s\n",
	ii == 0 ? "ip4.addr =" : " ",
	inet_ntoa(pr->pr_ip4[ii]));
	#endif
	#ifdef INET6
	db_printf(" ip6s = %d\n", pr->pr_ip6s);
	for (ii = 0; ii < pr->pr_ip6s; ii++)
	db_printf(" %s %s\n",
	ii == 0 ? "ip6.addr =" : " ",
	ip6_sprintf(ip6buf, &pr->pr_ip6[ii]));
	#endif
	}

	DB_SHOW_COMMAND(prison, db_show_prison_command)
	{
	struct prison *pr;

	if (!have_addr) {
	/*
	* Show all prisons in the list, and prison0 which is not
	* listed.
	*/
	db_show_prison(&prison0);
	if (!db_pager_quit) {
	TAILQ_FOREACH(pr, &allprison, pr_list) {
	db_show_prison(pr);
	if (db_pager_quit)
	break;
	}
	}
	return;
	}

	if (addr == 0)
	pr = &prison0;
	else {
	/* Look for a prison with the ID and with references. */
	TAILQ_FOREACH(pr, &allprison, pr_list)
	if (pr->pr_id == addr && pr->pr_ref > 0)
	break;
	if (pr == NULL)
	/* Look again, without requiring a reference. */
	TAILQ_FOREACH(pr, &allprison, pr_list)
	if (pr->pr_id == addr)
	break;
	if (pr == NULL)
	/* Assume address points to a valid prison. */
	pr = (struct prison *)addr;
	}
	db_show_prison(pr);
	}

	#endif /* DDB */
	Index: head/sys/kern/kern_ktrace.c
	===================================================================
	--- head/sys/kern/kern_ktrace.c (revision 225616)
	+++ head/sys/kern/kern_ktrace.c (revision 225617)
	@@ -1,1224 +1,1224 @@
	/*-
	* Copyright (c) 1989, 1993
	* The Regents of the University of California.
	* Copyright (c) 2005 Robert N. M. Watson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ktrace.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/fcntl.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/unistd.h>
	#include <sys/vnode.h>
	#include <sys/socket.h>
	#include <sys/stat.h>
	#include <sys/ktrace.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/syslog.h>
	#include <sys/sysproto.h>

	#include <security/mac/mac_framework.h>

	/*
	* The ktrace facility allows the tracing of certain key events in user space
	* processes, such as system calls, signal delivery, context switches, and
	* user generated events using utrace(2). It works by streaming event
	* records and data to a vnode associated with the process using the
	* ktrace(2) system call. In general, records can be written directly from
	* the context that generates the event. One important exception to this is
	* during a context switch, where sleeping is not permitted. To handle this
	* case, trace events are generated using in-kernel ktr_request records, and
	* then delivered to disk at a convenient moment -- either immediately, the
	* next traceable event, at system call return, or at process exit.
	*
	* When dealing with multiple threads or processes writing to the same event
	* log, ordering guarantees are weak: specifically, if an event has multiple
	* records (i.e., system call enter and return), they may be interlaced with
	* records from another event. Process and thread ID information is provided
	* in the record, and user applications can de-interlace events if required.
	*/

	static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");

	#ifdef KTRACE

	FEATURE(ktrace, "Kernel support for system-call tracing");

	#ifndef KTRACE_REQUEST_POOL
	#define KTRACE_REQUEST_POOL 100
	#endif

	struct ktr_request {
	struct ktr_header ktr_header;
	void *ktr_buffer;
	union {
	struct ktr_proc_ctor ktr_proc_ctor;
	struct ktr_syscall ktr_syscall;
	struct ktr_sysret ktr_sysret;
	struct ktr_genio ktr_genio;
	struct ktr_psig ktr_psig;
	struct ktr_csw ktr_csw;
	} ktr_data;
	STAILQ_ENTRY(ktr_request) ktr_list;
	};

	static int data_lengths[] = {
	0, /* none */
	offsetof(struct ktr_syscall, ktr_args), /* KTR_SYSCALL */
	sizeof(struct ktr_sysret), /* KTR_SYSRET */
	0, /* KTR_NAMEI */
	sizeof(struct ktr_genio), /* KTR_GENIO */
	sizeof(struct ktr_psig), /* KTR_PSIG */
	sizeof(struct ktr_csw), /* KTR_CSW */
	0, /* KTR_USER */
	0, /* KTR_STRUCT */
	0, /* KTR_SYSCTL */
	sizeof(struct ktr_proc_ctor), /* KTR_PROCCTOR */
	0, /* KTR_PROCDTOR */
	};

	static STAILQ_HEAD(, ktr_request) ktr_free;

	static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");

	static u_int ktr_requestpool = KTRACE_REQUEST_POOL;
	TUNABLE_INT("kern.ktrace.request_pool", &ktr_requestpool);

	static u_int ktr_geniosize = PAGE_SIZE;
	TUNABLE_INT("kern.ktrace.genio_size", &ktr_geniosize);
	SYSCTL_UINT(_kern_ktrace, OID_AUTO, genio_size, CTLFLAG_RW, &ktr_geniosize,
	0, "Maximum size of genio event payload");

	static int print_message = 1;
	static struct mtx ktrace_mtx;
	static struct sx ktrace_sx;

	static void ktrace_init(void *dummy);
	static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
	static u_int ktrace_resize_pool(u_int oldsize, u_int newsize);
	static struct ktr_request ktr_getrequest_entered(struct thread td, int type);
	static struct ktr_request *ktr_getrequest(int type);
	static void ktr_submitrequest(struct thread td, struct ktr_request req);
	static void ktr_freeproc(struct proc p, struct ucred *uc,
	struct vnode **vp);
	static void ktr_freerequest(struct ktr_request *req);
	static void ktr_freerequest_locked(struct ktr_request *req);
	static void ktr_writerequest(struct thread td, struct ktr_request req);
	static int ktrcanset(struct thread ,struct proc );
	static int ktrsetchildren(struct thread ,struct proc ,int,int,struct vnode *);
	static int ktrops(struct thread ,struct proc ,int,int,struct vnode *);
	static void ktrprocctor_entered(struct thread , struct proc );

	/*
	* ktrace itself generates events, such as context switches, which we do not
	* wish to trace. Maintain a flag, TDP_INKTRACE, on each thread to determine
	* whether or not it is in a region where tracing of events should be
	* suppressed.
	*/
	static void
	ktrace_enter(struct thread *td)
	{

	KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
	td->td_pflags \|= TDP_INKTRACE;
	}

	static void
	ktrace_exit(struct thread *td)
	{

	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
	td->td_pflags &= ~TDP_INKTRACE;
	}

	static void
	ktrace_assert(struct thread *td)
	{

	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
	}

	static void
	ktrace_init(void *dummy)
	{
	struct ktr_request *req;
	int i;

	mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF \| MTX_QUIET);
	sx_init(&ktrace_sx, "ktrace_sx");
	STAILQ_INIT(&ktr_free);
	for (i = 0; i < ktr_requestpool; i++) {
	req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
	}
	}
	SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);

	static int
	sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS)
	{
	struct thread *td;
	u_int newsize, oldsize, wantsize;
	int error;

	/* Handle easy read-only case first to avoid warnings from GCC. */
	if (!req->newptr) {
	oldsize = ktr_requestpool;
	return (SYSCTL_OUT(req, &oldsize, sizeof(u_int)));
	}

	error = SYSCTL_IN(req, &wantsize, sizeof(u_int));
	if (error)
	return (error);
	td = curthread;
	ktrace_enter(td);
	oldsize = ktr_requestpool;
	newsize = ktrace_resize_pool(oldsize, wantsize);
	ktrace_exit(td);
	error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
	if (error)
	return (error);
	if (wantsize > oldsize && newsize < wantsize)
	return (ENOSPC);
	return (0);
	}
	SYSCTL_PROC(_kern_ktrace, OID_AUTO, request_pool, CTLTYPE_UINT\|CTLFLAG_RW,
	&ktr_requestpool, 0, sysctl_kern_ktrace_request_pool, "IU",
	"Pool buffer size for ktrace(1)");

	static u_int
	ktrace_resize_pool(u_int oldsize, u_int newsize)
	{
	STAILQ_HEAD(, ktr_request) ktr_new;
	struct ktr_request *req;
	int bound;

	print_message = 1;
	bound = newsize - oldsize;
	if (bound == 0)
	return (ktr_requestpool);
	if (bound < 0) {
	mtx_lock(&ktrace_mtx);
	/* Shrink pool down to newsize if possible. */
	while (bound++ < 0) {
	req = STAILQ_FIRST(&ktr_free);
	if (req == NULL)
	break;
	STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
	ktr_requestpool--;
	free(req, M_KTRACE);
	}
	} else {
	/* Grow pool up to newsize. */
	STAILQ_INIT(&ktr_new);
	while (bound-- > 0) {
	req = malloc(sizeof(struct ktr_request), M_KTRACE,
	M_WAITOK);
	STAILQ_INSERT_HEAD(&ktr_new, req, ktr_list);
	}
	mtx_lock(&ktrace_mtx);
	STAILQ_CONCAT(&ktr_free, &ktr_new);
	ktr_requestpool += (newsize - oldsize);
	}
	mtx_unlock(&ktrace_mtx);
	return (ktr_requestpool);
	}

	/* ktr_getrequest() assumes that ktr_comm[] is the same size as td_name[]. */
	CTASSERT(sizeof(((struct ktr_header *)NULL)->ktr_comm) ==
	(sizeof((struct thread *)NULL)->td_name));

	static struct ktr_request *
	ktr_getrequest_entered(struct thread *td, int type)
	{
	struct ktr_request *req;
	struct proc *p = td->td_proc;
	int pm;

	mtx_lock(&ktrace_mtx);
	if (!KTRCHECK(td, type)) {
	mtx_unlock(&ktrace_mtx);
	return (NULL);
	}
	req = STAILQ_FIRST(&ktr_free);
	if (req != NULL) {
	STAILQ_REMOVE_HEAD(&ktr_free, ktr_list);
	req->ktr_header.ktr_type = type;
	if (p->p_traceflag & KTRFAC_DROP) {
	req->ktr_header.ktr_type \|= KTR_DROP;
	p->p_traceflag &= ~KTRFAC_DROP;
	}
	mtx_unlock(&ktrace_mtx);
	microtime(&req->ktr_header.ktr_time);
	req->ktr_header.ktr_pid = p->p_pid;
	req->ktr_header.ktr_tid = td->td_tid;
	bcopy(td->td_name, req->ktr_header.ktr_comm,
	sizeof(req->ktr_header.ktr_comm));
	req->ktr_buffer = NULL;
	req->ktr_header.ktr_len = 0;
	} else {
	p->p_traceflag \|= KTRFAC_DROP;
	pm = print_message;
	print_message = 0;
	mtx_unlock(&ktrace_mtx);
	if (pm)
	printf("Out of ktrace request objects.\n");
	}
	return (req);
	}

	static struct ktr_request *
	ktr_getrequest(int type)
	{
	struct thread *td = curthread;
	struct ktr_request *req;

	ktrace_enter(td);
	req = ktr_getrequest_entered(td, type);
	if (req == NULL)
	ktrace_exit(td);

	return (req);
	}

	/*
	* Some trace generation environments don't permit direct access to VFS,
	* such as during a context switch where sleeping is not allowed. Under these
	* circumstances, queue a request to the thread to be written asynchronously
	* later.
	*/
	static void
	ktr_enqueuerequest(struct thread td, struct ktr_request req)
	{

	mtx_lock(&ktrace_mtx);
	STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
	mtx_unlock(&ktrace_mtx);
	}

	/*
	* Drain any pending ktrace records from the per-thread queue to disk. This
	* is used both internally before committing other records, and also on
	* system call return. We drain all the ones we can find at the time when
	* drain is requested, but don't keep draining after that as those events
	* may be approximately "after" the current event.
	*/
	static void
	ktr_drain(struct thread *td)
	{
	struct ktr_request *queued_req;
	STAILQ_HEAD(, ktr_request) local_queue;

	ktrace_assert(td);
	sx_assert(&ktrace_sx, SX_XLOCKED);

	STAILQ_INIT(&local_queue);

	if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
	mtx_lock(&ktrace_mtx);
	STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
	mtx_unlock(&ktrace_mtx);

	while ((queued_req = STAILQ_FIRST(&local_queue))) {
	STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
	ktr_writerequest(td, queued_req);
	ktr_freerequest(queued_req);
	}
	}
	}

	/*
	* Submit a trace record for immediate commit to disk -- to be used only
	* where entering VFS is OK. First drain any pending records that may have
	* been cached in the thread.
	*/
	static void
	ktr_submitrequest(struct thread td, struct ktr_request req)
	{

	ktrace_assert(td);

	sx_xlock(&ktrace_sx);
	ktr_drain(td);
	ktr_writerequest(td, req);
	ktr_freerequest(req);
	sx_xunlock(&ktrace_sx);
	ktrace_exit(td);
	}

	static void
	ktr_freerequest(struct ktr_request *req)
	{

	mtx_lock(&ktrace_mtx);
	ktr_freerequest_locked(req);
	mtx_unlock(&ktrace_mtx);
	}

	static void
	ktr_freerequest_locked(struct ktr_request *req)
	{

	mtx_assert(&ktrace_mtx, MA_OWNED);
	if (req->ktr_buffer != NULL)
	free(req->ktr_buffer, M_KTRACE);
	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
	}

	/*
	* Disable tracing for a process and release all associated resources.
	* The caller is responsible for releasing a reference on the returned
	* vnode and credentials.
	*/
	static void
	ktr_freeproc(struct proc p, struct ucred uc, struct vnode *vp)
	{
	struct ktr_request *req;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	mtx_assert(&ktrace_mtx, MA_OWNED);
	*uc = p->p_tracecred;
	p->p_tracecred = NULL;
	if (vp != NULL)
	*vp = p->p_tracevp;
	p->p_tracevp = NULL;
	p->p_traceflag = 0;
	while ((req = STAILQ_FIRST(&p->p_ktr)) != NULL) {
	STAILQ_REMOVE_HEAD(&p->p_ktr, ktr_list);
	ktr_freerequest_locked(req);
	}
	}

	void
	ktrsyscall(code, narg, args)
	int code, narg;
	register_t args[];
	{
	struct ktr_request *req;
	struct ktr_syscall *ktp;
	size_t buflen;
	char *buf = NULL;

	buflen = sizeof(register_t) * narg;
	if (buflen > 0) {
	buf = malloc(buflen, M_KTRACE, M_WAITOK);
	bcopy(args, buf, buflen);
	}
	req = ktr_getrequest(KTR_SYSCALL);
	if (req == NULL) {
	if (buf != NULL)
	free(buf, M_KTRACE);
	return;
	}
	ktp = &req->ktr_data.ktr_syscall;
	ktp->ktr_code = code;
	ktp->ktr_narg = narg;
	if (buflen > 0) {
	req->ktr_header.ktr_len = buflen;
	req->ktr_buffer = buf;
	}
	ktr_submitrequest(curthread, req);
	}

	void
	ktrsysret(code, error, retval)
	int code, error;
	register_t retval;
	{
	struct ktr_request *req;
	struct ktr_sysret *ktp;

	req = ktr_getrequest(KTR_SYSRET);
	if (req == NULL)
	return;
	ktp = &req->ktr_data.ktr_sysret;
	ktp->ktr_code = code;
	ktp->ktr_error = error;
	ktp->ktr_retval = retval; /* what about val2 ? */
	ktr_submitrequest(curthread, req);
	}

	/*
	* When a setuid process execs, disable tracing.
	*
	* XXX: We toss any pending asynchronous records.
	*/
	void
	ktrprocexec(struct proc p, struct ucred uc, struct vnode *vp)
	{

	PROC_LOCK_ASSERT(p, MA_OWNED);
	mtx_lock(&ktrace_mtx);
	ktr_freeproc(p, uc, vp);
	mtx_unlock(&ktrace_mtx);
	}

	/*
	* When a process exits, drain per-process asynchronous trace records
	* and disable tracing.
	*/
	void
	ktrprocexit(struct thread *td)
	{
	struct ktr_request *req;
	struct proc *p;
	struct ucred *cred;
	struct vnode *vp;
	int vfslocked;

	p = td->td_proc;
	if (p->p_traceflag == 0)
	return;

	ktrace_enter(td);
	req = ktr_getrequest_entered(td, KTR_PROCDTOR);
	if (req != NULL)
	ktr_enqueuerequest(td, req);
	sx_xlock(&ktrace_sx);
	ktr_drain(td);
	sx_xunlock(&ktrace_sx);
	PROC_LOCK(p);
	mtx_lock(&ktrace_mtx);
	ktr_freeproc(p, &cred, &vp);
	mtx_unlock(&ktrace_mtx);
	PROC_UNLOCK(p);
	if (vp != NULL) {
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	if (cred != NULL)
	crfree(cred);
	ktrace_exit(td);
	}

	static void
	ktrprocctor_entered(struct thread td, struct proc p)
	{
	struct ktr_proc_ctor *ktp;
	struct ktr_request *req;
	struct thread *td2;

	ktrace_assert(td);
	td2 = FIRST_THREAD_IN_PROC(p);
	req = ktr_getrequest_entered(td2, KTR_PROCCTOR);
	if (req == NULL)
	return;
	ktp = &req->ktr_data.ktr_proc_ctor;
	ktp->sv_flags = p->p_sysent->sv_flags;
	ktr_enqueuerequest(td2, req);
	}

	void
	ktrprocctor(struct proc *p)
	{
	struct thread *td = curthread;

	if ((p->p_traceflag & KTRFAC_MASK) == 0)
	return;

	ktrace_enter(td);
	ktrprocctor_entered(td, p);
	ktrace_exit(td);
	}

	/*
	* When a process forks, enable tracing in the new process if needed.
	*/
	void
	ktrprocfork(struct proc p1, struct proc p2)
	{

	PROC_LOCK(p1);
	mtx_lock(&ktrace_mtx);
	KASSERT(p2->p_tracevp == NULL, ("new process has a ktrace vnode"));
	if (p1->p_traceflag & KTRFAC_INHERIT) {
	p2->p_traceflag = p1->p_traceflag;
	if ((p2->p_tracevp = p1->p_tracevp) != NULL) {
	VREF(p2->p_tracevp);
	KASSERT(p1->p_tracecred != NULL,
	("ktrace vnode with no cred"));
	p2->p_tracecred = crhold(p1->p_tracecred);
	}
	}
	mtx_unlock(&ktrace_mtx);
	PROC_UNLOCK(p1);

	ktrprocctor(p2);
	}

	/*
	* When a thread returns, drain any asynchronous records generated by the
	* system call.
	*/
	void
	ktruserret(struct thread *td)
	{

	ktrace_enter(td);
	sx_xlock(&ktrace_sx);
	ktr_drain(td);
	sx_xunlock(&ktrace_sx);
	ktrace_exit(td);
	}

	void
	ktrnamei(path)
	char *path;
	{
	struct ktr_request *req;
	int namelen;
	char *buf = NULL;

	namelen = strlen(path);
	if (namelen > 0) {
	buf = malloc(namelen, M_KTRACE, M_WAITOK);
	bcopy(path, buf, namelen);
	}
	req = ktr_getrequest(KTR_NAMEI);
	if (req == NULL) {
	if (buf != NULL)
	free(buf, M_KTRACE);
	return;
	}
	if (namelen > 0) {
	req->ktr_header.ktr_len = namelen;
	req->ktr_buffer = buf;
	}
	ktr_submitrequest(curthread, req);
	}

	void
	ktrsysctl(name, namelen)
	int *name;
	u_int namelen;
	{
	struct ktr_request *req;
	u_int mib[CTL_MAXNAME + 2];
	char *mibname;
	size_t mibnamelen;
	int error;

	/* Lookup name of mib. */
	KASSERT(namelen <= CTL_MAXNAME, ("sysctl MIB too long"));
	mib[0] = 0;
	mib[1] = 1;
	bcopy(name, mib + 2, namelen * sizeof(*name));
	mibnamelen = 128;
	mibname = malloc(mibnamelen, M_KTRACE, M_WAITOK);
	error = kernel_sysctl(curthread, mib, namelen + 2, mibname, &mibnamelen,
	NULL, 0, &mibnamelen, 0);
	if (error) {
	free(mibname, M_KTRACE);
	return;
	}
	req = ktr_getrequest(KTR_SYSCTL);
	if (req == NULL) {
	free(mibname, M_KTRACE);
	return;
	}
	req->ktr_header.ktr_len = mibnamelen;
	req->ktr_buffer = mibname;
	ktr_submitrequest(curthread, req);
	}

	void
	ktrgenio(fd, rw, uio, error)
	int fd;
	enum uio_rw rw;
	struct uio *uio;
	int error;
	{
	struct ktr_request *req;
	struct ktr_genio *ktg;
	int datalen;
	char *buf;

	if (error) {
	free(uio, M_IOV);
	return;
	}
	uio->uio_offset = 0;
	uio->uio_rw = UIO_WRITE;
	datalen = imin(uio->uio_resid, ktr_geniosize);
	buf = malloc(datalen, M_KTRACE, M_WAITOK);
	error = uiomove(buf, datalen, uio);
	free(uio, M_IOV);
	if (error) {
	free(buf, M_KTRACE);
	return;
	}
	req = ktr_getrequest(KTR_GENIO);
	if (req == NULL) {
	free(buf, M_KTRACE);
	return;
	}
	ktg = &req->ktr_data.ktr_genio;
	ktg->ktr_fd = fd;
	ktg->ktr_rw = rw;
	req->ktr_header.ktr_len = datalen;
	req->ktr_buffer = buf;
	ktr_submitrequest(curthread, req);
	}

	void
	ktrpsig(sig, action, mask, code)
	int sig;
	sig_t action;
	sigset_t *mask;
	int code;
	{
	struct thread *td = curthread;
	struct ktr_request *req;
	struct ktr_psig *kp;

	req = ktr_getrequest(KTR_PSIG);
	if (req == NULL)
	return;
	kp = &req->ktr_data.ktr_psig;
	kp->signo = (char)sig;
	kp->action = action;
	kp->mask = *mask;
	kp->code = code;
	ktr_enqueuerequest(td, req);
	ktrace_exit(td);
	}

	void
	ktrcsw(out, user)
	int out, user;
	{
	struct thread *td = curthread;
	struct ktr_request *req;
	struct ktr_csw *kc;

	req = ktr_getrequest(KTR_CSW);
	if (req == NULL)
	return;
	kc = &req->ktr_data.ktr_csw;
	kc->out = out;
	kc->user = user;
	ktr_enqueuerequest(td, req);
	ktrace_exit(td);
	}

	void
	ktrstruct(name, data, datalen)
	const char *name;
	void *data;
	size_t datalen;
	{
	struct ktr_request *req;
	char *buf = NULL;
	size_t buflen;

	if (!data)
	datalen = 0;
	buflen = strlen(name) + 1 + datalen;
	buf = malloc(buflen, M_KTRACE, M_WAITOK);
	strcpy(buf, name);
	bcopy(data, buf + strlen(name) + 1, datalen);
	if ((req = ktr_getrequest(KTR_STRUCT)) == NULL) {
	free(buf, M_KTRACE);
	return;
	}
	req->ktr_buffer = buf;
	req->ktr_header.ktr_len = buflen;
	ktr_submitrequest(curthread, req);
	}
	#endif /* KTRACE */

	/* Interface and common routines */

	#ifndef _SYS_SYSPROTO_H_
	struct ktrace_args {
	char *fname;
	int ops;
	int facs;
	int pid;
	};
	#endif
	/* ARGSUSED */
	int
	-ktrace(td, uap)
	+sys_ktrace(td, uap)
	struct thread *td;
	register struct ktrace_args *uap;
	{
	#ifdef KTRACE
	register struct vnode *vp = NULL;
	register struct proc *p;
	struct pgrp *pg;
	int facs = uap->facs & ~KTRFAC_ROOT;
	int ops = KTROP(uap->ops);
	int descend = uap->ops & KTRFLAG_DESCEND;
	int nfound, ret = 0;
	int flags, error = 0, vfslocked;
	struct nameidata nd;
	struct ucred *cred;

	/*
	* Need something to (un)trace.
	*/
	if (ops != KTROP_CLEARFILE && facs == 0)
	return (EINVAL);

	ktrace_enter(td);
	if (ops != KTROP_CLEAR) {
	/*
	* an operation which requires a file argument.
	*/
	NDINIT(&nd, LOOKUP, NOFOLLOW \| MPSAFE, UIO_USERSPACE,
	uap->fname, td);
	flags = FREAD \| FWRITE \| O_NOFOLLOW;
	error = vn_open(&nd, &flags, 0, NULL);
	if (error) {
	ktrace_exit(td);
	return (error);
	}
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vp = nd.ni_vp;
	VOP_UNLOCK(vp, 0);
	if (vp->v_type != VREG) {
	(void) vn_close(vp, FREAD\|FWRITE, td->td_ucred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	ktrace_exit(td);
	return (EACCES);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	}
	/*
	* Clear all uses of the tracefile.
	*/
	if (ops == KTROP_CLEARFILE) {
	int vrele_count;

	vrele_count = 0;
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	PROC_LOCK(p);
	if (p->p_tracevp == vp) {
	if (ktrcanset(td, p)) {
	mtx_lock(&ktrace_mtx);
	ktr_freeproc(p, &cred, NULL);
	mtx_unlock(&ktrace_mtx);
	vrele_count++;
	crfree(cred);
	} else
	error = EPERM;
	}
	PROC_UNLOCK(p);
	}
	sx_sunlock(&allproc_lock);
	if (vrele_count > 0) {
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	while (vrele_count-- > 0)
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	goto done;
	}
	/*
	* do it
	*/
	sx_slock(&proctree_lock);
	if (uap->pid < 0) {
	/*
	* by process group
	*/
	pg = pgfind(-uap->pid);
	if (pg == NULL) {
	sx_sunlock(&proctree_lock);
	error = ESRCH;
	goto done;
	}
	/*
	* ktrops() may call vrele(). Lock pg_members
	* by the proctree_lock rather than pg_mtx.
	*/
	PGRP_UNLOCK(pg);
	nfound = 0;
	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
	PROC_LOCK(p);
	if (p->p_state == PRS_NEW \|\|
	p_cansee(td, p) != 0) {
	PROC_UNLOCK(p);
	continue;
	}
	nfound++;
	if (descend)
	ret \|= ktrsetchildren(td, p, ops, facs, vp);
	else
	ret \|= ktrops(td, p, ops, facs, vp);
	}
	if (nfound == 0) {
	sx_sunlock(&proctree_lock);
	error = ESRCH;
	goto done;
	}
	} else {
	/*
	* by pid
	*/
	p = pfind(uap->pid);
	if (p == NULL)
	error = ESRCH;
	else
	error = p_cansee(td, p);
	if (error) {
	if (p != NULL)
	PROC_UNLOCK(p);
	sx_sunlock(&proctree_lock);
	goto done;
	}
	if (descend)
	ret \|= ktrsetchildren(td, p, ops, facs, vp);
	else
	ret \|= ktrops(td, p, ops, facs, vp);
	}
	sx_sunlock(&proctree_lock);
	if (!ret)
	error = EPERM;
	done:
	if (vp != NULL) {
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	(void) vn_close(vp, FWRITE, td->td_ucred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	ktrace_exit(td);
	return (error);
	#else /* !KTRACE */
	return (ENOSYS);
	#endif /* KTRACE */
	}

	/* ARGSUSED */
	int
	-utrace(td, uap)
	+sys_utrace(td, uap)
	struct thread *td;
	register struct utrace_args *uap;
	{

	#ifdef KTRACE
	struct ktr_request *req;
	void *cp;
	int error;

	if (!KTRPOINT(td, KTR_USER))
	return (0);
	if (uap->len > KTR_USER_MAXLEN)
	return (EINVAL);
	cp = malloc(uap->len, M_KTRACE, M_WAITOK);
	error = copyin(uap->addr, cp, uap->len);
	if (error) {
	free(cp, M_KTRACE);
	return (error);
	}
	req = ktr_getrequest(KTR_USER);
	if (req == NULL) {
	free(cp, M_KTRACE);
	return (ENOMEM);
	}
	req->ktr_buffer = cp;
	req->ktr_header.ktr_len = uap->len;
	ktr_submitrequest(td, req);
	return (0);
	#else /* !KTRACE */
	return (ENOSYS);
	#endif /* KTRACE */
	}

	#ifdef KTRACE
	static int
	ktrops(td, p, ops, facs, vp)
	struct thread *td;
	struct proc *p;
	int ops, facs;
	struct vnode *vp;
	{
	struct vnode *tracevp = NULL;
	struct ucred *tracecred = NULL;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	if (!ktrcanset(td, p)) {
	PROC_UNLOCK(p);
	return (0);
	}
	if (p->p_flag & P_WEXIT) {
	/* If the process is exiting, just ignore it. */
	PROC_UNLOCK(p);
	return (1);
	}
	mtx_lock(&ktrace_mtx);
	if (ops == KTROP_SET) {
	if (p->p_tracevp != vp) {
	/*
	* if trace file already in use, relinquish below
	*/
	tracevp = p->p_tracevp;
	VREF(vp);
	p->p_tracevp = vp;
	}
	if (p->p_tracecred != td->td_ucred) {
	tracecred = p->p_tracecred;
	p->p_tracecred = crhold(td->td_ucred);
	}
	p->p_traceflag \|= facs;
	if (priv_check(td, PRIV_KTRACE) == 0)
	p->p_traceflag \|= KTRFAC_ROOT;
	} else {
	/* KTROP_CLEAR */
	if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0)
	/* no more tracing */
	ktr_freeproc(p, &tracecred, &tracevp);
	}
	mtx_unlock(&ktrace_mtx);
	if ((p->p_traceflag & KTRFAC_MASK) != 0)
	ktrprocctor_entered(td, p);
	PROC_UNLOCK(p);
	if (tracevp != NULL) {
	int vfslocked;

	vfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
	vrele(tracevp);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	if (tracecred != NULL)
	crfree(tracecred);

	return (1);
	}

	static int
	ktrsetchildren(td, top, ops, facs, vp)
	struct thread *td;
	struct proc *top;
	int ops, facs;
	struct vnode *vp;
	{
	register struct proc *p;
	register int ret = 0;

	p = top;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	sx_assert(&proctree_lock, SX_LOCKED);
	for (;;) {
	ret \|= ktrops(td, p, ops, facs, vp);
	/*
	* If this process has children, descend to them next,
	* otherwise do any siblings, and if done with this level,
	* follow back up the tree (but not past top).
	*/
	if (!LIST_EMPTY(&p->p_children))
	p = LIST_FIRST(&p->p_children);
	else for (;;) {
	if (p == top)
	return (ret);
	if (LIST_NEXT(p, p_sibling)) {
	p = LIST_NEXT(p, p_sibling);
	break;
	}
	p = p->p_pptr;
	}
	PROC_LOCK(p);
	}
	/NOTREACHED/
	}

	static void
	ktr_writerequest(struct thread td, struct ktr_request req)
	{
	struct ktr_header *kth;
	struct vnode *vp;
	struct proc *p;
	struct ucred *cred;
	struct uio auio;
	struct iovec aiov[3];
	struct mount *mp;
	int datalen, buflen, vrele_count;
	int error, vfslocked;

	/*
	* We hold the vnode and credential for use in I/O in case ktrace is
	* disabled on the process as we write out the request.
	*
	* XXXRW: This is not ideal: we could end up performing a write after
	* the vnode has been closed.
	*/
	mtx_lock(&ktrace_mtx);
	vp = td->td_proc->p_tracevp;
	cred = td->td_proc->p_tracecred;

	/*
	* If vp is NULL, the vp has been cleared out from under this
	* request, so just drop it. Make sure the credential and vnode are
	* in sync: we should have both or neither.
	*/
	if (vp == NULL) {
	KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
	mtx_unlock(&ktrace_mtx);
	return;
	}
	VREF(vp);
	KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
	crhold(cred);
	mtx_unlock(&ktrace_mtx);

	kth = &req->ktr_header;
	KASSERT(((u_short)kth->ktr_type & ~KTR_DROP) <
	sizeof(data_lengths) / sizeof(data_lengths[0]),
	("data_lengths array overflow"));
	datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
	buflen = kth->ktr_len;
	auio.uio_iov = &aiov[0];
	auio.uio_offset = 0;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_rw = UIO_WRITE;
	aiov[0].iov_base = (caddr_t)kth;
	aiov[0].iov_len = sizeof(struct ktr_header);
	auio.uio_resid = sizeof(struct ktr_header);
	auio.uio_iovcnt = 1;
	auio.uio_td = td;
	if (datalen != 0) {
	aiov[1].iov_base = (caddr_t)&req->ktr_data;
	aiov[1].iov_len = datalen;
	auio.uio_resid += datalen;
	auio.uio_iovcnt++;
	kth->ktr_len += datalen;
	}
	if (buflen != 0) {
	KASSERT(req->ktr_buffer != NULL, ("ktrace: nothing to write"));
	aiov[auio.uio_iovcnt].iov_base = req->ktr_buffer;
	aiov[auio.uio_iovcnt].iov_len = buflen;
	auio.uio_resid += buflen;
	auio.uio_iovcnt++;
	}

	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vn_start_write(vp, &mp, V_WAIT);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	#ifdef MAC
	error = mac_vnode_check_write(cred, NOCRED, vp);
	if (error == 0)
	#endif
	error = VOP_WRITE(vp, &auio, IO_UNIT \| IO_APPEND, cred);
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	crfree(cred);
	if (!error) {
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return;
	}
	VFS_UNLOCK_GIANT(vfslocked);

	/*
	* If error encountered, give up tracing on this vnode. We defer
	* all the vrele()'s on the vnode until after we are finished walking
	* the various lists to avoid needlessly holding locks.
	* NB: at this point we still hold the vnode reference that must
	* not go away as we need the valid vnode to compare with. Thus let
	* vrele_count start at 1 and the reference will be freed
	* by the loop at the end after our last use of vp.
	*/
	log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n",
	error);
	vrele_count = 1;
	/*
	* First, clear this vnode from being used by any processes in the
	* system.
	* XXX - If one process gets an EPERM writing to the vnode, should
	* we really do this? Other processes might have suitable
	* credentials for the operation.
	*/
	cred = NULL;
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	PROC_LOCK(p);
	if (p->p_tracevp == vp) {
	mtx_lock(&ktrace_mtx);
	ktr_freeproc(p, &cred, NULL);
	mtx_unlock(&ktrace_mtx);
	vrele_count++;
	}
	PROC_UNLOCK(p);
	if (cred != NULL) {
	crfree(cred);
	cred = NULL;
	}
	}
	sx_sunlock(&allproc_lock);

	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	while (vrele_count-- > 0)
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	}

	/*
	* Return true if caller has permission to set the ktracing state
	* of target. Essentially, the target can't possess any
	* more permissions than the caller. KTRFAC_ROOT signifies that
	* root previously set the tracing status on the target process, and
	* so, only root may further change it.
	*/
	static int
	ktrcanset(td, targetp)
	struct thread *td;
	struct proc *targetp;
	{

	PROC_LOCK_ASSERT(targetp, MA_OWNED);
	if (targetp->p_traceflag & KTRFAC_ROOT &&
	priv_check(td, PRIV_KTRACE))
	return (0);

	if (p_candebug(td, targetp) != 0)
	return (0);

	return (1);
	}

	#endif /* KTRACE */
	Index: head/sys/kern/kern_linker.c
	===================================================================
	--- head/sys/kern/kern_linker.c (revision 225616)
	+++ head/sys/kern/kern_linker.c (revision 225617)
	@@ -1,2170 +1,2170 @@
	/*-
	* Copyright (c) 1997-2000 Doug Rabson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ddb.h"
	#include "opt_hwpmc_hooks.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/sysproto.h>
	#include <sys/sysent.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sx.h>
	#include <sys/module.h>
	#include <sys/mount.h>
	#include <sys/linker.h>
	#include <sys/fcntl.h>
	#include <sys/jail.h>
	#include <sys/libkern.h>
	#include <sys/namei.h>
	#include <sys/vnode.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>

	#include <net/vnet.h>

	#include <security/mac/mac_framework.h>

	#include "linker_if.h"

	#ifdef HWPMC_HOOKS
	#include <sys/pmckern.h>
	#endif

	#ifdef KLD_DEBUG
	int kld_debug = 0;
	SYSCTL_INT(_debug, OID_AUTO, kld_debug, CTLFLAG_RW,
	&kld_debug, 0, "Set various levels of KLD debug");
	#endif

	#define KLD_LOCK() sx_xlock(&kld_sx)
	#define KLD_UNLOCK() sx_xunlock(&kld_sx)
	#define KLD_DOWNGRADE() sx_downgrade(&kld_sx)
	#define KLD_LOCK_READ() sx_slock(&kld_sx)
	#define KLD_UNLOCK_READ() sx_sunlock(&kld_sx)
	#define KLD_LOCKED() sx_xlocked(&kld_sx)
	#define KLD_LOCK_ASSERT() do { \
	if (!cold) \
	sx_assert(&kld_sx, SX_XLOCKED); \
	} while (0)

	/*
	* static char linker_search_path(const char name, struct mod_depend
	* *verinfo);
	*/
	static const char linker_basename(const char path);

	/*
	* Find a currently loaded file given its filename.
	*/
	static linker_file_t linker_find_file_by_name(const char* _filename);

	/*
	* Find a currently loaded file given its file id.
	*/
	static linker_file_t linker_find_file_by_id(int _fileid);

	/* Metadata from the static kernel */
	SET_DECLARE(modmetadata_set, struct mod_metadata);

	MALLOC_DEFINE(M_LINKER, "linker", "kernel linker");

	linker_file_t linker_kernel_file;

	static struct sx kld_sx; /* kernel linker lock */

	/*
	* Load counter used by clients to determine if a linker file has been
	* re-loaded. This counter is incremented for each file load.
	*/
	static int loadcnt;

	static linker_class_list_t classes;
	static linker_file_list_t linker_files;
	static int next_file_id = 1;
	static int linker_no_more_classes = 0;

	#define LINKER_GET_NEXT_FILE_ID(a) do { \
	linker_file_t lftmp; \
	\
	KLD_LOCK_ASSERT(); \
	retry: \
	TAILQ_FOREACH(lftmp, &linker_files, link) { \
	if (next_file_id == lftmp->id) { \
	next_file_id++; \
	goto retry; \
	} \
	} \
	(a) = next_file_id; \
	} while(0)


	/* XXX wrong name; we're looking at version provision tags here, not modules */
	typedef TAILQ_HEAD(, modlist) modlisthead_t;
	struct modlist {
	TAILQ_ENTRY(modlist) link; /* chain together all modules */
	linker_file_t container;
	const char *name;
	int version;
	};
	typedef struct modlist *modlist_t;
	static modlisthead_t found_modules;

	static int linker_file_add_dependency(linker_file_t file,
	linker_file_t dep);
	static caddr_t linker_file_lookup_symbol_internal(linker_file_t file,
	const char* name, int deps);
	static int linker_load_module(const char *kldname,
	const char modname, struct linker_file parent,
	struct mod_depend verinfo, struct linker_file *lfpp);
	static modlist_t modlist_lookup2(const char name, struct mod_depend verinfo);

	static char *
	linker_strdup(const char *str)
	{
	char *result;

	if ((result = malloc((strlen(str) + 1), M_LINKER, M_WAITOK)) != NULL)
	strcpy(result, str);
	return (result);
	}

	static void
	linker_init(void *arg)
	{

	sx_init(&kld_sx, "kernel linker");
	TAILQ_INIT(&classes);
	TAILQ_INIT(&linker_files);
	}

	SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0);

	static void
	linker_stop_class_add(void *arg)
	{

	linker_no_more_classes = 1;
	}

	SYSINIT(linker_class, SI_SUB_KLD, SI_ORDER_ANY, linker_stop_class_add, NULL);

	int
	linker_add_class(linker_class_t lc)
	{

	/*
	* We disallow any class registration past SI_ORDER_ANY
	* of SI_SUB_KLD. We bump the reference count to keep the
	* ops from being freed.
	*/
	if (linker_no_more_classes == 1)
	return (EPERM);
	kobj_class_compile((kobj_class_t) lc);
	((kobj_class_t)lc)->refs++; /* XXX: kobj_mtx */
	TAILQ_INSERT_TAIL(&classes, lc, link);
	return (0);
	}

	static void
	linker_file_sysinit(linker_file_t lf)
	{
	struct sysinit start, stop, sipp, xipp, *save;

	KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n",
	lf->filename));

	if (linker_file_lookup_set(lf, "sysinit_set", &start, &stop, NULL) != 0)
	return;
	/*
	* Perform a bubble sort of the system initialization objects by
	* their subsystem (primary key) and order (secondary key).
	*
	* Since some things care about execution order, this is the operation
	* which ensures continued function.
	*/
	for (sipp = start; sipp < stop; sipp++) {
	for (xipp = sipp + 1; xipp < stop; xipp++) {
	if ((sipp)->subsystem < (xipp)->subsystem \|\|
	((sipp)->subsystem == (xipp)->subsystem &&
	(sipp)->order <= (xipp)->order))
	continue; /* skip */
	save = *sipp;
	sipp = xipp;
	*xipp = save;
	}
	}

	/*
	* Traverse the (now) ordered list of system initialization tasks.
	* Perform each task, and continue on to the next task.
	*/
	mtx_lock(&Giant);
	for (sipp = start; sipp < stop; sipp++) {
	if ((*sipp)->subsystem == SI_SUB_DUMMY)
	continue; /* skip dummy task(s) */

	/* Call function */
	(((sipp)->func)) ((*sipp)->udata);
	}
	mtx_unlock(&Giant);
	}

	static void
	linker_file_sysuninit(linker_file_t lf)
	{
	struct sysinit start, stop, sipp, xipp, *save;

	KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n",
	lf->filename));

	if (linker_file_lookup_set(lf, "sysuninit_set", &start, &stop,
	NULL) != 0)
	return;

	/*
	* Perform a reverse bubble sort of the system initialization objects
	* by their subsystem (primary key) and order (secondary key).
	*
	* Since some things care about execution order, this is the operation
	* which ensures continued function.
	*/
	for (sipp = start; sipp < stop; sipp++) {
	for (xipp = sipp + 1; xipp < stop; xipp++) {
	if ((sipp)->subsystem > (xipp)->subsystem \|\|
	((sipp)->subsystem == (xipp)->subsystem &&
	(sipp)->order >= (xipp)->order))
	continue; /* skip */
	save = *sipp;
	sipp = xipp;
	*xipp = save;
	}
	}

	/*
	* Traverse the (now) ordered list of system initialization tasks.
	* Perform each task, and continue on to the next task.
	*/
	mtx_lock(&Giant);
	for (sipp = start; sipp < stop; sipp++) {
	if ((*sipp)->subsystem == SI_SUB_DUMMY)
	continue; /* skip dummy task(s) */

	/* Call function */
	(((sipp)->func)) ((*sipp)->udata);
	}
	mtx_unlock(&Giant);
	}

	static void
	linker_file_register_sysctls(linker_file_t lf)
	{
	struct sysctl_oid start, stop, **oidp;

	KLD_DPF(FILE,
	("linker_file_register_sysctls: registering SYSCTLs for %s\n",
	lf->filename));

	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
	return;

	sysctl_lock();
	for (oidp = start; oidp < stop; oidp++)
	sysctl_register_oid(*oidp);
	sysctl_unlock();
	}

	static void
	linker_file_unregister_sysctls(linker_file_t lf)
	{
	struct sysctl_oid start, stop, **oidp;

	KLD_DPF(FILE, ("linker_file_unregister_sysctls: registering SYSCTLs"
	" for %s\n", lf->filename));

	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
	return;

	sysctl_lock();
	for (oidp = start; oidp < stop; oidp++)
	sysctl_unregister_oid(*oidp);
	sysctl_unlock();
	}

	static int
	linker_file_register_modules(linker_file_t lf)
	{
	struct mod_metadata start, stop, **mdp;
	const moduledata_t *moddata;
	int first_error, error;

	KLD_DPF(FILE, ("linker_file_register_modules: registering modules"
	" in %s\n", lf->filename));

	if (linker_file_lookup_set(lf, "modmetadata_set", &start,
	&stop, NULL) != 0) {
	/*
	* This fallback should be unnecessary, but if we get booted
	* from boot2 instead of loader and we are missing our
	* metadata then we have to try the best we can.
	*/
	if (lf == linker_kernel_file) {
	start = SET_BEGIN(modmetadata_set);
	stop = SET_LIMIT(modmetadata_set);
	} else
	return (0);
	}
	first_error = 0;
	for (mdp = start; mdp < stop; mdp++) {
	if ((*mdp)->md_type != MDT_MODULE)
	continue;
	moddata = (*mdp)->md_data;
	KLD_DPF(FILE, ("Registering module %s in %s\n",
	moddata->name, lf->filename));
	error = module_register(moddata, lf);
	if (error) {
	printf("Module %s failed to register: %d\n",
	moddata->name, error);
	if (first_error == 0)
	first_error = error;
	}
	}
	return (first_error);
	}

	static void
	linker_init_kernel_modules(void)
	{

	linker_file_register_modules(linker_kernel_file);
	}

	SYSINIT(linker_kernel, SI_SUB_KLD, SI_ORDER_ANY, linker_init_kernel_modules,
	0);

	static int
	linker_load_file(const char filename, linker_file_t result)
	{
	linker_class_t lc;
	linker_file_t lf;
	int foundfile, error;

	/* Refuse to load modules if securelevel raised */
	if (prison0.pr_securelevel > 0)
	return (EPERM);

	KLD_LOCK_ASSERT();
	lf = linker_find_file_by_name(filename);
	if (lf) {
	KLD_DPF(FILE, ("linker_load_file: file %s is already loaded,"
	" incrementing refs\n", filename));
	*result = lf;
	lf->refs++;
	return (0);
	}
	foundfile = 0;
	error = 0;

	/*
	* We do not need to protect (lock) classes here because there is
	* no class registration past startup (SI_SUB_KLD, SI_ORDER_ANY)
	* and there is no class deregistration mechanism at this time.
	*/
	TAILQ_FOREACH(lc, &classes, link) {
	KLD_DPF(FILE, ("linker_load_file: trying to load %s\n",
	filename));
	error = LINKER_LOAD_FILE(lc, filename, &lf);
	/*
	* If we got something other than ENOENT, then it exists but
	* we cannot load it for some other reason.
	*/
	if (error != ENOENT)
	foundfile = 1;
	if (lf) {
	error = linker_file_register_modules(lf);
	if (error == EEXIST) {
	linker_file_unload(lf, LINKER_UNLOAD_FORCE);
	return (error);
	}
	KLD_UNLOCK();
	linker_file_register_sysctls(lf);
	linker_file_sysinit(lf);
	KLD_LOCK();
	lf->flags \|= LINKER_FILE_LINKED;
	*result = lf;
	return (0);
	}
	}
	/*
	* Less than ideal, but tells the user whether it failed to load or
	* the module was not found.
	*/
	if (foundfile) {

	/*
	* If the file type has not been recognized by the last try
	* printout a message before to fail.
	*/
	if (error == ENOSYS)
	printf("linker_load_file: Unsupported file type\n");

	/*
	* Format not recognized or otherwise unloadable.
	* When loading a module that is statically built into
	* the kernel EEXIST percolates back up as the return
	* value. Preserve this so that apps like sysinstall
	* can recognize this special case and not post bogus
	* dialog boxes.
	*/
	if (error != EEXIST)
	error = ENOEXEC;
	} else
	error = ENOENT; /* Nothing found */
	return (error);
	}

	int
	linker_reference_module(const char modname, struct mod_depend verinfo,
	linker_file_t *result)
	{
	modlist_t mod;
	int error;

	KLD_LOCK();
	if ((mod = modlist_lookup2(modname, verinfo)) != NULL) {
	*result = mod->container;
	(*result)->refs++;
	KLD_UNLOCK();
	return (0);
	}

	error = linker_load_module(NULL, modname, NULL, verinfo, result);
	KLD_UNLOCK();
	return (error);
	}

	int
	linker_release_module(const char modname, struct mod_depend verinfo,
	linker_file_t lf)
	{
	modlist_t mod;
	int error;

	KLD_LOCK();
	if (lf == NULL) {
	KASSERT(modname != NULL,
	("linker_release_module: no file or name"));
	mod = modlist_lookup2(modname, verinfo);
	if (mod == NULL) {
	KLD_UNLOCK();
	return (ESRCH);
	}
	lf = mod->container;
	} else
	KASSERT(modname == NULL && verinfo == NULL,
	("linker_release_module: both file and name"));
	error = linker_file_unload(lf, LINKER_UNLOAD_NORMAL);
	KLD_UNLOCK();
	return (error);
	}

	static linker_file_t
	linker_find_file_by_name(const char *filename)
	{
	linker_file_t lf;
	char *koname;

	koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
	sprintf(koname, "%s.ko", filename);

	KLD_LOCK_ASSERT();
	TAILQ_FOREACH(lf, &linker_files, link) {
	if (strcmp(lf->filename, koname) == 0)
	break;
	if (strcmp(lf->filename, filename) == 0)
	break;
	}
	free(koname, M_LINKER);
	return (lf);
	}

	static linker_file_t
	linker_find_file_by_id(int fileid)
	{
	linker_file_t lf;

	KLD_LOCK_ASSERT();
	TAILQ_FOREACH(lf, &linker_files, link)
	if (lf->id == fileid && lf->flags & LINKER_FILE_LINKED)
	break;
	return (lf);
	}

	int
	linker_file_foreach(linker_predicate_t predicate, void context)
	{
	linker_file_t lf;
	int retval = 0;

	KLD_LOCK();
	TAILQ_FOREACH(lf, &linker_files, link) {
	retval = predicate(lf, context);
	if (retval != 0)
	break;
	}
	KLD_UNLOCK();
	return (retval);
	}

	linker_file_t
	linker_make_file(const char *pathname, linker_class_t lc)
	{
	linker_file_t lf;
	const char *filename;

	KLD_LOCK_ASSERT();
	filename = linker_basename(pathname);

	KLD_DPF(FILE, ("linker_make_file: new file, filename='%s' for pathname='%s'\n", filename, pathname));
	lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK);
	if (lf == NULL)
	return (NULL);
	lf->refs = 1;
	lf->userrefs = 0;
	lf->flags = 0;
	lf->filename = linker_strdup(filename);
	lf->pathname = linker_strdup(pathname);
	LINKER_GET_NEXT_FILE_ID(lf->id);
	lf->ndeps = 0;
	lf->deps = NULL;
	lf->loadcnt = ++loadcnt;
	lf->sdt_probes = NULL;
	lf->sdt_nprobes = 0;
	STAILQ_INIT(&lf->common);
	TAILQ_INIT(&lf->modules);
	TAILQ_INSERT_TAIL(&linker_files, lf, link);
	return (lf);
	}

	int
	linker_file_unload(linker_file_t file, int flags)
	{
	module_t mod, next;
	modlist_t ml, nextml;
	struct common_symbol *cp;
	int error, i;

	/* Refuse to unload modules if securelevel raised. */
	if (prison0.pr_securelevel > 0)
	return (EPERM);

	KLD_LOCK_ASSERT();
	KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs));

	/* Easy case of just dropping a reference. */
	if (file->refs > 1) {
	file->refs--;
	return (0);
	}

	KLD_DPF(FILE, ("linker_file_unload: file is unloading,"
	" informing modules\n"));

	/*
	* Quiesce all the modules to give them a chance to veto the unload.
	*/
	MOD_SLOCK;
	for (mod = TAILQ_FIRST(&file->modules); mod;
	mod = module_getfnext(mod)) {

	error = module_quiesce(mod);
	if (error != 0 && flags != LINKER_UNLOAD_FORCE) {
	KLD_DPF(FILE, ("linker_file_unload: module %s"
	" vetoed unload\n", module_getname(mod)));
	/*
	* XXX: Do we need to tell all the quiesced modules
	* that they can resume work now via a new module
	* event?
	*/
	MOD_SUNLOCK;
	return (error);
	}
	}
	MOD_SUNLOCK;

	/*
	* Inform any modules associated with this file that they are
	* being be unloaded.
	*/
	MOD_XLOCK;
	for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
	next = module_getfnext(mod);
	MOD_XUNLOCK;

	/*
	* Give the module a chance to veto the unload.
	*/
	if ((error = module_unload(mod)) != 0) {
	KLD_DPF(FILE, ("linker_file_unload: module %s"
	" failed unload\n", module_getname(mod)));
	return (error);
	}
	MOD_XLOCK;
	module_release(mod);
	}
	MOD_XUNLOCK;

	TAILQ_FOREACH_SAFE(ml, &found_modules, link, nextml) {
	if (ml->container == file) {
	TAILQ_REMOVE(&found_modules, ml, link);
	free(ml, M_LINKER);
	}
	}

	/*
	* Don't try to run SYSUNINITs if we are unloaded due to a
	* link error.
	*/
	if (file->flags & LINKER_FILE_LINKED) {
	file->flags &= ~LINKER_FILE_LINKED;
	KLD_UNLOCK();
	linker_file_sysuninit(file);
	linker_file_unregister_sysctls(file);
	KLD_LOCK();
	}
	TAILQ_REMOVE(&linker_files, file, link);

	if (file->deps) {
	for (i = 0; i < file->ndeps; i++)
	linker_file_unload(file->deps[i], flags);
	free(file->deps, M_LINKER);
	file->deps = NULL;
	}
	while ((cp = STAILQ_FIRST(&file->common)) != NULL) {
	STAILQ_REMOVE_HEAD(&file->common, link);
	free(cp, M_LINKER);
	}

	LINKER_UNLOAD(file);
	if (file->filename) {
	free(file->filename, M_LINKER);
	file->filename = NULL;
	}
	if (file->pathname) {
	free(file->pathname, M_LINKER);
	file->pathname = NULL;
	}
	kobj_delete((kobj_t) file, M_LINKER);
	return (0);
	}

	int
	linker_ctf_get(linker_file_t file, linker_ctf_t *lc)
	{
	return (LINKER_CTF_GET(file, lc));
	}

	static int
	linker_file_add_dependency(linker_file_t file, linker_file_t dep)
	{
	linker_file_t *newdeps;

	KLD_LOCK_ASSERT();
	newdeps = malloc((file->ndeps + 1) * sizeof(linker_file_t *),
	M_LINKER, M_WAITOK \| M_ZERO);
	if (newdeps == NULL)
	return (ENOMEM);

	if (file->deps) {
	bcopy(file->deps, newdeps,
	file->ndeps * sizeof(linker_file_t *));
	free(file->deps, M_LINKER);
	}
	file->deps = newdeps;
	file->deps[file->ndeps] = dep;
	file->ndeps++;
	KLD_DPF(FILE, ("linker_file_add_dependency:"
	" adding %s as dependency for %s\n",
	dep->filename, file->filename));
	return (0);
	}

	/*
	* Locate a linker set and its contents. This is a helper function to avoid
	* linker_if.h exposure elsewhere. Note: firstp and lastp are really void **.
	* This function is used in this file so we can avoid having lots of (void **)
	* casts.
	*/
	int
	linker_file_lookup_set(linker_file_t file, const char *name,
	void firstp, void lastp, int *countp)
	{
	int error, locked;

	locked = KLD_LOCKED();
	if (!locked)
	KLD_LOCK();
	error = LINKER_LOOKUP_SET(file, name, firstp, lastp, countp);
	if (!locked)
	KLD_UNLOCK();
	return (error);
	}

	/*
	* List all functions in a file.
	*/
	int
	linker_file_function_listall(linker_file_t lf,
	linker_function_nameval_callback_t callback_func, void *arg)
	{
	return (LINKER_EACH_FUNCTION_NAMEVAL(lf, callback_func, arg));
	}

	caddr_t
	linker_file_lookup_symbol(linker_file_t file, const char *name, int deps)
	{
	caddr_t sym;
	int locked;

	locked = KLD_LOCKED();
	if (!locked)
	KLD_LOCK();
	sym = linker_file_lookup_symbol_internal(file, name, deps);
	if (!locked)
	KLD_UNLOCK();
	return (sym);
	}

	static caddr_t
	linker_file_lookup_symbol_internal(linker_file_t file, const char *name,
	int deps)
	{
	c_linker_sym_t sym;
	linker_symval_t symval;
	caddr_t address;
	size_t common_size = 0;
	int i;

	KLD_LOCK_ASSERT();
	KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%p, name=%s, deps=%d\n",
	file, name, deps));

	if (LINKER_LOOKUP_SYMBOL(file, name, &sym) == 0) {
	LINKER_SYMBOL_VALUES(file, sym, &symval);
	if (symval.value == 0)
	/*
	* For commons, first look them up in the
	* dependencies and only allocate space if not found
	* there.
	*/
	common_size = symval.size;
	else {
	KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol"
	".value=%p\n", symval.value));
	return (symval.value);
	}
	}
	if (deps) {
	for (i = 0; i < file->ndeps; i++) {
	address = linker_file_lookup_symbol_internal(
	file->deps[i], name, 0);
	if (address) {
	KLD_DPF(SYM, ("linker_file_lookup_symbol:"
	" deps value=%p\n", address));
	return (address);
	}
	}
	}
	if (common_size > 0) {
	/*
	* This is a common symbol which was not found in the
	* dependencies. We maintain a simple common symbol table in
	* the file object.
	*/
	struct common_symbol *cp;

	STAILQ_FOREACH(cp, &file->common, link) {
	if (strcmp(cp->name, name) == 0) {
	KLD_DPF(SYM, ("linker_file_lookup_symbol:"
	" old common value=%p\n", cp->address));
	return (cp->address);
	}
	}
	/*
	* Round the symbol size up to align.
	*/
	common_size = (common_size + sizeof(int) - 1) & -sizeof(int);
	cp = malloc(sizeof(struct common_symbol)
	+ common_size + strlen(name) + 1, M_LINKER,
	M_WAITOK \| M_ZERO);
	cp->address = (caddr_t)(cp + 1);
	cp->name = cp->address + common_size;
	strcpy(cp->name, name);
	bzero(cp->address, common_size);
	STAILQ_INSERT_TAIL(&file->common, cp, link);

	KLD_DPF(SYM, ("linker_file_lookup_symbol: new common"
	" value=%p\n", cp->address));
	return (cp->address);
	}
	KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n"));
	return (0);
	}

	/*
	* Both DDB and stack(9) rely on the kernel linker to provide forward and
	* backward lookup of symbols. However, DDB and sometimes stack(9) need to
	* do this in a lockfree manner. We provide a set of internal helper
	* routines to perform these operations without locks, and then wrappers that
	* optionally lock.
	*
	* linker_debug_lookup() is ifdef DDB as currently it's only used by DDB.
	*/
	#ifdef DDB
	static int
	linker_debug_lookup(const char symstr, c_linker_sym_t sym)
	{
	linker_file_t lf;

	TAILQ_FOREACH(lf, &linker_files, link) {
	if (LINKER_LOOKUP_SYMBOL(lf, symstr, sym) == 0)
	return (0);
	}
	return (ENOENT);
	}
	#endif

	static int
	linker_debug_search_symbol(caddr_t value, c_linker_sym_t sym, long diffp)
	{
	linker_file_t lf;
	c_linker_sym_t best, es;
	u_long diff, bestdiff, off;

	best = 0;
	off = (uintptr_t)value;
	bestdiff = off;
	TAILQ_FOREACH(lf, &linker_files, link) {
	if (LINKER_SEARCH_SYMBOL(lf, value, &es, &diff) != 0)
	continue;
	if (es != 0 && diff < bestdiff) {
	best = es;
	bestdiff = diff;
	}
	if (bestdiff == 0)
	break;
	}
	if (best) {
	*sym = best;
	*diffp = bestdiff;
	return (0);
	} else {
	*sym = 0;
	*diffp = off;
	return (ENOENT);
	}
	}

	static int
	linker_debug_symbol_values(c_linker_sym_t sym, linker_symval_t *symval)
	{
	linker_file_t lf;

	TAILQ_FOREACH(lf, &linker_files, link) {
	if (LINKER_SYMBOL_VALUES(lf, sym, symval) == 0)
	return (0);
	}
	return (ENOENT);
	}

	static int
	linker_debug_search_symbol_name(caddr_t value, char *buf, u_int buflen,
	long *offset)
	{
	linker_symval_t symval;
	c_linker_sym_t sym;
	int error;

	*offset = 0;
	error = linker_debug_search_symbol(value, &sym, offset);
	if (error)
	return (error);
	error = linker_debug_symbol_values(sym, &symval);
	if (error)
	return (error);
	strlcpy(buf, symval.name, buflen);
	return (0);
	}

	/*
	* DDB Helpers. DDB has to look across multiple files with their own symbol
	* tables and string tables.
	*
	* Note that we do not obey list locking protocols here. We really don't need
	* DDB to hang because somebody's got the lock held. We'll take the chance
	* that the files list is inconsistant instead.
	*/
	#ifdef DDB
	int
	linker_ddb_lookup(const char symstr, c_linker_sym_t sym)
	{

	return (linker_debug_lookup(symstr, sym));
	}
	#endif

	int
	linker_ddb_search_symbol(caddr_t value, c_linker_sym_t sym, long diffp)
	{

	return (linker_debug_search_symbol(value, sym, diffp));
	}

	int
	linker_ddb_symbol_values(c_linker_sym_t sym, linker_symval_t *symval)
	{

	return (linker_debug_symbol_values(sym, symval));
	}

	int
	linker_ddb_search_symbol_name(caddr_t value, char *buf, u_int buflen,
	long *offset)
	{

	return (linker_debug_search_symbol_name(value, buf, buflen, offset));
	}

	/*
	* stack(9) helper for non-debugging environemnts. Unlike DDB helpers, we do
	* obey locking protocols, and offer a significantly less complex interface.
	*/
	int
	linker_search_symbol_name(caddr_t value, char *buf, u_int buflen,
	long *offset)
	{
	int error;

	KLD_LOCK();
	error = linker_debug_search_symbol_name(value, buf, buflen, offset);
	KLD_UNLOCK();
	return (error);
	}

	/*
	* Syscalls.
	*/
	int
	kern_kldload(struct thread td, const char file, int *fileid)
	{
	#ifdef HWPMC_HOOKS
	struct pmckern_map_in pkm;
	#endif
	const char kldname, modname;
	linker_file_t lf;
	int error;

	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
	return (error);

	if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0)
	return (error);

	/*
	* It is possible that kldloaded module will attach a new ifnet,
	* so vnet context must be set when this ocurs.
	*/
	CURVNET_SET(TD_TO_VNET(td));

	/*
	* If file does not contain a qualified name or any dot in it
	* (kldname.ko, or kldname.ver.ko) treat it as an interface
	* name.
	*/
	if (index(file, '/') \|\| index(file, '.')) {
	kldname = file;
	modname = NULL;
	} else {
	kldname = NULL;
	modname = file;
	}

	KLD_LOCK();
	error = linker_load_module(kldname, modname, NULL, NULL, &lf);
	if (error) {
	KLD_UNLOCK();
	goto done;
	}
	lf->userrefs++;
	if (fileid != NULL)
	*fileid = lf->id;
	#ifdef HWPMC_HOOKS
	KLD_DOWNGRADE();
	pkm.pm_file = lf->filename;
	pkm.pm_address = (uintptr_t) lf->address;
	PMC_CALL_HOOK(td, PMC_FN_KLD_LOAD, (void *) &pkm);
	KLD_UNLOCK_READ();
	#else
	KLD_UNLOCK();
	#endif

	done:
	CURVNET_RESTORE();
	return (error);
	}

	int
	-kldload(struct thread td, struct kldload_args uap)
	+sys_kldload(struct thread td, struct kldload_args uap)
	{
	char *pathname = NULL;
	int error, fileid;

	td->td_retval[0] = -1;

	pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
	error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL);
	if (error == 0) {
	error = kern_kldload(td, pathname, &fileid);
	if (error == 0)
	td->td_retval[0] = fileid;
	}
	free(pathname, M_TEMP);
	return (error);
	}

	int
	kern_kldunload(struct thread *td, int fileid, int flags)
	{
	#ifdef HWPMC_HOOKS
	struct pmckern_map_out pkm;
	#endif
	linker_file_t lf;
	int error = 0;

	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
	return (error);

	if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0)
	return (error);

	CURVNET_SET(TD_TO_VNET(td));
	KLD_LOCK();
	lf = linker_find_file_by_id(fileid);
	if (lf) {
	KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs));

	/* Check if there are DTrace probes enabled on this file. */
	if (lf->nenabled > 0) {
	printf("kldunload: attempt to unload file that has"
	" DTrace probes enabled\n");
	error = EBUSY;
	} else if (lf->userrefs == 0) {
	/*
	* XXX: maybe LINKER_UNLOAD_FORCE should override ?
	*/
	printf("kldunload: attempt to unload file that was"
	" loaded by the kernel\n");
	error = EBUSY;
	} else {
	#ifdef HWPMC_HOOKS
	/* Save data needed by hwpmc(4) before unloading. */
	pkm.pm_address = (uintptr_t) lf->address;
	pkm.pm_size = lf->size;
	#endif
	lf->userrefs--;
	error = linker_file_unload(lf, flags);
	if (error)
	lf->userrefs++;
	}
	} else
	error = ENOENT;

	#ifdef HWPMC_HOOKS
	if (error == 0) {
	KLD_DOWNGRADE();
	PMC_CALL_HOOK(td, PMC_FN_KLD_UNLOAD, (void *) &pkm);
	KLD_UNLOCK_READ();
	} else
	KLD_UNLOCK();
	#else
	KLD_UNLOCK();
	#endif
	CURVNET_RESTORE();
	return (error);
	}

	int
	-kldunload(struct thread td, struct kldunload_args uap)
	+sys_kldunload(struct thread td, struct kldunload_args uap)
	{

	return (kern_kldunload(td, uap->fileid, LINKER_UNLOAD_NORMAL));
	}

	int
	-kldunloadf(struct thread td, struct kldunloadf_args uap)
	+sys_kldunloadf(struct thread td, struct kldunloadf_args uap)
	{

	if (uap->flags != LINKER_UNLOAD_NORMAL &&
	uap->flags != LINKER_UNLOAD_FORCE)
	return (EINVAL);
	return (kern_kldunload(td, uap->fileid, uap->flags));
	}

	int
	-kldfind(struct thread td, struct kldfind_args uap)
	+sys_kldfind(struct thread td, struct kldfind_args uap)
	{
	char *pathname;
	const char *filename;
	linker_file_t lf;
	int error;

	#ifdef MAC
	error = mac_kld_check_stat(td->td_ucred);
	if (error)
	return (error);
	#endif

	td->td_retval[0] = -1;

	pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
	if ((error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL)) != 0)
	goto out;

	filename = linker_basename(pathname);
	KLD_LOCK();
	lf = linker_find_file_by_name(filename);
	if (lf)
	td->td_retval[0] = lf->id;
	else
	error = ENOENT;
	KLD_UNLOCK();
	out:
	free(pathname, M_TEMP);
	return (error);
	}

	int
	-kldnext(struct thread td, struct kldnext_args uap)
	+sys_kldnext(struct thread td, struct kldnext_args uap)
	{
	linker_file_t lf;
	int error = 0;

	#ifdef MAC
	error = mac_kld_check_stat(td->td_ucred);
	if (error)
	return (error);
	#endif

	KLD_LOCK();
	if (uap->fileid == 0)
	lf = TAILQ_FIRST(&linker_files);
	else {
	lf = linker_find_file_by_id(uap->fileid);
	if (lf == NULL) {
	error = ENOENT;
	goto out;
	}
	lf = TAILQ_NEXT(lf, link);
	}

	/* Skip partially loaded files. */
	while (lf != NULL && !(lf->flags & LINKER_FILE_LINKED))
	lf = TAILQ_NEXT(lf, link);

	if (lf)
	td->td_retval[0] = lf->id;
	else
	td->td_retval[0] = 0;
	out:
	KLD_UNLOCK();
	return (error);
	}

	int
	-kldstat(struct thread td, struct kldstat_args uap)
	+sys_kldstat(struct thread td, struct kldstat_args uap)
	{
	struct kld_file_stat stat;
	int error, version;

	/*
	* Check the version of the user's structure.
	*/
	if ((error = copyin(&uap->stat->version, &version, sizeof(version)))
	!= 0)
	return (error);
	if (version != sizeof(struct kld_file_stat_1) &&
	version != sizeof(struct kld_file_stat))
	return (EINVAL);

	error = kern_kldstat(td, uap->fileid, &stat);
	if (error != 0)
	return (error);
	return (copyout(&stat, uap->stat, version));
	}

	int
	kern_kldstat(struct thread td, int fileid, struct kld_file_stat stat)
	{
	linker_file_t lf;
	int namelen;
	#ifdef MAC
	int error;

	error = mac_kld_check_stat(td->td_ucred);
	if (error)
	return (error);
	#endif

	KLD_LOCK();
	lf = linker_find_file_by_id(fileid);
	if (lf == NULL) {
	KLD_UNLOCK();
	return (ENOENT);
	}

	/* Version 1 fields: */
	namelen = strlen(lf->filename) + 1;
	if (namelen > MAXPATHLEN)
	namelen = MAXPATHLEN;
	bcopy(lf->filename, &stat->name[0], namelen);
	stat->refs = lf->refs;
	stat->id = lf->id;
	stat->address = lf->address;
	stat->size = lf->size;
	/* Version 2 fields: */
	namelen = strlen(lf->pathname) + 1;
	if (namelen > MAXPATHLEN)
	namelen = MAXPATHLEN;
	bcopy(lf->pathname, &stat->pathname[0], namelen);
	KLD_UNLOCK();

	td->td_retval[0] = 0;
	return (0);
	}

	int
	-kldfirstmod(struct thread td, struct kldfirstmod_args uap)
	+sys_kldfirstmod(struct thread td, struct kldfirstmod_args uap)
	{
	linker_file_t lf;
	module_t mp;
	int error = 0;

	#ifdef MAC
	error = mac_kld_check_stat(td->td_ucred);
	if (error)
	return (error);
	#endif

	KLD_LOCK();
	lf = linker_find_file_by_id(uap->fileid);
	if (lf) {
	MOD_SLOCK;
	mp = TAILQ_FIRST(&lf->modules);
	if (mp != NULL)
	td->td_retval[0] = module_getid(mp);
	else
	td->td_retval[0] = 0;
	MOD_SUNLOCK;
	} else
	error = ENOENT;
	KLD_UNLOCK();
	return (error);
	}

	int
	-kldsym(struct thread td, struct kldsym_args uap)
	+sys_kldsym(struct thread td, struct kldsym_args uap)
	{
	char *symstr = NULL;
	c_linker_sym_t sym;
	linker_symval_t symval;
	linker_file_t lf;
	struct kld_sym_lookup lookup;
	int error = 0;

	#ifdef MAC
	error = mac_kld_check_stat(td->td_ucred);
	if (error)
	return (error);
	#endif

	if ((error = copyin(uap->data, &lookup, sizeof(lookup))) != 0)
	return (error);
	if (lookup.version != sizeof(lookup) \|\|
	uap->cmd != KLDSYM_LOOKUP)
	return (EINVAL);
	symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
	if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0)
	goto out;
	KLD_LOCK();
	if (uap->fileid != 0) {
	lf = linker_find_file_by_id(uap->fileid);
	if (lf == NULL)
	error = ENOENT;
	else if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
	LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
	lookup.symvalue = (uintptr_t) symval.value;
	lookup.symsize = symval.size;
	error = copyout(&lookup, uap->data, sizeof(lookup));
	} else
	error = ENOENT;
	} else {
	TAILQ_FOREACH(lf, &linker_files, link) {
	if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
	LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
	lookup.symvalue = (uintptr_t)symval.value;
	lookup.symsize = symval.size;
	error = copyout(&lookup, uap->data,
	sizeof(lookup));
	break;
	}
	}
	if (lf == NULL)
	error = ENOENT;
	}
	KLD_UNLOCK();
	out:
	free(symstr, M_TEMP);
	return (error);
	}

	/*
	* Preloaded module support
	*/

	static modlist_t
	modlist_lookup(const char *name, int ver)
	{
	modlist_t mod;

	TAILQ_FOREACH(mod, &found_modules, link) {
	if (strcmp(mod->name, name) == 0 &&
	(ver == 0 \|\| mod->version == ver))
	return (mod);
	}
	return (NULL);
	}

	static modlist_t
	modlist_lookup2(const char name, struct mod_depend verinfo)
	{
	modlist_t mod, bestmod;
	int ver;

	if (verinfo == NULL)
	return (modlist_lookup(name, 0));
	bestmod = NULL;
	TAILQ_FOREACH(mod, &found_modules, link) {
	if (strcmp(mod->name, name) != 0)
	continue;
	ver = mod->version;
	if (ver == verinfo->md_ver_preferred)
	return (mod);
	if (ver >= verinfo->md_ver_minimum &&
	ver <= verinfo->md_ver_maximum &&
	(bestmod == NULL \|\| ver > bestmod->version))
	bestmod = mod;
	}
	return (bestmod);
	}

	static modlist_t
	modlist_newmodule(const char *modname, int version, linker_file_t container)
	{
	modlist_t mod;

	mod = malloc(sizeof(struct modlist), M_LINKER, M_NOWAIT \| M_ZERO);
	if (mod == NULL)
	panic("no memory for module list");
	mod->container = container;
	mod->name = modname;
	mod->version = version;
	TAILQ_INSERT_TAIL(&found_modules, mod, link);
	return (mod);
	}

	static void
	linker_addmodules(linker_file_t lf, struct mod_metadata **start,
	struct mod_metadata **stop, int preload)
	{
	struct mod_metadata mp, *mdp;
	const char *modname;
	int ver;

	for (mdp = start; mdp < stop; mdp++) {
	mp = *mdp;
	if (mp->md_type != MDT_VERSION)
	continue;
	modname = mp->md_cval;
	ver = ((struct mod_version *)mp->md_data)->mv_version;
	if (modlist_lookup(modname, ver) != NULL) {
	printf("module %s already present!\n", modname);
	/* XXX what can we do? this is a build error. :-( */
	continue;
	}
	modlist_newmodule(modname, ver, lf);
	}
	}

	static void
	linker_preload(void *arg)
	{
	caddr_t modptr;
	const char modname, nmodname;
	char *modtype;
	linker_file_t lf, nlf;
	linker_class_t lc;
	int error;
	linker_file_list_t loaded_files;
	linker_file_list_t depended_files;
	struct mod_metadata mp, nmp;
	struct mod_metadata start, stop, mdp, nmdp;
	struct mod_depend *verinfo;
	int nver;
	int resolves;
	modlist_t mod;
	struct sysinit si_start, si_stop;

	TAILQ_INIT(&loaded_files);
	TAILQ_INIT(&depended_files);
	TAILQ_INIT(&found_modules);
	error = 0;

	modptr = NULL;
	while ((modptr = preload_search_next_name(modptr)) != NULL) {
	modname = (char *)preload_search_info(modptr, MODINFO_NAME);
	modtype = (char *)preload_search_info(modptr, MODINFO_TYPE);
	if (modname == NULL) {
	printf("Preloaded module at %p does not have a"
	" name!\n", modptr);
	continue;
	}
	if (modtype == NULL) {
	printf("Preloaded module at %p does not have a type!\n",
	modptr);
	continue;
	}
	if (bootverbose)
	printf("Preloaded %s \"%s\" at %p.\n", modtype, modname,
	modptr);
	lf = NULL;
	TAILQ_FOREACH(lc, &classes, link) {
	error = LINKER_LINK_PRELOAD(lc, modname, &lf);
	if (!error)
	break;
	lf = NULL;
	}
	if (lf)
	TAILQ_INSERT_TAIL(&loaded_files, lf, loaded);
	}

	/*
	* First get a list of stuff in the kernel.
	*/
	if (linker_file_lookup_set(linker_kernel_file, MDT_SETNAME, &start,
	&stop, NULL) == 0)
	linker_addmodules(linker_kernel_file, start, stop, 1);

	/*
	* This is a once-off kinky bubble sort to resolve relocation
	* dependency requirements.
	*/
	restart:
	TAILQ_FOREACH(lf, &loaded_files, loaded) {
	error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
	&stop, NULL);
	/*
	* First, look to see if we would successfully link with this
	* stuff.
	*/
	resolves = 1; /* unless we know otherwise */
	if (!error) {
	for (mdp = start; mdp < stop; mdp++) {
	mp = *mdp;
	if (mp->md_type != MDT_DEPEND)
	continue;
	modname = mp->md_cval;
	verinfo = mp->md_data;
	for (nmdp = start; nmdp < stop; nmdp++) {
	nmp = *nmdp;
	if (nmp->md_type != MDT_VERSION)
	continue;
	nmodname = nmp->md_cval;
	if (strcmp(modname, nmodname) == 0)
	break;
	}
	if (nmdp < stop) /* it's a self reference */
	continue;

	/*
	* ok, the module isn't here yet, we
	* are not finished
	*/
	if (modlist_lookup2(modname, verinfo) == NULL)
	resolves = 0;
	}
	}
	/*
	* OK, if we found our modules, we can link. So, "provide"
	* the modules inside and add it to the end of the link order
	* list.
	*/
	if (resolves) {
	if (!error) {
	for (mdp = start; mdp < stop; mdp++) {
	mp = *mdp;
	if (mp->md_type != MDT_VERSION)
	continue;
	modname = mp->md_cval;
	nver = ((struct mod_version *)
	mp->md_data)->mv_version;
	if (modlist_lookup(modname,
	nver) != NULL) {
	printf("module %s already"
	" present!\n", modname);
	TAILQ_REMOVE(&loaded_files,
	lf, loaded);
	linker_file_unload(lf,
	LINKER_UNLOAD_FORCE);
	/* we changed tailq next ptr */
	goto restart;
	}
	modlist_newmodule(modname, nver, lf);
	}
	}
	TAILQ_REMOVE(&loaded_files, lf, loaded);
	TAILQ_INSERT_TAIL(&depended_files, lf, loaded);
	/*
	* Since we provided modules, we need to restart the
	* sort so that the previous files that depend on us
	* have a chance. Also, we've busted the tailq next
	* pointer with the REMOVE.
	*/
	goto restart;
	}
	}

	/*
	* At this point, we check to see what could not be resolved..
	*/
	while ((lf = TAILQ_FIRST(&loaded_files)) != NULL) {
	TAILQ_REMOVE(&loaded_files, lf, loaded);
	printf("KLD file %s is missing dependencies\n", lf->filename);
	linker_file_unload(lf, LINKER_UNLOAD_FORCE);
	}

	/*
	* We made it. Finish off the linking in the order we determined.
	*/
	TAILQ_FOREACH_SAFE(lf, &depended_files, loaded, nlf) {
	if (linker_kernel_file) {
	linker_kernel_file->refs++;
	error = linker_file_add_dependency(lf,
	linker_kernel_file);
	if (error)
	panic("cannot add dependency");
	}
	lf->userrefs++; /* so we can (try to) kldunload it */
	error = linker_file_lookup_set(lf, MDT_SETNAME, &start,
	&stop, NULL);
	if (!error) {
	for (mdp = start; mdp < stop; mdp++) {
	mp = *mdp;
	if (mp->md_type != MDT_DEPEND)
	continue;
	modname = mp->md_cval;
	verinfo = mp->md_data;
	mod = modlist_lookup2(modname, verinfo);
	if (mod == NULL) {
	printf("KLD file %s - cannot find "
	"dependency \"%s\"\n",
	lf->filename, modname);
	goto fail;
	}
	/* Don't count self-dependencies */
	if (lf == mod->container)
	continue;
	mod->container->refs++;
	error = linker_file_add_dependency(lf,
	mod->container);
	if (error)
	panic("cannot add dependency");
	}
	}
	/*
	* Now do relocation etc using the symbol search paths
	* established by the dependencies
	*/
	error = LINKER_LINK_PRELOAD_FINISH(lf);
	if (error) {
	printf("KLD file %s - could not finalize loading\n",
	lf->filename);
	goto fail;
	}
	linker_file_register_modules(lf);
	if (linker_file_lookup_set(lf, "sysinit_set", &si_start,
	&si_stop, NULL) == 0)
	sysinit_add(si_start, si_stop);
	linker_file_register_sysctls(lf);
	lf->flags \|= LINKER_FILE_LINKED;
	continue;
	fail:
	TAILQ_REMOVE(&depended_files, lf, loaded);
	linker_file_unload(lf, LINKER_UNLOAD_FORCE);
	}
	/* woohoo! we made it! */
	}

	SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0);

	/*
	* Search for a not-loaded module by name.
	*
	* Modules may be found in the following locations:
	*
	* - preloaded (result is just the module name) - on disk (result is full path
	* to module)
	*
	* If the module name is qualified in any way (contains path, etc.) the we
	* simply return a copy of it.
	*
	* The search path can be manipulated via sysctl. Note that we use the ';'
	* character as a separator to be consistent with the bootloader.
	*/

	static char linker_hintfile[] = "linker.hints";
	static char linker_path[MAXPATHLEN] = "/boot/kernel;/boot/modules";

	SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RW, linker_path,
	sizeof(linker_path), "module load search path");

	TUNABLE_STR("module_path", linker_path, sizeof(linker_path));

	static char *linker_ext_list[] = {
	"",
	".ko",
	NULL
	};

	/*
	* Check if file actually exists either with or without extension listed in
	* the linker_ext_list. (probably should be generic for the rest of the
	* kernel)
	*/
	static char *
	linker_lookup_file(const char path, int pathlen, const char name,
	int namelen, struct vattr *vap)
	{
	struct nameidata nd;
	struct thread td = curthread; / XXX */
	char result, cpp, sep;
	int error, len, extlen, reclen, flags, vfslocked;
	enum vtype type;

	extlen = 0;
	for (cpp = linker_ext_list; *cpp; cpp++) {
	len = strlen(*cpp);
	if (len > extlen)
	extlen = len;
	}
	extlen++; /* trailing '\0' */
	sep = (path[pathlen - 1] != '/') ? "/" : "";

	reclen = pathlen + strlen(sep) + namelen + extlen + 1;
	result = malloc(reclen, M_LINKER, M_WAITOK);
	for (cpp = linker_ext_list; *cpp; cpp++) {
	snprintf(result, reclen, "%.s%s%.s%s", pathlen, path, sep,
	namelen, name, *cpp);
	/*
	* Attempt to open the file, and return the path if
	* we succeed and it's a regular file.
	*/
	NDINIT(&nd, LOOKUP, FOLLOW \| MPSAFE, UIO_SYSSPACE, result, td);
	flags = FREAD;
	error = vn_open(&nd, &flags, 0, NULL);
	if (error == 0) {
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	type = nd.ni_vp->v_type;
	if (vap)
	VOP_GETATTR(nd.ni_vp, vap, td->td_ucred);
	VOP_UNLOCK(nd.ni_vp, 0);
	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	if (type == VREG)
	return (result);
	}
	}
	free(result, M_LINKER);
	return (NULL);
	}

	#define INT_ALIGN(base, ptr) ptr = \
	(base) + (((ptr) - (base) + sizeof(int) - 1) & ~(sizeof(int) - 1))

	/*
	* Lookup KLD which contains requested module in the "linker.hints" file. If
	* version specification is available, then try to find the best KLD.
	* Otherwise just find the latest one.
	*/
	static char *
	linker_hints_lookup(const char path, int pathlen, const char modname,
	int modnamelen, struct mod_depend *verinfo)
	{
	struct thread td = curthread; / XXX */
	struct ucred *cred = td ? td->td_ucred : NULL;
	struct nameidata nd;
	struct vattr vattr, mattr;
	u_char *hints = NULL;
	u_char cp, recptr, bufend, result, best, pathbuf, *sep;
	int error, ival, bestver, *intp, reclen, found, flags, clen, blen;
	int vfslocked = 0;

	result = NULL;
	bestver = found = 0;

	sep = (path[pathlen - 1] != '/') ? "/" : "";
	reclen = imax(modnamelen, strlen(linker_hintfile)) + pathlen +
	strlen(sep) + 1;
	pathbuf = malloc(reclen, M_LINKER, M_WAITOK);
	snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep,
	linker_hintfile);

	NDINIT(&nd, LOOKUP, NOFOLLOW \| MPSAFE, UIO_SYSSPACE, pathbuf, td);
	flags = FREAD;
	error = vn_open(&nd, &flags, 0, NULL);
	if (error)
	goto bad;
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (nd.ni_vp->v_type != VREG)
	goto bad;
	best = cp = NULL;
	error = VOP_GETATTR(nd.ni_vp, &vattr, cred);
	if (error)
	goto bad;
	/*
	* XXX: we need to limit this number to some reasonable value
	*/
	if (vattr.va_size > 100 * 1024) {
	printf("hints file too large %ld\n", (long)vattr.va_size);
	goto bad;
	}
	hints = malloc(vattr.va_size, M_TEMP, M_WAITOK);
	if (hints == NULL)
	goto bad;
	error = vn_rdwr(UIO_READ, nd.ni_vp, (caddr_t)hints, vattr.va_size, 0,
	UIO_SYSSPACE, IO_NODELOCKED, cred, NOCRED, &reclen, td);
	if (error)
	goto bad;
	VOP_UNLOCK(nd.ni_vp, 0);
	vn_close(nd.ni_vp, FREAD, cred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	nd.ni_vp = NULL;
	if (reclen != 0) {
	printf("can't read %d\n", reclen);
	goto bad;
	}
	intp = (int *)hints;
	ival = *intp++;
	if (ival != LINKER_HINTS_VERSION) {
	printf("hints file version mismatch %d\n", ival);
	goto bad;
	}
	bufend = hints + vattr.va_size;
	recptr = (u_char *)intp;
	clen = blen = 0;
	while (recptr < bufend && !found) {
	intp = (int *)recptr;
	reclen = *intp++;
	ival = *intp++;
	cp = (char *)intp;
	switch (ival) {
	case MDT_VERSION:
	clen = *cp++;
	if (clen != modnamelen \|\| bcmp(cp, modname, clen) != 0)
	break;
	cp += clen;
	INT_ALIGN(hints, cp);
	ival = (int )cp;
	cp += sizeof(int);
	clen = *cp++;
	if (verinfo == NULL \|\|
	ival == verinfo->md_ver_preferred) {
	found = 1;
	break;
	}
	if (ival >= verinfo->md_ver_minimum &&
	ival <= verinfo->md_ver_maximum &&
	ival > bestver) {
	bestver = ival;
	best = cp;
	blen = clen;
	}
	break;
	default:
	break;
	}
	recptr += reclen + sizeof(int);
	}
	/*
	* Finally check if KLD is in the place
	*/
	if (found)
	result = linker_lookup_file(path, pathlen, cp, clen, &mattr);
	else if (best)
	result = linker_lookup_file(path, pathlen, best, blen, &mattr);

	/*
	* KLD is newer than hints file. What we should do now?
	*/
	if (result && timespeccmp(&mattr.va_mtime, &vattr.va_mtime, >))
	printf("warning: KLD '%s' is newer than the linker.hints"
	" file\n", result);
	bad:
	free(pathbuf, M_LINKER);
	if (hints)
	free(hints, M_TEMP);
	if (nd.ni_vp != NULL) {
	VOP_UNLOCK(nd.ni_vp, 0);
	vn_close(nd.ni_vp, FREAD, cred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	/*
	* If nothing found or hints is absent - fallback to the old
	* way by using "kldname[.ko]" as module name.
	*/
	if (!found && !bestver && result == NULL)
	result = linker_lookup_file(path, pathlen, modname,
	modnamelen, NULL);
	return (result);
	}

	/*
	* Lookup KLD which contains requested module in the all directories.
	*/
	static char *
	linker_search_module(const char *modname, int modnamelen,
	struct mod_depend *verinfo)
	{
	char cp, ep, *result;

	/*
	* traverse the linker path
	*/
	for (cp = linker_path; *cp; cp = ep + 1) {
	/* find the end of this component */
	for (ep = cp; (ep != 0) && (ep != ';'); ep++);
	result = linker_hints_lookup(cp, ep - cp, modname,
	modnamelen, verinfo);
	if (result != NULL)
	return (result);
	if (*ep == 0)
	break;
	}
	return (NULL);
	}

	/*
	* Search for module in all directories listed in the linker_path.
	*/
	static char *
	linker_search_kld(const char *name)
	{
	char cp, ep, *result;
	int len;

	/* qualified at all? */
	if (index(name, '/'))
	return (linker_strdup(name));

	/* traverse the linker path */
	len = strlen(name);
	for (ep = linker_path; *ep; ep++) {
	cp = ep;
	/* find the end of this component */
	for (; ep != 0 && ep != ';'; ep++);
	result = linker_lookup_file(cp, ep - cp, name, len, NULL);
	if (result != NULL)
	return (result);
	}
	return (NULL);
	}

	static const char *
	linker_basename(const char *path)
	{
	const char *filename;

	filename = rindex(path, '/');
	if (filename == NULL)
	return path;
	if (filename[1])
	filename++;
	return (filename);
	}

	#ifdef HWPMC_HOOKS
	/*
	* Inform hwpmc about the set of kernel modules currently loaded.
	*/
	void *
	linker_hwpmc_list_objects(void)
	{
	linker_file_t lf;
	struct pmckern_map_in *kobase;
	int i, nmappings;

	nmappings = 0;
	KLD_LOCK_READ();
	TAILQ_FOREACH(lf, &linker_files, link)
	nmappings++;

	/* Allocate nmappings + 1 entries. */
	kobase = malloc((nmappings + 1) * sizeof(struct pmckern_map_in),
	M_LINKER, M_WAITOK \| M_ZERO);
	i = 0;
	TAILQ_FOREACH(lf, &linker_files, link) {

	/* Save the info for this linker file. */
	kobase[i].pm_file = lf->filename;
	kobase[i].pm_address = (uintptr_t)lf->address;
	i++;
	}
	KLD_UNLOCK_READ();

	KASSERT(i > 0, ("linker_hpwmc_list_objects: no kernel objects?"));

	/* The last entry of the malloced area comprises of all zeros. */
	KASSERT(kobase[i].pm_file == NULL,
	("linker_hwpmc_list_objects: last object not NULL"));

	return ((void *)kobase);
	}
	#endif

	/*
	* Find a file which contains given module and load it, if "parent" is not
	* NULL, register a reference to it.
	*/
	static int
	linker_load_module(const char kldname, const char modname,
	struct linker_file parent, struct mod_depend verinfo,
	struct linker_file **lfpp)
	{
	linker_file_t lfdep;
	const char *filename;
	char *pathname;
	int error;

	KLD_LOCK_ASSERT();
	if (modname == NULL) {
	/*
	* We have to load KLD
	*/
	KASSERT(verinfo == NULL, ("linker_load_module: verinfo"
	" is not NULL"));
	pathname = linker_search_kld(kldname);
	} else {
	if (modlist_lookup2(modname, verinfo) != NULL)
	return (EEXIST);
	if (kldname != NULL)
	pathname = linker_strdup(kldname);
	else if (rootvnode == NULL)
	pathname = NULL;
	else
	/*
	* Need to find a KLD with required module
	*/
	pathname = linker_search_module(modname,
	strlen(modname), verinfo);
	}
	if (pathname == NULL)
	return (ENOENT);

	/*
	* Can't load more than one file with the same basename XXX:
	* Actually it should be possible to have multiple KLDs with
	* the same basename but different path because they can
	* provide different versions of the same modules.
	*/
	filename = linker_basename(pathname);
	if (linker_find_file_by_name(filename))
	error = EEXIST;
	else do {
	error = linker_load_file(pathname, &lfdep);
	if (error)
	break;
	if (modname && verinfo &&
	modlist_lookup2(modname, verinfo) == NULL) {
	linker_file_unload(lfdep, LINKER_UNLOAD_FORCE);
	error = ENOENT;
	break;
	}
	if (parent) {
	error = linker_file_add_dependency(parent, lfdep);
	if (error)
	break;
	}
	if (lfpp)
	*lfpp = lfdep;
	} while (0);
	free(pathname, M_LINKER);
	return (error);
	}

	/*
	* This routine is responsible for finding dependencies of userland initiated
	* kldload(2)'s of files.
	*/
	int
	linker_load_dependencies(linker_file_t lf)
	{
	linker_file_t lfdep;
	struct mod_metadata start, stop, mdp, nmdp;
	struct mod_metadata mp, nmp;
	struct mod_depend *verinfo;
	modlist_t mod;
	const char modname, nmodname;
	int ver, error = 0, count;

	/*
	* All files are dependant on /kernel.
	*/
	KLD_LOCK_ASSERT();
	if (linker_kernel_file) {
	linker_kernel_file->refs++;
	error = linker_file_add_dependency(lf, linker_kernel_file);
	if (error)
	return (error);
	}
	if (linker_file_lookup_set(lf, MDT_SETNAME, &start, &stop,
	&count) != 0)
	return (0);
	for (mdp = start; mdp < stop; mdp++) {
	mp = *mdp;
	if (mp->md_type != MDT_VERSION)
	continue;
	modname = mp->md_cval;
	ver = ((struct mod_version *)mp->md_data)->mv_version;
	mod = modlist_lookup(modname, ver);
	if (mod != NULL) {
	printf("interface %s.%d already present in the KLD"
	" '%s'!\n", modname, ver,
	mod->container->filename);
	return (EEXIST);
	}
	}

	for (mdp = start; mdp < stop; mdp++) {
	mp = *mdp;
	if (mp->md_type != MDT_DEPEND)
	continue;
	modname = mp->md_cval;
	verinfo = mp->md_data;
	nmodname = NULL;
	for (nmdp = start; nmdp < stop; nmdp++) {
	nmp = *nmdp;
	if (nmp->md_type != MDT_VERSION)
	continue;
	nmodname = nmp->md_cval;
	if (strcmp(modname, nmodname) == 0)
	break;
	}
	if (nmdp < stop)/* early exit, it's a self reference */
	continue;
	mod = modlist_lookup2(modname, verinfo);
	if (mod) { /* woohoo, it's loaded already */
	lfdep = mod->container;
	lfdep->refs++;
	error = linker_file_add_dependency(lf, lfdep);
	if (error)
	break;
	continue;
	}
	error = linker_load_module(NULL, modname, lf, verinfo, NULL);
	if (error) {
	printf("KLD %s: depends on %s - not available or"
	" version mismatch\n", lf->filename, modname);
	break;
	}
	}

	if (error)
	return (error);
	linker_addmodules(lf, start, stop, 0);
	return (error);
	}

	static int
	sysctl_kern_function_list_iterate(const char name, void opaque)
	{
	struct sysctl_req *req;

	req = opaque;
	return (SYSCTL_OUT(req, name, strlen(name) + 1));
	}

	/*
	* Export a nul-separated, double-nul-terminated list of all function names
	* in the kernel.
	*/
	static int
	sysctl_kern_function_list(SYSCTL_HANDLER_ARGS)
	{
	linker_file_t lf;
	int error;

	#ifdef MAC
	error = mac_kld_check_stat(req->td->td_ucred);
	if (error)
	return (error);
	#endif
	error = sysctl_wire_old_buffer(req, 0);
	if (error != 0)
	return (error);
	KLD_LOCK();
	TAILQ_FOREACH(lf, &linker_files, link) {
	error = LINKER_EACH_FUNCTION_NAME(lf,
	sysctl_kern_function_list_iterate, req);
	if (error) {
	KLD_UNLOCK();
	return (error);
	}
	}
	KLD_UNLOCK();
	return (SYSCTL_OUT(req, "", 1));
	}

	SYSCTL_PROC(_kern, OID_AUTO, function_list, CTLTYPE_OPAQUE \| CTLFLAG_RD,
	NULL, 0, sysctl_kern_function_list, "", "kernel function list");
	Index: head/sys/kern/kern_loginclass.c
	===================================================================
	--- head/sys/kern/kern_loginclass.c (revision 225616)
	+++ head/sys/kern/kern_loginclass.c (revision 225617)
	@@ -1,238 +1,238 @@
	/*-
	* Copyright (c) 2011 The FreeBSD Foundation
	* All rights reserved.
	*
	* This software was developed by Edward Tomasz Napierala under sponsorship
	* from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	/*
	* Processes may set login class name using setloginclass(2). This
	* is usually done through call to setusercontext(3), by programs
	* such as login(1), based on information from master.passwd(5). Kernel
	* uses this information to enforce per-class resource limits. Current
	* login class can be determined using id(1). Login class is inherited
	* from the parent process during fork(2). If not set, it defaults
	* to "default".
	*
	* Code in this file implements setloginclass(2) and getloginclass(2)
	* system calls, and maintains class name storage and retrieval.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/eventhandler.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/loginclass.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/types.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/queue.h>
	#include <sys/racct.h>
	#include <sys/refcount.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>

	static MALLOC_DEFINE(M_LOGINCLASS, "loginclass", "loginclass structures");

	LIST_HEAD(, loginclass) loginclasses;

	/*
	* Lock protecting loginclasses list.
	*/
	static struct mtx loginclasses_lock;

	static void lc_init(void);
	SYSINIT(loginclass, SI_SUB_CPU, SI_ORDER_FIRST, lc_init, NULL);

	void
	loginclass_hold(struct loginclass *lc)
	{

	refcount_acquire(&lc->lc_refcount);
	}

	void
	loginclass_free(struct loginclass *lc)
	{
	int old;

	old = lc->lc_refcount;
	if (old > 1 && atomic_cmpset_int(&lc->lc_refcount, old, old - 1))
	return;

	mtx_lock(&loginclasses_lock);
	if (refcount_release(&lc->lc_refcount)) {
	racct_destroy(&lc->lc_racct);
	LIST_REMOVE(lc, lc_next);
	mtx_unlock(&loginclasses_lock);
	free(lc, M_LOGINCLASS);

	return;
	}
	mtx_unlock(&loginclasses_lock);
	}

	/*
	* Return loginclass structure with a corresponding name. Not
	* performance critical, as it's used mainly by setloginclass(2),
	* which happens once per login session. Caller has to use
	* loginclass_free() on the returned value when it's no longer
	* needed.
	*/
	struct loginclass *
	loginclass_find(const char *name)
	{
	struct loginclass lc, newlc;

	if (name[0] == '\0' \|\| strlen(name) >= MAXLOGNAME)
	return (NULL);

	newlc = malloc(sizeof(*newlc), M_LOGINCLASS, M_ZERO \| M_WAITOK);
	racct_create(&newlc->lc_racct);

	mtx_lock(&loginclasses_lock);
	LIST_FOREACH(lc, &loginclasses, lc_next) {
	if (strcmp(name, lc->lc_name) != 0)
	continue;

	/* Found loginclass with a matching name? */
	loginclass_hold(lc);
	mtx_unlock(&loginclasses_lock);
	racct_destroy(&newlc->lc_racct);
	free(newlc, M_LOGINCLASS);
	return (lc);
	}

	/* Add new loginclass. */
	strcpy(newlc->lc_name, name);
	refcount_init(&newlc->lc_refcount, 1);
	LIST_INSERT_HEAD(&loginclasses, newlc, lc_next);
	mtx_unlock(&loginclasses_lock);

	return (newlc);
	}

	/*
	* Get login class name.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct getloginclass_args {
	char *namebuf;
	size_t namelen;
	};
	#endif
	/* ARGSUSED */
	int
	-getloginclass(struct thread td, struct getloginclass_args uap)
	+sys_getloginclass(struct thread td, struct getloginclass_args uap)
	{
	int error = 0;
	size_t lcnamelen;
	struct proc *p;
	struct loginclass *lc;

	p = td->td_proc;
	PROC_LOCK(p);
	lc = p->p_ucred->cr_loginclass;
	loginclass_hold(lc);
	PROC_UNLOCK(p);

	lcnamelen = strlen(lc->lc_name) + 1;
	if (lcnamelen > uap->namelen)
	error = ERANGE;
	if (error == 0)
	error = copyout(lc->lc_name, uap->namebuf, lcnamelen);
	loginclass_free(lc);
	return (error);
	}

	/*
	* Set login class name.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct setloginclass_args {
	const char *namebuf;
	};
	#endif
	/* ARGSUSED */
	int
	-setloginclass(struct thread td, struct setloginclass_args uap)
	+sys_setloginclass(struct thread td, struct setloginclass_args uap)
	{
	struct proc *p = td->td_proc;
	int error;
	char lcname[MAXLOGNAME];
	struct loginclass *newlc;
	struct ucred newcred, oldcred;

	error = priv_check(td, PRIV_PROC_SETLOGINCLASS);
	if (error != 0)
	return (error);
	error = copyinstr(uap->namebuf, lcname, sizeof(lcname), NULL);
	if (error != 0)
	return (error);

	newlc = loginclass_find(lcname);
	if (newlc == NULL)
	return (EINVAL);
	newcred = crget();

	PROC_LOCK(p);
	oldcred = crcopysafe(p, newcred);
	newcred->cr_loginclass = newlc;
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	#ifdef RACCT
	racct_proc_ucred_changed(p, oldcred, newcred);
	#endif
	loginclass_free(oldcred->cr_loginclass);
	crfree(oldcred);

	return (0);
	}

	void
	loginclass_racct_foreach(void (callback)(struct racct racct,
	void arg2, void arg3), void arg2, void arg3)
	{
	struct loginclass *lc;

	mtx_lock(&loginclasses_lock);
	LIST_FOREACH(lc, &loginclasses, lc_next)
	(callback)(lc->lc_racct, arg2, arg3);
	mtx_unlock(&loginclasses_lock);
	}

	static void
	lc_init(void)
	{

	mtx_init(&loginclasses_lock, "loginclasses lock", NULL, MTX_DEF);
	}
	Index: head/sys/kern/kern_module.c
	===================================================================
	--- head/sys/kern/kern_module.c (revision 225616)
	+++ head/sys/kern/kern_module.c (revision 225617)
	@@ -1,523 +1,523 @@
	/*-
	* Copyright (c) 1997 Doug Rabson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include "opt_compat.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/systm.h>
	#include <sys/eventhandler.h>
	#include <sys/malloc.h>
	#include <sys/sysproto.h>
	#include <sys/sysent.h>
	#include <sys/proc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/reboot.h>
	#include <sys/sx.h>
	#include <sys/module.h>
	#include <sys/linker.h>

	static MALLOC_DEFINE(M_MODULE, "module", "module data structures");

	struct module {
	TAILQ_ENTRY(module) link; /* chain together all modules */
	TAILQ_ENTRY(module) flink; /* all modules in a file */
	struct linker_file file; / file which contains this module */
	int refs; /* reference count */
	int id; /* unique id number */
	char name; / module name */
	modeventhand_t handler; /* event handler */
	void arg; / argument for handler */
	modspecific_t data; /* module specific data */
	};

	#define MOD_EVENT(mod, type) (mod)->handler((mod), (type), (mod)->arg)

	static TAILQ_HEAD(modulelist, module) modules;
	struct sx modules_sx;
	static int nextid = 1;
	static void module_shutdown(void *, int);

	static int
	modevent_nop(module_t mod, int what, void *arg)
	{

	switch(what) {
	case MOD_LOAD:
	return (0);
	case MOD_UNLOAD:
	return (EBUSY);
	default:
	return (EOPNOTSUPP);
	}
	}

	static void
	module_init(void *arg)
	{

	sx_init(&modules_sx, "module subsystem sx lock");
	TAILQ_INIT(&modules);
	EVENTHANDLER_REGISTER(shutdown_final, module_shutdown, NULL,
	SHUTDOWN_PRI_DEFAULT);
	}

	SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0);

	static void
	module_shutdown(void *arg1, int arg2)
	{
	module_t mod;

	if (arg2 & RB_NOSYNC)
	return;
	mtx_lock(&Giant);
	MOD_SLOCK;
	TAILQ_FOREACH_REVERSE(mod, &modules, modulelist, link)
	MOD_EVENT(mod, MOD_SHUTDOWN);
	MOD_SUNLOCK;
	mtx_unlock(&Giant);
	}

	void
	module_register_init(const void *arg)
	{
	const moduledata_t data = (const moduledata_t )arg;
	int error;
	module_t mod;

	mtx_lock(&Giant);
	MOD_SLOCK;
	mod = module_lookupbyname(data->name);
	if (mod == NULL)
	panic("module_register_init: module named %s not found\n",
	data->name);
	MOD_SUNLOCK;
	error = MOD_EVENT(mod, MOD_LOAD);
	if (error) {
	MOD_EVENT(mod, MOD_UNLOAD);
	MOD_XLOCK;
	module_release(mod);
	MOD_XUNLOCK;
	printf("module_register_init: MOD_LOAD (%s, %p, %p) error"
	" %d\n", data->name, (void *)data->evhand, data->priv,
	error);
	} else {
	MOD_XLOCK;
	if (mod->file) {
	/*
	* Once a module is succesfully loaded, move
	* it to the head of the module list for this
	* linker file. This resorts the list so that
	* when the kernel linker iterates over the
	* modules to unload them, it will unload them
	* in the reverse order they were loaded.
	*/
	TAILQ_REMOVE(&mod->file->modules, mod, flink);
	TAILQ_INSERT_HEAD(&mod->file->modules, mod, flink);
	}
	MOD_XUNLOCK;
	}
	mtx_unlock(&Giant);
	}

	int
	module_register(const moduledata_t *data, linker_file_t container)
	{
	size_t namelen;
	module_t newmod;

	MOD_XLOCK;
	newmod = module_lookupbyname(data->name);
	if (newmod != NULL) {
	MOD_XUNLOCK;
	printf("module_register: module %s already exists!\n",
	data->name);
	return (EEXIST);
	}
	namelen = strlen(data->name) + 1;
	newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK);
	if (newmod == NULL) {
	MOD_XUNLOCK;
	return (ENOMEM);
	}
	newmod->refs = 1;
	newmod->id = nextid++;
	newmod->name = (char *)(newmod + 1);
	strcpy(newmod->name, data->name);
	newmod->handler = data->evhand ? data->evhand : modevent_nop;
	newmod->arg = data->priv;
	bzero(&newmod->data, sizeof(newmod->data));
	TAILQ_INSERT_TAIL(&modules, newmod, link);

	if (container)
	TAILQ_INSERT_TAIL(&container->modules, newmod, flink);
	newmod->file = container;
	MOD_XUNLOCK;
	return (0);
	}

	void
	module_reference(module_t mod)
	{

	MOD_XLOCK_ASSERT;

	MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs));
	mod->refs++;
	}

	void
	module_release(module_t mod)
	{

	MOD_XLOCK_ASSERT;

	if (mod->refs <= 0)
	panic("module_release: bad reference count");

	MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs));

	mod->refs--;
	if (mod->refs == 0) {
	TAILQ_REMOVE(&modules, mod, link);
	if (mod->file)
	TAILQ_REMOVE(&mod->file->modules, mod, flink);
	free(mod, M_MODULE);
	}
	}

	module_t
	module_lookupbyname(const char *name)
	{
	module_t mod;
	int err;

	MOD_LOCK_ASSERT;

	TAILQ_FOREACH(mod, &modules, link) {
	err = strcmp(mod->name, name);
	if (err == 0)
	return (mod);
	}
	return (NULL);
	}

	module_t
	module_lookupbyid(int modid)
	{
	module_t mod;

	MOD_LOCK_ASSERT;

	TAILQ_FOREACH(mod, &modules, link)
	if (mod->id == modid)
	return(mod);
	return (NULL);
	}

	int
	module_quiesce(module_t mod)
	{
	int error;

	mtx_lock(&Giant);
	error = MOD_EVENT(mod, MOD_QUIESCE);
	mtx_unlock(&Giant);
	if (error == EOPNOTSUPP \|\| error == EINVAL)
	error = 0;
	return (error);
	}

	int
	module_unload(module_t mod)
	{
	int error;

	mtx_lock(&Giant);
	error = MOD_EVENT(mod, MOD_UNLOAD);
	mtx_unlock(&Giant);
	return (error);
	}

	int
	module_getid(module_t mod)
	{

	MOD_LOCK_ASSERT;
	return (mod->id);
	}

	module_t
	module_getfnext(module_t mod)
	{

	MOD_LOCK_ASSERT;
	return (TAILQ_NEXT(mod, flink));
	}

	const char *
	module_getname(module_t mod)
	{

	MOD_LOCK_ASSERT;
	return (mod->name);
	}

	void
	module_setspecific(module_t mod, modspecific_t *datap)
	{

	MOD_XLOCK_ASSERT;
	mod->data = *datap;
	}

	linker_file_t
	module_file(module_t mod)
	{

	return (mod->file);
	}

	/*
	* Syscalls.
	*/
	int
	-modnext(struct thread td, struct modnext_args uap)
	+sys_modnext(struct thread td, struct modnext_args uap)
	{
	module_t mod;
	int error = 0;

	td->td_retval[0] = -1;

	MOD_SLOCK;
	if (uap->modid == 0) {
	mod = TAILQ_FIRST(&modules);
	if (mod)
	td->td_retval[0] = mod->id;
	else
	error = ENOENT;
	goto done2;
	}
	mod = module_lookupbyid(uap->modid);
	if (mod == NULL) {
	error = ENOENT;
	goto done2;
	}
	if (TAILQ_NEXT(mod, link))
	td->td_retval[0] = TAILQ_NEXT(mod, link)->id;
	else
	td->td_retval[0] = 0;
	done2:
	MOD_SUNLOCK;
	return (error);
	}

	int
	-modfnext(struct thread td, struct modfnext_args uap)
	+sys_modfnext(struct thread td, struct modfnext_args uap)
	{
	module_t mod;
	int error;

	td->td_retval[0] = -1;

	MOD_SLOCK;
	mod = module_lookupbyid(uap->modid);
	if (mod == NULL) {
	error = ENOENT;
	} else {
	error = 0;
	if (TAILQ_NEXT(mod, flink))
	td->td_retval[0] = TAILQ_NEXT(mod, flink)->id;
	else
	td->td_retval[0] = 0;
	}
	MOD_SUNLOCK;
	return (error);
	}

	struct module_stat_v1 {
	int version; /* set to sizeof(struct module_stat) */
	char name[MAXMODNAME];
	int refs;
	int id;
	};

	int
	-modstat(struct thread td, struct modstat_args uap)
	+sys_modstat(struct thread td, struct modstat_args uap)
	{
	module_t mod;
	modspecific_t data;
	int error = 0;
	int id, namelen, refs, version;
	struct module_stat *stat;
	char *name;

	MOD_SLOCK;
	mod = module_lookupbyid(uap->modid);
	if (mod == NULL) {
	MOD_SUNLOCK;
	return (ENOENT);
	}
	id = mod->id;
	refs = mod->refs;
	name = mod->name;
	data = mod->data;
	MOD_SUNLOCK;
	stat = uap->stat;

	/*
	* Check the version of the user's structure.
	*/
	if ((error = copyin(&stat->version, &version, sizeof(version))) != 0)
	return (error);
	if (version != sizeof(struct module_stat_v1)
	&& version != sizeof(struct module_stat))
	return (EINVAL);
	namelen = strlen(mod->name) + 1;
	if (namelen > MAXMODNAME)
	namelen = MAXMODNAME;
	if ((error = copyout(name, &stat->name[0], namelen)) != 0)
	return (error);

	if ((error = copyout(&refs, &stat->refs, sizeof(int))) != 0)
	return (error);
	if ((error = copyout(&id, &stat->id, sizeof(int))) != 0)
	return (error);

	/*
	* >v1 stat includes module data.
	*/
	if (version == sizeof(struct module_stat))
	if ((error = copyout(&data, &stat->data,
	sizeof(data))) != 0)
	return (error);
	td->td_retval[0] = 0;
	return (error);
	}

	int
	-modfind(struct thread td, struct modfind_args uap)
	+sys_modfind(struct thread td, struct modfind_args uap)
	{
	int error = 0;
	char name[MAXMODNAME];
	module_t mod;

	if ((error = copyinstr(uap->name, name, sizeof name, 0)) != 0)
	return (error);

	MOD_SLOCK;
	mod = module_lookupbyname(name);
	if (mod == NULL)
	error = ENOENT;
	else
	td->td_retval[0] = module_getid(mod);
	MOD_SUNLOCK;
	return (error);
	}

	MODULE_VERSION(kernel, __FreeBSD_version);

	#ifdef COMPAT_FREEBSD32
	#include <sys/mount.h>
	#include <sys/socket.h>
	#include <compat/freebsd32/freebsd32_util.h>
	#include <compat/freebsd32/freebsd32.h>
	#include <compat/freebsd32/freebsd32_proto.h>

	typedef union modspecific32 {
	int intval;
	uint32_t uintval;
	int longval;
	uint32_t ulongval;
	} modspecific32_t;

	struct module_stat32 {
	int version;
	char name[MAXMODNAME];
	int refs;
	int id;
	modspecific32_t data;
	};

	int
	freebsd32_modstat(struct thread td, struct freebsd32_modstat_args uap)
	{
	module_t mod;
	modspecific32_t data32;
	int error = 0;
	int id, namelen, refs, version;
	struct module_stat32 *stat32;
	char *name;

	MOD_SLOCK;
	mod = module_lookupbyid(uap->modid);
	if (mod == NULL) {
	MOD_SUNLOCK;
	return (ENOENT);
	}

	id = mod->id;
	refs = mod->refs;
	name = mod->name;
	CP(mod->data, data32, intval);
	CP(mod->data, data32, uintval);
	CP(mod->data, data32, longval);
	CP(mod->data, data32, ulongval);
	MOD_SUNLOCK;
	stat32 = uap->stat;

	if ((error = copyin(&stat32->version, &version, sizeof(version))) != 0)
	return (error);
	if (version != sizeof(struct module_stat_v1)
	&& version != sizeof(struct module_stat32))
	return (EINVAL);
	namelen = strlen(mod->name) + 1;
	if (namelen > MAXMODNAME)
	namelen = MAXMODNAME;
	if ((error = copyout(name, &stat32->name[0], namelen)) != 0)
	return (error);

	if ((error = copyout(&refs, &stat32->refs, sizeof(int))) != 0)
	return (error);
	if ((error = copyout(&id, &stat32->id, sizeof(int))) != 0)
	return (error);

	/*
	* >v1 stat includes module data.
	*/
	if (version == sizeof(struct module_stat32))
	if ((error = copyout(&data32, &stat32->data,
	sizeof(data32))) != 0)
	return (error);
	td->td_retval[0] = 0;
	return (error);
	}
	#endif
	Index: head/sys/kern/kern_ntptime.c
	===================================================================
	--- head/sys/kern/kern_ntptime.c (revision 225616)
	+++ head/sys/kern/kern_ntptime.c (revision 225617)
	@@ -1,1044 +1,1044 @@
	/*-
	***********************************************************************
	* *
	* Copyright (c) David L. Mills 1993-2001 *
	* *
	* Permission to use, copy, modify, and distribute this software and *
	* its documentation for any purpose and without fee is hereby *
	* granted, provided that the above copyright notice appears in all *
	* copies and that both the copyright notice and this permission *
	* notice appear in supporting documentation, and that the name *
	* University of Delaware not be used in advertising or publicity *
	* pertaining to distribution of the software without specific, *
	* written prior permission. The University of Delaware makes no *
	* representations about the suitability this software for any *
	* purpose. It is provided "as is" without express or implied *
	* warranty. *
	* *
	**********************************************************************/

	/*
	* Adapted from the original sources for FreeBSD and timecounters by:
	* Poul-Henning Kamp <phk@FreeBSD.org>.
	*
	* The 32bit version of the "LP" macros seems a bit past its "sell by"
	* date so I have retained only the 64bit version and included it directly
	* in this file.
	*
	* Only minor changes done to interface with the timecounters over in
	* sys/kern/kern_clock.c. Some of the comments below may be (even more)
	* confusing and/or plain wrong in that context.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ntp.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/eventhandler.h>
	#include <sys/kernel.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/time.h>
	#include <sys/timex.h>
	#include <sys/timetc.h>
	#include <sys/timepps.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>

	#ifdef PPS_SYNC
	FEATURE(pps_sync, "Support usage of external PPS signal by kernel PLL");
	#endif

	/*
	* Single-precision macros for 64-bit machines
	*/
	typedef int64_t l_fp;
	#define L_ADD(v, u) ((v) += (u))
	#define L_SUB(v, u) ((v) -= (u))
	#define L_ADDHI(v, a) ((v) += (int64_t)(a) << 32)
	#define L_NEG(v) ((v) = -(v))
	#define L_RSHIFT(v, n) \
	do { \
	if ((v) < 0) \
	(v) = -(-(v) >> (n)); \
	else \
	(v) = (v) >> (n); \
	} while (0)
	#define L_MPY(v, a) ((v) *= (a))
	#define L_CLR(v) ((v) = 0)
	#define L_ISNEG(v) ((v) < 0)
	#define L_LINT(v, a) ((v) = (int64_t)(a) << 32)
	#define L_GINT(v) ((v) < 0 ? -(-(v) >> 32) : (v) >> 32)

	/*
	* Generic NTP kernel interface
	*
	* These routines constitute the Network Time Protocol (NTP) interfaces
	* for user and daemon application programs. The ntp_gettime() routine
	* provides the time, maximum error (synch distance) and estimated error
	* (dispersion) to client user application programs. The ntp_adjtime()
	* routine is used by the NTP daemon to adjust the system clock to an
	* externally derived time. The time offset and related variables set by
	* this routine are used by other routines in this module to adjust the
	* phase and frequency of the clock discipline loop which controls the
	* system clock.
	*
	* When the kernel time is reckoned directly in nanoseconds (NTP_NANO
	* defined), the time at each tick interrupt is derived directly from
	* the kernel time variable. When the kernel time is reckoned in
	* microseconds, (NTP_NANO undefined), the time is derived from the
	* kernel time variable together with a variable representing the
	* leftover nanoseconds at the last tick interrupt. In either case, the
	* current nanosecond time is reckoned from these values plus an
	* interpolated value derived by the clock routines in another
	* architecture-specific module. The interpolation can use either a
	* dedicated counter or a processor cycle counter (PCC) implemented in
	* some architectures.
	*
	* Note that all routines must run at priority splclock or higher.
	*/
	/*
	* Phase/frequency-lock loop (PLL/FLL) definitions
	*
	* The nanosecond clock discipline uses two variable types, time
	* variables and frequency variables. Both types are represented as 64-
	* bit fixed-point quantities with the decimal point between two 32-bit
	* halves. On a 32-bit machine, each half is represented as a single
	* word and mathematical operations are done using multiple-precision
	* arithmetic. On a 64-bit machine, ordinary computer arithmetic is
	* used.
	*
	* A time variable is a signed 64-bit fixed-point number in ns and
	* fraction. It represents the remaining time offset to be amortized
	* over succeeding tick interrupts. The maximum time offset is about
	* 0.5 s and the resolution is about 2.3e-10 ns.
	*
	* 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
	* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	* \|s s s\| ns \|
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	* \| fraction \|
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	*
	* A frequency variable is a signed 64-bit fixed-point number in ns/s
	* and fraction. It represents the ns and fraction to be added to the
	* kernel time variable at each second. The maximum frequency offset is
	* about +-500000 ns/s and the resolution is about 2.3e-10 ns/s.
	*
	* 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
	* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	* \|s s s s s s s s s s s s s\| ns/s \|
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	* \| fraction \|
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	*/
	/*
	* The following variables establish the state of the PLL/FLL and the
	* residual time and frequency offset of the local clock.
	*/
	#define SHIFT_PLL 4 /* PLL loop gain (shift) */
	#define SHIFT_FLL 2 /* FLL loop gain (shift) */

	static int time_state = TIME_OK; /* clock state */
	static int time_status = STA_UNSYNC; /* clock status bits */
	static long time_tai; /* TAI offset (s) */
	static long time_monitor; /* last time offset scaled (ns) */
	static long time_constant; /* poll interval (shift) (s) */
	static long time_precision = 1; /* clock precision (ns) */
	static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */
	static long time_esterror = MAXPHASE / 1000; /* estimated error (us) */
	static long time_reftime; /* time at last adjustment (s) */
	static l_fp time_offset; /* time offset (ns) */
	static l_fp time_freq; /* frequency offset (ns/s) */
	static l_fp time_adj; /* tick adjust (ns/s) */

	static int64_t time_adjtime; /* correction from adjtime(2) (usec) */

	#ifdef PPS_SYNC
	/*
	* The following variables are used when a pulse-per-second (PPS) signal
	* is available and connected via a modem control lead. They establish
	* the engineering parameters of the clock discipline loop when
	* controlled by the PPS signal.
	*/
	#define PPS_FAVG 2 /* min freq avg interval (s) (shift) */
	#define PPS_FAVGDEF 8 /* default freq avg int (s) (shift) */
	#define PPS_FAVGMAX 15 /* max freq avg interval (s) (shift) */
	#define PPS_PAVG 4 /* phase avg interval (s) (shift) */
	#define PPS_VALID 120 /* PPS signal watchdog max (s) */
	#define PPS_MAXWANDER 100000 /* max PPS wander (ns/s) */
	#define PPS_POPCORN 2 /* popcorn spike threshold (shift) */

	static struct timespec pps_tf[3]; /* phase median filter */
	static l_fp pps_freq; /* scaled frequency offset (ns/s) */
	static long pps_fcount; /* frequency accumulator */
	static long pps_jitter; /* nominal jitter (ns) */
	static long pps_stabil; /* nominal stability (scaled ns/s) */
	static long pps_lastsec; /* time at last calibration (s) */
	static int pps_valid; /* signal watchdog counter */
	static int pps_shift = PPS_FAVG; /* interval duration (s) (shift) */
	static int pps_shiftmax = PPS_FAVGDEF; /* max interval duration (s) (shift) */
	static int pps_intcnt; /* wander counter */

	/*
	* PPS signal quality monitors
	*/
	static long pps_calcnt; /* calibration intervals */
	static long pps_jitcnt; /* jitter limit exceeded */
	static long pps_stbcnt; /* stability limit exceeded */
	static long pps_errcnt; /* calibration errors */
	#endif /* PPS_SYNC */
	/*
	* End of phase/frequency-lock loop (PLL/FLL) definitions
	*/

	static void ntp_init(void);
	static void hardupdate(long offset);
	static void ntp_gettime1(struct ntptimeval *ntvp);
	static int ntp_is_time_error(void);

	static int
	ntp_is_time_error(void)
	{
	/*
	* Status word error decode. If any of these conditions occur,
	* an error is returned, instead of the status word. Most
	* applications will care only about the fact the system clock
	* may not be trusted, not about the details.
	*
	* Hardware or software error
	*/
	if ((time_status & (STA_UNSYNC \| STA_CLOCKERR)) \|\|

	/*
	* PPS signal lost when either time or frequency synchronization
	* requested
	*/
	(time_status & (STA_PPSFREQ \| STA_PPSTIME) &&
	!(time_status & STA_PPSSIGNAL)) \|\|

	/*
	* PPS jitter exceeded when time synchronization requested
	*/
	(time_status & STA_PPSTIME &&
	time_status & STA_PPSJITTER) \|\|

	/*
	* PPS wander exceeded or calibration error when frequency
	* synchronization requested
	*/
	(time_status & STA_PPSFREQ &&
	time_status & (STA_PPSWANDER \| STA_PPSERROR)))
	return (1);

	return (0);
	}

	static void
	ntp_gettime1(struct ntptimeval *ntvp)
	{
	struct timespec atv; /* nanosecond time */

	GIANT_REQUIRED;

	nanotime(&atv);
	ntvp->time.tv_sec = atv.tv_sec;
	ntvp->time.tv_nsec = atv.tv_nsec;
	ntvp->maxerror = time_maxerror;
	ntvp->esterror = time_esterror;
	ntvp->tai = time_tai;
	ntvp->time_state = time_state;

	if (ntp_is_time_error())
	ntvp->time_state = TIME_ERROR;
	}

	/*
	* ntp_gettime() - NTP user application interface
	*
	* See the timex.h header file for synopsis and API description. Note that
	* the TAI offset is returned in the ntvtimeval.tai structure member.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct ntp_gettime_args {
	struct ntptimeval *ntvp;
	};
	#endif
	/* ARGSUSED */
	int
	-ntp_gettime(struct thread td, struct ntp_gettime_args uap)
	+sys_ntp_gettime(struct thread td, struct ntp_gettime_args uap)
	{
	struct ntptimeval ntv;

	mtx_lock(&Giant);
	ntp_gettime1(&ntv);
	mtx_unlock(&Giant);

	td->td_retval[0] = ntv.time_state;
	return (copyout(&ntv, uap->ntvp, sizeof(ntv)));
	}

	static int
	ntp_sysctl(SYSCTL_HANDLER_ARGS)
	{
	struct ntptimeval ntv; /* temporary structure */

	ntp_gettime1(&ntv);

	return (sysctl_handle_opaque(oidp, &ntv, sizeof(ntv), req));
	}

	SYSCTL_NODE(_kern, OID_AUTO, ntp_pll, CTLFLAG_RW, 0, "");
	SYSCTL_PROC(_kern_ntp_pll, OID_AUTO, gettime, CTLTYPE_OPAQUE\|CTLFLAG_RD,
	0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", "");

	#ifdef PPS_SYNC
	SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shiftmax, CTLFLAG_RW, &pps_shiftmax, 0, "");
	SYSCTL_INT(_kern_ntp_pll, OID_AUTO, pps_shift, CTLFLAG_RW, &pps_shift, 0, "");
	SYSCTL_LONG(_kern_ntp_pll, OID_AUTO, time_monitor, CTLFLAG_RD,
	&time_monitor, 0, "");

	SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, pps_freq, CTLFLAG_RD, &pps_freq, sizeof(pps_freq), "I", "");
	SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, time_freq, CTLFLAG_RD, &time_freq, sizeof(time_freq), "I", "");
	#endif

	/*
	* ntp_adjtime() - NTP daemon application interface
	*
	* See the timex.h header file for synopsis and API description. Note that
	* the timex.constant structure member has a dual purpose to set the time
	* constant and to set the TAI offset.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct ntp_adjtime_args {
	struct timex *tp;
	};
	#endif

	int
	-ntp_adjtime(struct thread td, struct ntp_adjtime_args uap)
	+sys_ntp_adjtime(struct thread td, struct ntp_adjtime_args uap)
	{
	struct timex ntv; /* temporary structure */
	long freq; /* frequency ns/s) */
	int modes; /* mode bits from structure */
	int s; /* caller priority */
	int error;

	error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv));
	if (error)
	return(error);

	/*
	* Update selected clock variables - only the superuser can
	* change anything. Note that there is no error checking here on
	* the assumption the superuser should know what it is doing.
	* Note that either the time constant or TAI offset are loaded
	* from the ntv.constant member, depending on the mode bits. If
	* the STA_PLL bit in the status word is cleared, the state and
	* status words are reset to the initial values at boot.
	*/
	mtx_lock(&Giant);
	modes = ntv.modes;
	if (modes)
	error = priv_check(td, PRIV_NTP_ADJTIME);
	if (error)
	goto done2;
	s = splclock();
	if (modes & MOD_MAXERROR)
	time_maxerror = ntv.maxerror;
	if (modes & MOD_ESTERROR)
	time_esterror = ntv.esterror;
	if (modes & MOD_STATUS) {
	if (time_status & STA_PLL && !(ntv.status & STA_PLL)) {
	time_state = TIME_OK;
	time_status = STA_UNSYNC;
	#ifdef PPS_SYNC
	pps_shift = PPS_FAVG;
	#endif /* PPS_SYNC */
	}
	time_status &= STA_RONLY;
	time_status \|= ntv.status & ~STA_RONLY;
	}
	if (modes & MOD_TIMECONST) {
	if (ntv.constant < 0)
	time_constant = 0;
	else if (ntv.constant > MAXTC)
	time_constant = MAXTC;
	else
	time_constant = ntv.constant;
	}
	if (modes & MOD_TAI) {
	if (ntv.constant > 0) /* XXX zero & negative numbers ? */
	time_tai = ntv.constant;
	}
	#ifdef PPS_SYNC
	if (modes & MOD_PPSMAX) {
	if (ntv.shift < PPS_FAVG)
	pps_shiftmax = PPS_FAVG;
	else if (ntv.shift > PPS_FAVGMAX)
	pps_shiftmax = PPS_FAVGMAX;
	else
	pps_shiftmax = ntv.shift;
	}
	#endif /* PPS_SYNC */
	if (modes & MOD_NANO)
	time_status \|= STA_NANO;
	if (modes & MOD_MICRO)
	time_status &= ~STA_NANO;
	if (modes & MOD_CLKB)
	time_status \|= STA_CLK;
	if (modes & MOD_CLKA)
	time_status &= ~STA_CLK;
	if (modes & MOD_FREQUENCY) {
	freq = (ntv.freq * 1000LL) >> 16;
	if (freq > MAXFREQ)
	L_LINT(time_freq, MAXFREQ);
	else if (freq < -MAXFREQ)
	L_LINT(time_freq, -MAXFREQ);
	else {
	/*
	* ntv.freq is [PPM * 2^16] = [us/s * 2^16]
	* time_freq is [ns/s * 2^32]
	*/
	time_freq = ntv.freq * 1000LL * 65536LL;
	}
	#ifdef PPS_SYNC
	pps_freq = time_freq;
	#endif /* PPS_SYNC */
	}
	if (modes & MOD_OFFSET) {
	if (time_status & STA_NANO)
	hardupdate(ntv.offset);
	else
	hardupdate(ntv.offset * 1000);
	}

	/*
	* Retrieve all clock variables. Note that the TAI offset is
	* returned only by ntp_gettime();
	*/
	if (time_status & STA_NANO)
	ntv.offset = L_GINT(time_offset);
	else
	ntv.offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */
	ntv.freq = L_GINT((time_freq / 1000LL) << 16);
	ntv.maxerror = time_maxerror;
	ntv.esterror = time_esterror;
	ntv.status = time_status;
	ntv.constant = time_constant;
	if (time_status & STA_NANO)
	ntv.precision = time_precision;
	else
	ntv.precision = time_precision / 1000;
	ntv.tolerance = MAXFREQ * SCALE_PPM;
	#ifdef PPS_SYNC
	ntv.shift = pps_shift;
	ntv.ppsfreq = L_GINT((pps_freq / 1000LL) << 16);
	if (time_status & STA_NANO)
	ntv.jitter = pps_jitter;
	else
	ntv.jitter = pps_jitter / 1000;
	ntv.stabil = pps_stabil;
	ntv.calcnt = pps_calcnt;
	ntv.errcnt = pps_errcnt;
	ntv.jitcnt = pps_jitcnt;
	ntv.stbcnt = pps_stbcnt;
	#endif /* PPS_SYNC */
	splx(s);

	error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv));
	if (error)
	goto done2;

	if (ntp_is_time_error())
	td->td_retval[0] = TIME_ERROR;
	else
	td->td_retval[0] = time_state;

	done2:
	mtx_unlock(&Giant);
	return (error);
	}

	/*
	* second_overflow() - called after ntp_tick_adjust()
	*
	* This routine is ordinarily called immediately following the above
	* routine ntp_tick_adjust(). While these two routines are normally
	* combined, they are separated here only for the purposes of
	* simulation.
	*/
	void
	ntp_update_second(int64_t adjustment, time_t newsec)
	{
	int tickrate;
	l_fp ftemp; /* 32/64-bit temporary */

	/*
	* On rollover of the second both the nanosecond and microsecond
	* clocks are updated and the state machine cranked as
	* necessary. The phase adjustment to be used for the next
	* second is calculated and the maximum error is increased by
	* the tolerance.
	*/
	time_maxerror += MAXFREQ / 1000;

	/*
	* Leap second processing. If in leap-insert state at
	* the end of the day, the system clock is set back one
	* second; if in leap-delete state, the system clock is
	* set ahead one second. The nano_time() routine or
	* external clock driver will insure that reported time
	* is always monotonic.
	*/
	switch (time_state) {

	/*
	* No warning.
	*/
	case TIME_OK:
	if (time_status & STA_INS)
	time_state = TIME_INS;
	else if (time_status & STA_DEL)
	time_state = TIME_DEL;
	break;

	/*
	* Insert second 23:59:60 following second
	* 23:59:59.
	*/
	case TIME_INS:
	if (!(time_status & STA_INS))
	time_state = TIME_OK;
	else if ((*newsec) % 86400 == 0) {
	(*newsec)--;
	time_state = TIME_OOP;
	time_tai++;
	}
	break;

	/*
	* Delete second 23:59:59.
	*/
	case TIME_DEL:
	if (!(time_status & STA_DEL))
	time_state = TIME_OK;
	else if (((*newsec) + 1) % 86400 == 0) {
	(*newsec)++;
	time_tai--;
	time_state = TIME_WAIT;
	}
	break;

	/*
	* Insert second in progress.
	*/
	case TIME_OOP:
	time_state = TIME_WAIT;
	break;

	/*
	* Wait for status bits to clear.
	*/
	case TIME_WAIT:
	if (!(time_status & (STA_INS \| STA_DEL)))
	time_state = TIME_OK;
	}

	/*
	* Compute the total time adjustment for the next second
	* in ns. The offset is reduced by a factor depending on
	* whether the PPS signal is operating. Note that the
	* value is in effect scaled by the clock frequency,
	* since the adjustment is added at each tick interrupt.
	*/
	ftemp = time_offset;
	#ifdef PPS_SYNC
	/* XXX even if PPS signal dies we should finish adjustment ? */
	if (time_status & STA_PPSTIME && time_status &
	STA_PPSSIGNAL)
	L_RSHIFT(ftemp, pps_shift);
	else
	L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
	#else
	L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
	#endif /* PPS_SYNC */
	time_adj = ftemp;
	L_SUB(time_offset, ftemp);
	L_ADD(time_adj, time_freq);

	/*
	* Apply any correction from adjtime(2). If more than one second
	* off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM)
	* until the last second is slewed the final < 500 usecs.
	*/
	if (time_adjtime != 0) {
	if (time_adjtime > 1000000)
	tickrate = 5000;
	else if (time_adjtime < -1000000)
	tickrate = -5000;
	else if (time_adjtime > 500)
	tickrate = 500;
	else if (time_adjtime < -500)
	tickrate = -500;
	else
	tickrate = time_adjtime;
	time_adjtime -= tickrate;
	L_LINT(ftemp, tickrate * 1000);
	L_ADD(time_adj, ftemp);
	}
	*adjustment = time_adj;

	#ifdef PPS_SYNC
	if (pps_valid > 0)
	pps_valid--;
	else
	time_status &= ~STA_PPSSIGNAL;
	#endif /* PPS_SYNC */
	}

	/*
	* ntp_init() - initialize variables and structures
	*
	* This routine must be called after the kernel variables hz and tick
	* are set or changed and before the next tick interrupt. In this
	* particular implementation, these values are assumed set elsewhere in
	* the kernel. The design allows the clock frequency and tick interval
	* to be changed while the system is running. So, this routine should
	* probably be integrated with the code that does that.
	*/
	static void
	ntp_init()
	{

	/*
	* The following variables are initialized only at startup. Only
	* those structures not cleared by the compiler need to be
	* initialized, and these only in the simulator. In the actual
	* kernel, any nonzero values here will quickly evaporate.
	*/
	L_CLR(time_offset);
	L_CLR(time_freq);
	#ifdef PPS_SYNC
	pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0;
	pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0;
	pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0;
	pps_fcount = 0;
	L_CLR(pps_freq);
	#endif /* PPS_SYNC */
	}

	SYSINIT(ntpclocks, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, ntp_init, NULL);

	/*
	* hardupdate() - local clock update
	*
	* This routine is called by ntp_adjtime() to update the local clock
	* phase and frequency. The implementation is of an adaptive-parameter,
	* hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
	* time and frequency offset estimates for each call. If the kernel PPS
	* discipline code is configured (PPS_SYNC), the PPS signal itself
	* determines the new time offset, instead of the calling argument.
	* Presumably, calls to ntp_adjtime() occur only when the caller
	* believes the local clock is valid within some bound (+-128 ms with
	* NTP). If the caller's time is far different than the PPS time, an
	* argument will ensue, and it's not clear who will lose.
	*
	* For uncompensated quartz crystal oscillators and nominal update
	* intervals less than 256 s, operation should be in phase-lock mode,
	* where the loop is disciplined to phase. For update intervals greater
	* than 1024 s, operation should be in frequency-lock mode, where the
	* loop is disciplined to frequency. Between 256 s and 1024 s, the mode
	* is selected by the STA_MODE status bit.
	*/
	static void
	hardupdate(offset)
	long offset; /* clock offset (ns) */
	{
	long mtemp;
	l_fp ftemp;

	/*
	* Select how the phase is to be controlled and from which
	* source. If the PPS signal is present and enabled to
	* discipline the time, the PPS offset is used; otherwise, the
	* argument offset is used.
	*/
	if (!(time_status & STA_PLL))
	return;
	if (!(time_status & STA_PPSTIME && time_status &
	STA_PPSSIGNAL)) {
	if (offset > MAXPHASE)
	time_monitor = MAXPHASE;
	else if (offset < -MAXPHASE)
	time_monitor = -MAXPHASE;
	else
	time_monitor = offset;
	L_LINT(time_offset, time_monitor);
	}

	/*
	* Select how the frequency is to be controlled and in which
	* mode (PLL or FLL). If the PPS signal is present and enabled
	* to discipline the frequency, the PPS frequency is used;
	* otherwise, the argument offset is used to compute it.
	*/
	if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) {
	time_reftime = time_second;
	return;
	}
	if (time_status & STA_FREQHOLD \|\| time_reftime == 0)
	time_reftime = time_second;
	mtemp = time_second - time_reftime;
	L_LINT(ftemp, time_monitor);
	L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1);
	L_MPY(ftemp, mtemp);
	L_ADD(time_freq, ftemp);
	time_status &= ~STA_MODE;
	if (mtemp >= MINSEC && (time_status & STA_FLL \|\| mtemp >
	MAXSEC)) {
	L_LINT(ftemp, (time_monitor << 4) / mtemp);
	L_RSHIFT(ftemp, SHIFT_FLL + 4);
	L_ADD(time_freq, ftemp);
	time_status \|= STA_MODE;
	}
	time_reftime = time_second;
	if (L_GINT(time_freq) > MAXFREQ)
	L_LINT(time_freq, MAXFREQ);
	else if (L_GINT(time_freq) < -MAXFREQ)
	L_LINT(time_freq, -MAXFREQ);
	}

	#ifdef PPS_SYNC
	/*
	* hardpps() - discipline CPU clock oscillator to external PPS signal
	*
	* This routine is called at each PPS interrupt in order to discipline
	* the CPU clock oscillator to the PPS signal. There are two independent
	* first-order feedback loops, one for the phase, the other for the
	* frequency. The phase loop measures and grooms the PPS phase offset
	* and leaves it in a handy spot for the seconds overflow routine. The
	* frequency loop averages successive PPS phase differences and
	* calculates the PPS frequency offset, which is also processed by the
	* seconds overflow routine. The code requires the caller to capture the
	* time and architecture-dependent hardware counter values in
	* nanoseconds at the on-time PPS signal transition.
	*
	* Note that, on some Unix systems this routine runs at an interrupt
	* priority level higher than the timer interrupt routine hardclock().
	* Therefore, the variables used are distinct from the hardclock()
	* variables, except for the actual time and frequency variables, which
	* are determined by this routine and updated atomically.
	*/
	void
	hardpps(tsp, nsec)
	struct timespec tsp; / time at PPS */
	long nsec; /* hardware counter at PPS */
	{
	long u_sec, u_nsec, v_nsec; /* temps */
	l_fp ftemp;

	/*
	* The signal is first processed by a range gate and frequency
	* discriminator. The range gate rejects noise spikes outside
	* the range +-500 us. The frequency discriminator rejects input
	* signals with apparent frequency outside the range 1 +-500
	* PPM. If two hits occur in the same second, we ignore the
	* later hit; if not and a hit occurs outside the range gate,
	* keep the later hit for later comparison, but do not process
	* it.
	*/
	time_status \|= STA_PPSSIGNAL \| STA_PPSJITTER;
	time_status &= ~(STA_PPSWANDER \| STA_PPSERROR);
	pps_valid = PPS_VALID;
	u_sec = tsp->tv_sec;
	u_nsec = tsp->tv_nsec;
	if (u_nsec >= (NANOSECOND >> 1)) {
	u_nsec -= NANOSECOND;
	u_sec++;
	}
	v_nsec = u_nsec - pps_tf[0].tv_nsec;
	if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND -
	MAXFREQ)
	return;
	pps_tf[2] = pps_tf[1];
	pps_tf[1] = pps_tf[0];
	pps_tf[0].tv_sec = u_sec;
	pps_tf[0].tv_nsec = u_nsec;

	/*
	* Compute the difference between the current and previous
	* counter values. If the difference exceeds 0.5 s, assume it
	* has wrapped around, so correct 1.0 s. If the result exceeds
	* the tick interval, the sample point has crossed a tick
	* boundary during the last second, so correct the tick. Very
	* intricate.
	*/
	u_nsec = nsec;
	if (u_nsec > (NANOSECOND >> 1))
	u_nsec -= NANOSECOND;
	else if (u_nsec < -(NANOSECOND >> 1))
	u_nsec += NANOSECOND;
	pps_fcount += u_nsec;
	if (v_nsec > MAXFREQ \|\| v_nsec < -MAXFREQ)
	return;
	time_status &= ~STA_PPSJITTER;

	/*
	* A three-stage median filter is used to help denoise the PPS
	* time. The median sample becomes the time offset estimate; the
	* difference between the other two samples becomes the time
	* dispersion (jitter) estimate.
	*/
	if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) {
	if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) {
	v_nsec = pps_tf[1].tv_nsec; /* 0 1 2 */
	u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec;
	} else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) {
	v_nsec = pps_tf[0].tv_nsec; /* 2 0 1 */
	u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec;
	} else {
	v_nsec = pps_tf[2].tv_nsec; /* 0 2 1 */
	u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec;
	}
	} else {
	if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) {
	v_nsec = pps_tf[1].tv_nsec; /* 2 1 0 */
	u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec;
	} else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) {
	v_nsec = pps_tf[0].tv_nsec; /* 1 0 2 */
	u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec;
	} else {
	v_nsec = pps_tf[2].tv_nsec; /* 1 2 0 */
	u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec;
	}
	}

	/*
	* Nominal jitter is due to PPS signal noise and interrupt
	* latency. If it exceeds the popcorn threshold, the sample is
	* discarded. otherwise, if so enabled, the time offset is
	* updated. We can tolerate a modest loss of data here without
	* much degrading time accuracy.
	*/
	if (u_nsec > (pps_jitter << PPS_POPCORN)) {
	time_status \|= STA_PPSJITTER;
	pps_jitcnt++;
	} else if (time_status & STA_PPSTIME) {
	time_monitor = -v_nsec;
	L_LINT(time_offset, time_monitor);
	}
	pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG;
	u_sec = pps_tf[0].tv_sec - pps_lastsec;
	if (u_sec < (1 << pps_shift))
	return;

	/*
	* At the end of the calibration interval the difference between
	* the first and last counter values becomes the scaled
	* frequency. It will later be divided by the length of the
	* interval to determine the frequency update. If the frequency
	* exceeds a sanity threshold, or if the actual calibration
	* interval is not equal to the expected length, the data are
	* discarded. We can tolerate a modest loss of data here without
	* much degrading frequency accuracy.
	*/
	pps_calcnt++;
	v_nsec = -pps_fcount;
	pps_lastsec = pps_tf[0].tv_sec;
	pps_fcount = 0;
	u_nsec = MAXFREQ << pps_shift;
	if (v_nsec > u_nsec \|\| v_nsec < -u_nsec \|\| u_sec != (1 <<
	pps_shift)) {
	time_status \|= STA_PPSERROR;
	pps_errcnt++;
	return;
	}

	/*
	* Here the raw frequency offset and wander (stability) is
	* calculated. If the wander is less than the wander threshold
	* for four consecutive averaging intervals, the interval is
	* doubled; if it is greater than the threshold for four
	* consecutive intervals, the interval is halved. The scaled
	* frequency offset is converted to frequency offset. The
	* stability metric is calculated as the average of recent
	* frequency changes, but is used only for performance
	* monitoring.
	*/
	L_LINT(ftemp, v_nsec);
	L_RSHIFT(ftemp, pps_shift);
	L_SUB(ftemp, pps_freq);
	u_nsec = L_GINT(ftemp);
	if (u_nsec > PPS_MAXWANDER) {
	L_LINT(ftemp, PPS_MAXWANDER);
	pps_intcnt--;
	time_status \|= STA_PPSWANDER;
	pps_stbcnt++;
	} else if (u_nsec < -PPS_MAXWANDER) {
	L_LINT(ftemp, -PPS_MAXWANDER);
	pps_intcnt--;
	time_status \|= STA_PPSWANDER;
	pps_stbcnt++;
	} else {
	pps_intcnt++;
	}
	if (pps_intcnt >= 4) {
	pps_intcnt = 4;
	if (pps_shift < pps_shiftmax) {
	pps_shift++;
	pps_intcnt = 0;
	}
	} else if (pps_intcnt <= -4 \|\| pps_shift > pps_shiftmax) {
	pps_intcnt = -4;
	if (pps_shift > PPS_FAVG) {
	pps_shift--;
	pps_intcnt = 0;
	}
	}
	if (u_nsec < 0)
	u_nsec = -u_nsec;
	pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG;

	/*
	* The PPS frequency is recalculated and clamped to the maximum
	* MAXFREQ. If enabled, the system clock frequency is updated as
	* well.
	*/
	L_ADD(pps_freq, ftemp);
	u_nsec = L_GINT(pps_freq);
	if (u_nsec > MAXFREQ)
	L_LINT(pps_freq, MAXFREQ);
	else if (u_nsec < -MAXFREQ)
	L_LINT(pps_freq, -MAXFREQ);
	if (time_status & STA_PPSFREQ)
	time_freq = pps_freq;
	}
	#endif /* PPS_SYNC */

	#ifndef _SYS_SYSPROTO_H_
	struct adjtime_args {
	struct timeval *delta;
	struct timeval *olddelta;
	};
	#endif
	/* ARGSUSED */
	int
	-adjtime(struct thread td, struct adjtime_args uap)
	+sys_adjtime(struct thread td, struct adjtime_args uap)
	{
	struct timeval delta, olddelta, *deltap;
	int error;

	if (uap->delta) {
	error = copyin(uap->delta, &delta, sizeof(delta));
	if (error)
	return (error);
	deltap = δ
	} else
	deltap = NULL;
	error = kern_adjtime(td, deltap, &olddelta);
	if (uap->olddelta && error == 0)
	error = copyout(&olddelta, uap->olddelta, sizeof(olddelta));
	return (error);
	}

	int
	kern_adjtime(struct thread td, struct timeval delta, struct timeval *olddelta)
	{
	struct timeval atv;
	int error;

	mtx_lock(&Giant);
	if (olddelta) {
	atv.tv_sec = time_adjtime / 1000000;
	atv.tv_usec = time_adjtime % 1000000;
	if (atv.tv_usec < 0) {
	atv.tv_usec += 1000000;
	atv.tv_sec--;
	}
	*olddelta = atv;
	}
	if (delta) {
	if ((error = priv_check(td, PRIV_ADJTIME))) {
	mtx_unlock(&Giant);
	return (error);
	}
	time_adjtime = (int64_t)delta->tv_sec * 1000000 +
	delta->tv_usec;
	}
	mtx_unlock(&Giant);
	return (0);
	}

	static struct callout resettodr_callout;
	static int resettodr_period = 1800;

	static void
	periodic_resettodr(void *arg __unused)
	{

	if (!ntp_is_time_error()) {
	mtx_lock(&Giant);
	resettodr();
	mtx_unlock(&Giant);
	}
	if (resettodr_period > 0)
	callout_schedule(&resettodr_callout, resettodr_period * hz);
	}

	static void
	shutdown_resettodr(void *arg __unused, int howto __unused)
	{

	callout_drain(&resettodr_callout);
	if (resettodr_period > 0 && !ntp_is_time_error()) {
	mtx_lock(&Giant);
	resettodr();
	mtx_unlock(&Giant);
	}
	}

	static int
	sysctl_resettodr_period(SYSCTL_HANDLER_ARGS)
	{
	int error;

	error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req);
	if (error \|\| !req->newptr)
	return (error);
	if (resettodr_period == 0)
	callout_stop(&resettodr_callout);
	else
	callout_reset(&resettodr_callout, resettodr_period * hz,
	periodic_resettodr, NULL);
	return (0);
	}

	SYSCTL_PROC(_machdep, OID_AUTO, rtc_save_period, CTLTYPE_INT\|CTLFLAG_RW,
	&resettodr_period, 1800, sysctl_resettodr_period, "I",
	"Save system time to RTC with this period (in seconds)");
	TUNABLE_INT("machdep.rtc_save_period", &resettodr_period);

	static void
	start_periodic_resettodr(void *arg __unused)
	{

	EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_resettodr, NULL,
	SHUTDOWN_PRI_FIRST);
	callout_init(&resettodr_callout, 1);
	if (resettodr_period == 0)
	return;
	callout_reset(&resettodr_callout, resettodr_period * hz,
	periodic_resettodr, NULL);
	}

	SYSINIT(periodic_resettodr, SI_SUB_RUN_SCHEDULER, SI_ORDER_MIDDLE,
	start_periodic_resettodr, NULL);
	Index: head/sys/kern/kern_proc.c
	===================================================================
	--- head/sys/kern/kern_proc.c (revision 225616)
	+++ head/sys/kern/kern_proc.c (revision 225617)
	@@ -1,2078 +1,2078 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_proc.c 8.7 (Berkeley) 2/14/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_ddb.h"
	#include "opt_kdtrace.h"
	#include "opt_ktrace.h"
	#include "opt_kstack_pages.h"
	#include "opt_stack.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/loginclass.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/refcount.h>
	#include <sys/sbuf.h>
	#include <sys/sysent.h>
	#include <sys/sched.h>
	#include <sys/smp.h>
	#include <sys/stack.h>
	#include <sys/sysctl.h>
	#include <sys/filedesc.h>
	#include <sys/tty.h>
	#include <sys/signalvar.h>
	#include <sys/sdt.h>
	#include <sys/sx.h>
	#include <sys/user.h>
	#include <sys/jail.h>
	#include <sys/vnode.h>
	#include <sys/eventhandler.h>

	#ifdef DDB
	#include <ddb/ddb.h>
	#endif

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/uma.h>

	#ifdef COMPAT_FREEBSD32
	#include <compat/freebsd32/freebsd32.h>
	#include <compat/freebsd32/freebsd32_util.h>
	#endif

	SDT_PROVIDER_DEFINE(proc);
	SDT_PROBE_DEFINE(proc, kernel, ctor, entry, entry);
	SDT_PROBE_ARGTYPE(proc, kernel, ctor, entry, 0, "struct proc *");
	SDT_PROBE_ARGTYPE(proc, kernel, ctor, entry, 1, "int");
	SDT_PROBE_ARGTYPE(proc, kernel, ctor, entry, 2, "void *");
	SDT_PROBE_ARGTYPE(proc, kernel, ctor, entry, 3, "int");
	SDT_PROBE_DEFINE(proc, kernel, ctor, return, return);
	SDT_PROBE_ARGTYPE(proc, kernel, ctor, return, 0, "struct proc *");
	SDT_PROBE_ARGTYPE(proc, kernel, ctor, return, 1, "int");
	SDT_PROBE_ARGTYPE(proc, kernel, ctor, return, 2, "void *");
	SDT_PROBE_ARGTYPE(proc, kernel, ctor, return, 3, "int");
	SDT_PROBE_DEFINE(proc, kernel, dtor, entry, entry);
	SDT_PROBE_ARGTYPE(proc, kernel, dtor, entry, 0, "struct proc *");
	SDT_PROBE_ARGTYPE(proc, kernel, dtor, entry, 1, "int");
	SDT_PROBE_ARGTYPE(proc, kernel, dtor, entry, 2, "void *");
	SDT_PROBE_ARGTYPE(proc, kernel, dtor, entry, 3, "struct thread *");
	SDT_PROBE_DEFINE(proc, kernel, dtor, return, return);
	SDT_PROBE_ARGTYPE(proc, kernel, dtor, return, 0, "struct proc *");
	SDT_PROBE_ARGTYPE(proc, kernel, dtor, return, 1, "int");
	SDT_PROBE_ARGTYPE(proc, kernel, dtor, return, 2, "void *");
	SDT_PROBE_DEFINE(proc, kernel, init, entry, entry);
	SDT_PROBE_ARGTYPE(proc, kernel, init, entry, 0, "struct proc *");
	SDT_PROBE_ARGTYPE(proc, kernel, init, entry, 1, "int");
	SDT_PROBE_ARGTYPE(proc, kernel, init, entry, 2, "int");
	SDT_PROBE_DEFINE(proc, kernel, init, return, return);
	SDT_PROBE_ARGTYPE(proc, kernel, init, return, 0, "struct proc *");
	SDT_PROBE_ARGTYPE(proc, kernel, init, return, 1, "int");
	SDT_PROBE_ARGTYPE(proc, kernel, init, return, 2, "int");

	MALLOC_DEFINE(M_PGRP, "pgrp", "process group header");
	MALLOC_DEFINE(M_SESSION, "session", "session header");
	static MALLOC_DEFINE(M_PROC, "proc", "Proc structures");
	MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures");

	static void doenterpgrp(struct proc , struct pgrp );
	static void orphanpg(struct pgrp *pg);
	static void fill_kinfo_aggregate(struct proc p, struct kinfo_proc kp);
	static void fill_kinfo_proc_only(struct proc p, struct kinfo_proc kp);
	static void fill_kinfo_thread(struct thread td, struct kinfo_proc kp,
	int preferthread);
	static void pgadjustjobc(struct pgrp *pgrp, int entering);
	static void pgdelete(struct pgrp *);
	static int proc_ctor(void mem, int size, void arg, int flags);
	static void proc_dtor(void mem, int size, void arg);
	static int proc_init(void *mem, int size, int flags);
	static void proc_fini(void *mem, int size);
	static void pargs_free(struct pargs *pa);

	/*
	* Other process lists
	*/
	struct pidhashhead *pidhashtbl;
	u_long pidhash;
	struct pgrphashhead *pgrphashtbl;
	u_long pgrphash;
	struct proclist allproc;
	struct proclist zombproc;
	struct sx allproc_lock;
	struct sx proctree_lock;
	struct mtx ppeers_lock;
	uma_zone_t proc_zone;

	int kstack_pages = KSTACK_PAGES;
	SYSCTL_INT(_kern, OID_AUTO, kstack_pages, CTLFLAG_RD, &kstack_pages, 0,
	"Kernel stack size in pages");

	CTASSERT(sizeof(struct kinfo_proc) == KINFO_PROC_SIZE);
	#ifdef COMPAT_FREEBSD32
	CTASSERT(sizeof(struct kinfo_proc32) == KINFO_PROC32_SIZE);
	#endif

	/*
	* Initialize global process hashing structures.
	*/
	void
	procinit()
	{

	sx_init(&allproc_lock, "allproc");
	sx_init(&proctree_lock, "proctree");
	mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF);
	LIST_INIT(&allproc);
	LIST_INIT(&zombproc);
	pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash);
	pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash);
	proc_zone = uma_zcreate("PROC", sched_sizeof_proc(),
	proc_ctor, proc_dtor, proc_init, proc_fini,
	UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	uihashinit();
	}

	/*
	* Prepare a proc for use.
	*/
	static int
	proc_ctor(void mem, int size, void arg, int flags)
	{
	struct proc *p;

	p = (struct proc *)mem;
	SDT_PROBE(proc, kernel, ctor , entry, p, size, arg, flags, 0);
	EVENTHANDLER_INVOKE(process_ctor, p);
	SDT_PROBE(proc, kernel, ctor , return, p, size, arg, flags, 0);
	return (0);
	}

	/*
	* Reclaim a proc after use.
	*/
	static void
	proc_dtor(void mem, int size, void arg)
	{
	struct proc *p;
	struct thread *td;

	/* INVARIANTS checks go here */
	p = (struct proc *)mem;
	td = FIRST_THREAD_IN_PROC(p);
	SDT_PROBE(proc, kernel, dtor, entry, p, size, arg, td, 0);
	if (td != NULL) {
	#ifdef INVARIANTS
	KASSERT((p->p_numthreads == 1),
	("bad number of threads in exiting process"));
	KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr"));
	#endif
	/* Free all OSD associated to this thread. */
	osd_thread_exit(td);
	}
	EVENTHANDLER_INVOKE(process_dtor, p);
	if (p->p_ksi != NULL)
	KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue"));
	SDT_PROBE(proc, kernel, dtor, return, p, size, arg, 0, 0);
	}

	/*
	* Initialize type-stable parts of a proc (when newly created).
	*/
	static int
	proc_init(void *mem, int size, int flags)
	{
	struct proc *p;

	p = (struct proc *)mem;
	SDT_PROBE(proc, kernel, init, entry, p, size, flags, 0, 0);
	p->p_sched = (struct p_sched *)&p[1];
	bzero(&p->p_mtx, sizeof(struct mtx));
	mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF \| MTX_DUPOK);
	mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN \| MTX_RECURSE);
	cv_init(&p->p_pwait, "ppwait");
	cv_init(&p->p_dbgwait, "dbgwait");
	TAILQ_INIT(&p->p_threads); /* all threads in proc */
	EVENTHANDLER_INVOKE(process_init, p);
	p->p_stats = pstats_alloc();
	SDT_PROBE(proc, kernel, init, return, p, size, flags, 0, 0);
	return (0);
	}

	/*
	* UMA should ensure that this function is never called.
	* Freeing a proc structure would violate type stability.
	*/
	static void
	proc_fini(void *mem, int size)
	{
	#ifdef notnow
	struct proc *p;

	p = (struct proc *)mem;
	EVENTHANDLER_INVOKE(process_fini, p);
	pstats_free(p->p_stats);
	thread_free(FIRST_THREAD_IN_PROC(p));
	mtx_destroy(&p->p_mtx);
	if (p->p_ksi != NULL)
	ksiginfo_free(p->p_ksi);
	#else
	panic("proc reclaimed");
	#endif
	}

	/*
	* Is p an inferior of the current process?
	*/
	int
	inferior(p)
	register struct proc *p;
	{

	sx_assert(&proctree_lock, SX_LOCKED);
	for (; p != curproc; p = p->p_pptr)
	if (p->p_pid == 0)
	return (0);
	return (1);
	}

	/*
	* Locate a process by number; return only "live" processes -- i.e., neither
	* zombies nor newly born but incompletely initialized processes. By not
	* returning processes in the PRS_NEW state, we allow callers to avoid
	* testing for that condition to avoid dereferencing p_ucred, et al.
	*/
	struct proc *
	pfind(pid)
	register pid_t pid;
	{
	register struct proc *p;

	sx_slock(&allproc_lock);
	LIST_FOREACH(p, PIDHASH(pid), p_hash)
	if (p->p_pid == pid) {
	PROC_LOCK(p);
	if (p->p_state == PRS_NEW) {
	PROC_UNLOCK(p);
	p = NULL;
	}
	break;
	}
	sx_sunlock(&allproc_lock);
	return (p);
	}

	/*
	* Locate a process group by number.
	* The caller must hold proctree_lock.
	*/
	struct pgrp *
	pgfind(pgid)
	register pid_t pgid;
	{
	register struct pgrp *pgrp;

	sx_assert(&proctree_lock, SX_LOCKED);

	LIST_FOREACH(pgrp, PGRPHASH(pgid), pg_hash) {
	if (pgrp->pg_id == pgid) {
	PGRP_LOCK(pgrp);
	return (pgrp);
	}
	}
	return (NULL);
	}

	/*
	* Create a new process group.
	* pgid must be equal to the pid of p.
	* Begin a new session if required.
	*/
	int
	enterpgrp(p, pgid, pgrp, sess)
	register struct proc *p;
	pid_t pgid;
	struct pgrp *pgrp;
	struct session *sess;
	{
	struct pgrp *pgrp2;

	sx_assert(&proctree_lock, SX_XLOCKED);

	KASSERT(pgrp != NULL, ("enterpgrp: pgrp == NULL"));
	KASSERT(p->p_pid == pgid,
	("enterpgrp: new pgrp and pid != pgid"));

	pgrp2 = pgfind(pgid);

	KASSERT(pgrp2 == NULL,
	("enterpgrp: pgrp with pgid exists"));
	KASSERT(!SESS_LEADER(p),
	("enterpgrp: session leader attempted setpgrp"));

	mtx_init(&pgrp->pg_mtx, "process group", NULL, MTX_DEF \| MTX_DUPOK);

	if (sess != NULL) {
	/*
	* new session
	*/
	mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF);
	PROC_LOCK(p);
	p->p_flag &= ~P_CONTROLT;
	PROC_UNLOCK(p);
	PGRP_LOCK(pgrp);
	sess->s_leader = p;
	sess->s_sid = p->p_pid;
	refcount_init(&sess->s_count, 1);
	sess->s_ttyvp = NULL;
	sess->s_ttydp = NULL;
	sess->s_ttyp = NULL;
	bcopy(p->p_session->s_login, sess->s_login,
	sizeof(sess->s_login));
	pgrp->pg_session = sess;
	KASSERT(p == curproc,
	("enterpgrp: mksession and p != curproc"));
	} else {
	pgrp->pg_session = p->p_session;
	sess_hold(pgrp->pg_session);
	PGRP_LOCK(pgrp);
	}
	pgrp->pg_id = pgid;
	LIST_INIT(&pgrp->pg_members);

	/*
	* As we have an exclusive lock of proctree_lock,
	* this should not deadlock.
	*/
	LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash);
	pgrp->pg_jobc = 0;
	SLIST_INIT(&pgrp->pg_sigiolst);
	PGRP_UNLOCK(pgrp);

	doenterpgrp(p, pgrp);

	return (0);
	}

	/*
	* Move p to an existing process group
	*/
	int
	enterthispgrp(p, pgrp)
	register struct proc *p;
	struct pgrp *pgrp;
	{

	sx_assert(&proctree_lock, SX_XLOCKED);
	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);
	KASSERT(pgrp->pg_session == p->p_session,
	("%s: pgrp's session %p, p->p_session %p.\n",
	__func__,
	pgrp->pg_session,
	p->p_session));
	KASSERT(pgrp != p->p_pgrp,
	("%s: p belongs to pgrp.", __func__));

	doenterpgrp(p, pgrp);

	return (0);
	}

	/*
	* Move p to a process group
	*/
	static void
	doenterpgrp(p, pgrp)
	struct proc *p;
	struct pgrp *pgrp;
	{
	struct pgrp *savepgrp;

	sx_assert(&proctree_lock, SX_XLOCKED);
	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
	PGRP_LOCK_ASSERT(p->p_pgrp, MA_NOTOWNED);
	SESS_LOCK_ASSERT(p->p_session, MA_NOTOWNED);

	savepgrp = p->p_pgrp;

	/*
	* Adjust eligibility of affected pgrps to participate in job control.
	* Increment eligibility counts before decrementing, otherwise we
	* could reach 0 spuriously during the first call.
	*/
	fixjobc(p, pgrp, 1);
	fixjobc(p, p->p_pgrp, 0);

	PGRP_LOCK(pgrp);
	PGRP_LOCK(savepgrp);
	PROC_LOCK(p);
	LIST_REMOVE(p, p_pglist);
	p->p_pgrp = pgrp;
	PROC_UNLOCK(p);
	LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
	PGRP_UNLOCK(savepgrp);
	PGRP_UNLOCK(pgrp);
	if (LIST_EMPTY(&savepgrp->pg_members))
	pgdelete(savepgrp);
	}

	/*
	* remove process from process group
	*/
	int
	leavepgrp(p)
	register struct proc *p;
	{
	struct pgrp *savepgrp;

	sx_assert(&proctree_lock, SX_XLOCKED);
	savepgrp = p->p_pgrp;
	PGRP_LOCK(savepgrp);
	PROC_LOCK(p);
	LIST_REMOVE(p, p_pglist);
	p->p_pgrp = NULL;
	PROC_UNLOCK(p);
	PGRP_UNLOCK(savepgrp);
	if (LIST_EMPTY(&savepgrp->pg_members))
	pgdelete(savepgrp);
	return (0);
	}

	/*
	* delete a process group
	*/
	static void
	pgdelete(pgrp)
	register struct pgrp *pgrp;
	{
	struct session *savesess;
	struct tty *tp;

	sx_assert(&proctree_lock, SX_XLOCKED);
	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);

	/*
	* Reset any sigio structures pointing to us as a result of
	* F_SETOWN with our pgid.
	*/
	funsetownlst(&pgrp->pg_sigiolst);

	PGRP_LOCK(pgrp);
	tp = pgrp->pg_session->s_ttyp;
	LIST_REMOVE(pgrp, pg_hash);
	savesess = pgrp->pg_session;
	PGRP_UNLOCK(pgrp);

	/* Remove the reference to the pgrp before deallocating it. */
	if (tp != NULL) {
	tty_lock(tp);
	tty_rel_pgrp(tp, pgrp);
	}

	mtx_destroy(&pgrp->pg_mtx);
	free(pgrp, M_PGRP);
	sess_release(savesess);
	}

	static void
	pgadjustjobc(pgrp, entering)
	struct pgrp *pgrp;
	int entering;
	{

	PGRP_LOCK(pgrp);
	if (entering)
	pgrp->pg_jobc++;
	else {
	--pgrp->pg_jobc;
	if (pgrp->pg_jobc == 0)
	orphanpg(pgrp);
	}
	PGRP_UNLOCK(pgrp);
	}

	/*
	* Adjust pgrp jobc counters when specified process changes process group.
	* We count the number of processes in each process group that "qualify"
	* the group for terminal job control (those with a parent in a different
	* process group of the same session). If that count reaches zero, the
	* process group becomes orphaned. Check both the specified process'
	* process group and that of its children.
	* entering == 0 => p is leaving specified group.
	* entering == 1 => p is entering specified group.
	*/
	void
	fixjobc(p, pgrp, entering)
	register struct proc *p;
	register struct pgrp *pgrp;
	int entering;
	{
	register struct pgrp *hispgrp;
	register struct session *mysession;

	sx_assert(&proctree_lock, SX_LOCKED);
	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
	PGRP_LOCK_ASSERT(pgrp, MA_NOTOWNED);
	SESS_LOCK_ASSERT(pgrp->pg_session, MA_NOTOWNED);

	/*
	* Check p's parent to see whether p qualifies its own process
	* group; if so, adjust count for p's process group.
	*/
	mysession = pgrp->pg_session;
	if ((hispgrp = p->p_pptr->p_pgrp) != pgrp &&
	hispgrp->pg_session == mysession)
	pgadjustjobc(pgrp, entering);

	/*
	* Check this process' children to see whether they qualify
	* their process groups; if so, adjust counts for children's
	* process groups.
	*/
	LIST_FOREACH(p, &p->p_children, p_sibling) {
	hispgrp = p->p_pgrp;
	if (hispgrp == pgrp \|\|
	hispgrp->pg_session != mysession)
	continue;
	PROC_LOCK(p);
	if (p->p_state == PRS_ZOMBIE) {
	PROC_UNLOCK(p);
	continue;
	}
	PROC_UNLOCK(p);
	pgadjustjobc(hispgrp, entering);
	}
	}

	/*
	* A process group has become orphaned;
	* if there are any stopped processes in the group,
	* hang-up all process in that group.
	*/
	static void
	orphanpg(pg)
	struct pgrp *pg;
	{
	register struct proc *p;

	PGRP_LOCK_ASSERT(pg, MA_OWNED);

	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
	PROC_LOCK(p);
	if (P_SHOULDSTOP(p)) {
	PROC_UNLOCK(p);
	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
	PROC_LOCK(p);
	- psignal(p, SIGHUP);
	- psignal(p, SIGCONT);
	+ kern_psignal(p, SIGHUP);
	+ kern_psignal(p, SIGCONT);
	PROC_UNLOCK(p);
	}
	return;
	}
	PROC_UNLOCK(p);
	}
	}

	void
	sess_hold(struct session *s)
	{

	refcount_acquire(&s->s_count);
	}

	void
	sess_release(struct session *s)
	{

	if (refcount_release(&s->s_count)) {
	if (s->s_ttyp != NULL) {
	tty_lock(s->s_ttyp);
	tty_rel_sess(s->s_ttyp, s);
	}
	mtx_destroy(&s->s_mtx);
	free(s, M_SESSION);
	}
	}

	#include "opt_ddb.h"
	#ifdef DDB
	#include <ddb/ddb.h>

	DB_SHOW_COMMAND(pgrpdump, pgrpdump)
	{
	register struct pgrp *pgrp;
	register struct proc *p;
	register int i;

	for (i = 0; i <= pgrphash; i++) {
	if (!LIST_EMPTY(&pgrphashtbl[i])) {
	printf("\tindx %d\n", i);
	LIST_FOREACH(pgrp, &pgrphashtbl[i], pg_hash) {
	printf(
	"\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n",
	(void *)pgrp, (long)pgrp->pg_id,
	(void *)pgrp->pg_session,
	pgrp->pg_session->s_count,
	(void *)LIST_FIRST(&pgrp->pg_members));
	LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
	printf("\t\tpid %ld addr %p pgrp %p\n",
	(long)p->p_pid, (void *)p,
	(void *)p->p_pgrp);
	}
	}
	}
	}
	}
	#endif /* DDB */

	/*
	* Calculate the kinfo_proc members which contain process-wide
	* informations.
	* Must be called with the target process locked.
	*/
	static void
	fill_kinfo_aggregate(struct proc p, struct kinfo_proc kp)
	{
	struct thread *td;

	PROC_LOCK_ASSERT(p, MA_OWNED);

	kp->ki_estcpu = 0;
	kp->ki_pctcpu = 0;
	FOREACH_THREAD_IN_PROC(p, td) {
	thread_lock(td);
	kp->ki_pctcpu += sched_pctcpu(td);
	kp->ki_estcpu += td->td_estcpu;
	thread_unlock(td);
	}
	}

	/*
	* Clear kinfo_proc and fill in any information that is common
	* to all threads in the process.
	* Must be called with the target process locked.
	*/
	static void
	fill_kinfo_proc_only(struct proc p, struct kinfo_proc kp)
	{
	struct thread *td0;
	struct tty *tp;
	struct session *sp;
	struct ucred *cred;
	struct sigacts *ps;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	bzero(kp, sizeof(*kp));

	kp->ki_structsize = sizeof(*kp);
	kp->ki_paddr = p;
	kp->ki_addr =/* p->p_addr; /0; / XXX */
	kp->ki_args = p->p_args;
	kp->ki_textvp = p->p_textvp;
	#ifdef KTRACE
	kp->ki_tracep = p->p_tracevp;
	kp->ki_traceflag = p->p_traceflag;
	#endif
	kp->ki_fd = p->p_fd;
	kp->ki_vmspace = p->p_vmspace;
	kp->ki_flag = p->p_flag;
	cred = p->p_ucred;
	if (cred) {
	kp->ki_uid = cred->cr_uid;
	kp->ki_ruid = cred->cr_ruid;
	kp->ki_svuid = cred->cr_svuid;
	kp->ki_cr_flags = 0;
	if (cred->cr_flags & CRED_FLAG_CAPMODE)
	kp->ki_cr_flags \|= KI_CRF_CAPABILITY_MODE;
	/* XXX bde doesn't like KI_NGROUPS */
	if (cred->cr_ngroups > KI_NGROUPS) {
	kp->ki_ngroups = KI_NGROUPS;
	kp->ki_cr_flags \|= KI_CRF_GRP_OVERFLOW;
	} else
	kp->ki_ngroups = cred->cr_ngroups;
	bcopy(cred->cr_groups, kp->ki_groups,
	kp->ki_ngroups * sizeof(gid_t));
	kp->ki_rgid = cred->cr_rgid;
	kp->ki_svgid = cred->cr_svgid;
	/* If jailed(cred), emulate the old P_JAILED flag. */
	if (jailed(cred)) {
	kp->ki_flag \|= P_JAILED;
	/* If inside the jail, use 0 as a jail ID. */
	if (cred->cr_prison != curthread->td_ucred->cr_prison)
	kp->ki_jid = cred->cr_prison->pr_id;
	}
	strlcpy(kp->ki_loginclass, cred->cr_loginclass->lc_name,
	sizeof(kp->ki_loginclass));
	}
	ps = p->p_sigacts;
	if (ps) {
	mtx_lock(&ps->ps_mtx);
	kp->ki_sigignore = ps->ps_sigignore;
	kp->ki_sigcatch = ps->ps_sigcatch;
	mtx_unlock(&ps->ps_mtx);
	}
	if (p->p_state != PRS_NEW &&
	p->p_state != PRS_ZOMBIE &&
	p->p_vmspace != NULL) {
	struct vmspace *vm = p->p_vmspace;

	kp->ki_size = vm->vm_map.size;
	kp->ki_rssize = vmspace_resident_count(vm); /XXX/
	FOREACH_THREAD_IN_PROC(p, td0) {
	if (!TD_IS_SWAPPED(td0))
	kp->ki_rssize += td0->td_kstack_pages;
	}
	kp->ki_swrss = vm->vm_swrss;
	kp->ki_tsize = vm->vm_tsize;
	kp->ki_dsize = vm->vm_dsize;
	kp->ki_ssize = vm->vm_ssize;
	} else if (p->p_state == PRS_ZOMBIE)
	kp->ki_stat = SZOMB;
	if (kp->ki_flag & P_INMEM)
	kp->ki_sflag = PS_INMEM;
	else
	kp->ki_sflag = 0;
	/* Calculate legacy swtime as seconds since 'swtick'. */
	kp->ki_swtime = (ticks - p->p_swtick) / hz;
	kp->ki_pid = p->p_pid;
	kp->ki_nice = p->p_nice;
	kp->ki_start = p->p_stats->p_start;
	timevaladd(&kp->ki_start, &boottime);
	PROC_SLOCK(p);
	rufetch(p, &kp->ki_rusage);
	kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime);
	calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime);
	PROC_SUNLOCK(p);
	calccru(p, &kp->ki_childutime, &kp->ki_childstime);
	/* Some callers want child times in a single value. */
	kp->ki_childtime = kp->ki_childstime;
	timevaladd(&kp->ki_childtime, &kp->ki_childutime);

	tp = NULL;
	if (p->p_pgrp) {
	kp->ki_pgid = p->p_pgrp->pg_id;
	kp->ki_jobc = p->p_pgrp->pg_jobc;
	sp = p->p_pgrp->pg_session;

	if (sp != NULL) {
	kp->ki_sid = sp->s_sid;
	SESS_LOCK(sp);
	strlcpy(kp->ki_login, sp->s_login,
	sizeof(kp->ki_login));
	if (sp->s_ttyvp)
	kp->ki_kiflag \|= KI_CTTY;
	if (SESS_LEADER(p))
	kp->ki_kiflag \|= KI_SLEADER;
	/* XXX proctree_lock */
	tp = sp->s_ttyp;
	SESS_UNLOCK(sp);
	}
	}
	if ((p->p_flag & P_CONTROLT) && tp != NULL) {
	kp->ki_tdev = tty_udev(tp);
	kp->ki_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
	if (tp->t_session)
	kp->ki_tsid = tp->t_session->s_sid;
	} else
	kp->ki_tdev = NODEV;
	if (p->p_comm[0] != '\0')
	strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm));
	if (p->p_sysent && p->p_sysent->sv_name != NULL &&
	p->p_sysent->sv_name[0] != '\0')
	strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul));
	kp->ki_siglist = p->p_siglist;
	kp->ki_xstat = p->p_xstat;
	kp->ki_acflag = p->p_acflag;
	kp->ki_lock = p->p_lock;
	if (p->p_pptr)
	kp->ki_ppid = p->p_pptr->p_pid;
	}

	/*
	* Fill in information that is thread specific. Must be called with
	* target process locked. If 'preferthread' is set, overwrite certain
	* process-related fields that are maintained for both threads and
	* processes.
	*/
	static void
	fill_kinfo_thread(struct thread td, struct kinfo_proc kp, int preferthread)
	{
	struct proc *p;

	p = td->td_proc;
	kp->ki_tdaddr = td;
	PROC_LOCK_ASSERT(p, MA_OWNED);

	if (preferthread)
	PROC_SLOCK(p);
	thread_lock(td);
	if (td->td_wmesg != NULL)
	strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg));
	else
	bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg));
	strlcpy(kp->ki_tdname, td->td_name, sizeof(kp->ki_tdname));
	if (TD_ON_LOCK(td)) {
	kp->ki_kiflag \|= KI_LOCKBLOCK;
	strlcpy(kp->ki_lockname, td->td_lockname,
	sizeof(kp->ki_lockname));
	} else {
	kp->ki_kiflag &= ~KI_LOCKBLOCK;
	bzero(kp->ki_lockname, sizeof(kp->ki_lockname));
	}

	if (p->p_state == PRS_NORMAL) { /* approximate. */
	if (TD_ON_RUNQ(td) \|\|
	TD_CAN_RUN(td) \|\|
	TD_IS_RUNNING(td)) {
	kp->ki_stat = SRUN;
	} else if (P_SHOULDSTOP(p)) {
	kp->ki_stat = SSTOP;
	} else if (TD_IS_SLEEPING(td)) {
	kp->ki_stat = SSLEEP;
	} else if (TD_ON_LOCK(td)) {
	kp->ki_stat = SLOCK;
	} else {
	kp->ki_stat = SWAIT;
	}
	} else if (p->p_state == PRS_ZOMBIE) {
	kp->ki_stat = SZOMB;
	} else {
	kp->ki_stat = SIDL;
	}

	/* Things in the thread */
	kp->ki_wchan = td->td_wchan;
	kp->ki_pri.pri_level = td->td_priority;
	kp->ki_pri.pri_native = td->td_base_pri;
	kp->ki_lastcpu = td->td_lastcpu;
	kp->ki_oncpu = td->td_oncpu;
	kp->ki_tdflags = td->td_flags;
	kp->ki_tid = td->td_tid;
	kp->ki_numthreads = p->p_numthreads;
	kp->ki_pcb = td->td_pcb;
	kp->ki_kstack = (void *)td->td_kstack;
	kp->ki_slptime = (ticks - td->td_slptick) / hz;
	kp->ki_pri.pri_class = td->td_pri_class;
	kp->ki_pri.pri_user = td->td_user_pri;

	if (preferthread) {
	rufetchtd(td, &kp->ki_rusage);
	kp->ki_runtime = cputick2usec(td->td_rux.rux_runtime);
	kp->ki_pctcpu = sched_pctcpu(td);
	kp->ki_estcpu = td->td_estcpu;
	}

	/* We can't get this anymore but ps etc never used it anyway. */
	kp->ki_rqindex = 0;

	if (preferthread)
	kp->ki_siglist = td->td_siglist;
	kp->ki_sigmask = td->td_sigmask;
	thread_unlock(td);
	if (preferthread)
	PROC_SUNLOCK(p);
	}

	/*
	* Fill in a kinfo_proc structure for the specified process.
	* Must be called with the target process locked.
	*/
	void
	fill_kinfo_proc(struct proc p, struct kinfo_proc kp)
	{

	MPASS(FIRST_THREAD_IN_PROC(p) != NULL);

	fill_kinfo_proc_only(p, kp);
	fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp, 0);
	fill_kinfo_aggregate(p, kp);
	}

	struct pstats *
	pstats_alloc(void)
	{

	return (malloc(sizeof(struct pstats), M_SUBPROC, M_ZERO\|M_WAITOK));
	}

	/*
	* Copy parts of p_stats; zero the rest of p_stats (statistics).
	*/
	void
	pstats_fork(struct pstats src, struct pstats dst)
	{

	bzero(&dst->pstat_startzero,
	__rangeof(struct pstats, pstat_startzero, pstat_endzero));
	bcopy(&src->pstat_startcopy, &dst->pstat_startcopy,
	__rangeof(struct pstats, pstat_startcopy, pstat_endcopy));
	}

	void
	pstats_free(struct pstats *ps)
	{

	free(ps, M_SUBPROC);
	}

	/*
	* Locate a zombie process by number
	*/
	struct proc *
	zpfind(pid_t pid)
	{
	struct proc *p;

	sx_slock(&allproc_lock);
	LIST_FOREACH(p, &zombproc, p_list)
	if (p->p_pid == pid) {
	PROC_LOCK(p);
	break;
	}
	sx_sunlock(&allproc_lock);
	return (p);
	}

	#define KERN_PROC_ZOMBMASK 0x3
	#define KERN_PROC_NOTHREADS 0x4

	#ifdef COMPAT_FREEBSD32

	/*
	* This function is typically used to copy out the kernel address, so
	* it can be replaced by assignment of zero.
	*/
	static inline uint32_t
	ptr32_trim(void *ptr)
	{
	uintptr_t uptr;

	uptr = (uintptr_t)ptr;
	return ((uptr > UINT_MAX) ? 0 : uptr);
	}

	#define PTRTRIM_CP(src,dst,fld) \
	do { (dst).fld = ptr32_trim((src).fld); } while (0)

	static void
	freebsd32_kinfo_proc_out(const struct kinfo_proc ki, struct kinfo_proc32 ki32)
	{
	int i;

	bzero(ki32, sizeof(struct kinfo_proc32));
	ki32->ki_structsize = sizeof(struct kinfo_proc32);
	CP(ki, ki32, ki_layout);
	PTRTRIM_CP(ki, ki32, ki_args);
	PTRTRIM_CP(ki, ki32, ki_paddr);
	PTRTRIM_CP(ki, ki32, ki_addr);
	PTRTRIM_CP(ki, ki32, ki_tracep);
	PTRTRIM_CP(ki, ki32, ki_textvp);
	PTRTRIM_CP(ki, ki32, ki_fd);
	PTRTRIM_CP(ki, ki32, ki_vmspace);
	PTRTRIM_CP(ki, ki32, ki_wchan);
	CP(ki, ki32, ki_pid);
	CP(ki, ki32, ki_ppid);
	CP(ki, ki32, ki_pgid);
	CP(ki, ki32, ki_tpgid);
	CP(ki, ki32, ki_sid);
	CP(ki, ki32, ki_tsid);
	CP(ki, ki32, ki_jobc);
	CP(ki, ki32, ki_tdev);
	CP(ki, ki32, ki_siglist);
	CP(ki, ki32, ki_sigmask);
	CP(ki, ki32, ki_sigignore);
	CP(ki, ki32, ki_sigcatch);
	CP(ki, ki32, ki_uid);
	CP(ki, ki32, ki_ruid);
	CP(ki, ki32, ki_svuid);
	CP(ki, ki32, ki_rgid);
	CP(ki, ki32, ki_svgid);
	CP(ki, ki32, ki_ngroups);
	for (i = 0; i < KI_NGROUPS; i++)
	CP(ki, ki32, ki_groups[i]);
	CP(ki, ki32, ki_size);
	CP(ki, ki32, ki_rssize);
	CP(ki, ki32, ki_swrss);
	CP(ki, ki32, ki_tsize);
	CP(ki, ki32, ki_dsize);
	CP(ki, ki32, ki_ssize);
	CP(ki, ki32, ki_xstat);
	CP(ki, ki32, ki_acflag);
	CP(ki, ki32, ki_pctcpu);
	CP(ki, ki32, ki_estcpu);
	CP(ki, ki32, ki_slptime);
	CP(ki, ki32, ki_swtime);
	CP(ki, ki32, ki_runtime);
	TV_CP(ki, ki32, ki_start);
	TV_CP(ki, ki32, ki_childtime);
	CP(ki, ki32, ki_flag);
	CP(ki, ki32, ki_kiflag);
	CP(ki, ki32, ki_traceflag);
	CP(ki, ki32, ki_stat);
	CP(ki, ki32, ki_nice);
	CP(ki, ki32, ki_lock);
	CP(ki, ki32, ki_rqindex);
	CP(ki, ki32, ki_oncpu);
	CP(ki, ki32, ki_lastcpu);
	bcopy(ki->ki_tdname, ki32->ki_tdname, TDNAMLEN + 1);
	bcopy(ki->ki_wmesg, ki32->ki_wmesg, WMESGLEN + 1);
	bcopy(ki->ki_login, ki32->ki_login, LOGNAMELEN + 1);
	bcopy(ki->ki_lockname, ki32->ki_lockname, LOCKNAMELEN + 1);
	bcopy(ki->ki_comm, ki32->ki_comm, COMMLEN + 1);
	bcopy(ki->ki_emul, ki32->ki_emul, KI_EMULNAMELEN + 1);
	bcopy(ki->ki_loginclass, ki32->ki_loginclass, LOGINCLASSLEN + 1);
	CP(ki, ki32, ki_cr_flags);
	CP(ki, ki32, ki_jid);
	CP(ki, ki32, ki_numthreads);
	CP(ki, ki32, ki_tid);
	CP(ki, ki32, ki_pri);
	freebsd32_rusage_out(&ki->ki_rusage, &ki32->ki_rusage);
	freebsd32_rusage_out(&ki->ki_rusage_ch, &ki32->ki_rusage_ch);
	PTRTRIM_CP(ki, ki32, ki_pcb);
	PTRTRIM_CP(ki, ki32, ki_kstack);
	PTRTRIM_CP(ki, ki32, ki_udata);
	CP(ki, ki32, ki_sflag);
	CP(ki, ki32, ki_tdflags);
	}

	static int
	sysctl_out_proc_copyout(struct kinfo_proc ki, struct sysctl_req req)
	{
	struct kinfo_proc32 ki32;
	int error;

	if (req->flags & SCTL_MASK32) {
	freebsd32_kinfo_proc_out(ki, &ki32);
	error = SYSCTL_OUT(req, &ki32, sizeof(struct kinfo_proc32));
	} else
	error = SYSCTL_OUT(req, ki, sizeof(struct kinfo_proc));
	return (error);
	}
	#else
	static int
	sysctl_out_proc_copyout(struct kinfo_proc ki, struct sysctl_req req)
	{

	return (SYSCTL_OUT(req, ki, sizeof(struct kinfo_proc)));
	}
	#endif

	/*
	* Must be called with the process locked and will return with it unlocked.
	*/
	static int
	sysctl_out_proc(struct proc p, struct sysctl_req req, int flags)
	{
	struct thread *td;
	struct kinfo_proc kinfo_proc;
	int error = 0;
	struct proc *np;
	pid_t pid = p->p_pid;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	MPASS(FIRST_THREAD_IN_PROC(p) != NULL);

	fill_kinfo_proc(p, &kinfo_proc);
	if (flags & KERN_PROC_NOTHREADS)
	error = sysctl_out_proc_copyout(&kinfo_proc, req);
	else {
	FOREACH_THREAD_IN_PROC(p, td) {
	fill_kinfo_thread(td, &kinfo_proc, 1);
	error = sysctl_out_proc_copyout(&kinfo_proc, req);
	if (error)
	break;
	}
	}
	PROC_UNLOCK(p);
	if (error)
	return (error);
	if (flags & KERN_PROC_ZOMBMASK)
	np = zpfind(pid);
	else {
	if (pid == 0)
	return (0);
	np = pfind(pid);
	}
	if (np == NULL)
	return (ESRCH);
	if (np != p) {
	PROC_UNLOCK(np);
	return (ESRCH);
	}
	PROC_UNLOCK(np);
	return (0);
	}

	static int
	sysctl_kern_proc(SYSCTL_HANDLER_ARGS)
	{
	int name = (int) arg1;
	u_int namelen = arg2;
	struct proc *p;
	int flags, doingzomb, oid_number;
	int error = 0;

	oid_number = oidp->oid_number;
	if (oid_number != KERN_PROC_ALL &&
	(oid_number & KERN_PROC_INC_THREAD) == 0)
	flags = KERN_PROC_NOTHREADS;
	else {
	flags = 0;
	oid_number &= ~KERN_PROC_INC_THREAD;
	}
	if (oid_number == KERN_PROC_PID) {
	if (namelen != 1)
	return (EINVAL);
	error = sysctl_wire_old_buffer(req, 0);
	if (error)
	return (error);
	p = pfind((pid_t)name[0]);
	if (!p)
	return (ESRCH);
	if ((error = p_cansee(curthread, p))) {
	PROC_UNLOCK(p);
	return (error);
	}
	error = sysctl_out_proc(p, req, flags);
	return (error);
	}

	switch (oid_number) {
	case KERN_PROC_ALL:
	if (namelen != 0)
	return (EINVAL);
	break;
	case KERN_PROC_PROC:
	if (namelen != 0 && namelen != 1)
	return (EINVAL);
	break;
	default:
	if (namelen != 1)
	return (EINVAL);
	break;
	}

	if (!req->oldptr) {
	/* overestimate by 5 procs */
	error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
	if (error)
	return (error);
	}
	error = sysctl_wire_old_buffer(req, 0);
	if (error != 0)
	return (error);
	sx_slock(&allproc_lock);
	for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
	if (!doingzomb)
	p = LIST_FIRST(&allproc);
	else
	p = LIST_FIRST(&zombproc);
	for (; p != 0; p = LIST_NEXT(p, p_list)) {
	/*
	* Skip embryonic processes.
	*/
	PROC_LOCK(p);
	if (p->p_state == PRS_NEW) {
	PROC_UNLOCK(p);
	continue;
	}
	KASSERT(p->p_ucred != NULL,
	("process credential is NULL for non-NEW proc"));
	/*
	* Show a user only appropriate processes.
	*/
	if (p_cansee(curthread, p)) {
	PROC_UNLOCK(p);
	continue;
	}
	/*
	* TODO - make more efficient (see notes below).
	* do by session.
	*/
	switch (oid_number) {

	case KERN_PROC_GID:
	if (p->p_ucred->cr_gid != (gid_t)name[0]) {
	PROC_UNLOCK(p);
	continue;
	}
	break;

	case KERN_PROC_PGRP:
	/* could do this by traversing pgrp */
	if (p->p_pgrp == NULL \|\|
	p->p_pgrp->pg_id != (pid_t)name[0]) {
	PROC_UNLOCK(p);
	continue;
	}
	break;

	case KERN_PROC_RGID:
	if (p->p_ucred->cr_rgid != (gid_t)name[0]) {
	PROC_UNLOCK(p);
	continue;
	}
	break;

	case KERN_PROC_SESSION:
	if (p->p_session == NULL \|\|
	p->p_session->s_sid != (pid_t)name[0]) {
	PROC_UNLOCK(p);
	continue;
	}
	break;

	case KERN_PROC_TTY:
	if ((p->p_flag & P_CONTROLT) == 0 \|\|
	p->p_session == NULL) {
	PROC_UNLOCK(p);
	continue;
	}
	/* XXX proctree_lock */
	SESS_LOCK(p->p_session);
	if (p->p_session->s_ttyp == NULL \|\|
	tty_udev(p->p_session->s_ttyp) !=
	(dev_t)name[0]) {
	SESS_UNLOCK(p->p_session);
	PROC_UNLOCK(p);
	continue;
	}
	SESS_UNLOCK(p->p_session);
	break;

	case KERN_PROC_UID:
	if (p->p_ucred->cr_uid != (uid_t)name[0]) {
	PROC_UNLOCK(p);
	continue;
	}
	break;

	case KERN_PROC_RUID:
	if (p->p_ucred->cr_ruid != (uid_t)name[0]) {
	PROC_UNLOCK(p);
	continue;
	}
	break;

	case KERN_PROC_PROC:
	break;

	default:
	break;

	}

	error = sysctl_out_proc(p, req, flags \| doingzomb);
	if (error) {
	sx_sunlock(&allproc_lock);
	return (error);
	}
	}
	}
	sx_sunlock(&allproc_lock);
	return (0);
	}

	struct pargs *
	pargs_alloc(int len)
	{
	struct pargs *pa;

	pa = malloc(sizeof(struct pargs) + len, M_PARGS,
	M_WAITOK);
	refcount_init(&pa->ar_ref, 1);
	pa->ar_length = len;
	return (pa);
	}

	static void
	pargs_free(struct pargs *pa)
	{

	free(pa, M_PARGS);
	}

	void
	pargs_hold(struct pargs *pa)
	{

	if (pa == NULL)
	return;
	refcount_acquire(&pa->ar_ref);
	}

	void
	pargs_drop(struct pargs *pa)
	{

	if (pa == NULL)
	return;
	if (refcount_release(&pa->ar_ref))
	pargs_free(pa);
	}

	/*
	* This sysctl allows a process to retrieve the argument list or process
	* title for another process without groping around in the address space
	* of the other process. It also allow a process to set its own "process
	* title to a string of its own choice.
	*/
	static int
	sysctl_kern_proc_args(SYSCTL_HANDLER_ARGS)
	{
	int name = (int) arg1;
	u_int namelen = arg2;
	struct pargs newpa, pa;
	struct proc *p;
	int error = 0;

	if (namelen != 1)
	return (EINVAL);

	p = pfind((pid_t)name[0]);
	if (!p)
	return (ESRCH);

	if ((error = p_cansee(curthread, p)) != 0) {
	PROC_UNLOCK(p);
	return (error);
	}

	if (req->newptr && curproc != p) {
	PROC_UNLOCK(p);
	return (EPERM);
	}

	pa = p->p_args;
	pargs_hold(pa);
	PROC_UNLOCK(p);
	if (pa != NULL)
	error = SYSCTL_OUT(req, pa->ar_args, pa->ar_length);
	pargs_drop(pa);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);

	if (req->newlen + sizeof(struct pargs) > ps_arg_cache_limit)
	return (ENOMEM);
	newpa = pargs_alloc(req->newlen);
	error = SYSCTL_IN(req, newpa->ar_args, req->newlen);
	if (error != 0) {
	pargs_free(newpa);
	return (error);
	}
	PROC_LOCK(p);
	pa = p->p_args;
	p->p_args = newpa;
	PROC_UNLOCK(p);
	pargs_drop(pa);
	return (0);
	}

	/*
	* This sysctl allows a process to retrieve the path of the executable for
	* itself or another process.
	*/
	static int
	sysctl_kern_proc_pathname(SYSCTL_HANDLER_ARGS)
	{
	pid_t pidp = (pid_t )arg1;
	unsigned int arglen = arg2;
	struct proc *p;
	struct vnode *vp;
	char retbuf, freebuf;
	int error, vfslocked;

	if (arglen != 1)
	return (EINVAL);
	if (pidp == -1) { / -1 means this process */
	p = req->td->td_proc;
	} else {
	p = pfind(*pidp);
	if (p == NULL)
	return (ESRCH);
	if ((error = p_cansee(curthread, p)) != 0) {
	PROC_UNLOCK(p);
	return (error);
	}
	}

	vp = p->p_textvp;
	if (vp == NULL) {
	if (*pidp != -1)
	PROC_UNLOCK(p);
	return (0);
	}
	vref(vp);
	if (*pidp != -1)
	PROC_UNLOCK(p);
	error = vn_fullpath(req->td, vp, &retbuf, &freebuf);
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	if (error)
	return (error);
	error = SYSCTL_OUT(req, retbuf, strlen(retbuf) + 1);
	free(freebuf, M_TEMP);
	return (error);
	}

	static int
	sysctl_kern_proc_sv_name(SYSCTL_HANDLER_ARGS)
	{
	struct proc *p;
	char *sv_name;
	int *name;
	int namelen;
	int error;

	namelen = arg2;
	if (namelen != 1)
	return (EINVAL);

	name = (int *)arg1;
	if ((p = pfind((pid_t)name[0])) == NULL)
	return (ESRCH);
	if ((error = p_cansee(curthread, p))) {
	PROC_UNLOCK(p);
	return (error);
	}
	sv_name = p->p_sysent->sv_name;
	PROC_UNLOCK(p);
	return (sysctl_handle_string(oidp, sv_name, 0, req));
	}

	#ifdef KINFO_OVMENTRY_SIZE
	CTASSERT(sizeof(struct kinfo_ovmentry) == KINFO_OVMENTRY_SIZE);
	#endif

	#ifdef COMPAT_FREEBSD7
	static int
	sysctl_kern_proc_ovmmap(SYSCTL_HANDLER_ARGS)
	{
	vm_map_entry_t entry, tmp_entry;
	unsigned int last_timestamp;
	char fullpath, freepath;
	struct kinfo_ovmentry *kve;
	struct vattr va;
	struct ucred *cred;
	int error, *name;
	struct vnode *vp;
	struct proc *p;
	vm_map_t map;
	struct vmspace *vm;

	name = (int *)arg1;
	if ((p = pfind((pid_t)name[0])) == NULL)
	return (ESRCH);
	if (p->p_flag & P_WEXIT) {
	PROC_UNLOCK(p);
	return (ESRCH);
	}
	if ((error = p_candebug(curthread, p))) {
	PROC_UNLOCK(p);
	return (error);
	}
	_PHOLD(p);
	PROC_UNLOCK(p);
	vm = vmspace_acquire_ref(p);
	if (vm == NULL) {
	PRELE(p);
	return (ESRCH);
	}
	kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK);

	map = &p->p_vmspace->vm_map; /* XXXRW: More locking required? */
	vm_map_lock_read(map);
	for (entry = map->header.next; entry != &map->header;
	entry = entry->next) {
	vm_object_t obj, tobj, lobj;
	vm_offset_t addr;
	int vfslocked;

	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
	continue;

	bzero(kve, sizeof(*kve));
	kve->kve_structsize = sizeof(*kve);

	kve->kve_private_resident = 0;
	obj = entry->object.vm_object;
	if (obj != NULL) {
	VM_OBJECT_LOCK(obj);
	if (obj->shadow_count == 1)
	kve->kve_private_resident =
	obj->resident_page_count;
	}
	kve->kve_resident = 0;
	addr = entry->start;
	while (addr < entry->end) {
	if (pmap_extract(map->pmap, addr))
	kve->kve_resident++;
	addr += PAGE_SIZE;
	}

	for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
	if (tobj != obj)
	VM_OBJECT_LOCK(tobj);
	if (lobj != obj)
	VM_OBJECT_UNLOCK(lobj);
	lobj = tobj;
	}

	kve->kve_start = (void*)entry->start;
	kve->kve_end = (void*)entry->end;
	kve->kve_offset = (off_t)entry->offset;

	if (entry->protection & VM_PROT_READ)
	kve->kve_protection \|= KVME_PROT_READ;
	if (entry->protection & VM_PROT_WRITE)
	kve->kve_protection \|= KVME_PROT_WRITE;
	if (entry->protection & VM_PROT_EXECUTE)
	kve->kve_protection \|= KVME_PROT_EXEC;

	if (entry->eflags & MAP_ENTRY_COW)
	kve->kve_flags \|= KVME_FLAG_COW;
	if (entry->eflags & MAP_ENTRY_NEEDS_COPY)
	kve->kve_flags \|= KVME_FLAG_NEEDS_COPY;
	if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
	kve->kve_flags \|= KVME_FLAG_NOCOREDUMP;

	last_timestamp = map->timestamp;
	vm_map_unlock_read(map);

	kve->kve_fileid = 0;
	kve->kve_fsid = 0;
	freepath = NULL;
	fullpath = "";
	if (lobj) {
	vp = NULL;
	switch (lobj->type) {
	case OBJT_DEFAULT:
	kve->kve_type = KVME_TYPE_DEFAULT;
	break;
	case OBJT_VNODE:
	kve->kve_type = KVME_TYPE_VNODE;
	vp = lobj->handle;
	vref(vp);
	break;
	case OBJT_SWAP:
	kve->kve_type = KVME_TYPE_SWAP;
	break;
	case OBJT_DEVICE:
	kve->kve_type = KVME_TYPE_DEVICE;
	break;
	case OBJT_PHYS:
	kve->kve_type = KVME_TYPE_PHYS;
	break;
	case OBJT_DEAD:
	kve->kve_type = KVME_TYPE_DEAD;
	break;
	case OBJT_SG:
	kve->kve_type = KVME_TYPE_SG;
	break;
	default:
	kve->kve_type = KVME_TYPE_UNKNOWN;
	break;
	}
	if (lobj != obj)
	VM_OBJECT_UNLOCK(lobj);

	kve->kve_ref_count = obj->ref_count;
	kve->kve_shadow_count = obj->shadow_count;
	VM_OBJECT_UNLOCK(obj);
	if (vp != NULL) {
	vn_fullpath(curthread, vp, &fullpath,
	&freepath);
	cred = curthread->td_ucred;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	if (VOP_GETATTR(vp, &va, cred) == 0) {
	kve->kve_fileid = va.va_fileid;
	kve->kve_fsid = va.va_fsid;
	}
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	} else {
	kve->kve_type = KVME_TYPE_NONE;
	kve->kve_ref_count = 0;
	kve->kve_shadow_count = 0;
	}

	strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path));
	if (freepath != NULL)
	free(freepath, M_TEMP);

	error = SYSCTL_OUT(req, kve, sizeof(*kve));
	vm_map_lock_read(map);
	if (error)
	break;
	if (last_timestamp != map->timestamp) {
	vm_map_lookup_entry(map, addr - 1, &tmp_entry);
	entry = tmp_entry;
	}
	}
	vm_map_unlock_read(map);
	vmspace_free(vm);
	PRELE(p);
	free(kve, M_TEMP);
	return (error);
	}
	#endif /* COMPAT_FREEBSD7 */

	#ifdef KINFO_VMENTRY_SIZE
	CTASSERT(sizeof(struct kinfo_vmentry) == KINFO_VMENTRY_SIZE);
	#endif

	static int
	sysctl_kern_proc_vmmap(SYSCTL_HANDLER_ARGS)
	{
	vm_map_entry_t entry, tmp_entry;
	unsigned int last_timestamp;
	char fullpath, freepath;
	struct kinfo_vmentry *kve;
	struct vattr va;
	struct ucred *cred;
	int error, *name;
	struct vnode *vp;
	struct proc *p;
	struct vmspace *vm;
	vm_map_t map;

	name = (int *)arg1;
	if ((p = pfind((pid_t)name[0])) == NULL)
	return (ESRCH);
	if (p->p_flag & P_WEXIT) {
	PROC_UNLOCK(p);
	return (ESRCH);
	}
	if ((error = p_candebug(curthread, p))) {
	PROC_UNLOCK(p);
	return (error);
	}
	_PHOLD(p);
	PROC_UNLOCK(p);
	vm = vmspace_acquire_ref(p);
	if (vm == NULL) {
	PRELE(p);
	return (ESRCH);
	}
	kve = malloc(sizeof(*kve), M_TEMP, M_WAITOK);

	map = &vm->vm_map; /* XXXRW: More locking required? */
	vm_map_lock_read(map);
	for (entry = map->header.next; entry != &map->header;
	entry = entry->next) {
	vm_object_t obj, tobj, lobj;
	vm_offset_t addr;
	int vfslocked;

	if (entry->eflags & MAP_ENTRY_IS_SUB_MAP)
	continue;

	bzero(kve, sizeof(*kve));

	kve->kve_private_resident = 0;
	obj = entry->object.vm_object;
	if (obj != NULL) {
	VM_OBJECT_LOCK(obj);
	if (obj->shadow_count == 1)
	kve->kve_private_resident =
	obj->resident_page_count;
	}
	kve->kve_resident = 0;
	addr = entry->start;
	while (addr < entry->end) {
	if (pmap_extract(map->pmap, addr))
	kve->kve_resident++;
	addr += PAGE_SIZE;
	}

	for (lobj = tobj = obj; tobj; tobj = tobj->backing_object) {
	if (tobj != obj)
	VM_OBJECT_LOCK(tobj);
	if (lobj != obj)
	VM_OBJECT_UNLOCK(lobj);
	lobj = tobj;
	}

	kve->kve_start = entry->start;
	kve->kve_end = entry->end;
	kve->kve_offset = entry->offset;

	if (entry->protection & VM_PROT_READ)
	kve->kve_protection \|= KVME_PROT_READ;
	if (entry->protection & VM_PROT_WRITE)
	kve->kve_protection \|= KVME_PROT_WRITE;
	if (entry->protection & VM_PROT_EXECUTE)
	kve->kve_protection \|= KVME_PROT_EXEC;

	if (entry->eflags & MAP_ENTRY_COW)
	kve->kve_flags \|= KVME_FLAG_COW;
	if (entry->eflags & MAP_ENTRY_NEEDS_COPY)
	kve->kve_flags \|= KVME_FLAG_NEEDS_COPY;
	if (entry->eflags & MAP_ENTRY_NOCOREDUMP)
	kve->kve_flags \|= KVME_FLAG_NOCOREDUMP;

	last_timestamp = map->timestamp;
	vm_map_unlock_read(map);

	freepath = NULL;
	fullpath = "";
	if (lobj) {
	vp = NULL;
	switch (lobj->type) {
	case OBJT_DEFAULT:
	kve->kve_type = KVME_TYPE_DEFAULT;
	break;
	case OBJT_VNODE:
	kve->kve_type = KVME_TYPE_VNODE;
	vp = lobj->handle;
	vref(vp);
	break;
	case OBJT_SWAP:
	kve->kve_type = KVME_TYPE_SWAP;
	break;
	case OBJT_DEVICE:
	kve->kve_type = KVME_TYPE_DEVICE;
	break;
	case OBJT_PHYS:
	kve->kve_type = KVME_TYPE_PHYS;
	break;
	case OBJT_DEAD:
	kve->kve_type = KVME_TYPE_DEAD;
	break;
	case OBJT_SG:
	kve->kve_type = KVME_TYPE_SG;
	break;
	default:
	kve->kve_type = KVME_TYPE_UNKNOWN;
	break;
	}
	if (lobj != obj)
	VM_OBJECT_UNLOCK(lobj);

	kve->kve_ref_count = obj->ref_count;
	kve->kve_shadow_count = obj->shadow_count;
	VM_OBJECT_UNLOCK(obj);
	if (vp != NULL) {
	vn_fullpath(curthread, vp, &fullpath,
	&freepath);
	kve->kve_vn_type = vntype_to_kinfo(vp->v_type);
	cred = curthread->td_ucred;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	if (VOP_GETATTR(vp, &va, cred) == 0) {
	kve->kve_vn_fileid = va.va_fileid;
	kve->kve_vn_fsid = va.va_fsid;
	kve->kve_vn_mode =
	MAKEIMODE(va.va_type, va.va_mode);
	kve->kve_vn_size = va.va_size;
	kve->kve_vn_rdev = va.va_rdev;
	kve->kve_status = KF_ATTR_VALID;
	}
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	} else {
	kve->kve_type = KVME_TYPE_NONE;
	kve->kve_ref_count = 0;
	kve->kve_shadow_count = 0;
	}

	strlcpy(kve->kve_path, fullpath, sizeof(kve->kve_path));
	if (freepath != NULL)
	free(freepath, M_TEMP);

	/* Pack record size down */
	kve->kve_structsize = offsetof(struct kinfo_vmentry, kve_path) +
	strlen(kve->kve_path) + 1;
	kve->kve_structsize = roundup(kve->kve_structsize,
	sizeof(uint64_t));
	error = SYSCTL_OUT(req, kve, kve->kve_structsize);
	vm_map_lock_read(map);
	if (error)
	break;
	if (last_timestamp != map->timestamp) {
	vm_map_lookup_entry(map, addr - 1, &tmp_entry);
	entry = tmp_entry;
	}
	}
	vm_map_unlock_read(map);
	vmspace_free(vm);
	PRELE(p);
	free(kve, M_TEMP);
	return (error);
	}

	#if defined(STACK) \|\| defined(DDB)
	static int
	sysctl_kern_proc_kstack(SYSCTL_HANDLER_ARGS)
	{
	struct kinfo_kstack *kkstp;
	int error, i, *name, numthreads;
	lwpid_t *lwpidarray;
	struct thread *td;
	struct stack *st;
	struct sbuf sb;
	struct proc *p;

	name = (int *)arg1;
	if ((p = pfind((pid_t)name[0])) == NULL)
	return (ESRCH);
	/* XXXRW: Not clear ESRCH is the right error during proc execve(). */
	if (p->p_flag & P_WEXIT \|\| p->p_flag & P_INEXEC) {
	PROC_UNLOCK(p);
	return (ESRCH);
	}
	if ((error = p_candebug(curthread, p))) {
	PROC_UNLOCK(p);
	return (error);
	}
	_PHOLD(p);
	PROC_UNLOCK(p);

	kkstp = malloc(sizeof(*kkstp), M_TEMP, M_WAITOK);
	st = stack_create();

	lwpidarray = NULL;
	numthreads = 0;
	PROC_LOCK(p);
	repeat:
	if (numthreads < p->p_numthreads) {
	if (lwpidarray != NULL) {
	free(lwpidarray, M_TEMP);
	lwpidarray = NULL;
	}
	numthreads = p->p_numthreads;
	PROC_UNLOCK(p);
	lwpidarray = malloc(sizeof(lwpidarray) numthreads, M_TEMP,
	M_WAITOK \| M_ZERO);
	PROC_LOCK(p);
	goto repeat;
	}
	i = 0;

	/*
	* XXXRW: During the below loop, execve(2) and countless other sorts
	* of changes could have taken place. Should we check to see if the
	* vmspace has been replaced, or the like, in order to prevent
	* giving a snapshot that spans, say, execve(2), with some threads
	* before and some after? Among other things, the credentials could
	* have changed, in which case the right to extract debug info might
	* no longer be assured.
	*/
	FOREACH_THREAD_IN_PROC(p, td) {
	KASSERT(i < numthreads,
	("sysctl_kern_proc_kstack: numthreads"));
	lwpidarray[i] = td->td_tid;
	i++;
	}
	numthreads = i;
	for (i = 0; i < numthreads; i++) {
	td = thread_find(p, lwpidarray[i]);
	if (td == NULL) {
	continue;
	}
	bzero(kkstp, sizeof(*kkstp));
	(void)sbuf_new(&sb, kkstp->kkst_trace,
	sizeof(kkstp->kkst_trace), SBUF_FIXEDLEN);
	thread_lock(td);
	kkstp->kkst_tid = td->td_tid;
	if (TD_IS_SWAPPED(td))
	kkstp->kkst_state = KKST_STATE_SWAPPED;
	else if (TD_IS_RUNNING(td))
	kkstp->kkst_state = KKST_STATE_RUNNING;
	else {
	kkstp->kkst_state = KKST_STATE_STACKOK;
	stack_save_td(st, td);
	}
	thread_unlock(td);
	PROC_UNLOCK(p);
	stack_sbuf_print(&sb, st);
	sbuf_finish(&sb);
	sbuf_delete(&sb);
	error = SYSCTL_OUT(req, kkstp, sizeof(*kkstp));
	PROC_LOCK(p);
	if (error)
	break;
	}
	_PRELE(p);
	PROC_UNLOCK(p);
	if (lwpidarray != NULL)
	free(lwpidarray, M_TEMP);
	stack_destroy(st);
	free(kkstp, M_TEMP);
	return (error);
	}
	#endif

	/*
	* This sysctl allows a process to retrieve the full list of groups from
	* itself or another process.
	*/
	static int
	sysctl_kern_proc_groups(SYSCTL_HANDLER_ARGS)
	{
	pid_t pidp = (pid_t )arg1;
	unsigned int arglen = arg2;
	struct proc *p;
	struct ucred *cred;
	int error;

	if (arglen != 1)
	return (EINVAL);
	if (pidp == -1) { / -1 means this process */
	p = req->td->td_proc;
	} else {
	p = pfind(*pidp);
	if (p == NULL)
	return (ESRCH);
	if ((error = p_cansee(curthread, p)) != 0) {
	PROC_UNLOCK(p);
	return (error);
	}
	}

	cred = crhold(p->p_ucred);
	if (*pidp != -1)
	PROC_UNLOCK(p);

	error = SYSCTL_OUT(req, cred->cr_groups,
	cred->cr_ngroups * sizeof(gid_t));
	crfree(cred);
	return (error);
	}

	SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table");

	SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD\|CTLTYPE_STRUCT\|
	CTLFLAG_MPSAFE, 0, 0, sysctl_kern_proc, "S,proc",
	"Return entire process table");

	static SYSCTL_NODE(_kern_proc, KERN_PROC_GID, gid, CTLFLAG_RD \| CTLFLAG_MPSAFE,
	sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD \| CTLFLAG_MPSAFE,
	sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, KERN_PROC_RGID, rgid, CTLFLAG_RD \| CTLFLAG_MPSAFE,
	sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, KERN_PROC_SESSION, sid, CTLFLAG_RD \|
	CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD \| CTLFLAG_MPSAFE,
	sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD \| CTLFLAG_MPSAFE,
	sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD \| CTLFLAG_MPSAFE,
	sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD \| CTLFLAG_MPSAFE,
	sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, KERN_PROC_PROC, proc, CTLFLAG_RD \| CTLFLAG_MPSAFE,
	sysctl_kern_proc, "Return process table, no threads");

	static SYSCTL_NODE(_kern_proc, KERN_PROC_ARGS, args,
	CTLFLAG_RW \| CTLFLAG_ANYBODY \| CTLFLAG_MPSAFE,
	sysctl_kern_proc_args, "Process argument list");

	static SYSCTL_NODE(_kern_proc, KERN_PROC_PATHNAME, pathname, CTLFLAG_RD \|
	CTLFLAG_MPSAFE, sysctl_kern_proc_pathname, "Process executable path");

	static SYSCTL_NODE(_kern_proc, KERN_PROC_SV_NAME, sv_name, CTLFLAG_RD \|
	CTLFLAG_MPSAFE, sysctl_kern_proc_sv_name,
	"Process syscall vector name (ABI type)");

	static SYSCTL_NODE(_kern_proc, (KERN_PROC_GID \| KERN_PROC_INC_THREAD), gid_td,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, (KERN_PROC_PGRP \| KERN_PROC_INC_THREAD), pgrp_td,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, (KERN_PROC_RGID \| KERN_PROC_INC_THREAD), rgid_td,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, (KERN_PROC_SESSION \| KERN_PROC_INC_THREAD),
	sid_td, CTLFLAG_RD \| CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, (KERN_PROC_TTY \| KERN_PROC_INC_THREAD), tty_td,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, (KERN_PROC_UID \| KERN_PROC_INC_THREAD), uid_td,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, (KERN_PROC_RUID \| KERN_PROC_INC_THREAD), ruid_td,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, (KERN_PROC_PID \| KERN_PROC_INC_THREAD), pid_td,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, sysctl_kern_proc, "Process table");

	static SYSCTL_NODE(_kern_proc, (KERN_PROC_PROC \| KERN_PROC_INC_THREAD), proc_td,
	CTLFLAG_RD \| CTLFLAG_MPSAFE, sysctl_kern_proc,
	"Return process table, no threads");

	#ifdef COMPAT_FREEBSD7
	static SYSCTL_NODE(_kern_proc, KERN_PROC_OVMMAP, ovmmap, CTLFLAG_RD \|
	CTLFLAG_MPSAFE, sysctl_kern_proc_ovmmap, "Old Process vm map entries");
	#endif

	static SYSCTL_NODE(_kern_proc, KERN_PROC_VMMAP, vmmap, CTLFLAG_RD \|
	CTLFLAG_MPSAFE, sysctl_kern_proc_vmmap, "Process vm map entries");

	#if defined(STACK) \|\| defined(DDB)
	static SYSCTL_NODE(_kern_proc, KERN_PROC_KSTACK, kstack, CTLFLAG_RD \|
	CTLFLAG_MPSAFE, sysctl_kern_proc_kstack, "Process kernel stacks");
	#endif

	static SYSCTL_NODE(_kern_proc, KERN_PROC_GROUPS, groups, CTLFLAG_RD \|
	CTLFLAG_MPSAFE, sysctl_kern_proc_groups, "Process groups");
	Index: head/sys/kern/kern_prot.c
	===================================================================
	--- head/sys/kern/kern_prot.c (revision 225616)
	+++ head/sys/kern/kern_prot.c (revision 225617)
	@@ -1,2220 +1,2220 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
	* The Regents of the University of California.
	* (c) UNIX System Laboratories, Inc.
	* Copyright (c) 2000-2001 Robert N. M. Watson.
	* All rights reserved.
	*
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_prot.c 8.6 (Berkeley) 1/21/94
	*/

	/*
	* System calls related to processes and protection
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_inet.h"
	#include "opt_inet6.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/acct.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/loginclass.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/refcount.h>
	#include <sys/sx.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/sysproto.h>
	#include <sys/jail.h>
	#include <sys/pioctl.h>
	#include <sys/racct.h>
	#include <sys/resourcevar.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>

	#ifdef REGRESSION
	FEATURE(regression,
	"Kernel support for interfaces nessesary for regression testing (SECURITY RISK!)");
	#endif

	#if defined(INET) \|\| defined(INET6)
	#include <netinet/in.h>
	#include <netinet/in_pcb.h>
	#endif

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>

	static MALLOC_DEFINE(M_CRED, "cred", "credentials");

	SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0, "BSD security policy");

	static void crextend(struct ucred *cr, int n);
	static void crsetgroups_locked(struct ucred *cr, int ngrp,
	gid_t *groups);

	#ifndef _SYS_SYSPROTO_H_
	struct getpid_args {
	int dummy;
	};
	#endif
	/* ARGSUSED */
	int
	-getpid(struct thread td, struct getpid_args uap)
	+sys_getpid(struct thread td, struct getpid_args uap)
	{
	struct proc *p = td->td_proc;

	td->td_retval[0] = p->p_pid;
	#if defined(COMPAT_43)
	PROC_LOCK(p);
	td->td_retval[1] = p->p_pptr->p_pid;
	PROC_UNLOCK(p);
	#endif
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct getppid_args {
	int dummy;
	};
	#endif
	/* ARGSUSED */
	int
	-getppid(struct thread td, struct getppid_args uap)
	+sys_getppid(struct thread td, struct getppid_args uap)
	{
	struct proc *p = td->td_proc;

	PROC_LOCK(p);
	td->td_retval[0] = p->p_pptr->p_pid;
	PROC_UNLOCK(p);
	return (0);
	}

	/*
	* Get process group ID; note that POSIX getpgrp takes no parameter.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct getpgrp_args {
	int dummy;
	};
	#endif
	int
	-getpgrp(struct thread td, struct getpgrp_args uap)
	+sys_getpgrp(struct thread td, struct getpgrp_args uap)
	{
	struct proc *p = td->td_proc;

	PROC_LOCK(p);
	td->td_retval[0] = p->p_pgrp->pg_id;
	PROC_UNLOCK(p);
	return (0);
	}

	/* Get an arbitary pid's process group id */
	#ifndef _SYS_SYSPROTO_H_
	struct getpgid_args {
	pid_t pid;
	};
	#endif
	int
	-getpgid(struct thread td, struct getpgid_args uap)
	+sys_getpgid(struct thread td, struct getpgid_args uap)
	{
	struct proc *p;
	int error;

	if (uap->pid == 0) {
	p = td->td_proc;
	PROC_LOCK(p);
	} else {
	p = pfind(uap->pid);
	if (p == NULL)
	return (ESRCH);
	error = p_cansee(td, p);
	if (error) {
	PROC_UNLOCK(p);
	return (error);
	}
	}
	td->td_retval[0] = p->p_pgrp->pg_id;
	PROC_UNLOCK(p);
	return (0);
	}

	/*
	* Get an arbitary pid's session id.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct getsid_args {
	pid_t pid;
	};
	#endif
	int
	-getsid(struct thread td, struct getsid_args uap)
	+sys_getsid(struct thread td, struct getsid_args uap)
	{
	struct proc *p;
	int error;

	if (uap->pid == 0) {
	p = td->td_proc;
	PROC_LOCK(p);
	} else {
	p = pfind(uap->pid);
	if (p == NULL)
	return (ESRCH);
	error = p_cansee(td, p);
	if (error) {
	PROC_UNLOCK(p);
	return (error);
	}
	}
	td->td_retval[0] = p->p_session->s_sid;
	PROC_UNLOCK(p);
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct getuid_args {
	int dummy;
	};
	#endif
	/* ARGSUSED */
	int
	-getuid(struct thread td, struct getuid_args uap)
	+sys_getuid(struct thread td, struct getuid_args uap)
	{

	td->td_retval[0] = td->td_ucred->cr_ruid;
	#if defined(COMPAT_43)
	td->td_retval[1] = td->td_ucred->cr_uid;
	#endif
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct geteuid_args {
	int dummy;
	};
	#endif
	/* ARGSUSED */
	int
	-geteuid(struct thread td, struct geteuid_args uap)
	+sys_geteuid(struct thread td, struct geteuid_args uap)
	{

	td->td_retval[0] = td->td_ucred->cr_uid;
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct getgid_args {
	int dummy;
	};
	#endif
	/* ARGSUSED */
	int
	-getgid(struct thread td, struct getgid_args uap)
	+sys_getgid(struct thread td, struct getgid_args uap)
	{

	td->td_retval[0] = td->td_ucred->cr_rgid;
	#if defined(COMPAT_43)
	td->td_retval[1] = td->td_ucred->cr_groups[0];
	#endif
	return (0);
	}

	/*
	* Get effective group ID. The "egid" is groups[0], and could be obtained
	* via getgroups. This syscall exists because it is somewhat painful to do
	* correctly in a library function.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct getegid_args {
	int dummy;
	};
	#endif
	/* ARGSUSED */
	int
	-getegid(struct thread td, struct getegid_args uap)
	+sys_getegid(struct thread td, struct getegid_args uap)
	{

	td->td_retval[0] = td->td_ucred->cr_groups[0];
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct getgroups_args {
	u_int gidsetsize;
	gid_t *gidset;
	};
	#endif
	int
	-getgroups(struct thread td, register struct getgroups_args uap)
	+sys_getgroups(struct thread td, register struct getgroups_args uap)
	{
	gid_t *groups;
	u_int ngrp;
	int error;

	if (uap->gidsetsize < td->td_ucred->cr_ngroups) {
	if (uap->gidsetsize == 0)
	ngrp = 0;
	else
	return (EINVAL);
	} else
	ngrp = td->td_ucred->cr_ngroups;
	groups = malloc(ngrp * sizeof(*groups), M_TEMP, M_WAITOK);
	error = kern_getgroups(td, &ngrp, groups);
	if (error)
	goto out;
	if (uap->gidsetsize > 0)
	error = copyout(groups, uap->gidset, ngrp * sizeof(gid_t));
	if (error == 0)
	td->td_retval[0] = ngrp;
	out:
	free(groups, M_TEMP);
	return (error);
	}

	int
	kern_getgroups(struct thread td, u_int ngrp, gid_t *groups)
	{
	struct ucred *cred;

	cred = td->td_ucred;
	if (*ngrp == 0) {
	*ngrp = cred->cr_ngroups;
	return (0);
	}
	if (*ngrp < cred->cr_ngroups)
	return (EINVAL);
	*ngrp = cred->cr_ngroups;
	bcopy(cred->cr_groups, groups, ngrp sizeof(gid_t));
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct setsid_args {
	int dummy;
	};
	#endif
	/* ARGSUSED */
	int
	-setsid(register struct thread td, struct setsid_args uap)
	+sys_setsid(register struct thread td, struct setsid_args uap)
	{
	struct pgrp *pgrp;
	int error;
	struct proc *p = td->td_proc;
	struct pgrp *newpgrp;
	struct session *newsess;

	error = 0;
	pgrp = NULL;

	newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK \| M_ZERO);
	newsess = malloc(sizeof(struct session), M_SESSION, M_WAITOK \| M_ZERO);

	sx_xlock(&proctree_lock);

	if (p->p_pgid == p->p_pid \|\| (pgrp = pgfind(p->p_pid)) != NULL) {
	if (pgrp != NULL)
	PGRP_UNLOCK(pgrp);
	error = EPERM;
	} else {
	(void)enterpgrp(p, p->p_pid, newpgrp, newsess);
	td->td_retval[0] = p->p_pid;
	newpgrp = NULL;
	newsess = NULL;
	}

	sx_xunlock(&proctree_lock);

	if (newpgrp != NULL)
	free(newpgrp, M_PGRP);
	if (newsess != NULL)
	free(newsess, M_SESSION);

	return (error);
	}

	/*
	* set process group (setpgid/old setpgrp)
	*
	* caller does setpgid(targpid, targpgid)
	*
	* pid must be caller or child of caller (ESRCH)
	* if a child
	* pid must be in same session (EPERM)
	* pid can't have done an exec (EACCES)
	* if pgid != pid
	* there must exist some pid in same session having pgid (EPERM)
	* pid must not be session leader (EPERM)
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct setpgid_args {
	int pid; /* target process id */
	int pgid; /* target pgrp id */
	};
	#endif
	/* ARGSUSED */
	int
	-setpgid(struct thread td, register struct setpgid_args uap)
	+sys_setpgid(struct thread td, register struct setpgid_args uap)
	{
	struct proc *curp = td->td_proc;
	register struct proc targp; / target process */
	register struct pgrp pgrp; / target pgrp */
	int error;
	struct pgrp *newpgrp;

	if (uap->pgid < 0)
	return (EINVAL);

	error = 0;

	newpgrp = malloc(sizeof(struct pgrp), M_PGRP, M_WAITOK \| M_ZERO);

	sx_xlock(&proctree_lock);
	if (uap->pid != 0 && uap->pid != curp->p_pid) {
	if ((targp = pfind(uap->pid)) == NULL) {
	error = ESRCH;
	goto done;
	}
	if (!inferior(targp)) {
	PROC_UNLOCK(targp);
	error = ESRCH;
	goto done;
	}
	if ((error = p_cansee(td, targp))) {
	PROC_UNLOCK(targp);
	goto done;
	}
	if (targp->p_pgrp == NULL \|\|
	targp->p_session != curp->p_session) {
	PROC_UNLOCK(targp);
	error = EPERM;
	goto done;
	}
	if (targp->p_flag & P_EXEC) {
	PROC_UNLOCK(targp);
	error = EACCES;
	goto done;
	}
	PROC_UNLOCK(targp);
	} else
	targp = curp;
	if (SESS_LEADER(targp)) {
	error = EPERM;
	goto done;
	}
	if (uap->pgid == 0)
	uap->pgid = targp->p_pid;
	if ((pgrp = pgfind(uap->pgid)) == NULL) {
	if (uap->pgid == targp->p_pid) {
	error = enterpgrp(targp, uap->pgid, newpgrp,
	NULL);
	if (error == 0)
	newpgrp = NULL;
	} else
	error = EPERM;
	} else {
	if (pgrp == targp->p_pgrp) {
	PGRP_UNLOCK(pgrp);
	goto done;
	}
	if (pgrp->pg_id != targp->p_pid &&
	pgrp->pg_session != curp->p_session) {
	PGRP_UNLOCK(pgrp);
	error = EPERM;
	goto done;
	}
	PGRP_UNLOCK(pgrp);
	error = enterthispgrp(targp, pgrp);
	}
	done:
	sx_xunlock(&proctree_lock);
	KASSERT((error == 0) \|\| (newpgrp != NULL),
	("setpgid failed and newpgrp is NULL"));
	if (newpgrp != NULL)
	free(newpgrp, M_PGRP);
	return (error);
	}

	/*
	* Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD
	* compatible. It says that setting the uid/gid to euid/egid is a special
	* case of "appropriate privilege". Once the rules are expanded out, this
	* basically means that setuid(nnn) sets all three id's, in all permitted
	* cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid())
	* does not set the saved id - this is dangerous for traditional BSD
	* programs. For this reason, we really do not want to set
	* _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2.
	*/
	#define POSIX_APPENDIX_B_4_2_2

	#ifndef _SYS_SYSPROTO_H_
	struct setuid_args {
	uid_t uid;
	};
	#endif
	/* ARGSUSED */
	int
	-setuid(struct thread td, struct setuid_args uap)
	+sys_setuid(struct thread td, struct setuid_args uap)
	{
	struct proc *p = td->td_proc;
	struct ucred newcred, oldcred;
	uid_t uid;
	struct uidinfo *uip;
	int error;

	uid = uap->uid;
	AUDIT_ARG_UID(uid);
	newcred = crget();
	uip = uifind(uid);
	PROC_LOCK(p);
	/*
	* Copy credentials so other references do not see our changes.
	*/
	oldcred = crcopysafe(p, newcred);

	#ifdef MAC
	error = mac_cred_check_setuid(oldcred, uid);
	if (error)
	goto fail;
	#endif

	/*
	* See if we have "permission" by POSIX 1003.1 rules.
	*
	* Note that setuid(geteuid()) is a special case of
	* "appropriate privileges" in appendix B.4.2.2. We need
	* to use this clause to be compatible with traditional BSD
	* semantics. Basically, it means that "setuid(xx)" sets all
	* three id's (assuming you have privs).
	*
	* Notes on the logic. We do things in three steps.
	* 1: We determine if the euid is going to change, and do EPERM
	* right away. We unconditionally change the euid later if this
	* test is satisfied, simplifying that part of the logic.
	* 2: We determine if the real and/or saved uids are going to
	* change. Determined by compile options.
	* 3: Change euid last. (after tests in #2 for "appropriate privs")
	*/
	if (uid != oldcred->cr_ruid && /* allow setuid(getuid()) */
	#ifdef _POSIX_SAVED_IDS
	uid != oldcred->cr_svuid && /* allow setuid(saved gid) */
	#endif
	#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */
	uid != oldcred->cr_uid && /* allow setuid(geteuid()) */
	#endif
	(error = priv_check_cred(oldcred, PRIV_CRED_SETUID, 0)) != 0)
	goto fail;

	#ifdef _POSIX_SAVED_IDS
	/*
	* Do we have "appropriate privileges" (are we root or uid == euid)
	* If so, we are changing the real uid and/or saved uid.
	*/
	if (
	#ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */
	uid == oldcred->cr_uid \|\|
	#endif
	/* We are using privs. */
	priv_check_cred(oldcred, PRIV_CRED_SETUID, 0) == 0)
	#endif
	{
	/*
	* Set the real uid and transfer proc count to new user.
	*/
	if (uid != oldcred->cr_ruid) {
	change_ruid(newcred, uip);
	setsugid(p);
	}
	/*
	* Set saved uid
	*
	* XXX always set saved uid even if not _POSIX_SAVED_IDS, as
	* the security of seteuid() depends on it. B.4.2.2 says it
	* is important that we should do this.
	*/
	if (uid != oldcred->cr_svuid) {
	change_svuid(newcred, uid);
	setsugid(p);
	}
	}

	/*
	* In all permitted cases, we are changing the euid.
	*/
	if (uid != oldcred->cr_uid) {
	change_euid(newcred, uip);
	setsugid(p);
	}
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	#ifdef RACCT
	racct_proc_ucred_changed(p, oldcred, newcred);
	#endif
	uifree(uip);
	crfree(oldcred);
	return (0);

	fail:
	PROC_UNLOCK(p);
	uifree(uip);
	crfree(newcred);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct seteuid_args {
	uid_t euid;
	};
	#endif
	/* ARGSUSED */
	int
	-seteuid(struct thread td, struct seteuid_args uap)
	+sys_seteuid(struct thread td, struct seteuid_args uap)
	{
	struct proc *p = td->td_proc;
	struct ucred newcred, oldcred;
	uid_t euid;
	struct uidinfo *euip;
	int error;

	euid = uap->euid;
	AUDIT_ARG_EUID(euid);
	newcred = crget();
	euip = uifind(euid);
	PROC_LOCK(p);
	/*
	* Copy credentials so other references do not see our changes.
	*/
	oldcred = crcopysafe(p, newcred);

	#ifdef MAC
	error = mac_cred_check_seteuid(oldcred, euid);
	if (error)
	goto fail;
	#endif

	if (euid != oldcred->cr_ruid && /* allow seteuid(getuid()) */
	euid != oldcred->cr_svuid && /* allow seteuid(saved uid) */
	(error = priv_check_cred(oldcred, PRIV_CRED_SETEUID, 0)) != 0)
	goto fail;

	/*
	* Everything's okay, do it.
	*/
	if (oldcred->cr_uid != euid) {
	change_euid(newcred, euip);
	setsugid(p);
	}
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	uifree(euip);
	crfree(oldcred);
	return (0);

	fail:
	PROC_UNLOCK(p);
	uifree(euip);
	crfree(newcred);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct setgid_args {
	gid_t gid;
	};
	#endif
	/* ARGSUSED */
	int
	-setgid(struct thread td, struct setgid_args uap)
	+sys_setgid(struct thread td, struct setgid_args uap)
	{
	struct proc *p = td->td_proc;
	struct ucred newcred, oldcred;
	gid_t gid;
	int error;

	gid = uap->gid;
	AUDIT_ARG_GID(gid);
	newcred = crget();
	PROC_LOCK(p);
	oldcred = crcopysafe(p, newcred);

	#ifdef MAC
	error = mac_cred_check_setgid(oldcred, gid);
	if (error)
	goto fail;
	#endif

	/*
	* See if we have "permission" by POSIX 1003.1 rules.
	*
	* Note that setgid(getegid()) is a special case of
	* "appropriate privileges" in appendix B.4.2.2. We need
	* to use this clause to be compatible with traditional BSD
	* semantics. Basically, it means that "setgid(xx)" sets all
	* three id's (assuming you have privs).
	*
	* For notes on the logic here, see setuid() above.
	*/
	if (gid != oldcred->cr_rgid && /* allow setgid(getgid()) */
	#ifdef _POSIX_SAVED_IDS
	gid != oldcred->cr_svgid && /* allow setgid(saved gid) */
	#endif
	#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */
	gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */
	#endif
	(error = priv_check_cred(oldcred, PRIV_CRED_SETGID, 0)) != 0)
	goto fail;

	#ifdef _POSIX_SAVED_IDS
	/*
	* Do we have "appropriate privileges" (are we root or gid == egid)
	* If so, we are changing the real uid and saved gid.
	*/
	if (
	#ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */
	gid == oldcred->cr_groups[0] \|\|
	#endif
	/* We are using privs. */
	priv_check_cred(oldcred, PRIV_CRED_SETGID, 0) == 0)
	#endif
	{
	/*
	* Set real gid
	*/
	if (oldcred->cr_rgid != gid) {
	change_rgid(newcred, gid);
	setsugid(p);
	}
	/*
	* Set saved gid
	*
	* XXX always set saved gid even if not _POSIX_SAVED_IDS, as
	* the security of setegid() depends on it. B.4.2.2 says it
	* is important that we should do this.
	*/
	if (oldcred->cr_svgid != gid) {
	change_svgid(newcred, gid);
	setsugid(p);
	}
	}
	/*
	* In all cases permitted cases, we are changing the egid.
	* Copy credentials so other references do not see our changes.
	*/
	if (oldcred->cr_groups[0] != gid) {
	change_egid(newcred, gid);
	setsugid(p);
	}
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	crfree(oldcred);
	return (0);

	fail:
	PROC_UNLOCK(p);
	crfree(newcred);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct setegid_args {
	gid_t egid;
	};
	#endif
	/* ARGSUSED */
	int
	-setegid(struct thread td, struct setegid_args uap)
	+sys_setegid(struct thread td, struct setegid_args uap)
	{
	struct proc *p = td->td_proc;
	struct ucred newcred, oldcred;
	gid_t egid;
	int error;

	egid = uap->egid;
	AUDIT_ARG_EGID(egid);
	newcred = crget();
	PROC_LOCK(p);
	oldcred = crcopysafe(p, newcred);

	#ifdef MAC
	error = mac_cred_check_setegid(oldcred, egid);
	if (error)
	goto fail;
	#endif

	if (egid != oldcred->cr_rgid && /* allow setegid(getgid()) */
	egid != oldcred->cr_svgid && /* allow setegid(saved gid) */
	(error = priv_check_cred(oldcred, PRIV_CRED_SETEGID, 0)) != 0)
	goto fail;

	if (oldcred->cr_groups[0] != egid) {
	change_egid(newcred, egid);
	setsugid(p);
	}
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	crfree(oldcred);
	return (0);

	fail:
	PROC_UNLOCK(p);
	crfree(newcred);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct setgroups_args {
	u_int gidsetsize;
	gid_t *gidset;
	};
	#endif
	/* ARGSUSED */
	int
	-setgroups(struct thread td, struct setgroups_args uap)
	+sys_setgroups(struct thread td, struct setgroups_args uap)
	{
	gid_t *groups = NULL;
	int error;

	if (uap->gidsetsize > ngroups_max + 1)
	return (EINVAL);
	groups = malloc(uap->gidsetsize * sizeof(gid_t), M_TEMP, M_WAITOK);
	error = copyin(uap->gidset, groups, uap->gidsetsize * sizeof(gid_t));
	if (error)
	goto out;
	error = kern_setgroups(td, uap->gidsetsize, groups);
	out:
	free(groups, M_TEMP);
	return (error);
	}

	int
	kern_setgroups(struct thread td, u_int ngrp, gid_t groups)
	{
	struct proc *p = td->td_proc;
	struct ucred newcred, oldcred;
	int error;

	if (ngrp > ngroups_max + 1)
	return (EINVAL);
	AUDIT_ARG_GROUPSET(groups, ngrp);
	newcred = crget();
	crextend(newcred, ngrp);
	PROC_LOCK(p);
	oldcred = crcopysafe(p, newcred);

	#ifdef MAC
	error = mac_cred_check_setgroups(oldcred, ngrp, groups);
	if (error)
	goto fail;
	#endif

	error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0);
	if (error)
	goto fail;

	if (ngrp < 1) {
	/*
	* setgroups(0, NULL) is a legitimate way of clearing the
	* groups vector on non-BSD systems (which generally do not
	* have the egid in the groups[0]). We risk security holes
	* when running non-BSD software if we do not do the same.
	*/
	newcred->cr_ngroups = 1;
	} else {
	crsetgroups_locked(newcred, ngrp, groups);
	}
	setsugid(p);
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	crfree(oldcred);
	return (0);

	fail:
	PROC_UNLOCK(p);
	crfree(newcred);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct setreuid_args {
	uid_t ruid;
	uid_t euid;
	};
	#endif
	/* ARGSUSED */
	int
	-setreuid(register struct thread td, struct setreuid_args uap)
	+sys_setreuid(register struct thread td, struct setreuid_args uap)
	{
	struct proc *p = td->td_proc;
	struct ucred newcred, oldcred;
	uid_t euid, ruid;
	struct uidinfo euip, ruip;
	int error;

	euid = uap->euid;
	ruid = uap->ruid;
	AUDIT_ARG_EUID(euid);
	AUDIT_ARG_RUID(ruid);
	newcred = crget();
	euip = uifind(euid);
	ruip = uifind(ruid);
	PROC_LOCK(p);
	oldcred = crcopysafe(p, newcred);

	#ifdef MAC
	error = mac_cred_check_setreuid(oldcred, ruid, euid);
	if (error)
	goto fail;
	#endif

	if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
	ruid != oldcred->cr_svuid) \|\|
	(euid != (uid_t)-1 && euid != oldcred->cr_uid &&
	euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) &&
	(error = priv_check_cred(oldcred, PRIV_CRED_SETREUID, 0)) != 0)
	goto fail;

	if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
	change_euid(newcred, euip);
	setsugid(p);
	}
	if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
	change_ruid(newcred, ruip);
	setsugid(p);
	}
	if ((ruid != (uid_t)-1 \|\| newcred->cr_uid != newcred->cr_ruid) &&
	newcred->cr_svuid != newcred->cr_uid) {
	change_svuid(newcred, newcred->cr_uid);
	setsugid(p);
	}
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	#ifdef RACCT
	racct_proc_ucred_changed(p, oldcred, newcred);
	#endif
	uifree(ruip);
	uifree(euip);
	crfree(oldcred);
	return (0);

	fail:
	PROC_UNLOCK(p);
	uifree(ruip);
	uifree(euip);
	crfree(newcred);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct setregid_args {
	gid_t rgid;
	gid_t egid;
	};
	#endif
	/* ARGSUSED */
	int
	-setregid(register struct thread td, struct setregid_args uap)
	+sys_setregid(register struct thread td, struct setregid_args uap)
	{
	struct proc *p = td->td_proc;
	struct ucred newcred, oldcred;
	gid_t egid, rgid;
	int error;

	egid = uap->egid;
	rgid = uap->rgid;
	AUDIT_ARG_EGID(egid);
	AUDIT_ARG_RGID(rgid);
	newcred = crget();
	PROC_LOCK(p);
	oldcred = crcopysafe(p, newcred);

	#ifdef MAC
	error = mac_cred_check_setregid(oldcred, rgid, egid);
	if (error)
	goto fail;
	#endif

	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
	rgid != oldcred->cr_svgid) \|\|
	(egid != (gid_t)-1 && egid != oldcred->cr_groups[0] &&
	egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
	(error = priv_check_cred(oldcred, PRIV_CRED_SETREGID, 0)) != 0)
	goto fail;

	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
	change_egid(newcred, egid);
	setsugid(p);
	}
	if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
	change_rgid(newcred, rgid);
	setsugid(p);
	}
	if ((rgid != (gid_t)-1 \|\| newcred->cr_groups[0] != newcred->cr_rgid) &&
	newcred->cr_svgid != newcred->cr_groups[0]) {
	change_svgid(newcred, newcred->cr_groups[0]);
	setsugid(p);
	}
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	crfree(oldcred);
	return (0);

	fail:
	PROC_UNLOCK(p);
	crfree(newcred);
	return (error);
	}

	/*
	* setresuid(ruid, euid, suid) is like setreuid except control over the saved
	* uid is explicit.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct setresuid_args {
	uid_t ruid;
	uid_t euid;
	uid_t suid;
	};
	#endif
	/* ARGSUSED */
	int
	-setresuid(register struct thread td, struct setresuid_args uap)
	+sys_setresuid(register struct thread td, struct setresuid_args uap)
	{
	struct proc *p = td->td_proc;
	struct ucred newcred, oldcred;
	uid_t euid, ruid, suid;
	struct uidinfo euip, ruip;
	int error;

	euid = uap->euid;
	ruid = uap->ruid;
	suid = uap->suid;
	AUDIT_ARG_EUID(euid);
	AUDIT_ARG_RUID(ruid);
	AUDIT_ARG_SUID(suid);
	newcred = crget();
	euip = uifind(euid);
	ruip = uifind(ruid);
	PROC_LOCK(p);
	oldcred = crcopysafe(p, newcred);

	#ifdef MAC
	error = mac_cred_check_setresuid(oldcred, ruid, euid, suid);
	if (error)
	goto fail;
	#endif

	if (((ruid != (uid_t)-1 && ruid != oldcred->cr_ruid &&
	ruid != oldcred->cr_svuid &&
	ruid != oldcred->cr_uid) \|\|
	(euid != (uid_t)-1 && euid != oldcred->cr_ruid &&
	euid != oldcred->cr_svuid &&
	euid != oldcred->cr_uid) \|\|
	(suid != (uid_t)-1 && suid != oldcred->cr_ruid &&
	suid != oldcred->cr_svuid &&
	suid != oldcred->cr_uid)) &&
	(error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID, 0)) != 0)
	goto fail;

	if (euid != (uid_t)-1 && oldcred->cr_uid != euid) {
	change_euid(newcred, euip);
	setsugid(p);
	}
	if (ruid != (uid_t)-1 && oldcred->cr_ruid != ruid) {
	change_ruid(newcred, ruip);
	setsugid(p);
	}
	if (suid != (uid_t)-1 && oldcred->cr_svuid != suid) {
	change_svuid(newcred, suid);
	setsugid(p);
	}
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	#ifdef RACCT
	racct_proc_ucred_changed(p, oldcred, newcred);
	#endif
	uifree(ruip);
	uifree(euip);
	crfree(oldcred);
	return (0);

	fail:
	PROC_UNLOCK(p);
	uifree(ruip);
	uifree(euip);
	crfree(newcred);
	return (error);

	}

	/*
	* setresgid(rgid, egid, sgid) is like setregid except control over the saved
	* gid is explicit.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct setresgid_args {
	gid_t rgid;
	gid_t egid;
	gid_t sgid;
	};
	#endif
	/* ARGSUSED */
	int
	-setresgid(register struct thread td, struct setresgid_args uap)
	+sys_setresgid(register struct thread td, struct setresgid_args uap)
	{
	struct proc *p = td->td_proc;
	struct ucred newcred, oldcred;
	gid_t egid, rgid, sgid;
	int error;

	egid = uap->egid;
	rgid = uap->rgid;
	sgid = uap->sgid;
	AUDIT_ARG_EGID(egid);
	AUDIT_ARG_RGID(rgid);
	AUDIT_ARG_SGID(sgid);
	newcred = crget();
	PROC_LOCK(p);
	oldcred = crcopysafe(p, newcred);

	#ifdef MAC
	error = mac_cred_check_setresgid(oldcred, rgid, egid, sgid);
	if (error)
	goto fail;
	#endif

	if (((rgid != (gid_t)-1 && rgid != oldcred->cr_rgid &&
	rgid != oldcred->cr_svgid &&
	rgid != oldcred->cr_groups[0]) \|\|
	(egid != (gid_t)-1 && egid != oldcred->cr_rgid &&
	egid != oldcred->cr_svgid &&
	egid != oldcred->cr_groups[0]) \|\|
	(sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
	sgid != oldcred->cr_svgid &&
	sgid != oldcred->cr_groups[0])) &&
	(error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID, 0)) != 0)
	goto fail;

	if (egid != (gid_t)-1 && oldcred->cr_groups[0] != egid) {
	change_egid(newcred, egid);
	setsugid(p);
	}
	if (rgid != (gid_t)-1 && oldcred->cr_rgid != rgid) {
	change_rgid(newcred, rgid);
	setsugid(p);
	}
	if (sgid != (gid_t)-1 && oldcred->cr_svgid != sgid) {
	change_svgid(newcred, sgid);
	setsugid(p);
	}
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	crfree(oldcred);
	return (0);

	fail:
	PROC_UNLOCK(p);
	crfree(newcred);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct getresuid_args {
	uid_t *ruid;
	uid_t *euid;
	uid_t *suid;
	};
	#endif
	/* ARGSUSED */
	int
	-getresuid(register struct thread td, struct getresuid_args uap)
	+sys_getresuid(register struct thread td, struct getresuid_args uap)
	{
	struct ucred *cred;
	int error1 = 0, error2 = 0, error3 = 0;

	cred = td->td_ucred;
	if (uap->ruid)
	error1 = copyout(&cred->cr_ruid,
	uap->ruid, sizeof(cred->cr_ruid));
	if (uap->euid)
	error2 = copyout(&cred->cr_uid,
	uap->euid, sizeof(cred->cr_uid));
	if (uap->suid)
	error3 = copyout(&cred->cr_svuid,
	uap->suid, sizeof(cred->cr_svuid));
	return (error1 ? error1 : error2 ? error2 : error3);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct getresgid_args {
	gid_t *rgid;
	gid_t *egid;
	gid_t *sgid;
	};
	#endif
	/* ARGSUSED */
	int
	-getresgid(register struct thread td, struct getresgid_args uap)
	+sys_getresgid(register struct thread td, struct getresgid_args uap)
	{
	struct ucred *cred;
	int error1 = 0, error2 = 0, error3 = 0;

	cred = td->td_ucred;
	if (uap->rgid)
	error1 = copyout(&cred->cr_rgid,
	uap->rgid, sizeof(cred->cr_rgid));
	if (uap->egid)
	error2 = copyout(&cred->cr_groups[0],
	uap->egid, sizeof(cred->cr_groups[0]));
	if (uap->sgid)
	error3 = copyout(&cred->cr_svgid,
	uap->sgid, sizeof(cred->cr_svgid));
	return (error1 ? error1 : error2 ? error2 : error3);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct issetugid_args {
	int dummy;
	};
	#endif
	/* ARGSUSED */
	int
	-issetugid(register struct thread td, struct issetugid_args uap)
	+sys_issetugid(register struct thread td, struct issetugid_args uap)
	{
	struct proc *p = td->td_proc;

	/*
	* Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time,
	* we use P_SUGID because we consider changing the owners as
	* "tainting" as well.
	* This is significant for procs that start as root and "become"
	* a user without an exec - programs cannot know everything
	* that libc might have put in their data segment.
	*/
	PROC_LOCK(p);
	td->td_retval[0] = (p->p_flag & P_SUGID) ? 1 : 0;
	PROC_UNLOCK(p);
	return (0);
	}

	int
	-__setugid(struct thread td, struct __setugid_args uap)
	+sys___setugid(struct thread td, struct __setugid_args uap)
	{
	#ifdef REGRESSION
	struct proc *p;

	p = td->td_proc;
	switch (uap->flag) {
	case 0:
	PROC_LOCK(p);
	p->p_flag &= ~P_SUGID;
	PROC_UNLOCK(p);
	return (0);
	case 1:
	PROC_LOCK(p);
	p->p_flag \|= P_SUGID;
	PROC_UNLOCK(p);
	return (0);
	default:
	return (EINVAL);
	}
	#else /* !REGRESSION */

	return (ENOSYS);
	#endif /* REGRESSION */
	}

	/*
	* Check if gid is a member of the group set.
	*/
	int
	groupmember(gid_t gid, struct ucred *cred)
	{
	int l;
	int h;
	int m;

	if (cred->cr_groups[0] == gid)
	return(1);

	/*
	* If gid was not our primary group, perform a binary search
	* of the supplemental groups. This is possible because we
	* sort the groups in crsetgroups().
	*/
	l = 1;
	h = cred->cr_ngroups;
	while (l < h) {
	m = l + ((h - l) / 2);
	if (cred->cr_groups[m] < gid)
	l = m + 1;
	else
	h = m;
	}
	if ((l < cred->cr_ngroups) && (cred->cr_groups[l] == gid))
	return (1);

	return (0);
	}

	/*
	* Test the active securelevel against a given level. securelevel_gt()
	* implements (securelevel > level). securelevel_ge() implements
	* (securelevel >= level). Note that the logic is inverted -- these
	* functions return EPERM on "success" and 0 on "failure".
	*
	* Due to care taken when setting the securelevel, we know that no jail will
	* be less secure that its parent (or the physical system), so it is sufficient
	* to test the current jail only.
	*
	* XXXRW: Possibly since this has to do with privilege, it should move to
	* kern_priv.c.
	*/
	int
	securelevel_gt(struct ucred *cr, int level)
	{

	return (cr->cr_prison->pr_securelevel > level ? EPERM : 0);
	}

	int
	securelevel_ge(struct ucred *cr, int level)
	{

	return (cr->cr_prison->pr_securelevel >= level ? EPERM : 0);
	}

	/*
	* 'see_other_uids' determines whether or not visibility of processes
	* and sockets with credentials holding different real uids is possible
	* using a variety of system MIBs.
	* XXX: data declarations should be together near the beginning of the file.
	*/
	static int see_other_uids = 1;
	SYSCTL_INT(_security_bsd, OID_AUTO, see_other_uids, CTLFLAG_RW,
	&see_other_uids, 0,
	"Unprivileged processes may see subjects/objects with different real uid");

	/*-
	* Determine if u1 "can see" the subject specified by u2, according to the
	* 'see_other_uids' policy.
	* Returns: 0 for permitted, ESRCH otherwise
	* Locks: none
	* References: u1 and u2 must not change during the call
	* u1 may equal u2, in which case only one reference is required
	*/
	static int
	cr_seeotheruids(struct ucred u1, struct ucred u2)
	{

	if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) {
	if (priv_check_cred(u1, PRIV_SEEOTHERUIDS, 0) != 0)
	return (ESRCH);
	}
	return (0);
	}

	/*
	* 'see_other_gids' determines whether or not visibility of processes
	* and sockets with credentials holding different real gids is possible
	* using a variety of system MIBs.
	* XXX: data declarations should be together near the beginning of the file.
	*/
	static int see_other_gids = 1;
	SYSCTL_INT(_security_bsd, OID_AUTO, see_other_gids, CTLFLAG_RW,
	&see_other_gids, 0,
	"Unprivileged processes may see subjects/objects with different real gid");

	/*
	* Determine if u1 can "see" the subject specified by u2, according to the
	* 'see_other_gids' policy.
	* Returns: 0 for permitted, ESRCH otherwise
	* Locks: none
	* References: u1 and u2 must not change during the call
	* u1 may equal u2, in which case only one reference is required
	*/
	static int
	cr_seeothergids(struct ucred u1, struct ucred u2)
	{
	int i, match;

	if (!see_other_gids) {
	match = 0;
	for (i = 0; i < u1->cr_ngroups; i++) {
	if (groupmember(u1->cr_groups[i], u2))
	match = 1;
	if (match)
	break;
	}
	if (!match) {
	if (priv_check_cred(u1, PRIV_SEEOTHERGIDS, 0) != 0)
	return (ESRCH);
	}
	}
	return (0);
	}

	/*-
	* Determine if u1 "can see" the subject specified by u2.
	* Returns: 0 for permitted, an errno value otherwise
	* Locks: none
	* References: u1 and u2 must not change during the call
	* u1 may equal u2, in which case only one reference is required
	*/
	int
	cr_cansee(struct ucred u1, struct ucred u2)
	{
	int error;

	if ((error = prison_check(u1, u2)))
	return (error);
	#ifdef MAC
	if ((error = mac_cred_check_visible(u1, u2)))
	return (error);
	#endif
	if ((error = cr_seeotheruids(u1, u2)))
	return (error);
	if ((error = cr_seeothergids(u1, u2)))
	return (error);
	return (0);
	}

	/*-
	* Determine if td "can see" the subject specified by p.
	* Returns: 0 for permitted, an errno value otherwise
	* Locks: Sufficient locks to protect p->p_ucred must be held. td really
	* should be curthread.
	* References: td and p must be valid for the lifetime of the call
	*/
	int
	p_cansee(struct thread td, struct proc p)
	{

	/* Wrap cr_cansee() for all functionality. */
	KASSERT(td == curthread, ("%s: td not curthread", __func__));
	PROC_LOCK_ASSERT(p, MA_OWNED);
	return (cr_cansee(td->td_ucred, p->p_ucred));
	}

	/*
	* 'conservative_signals' prevents the delivery of a broad class of
	* signals by unprivileged processes to processes that have changed their
	* credentials since the last invocation of execve(). This can prevent
	* the leakage of cached information or retained privileges as a result
	* of a common class of signal-related vulnerabilities. However, this
	* may interfere with some applications that expect to be able to
	* deliver these signals to peer processes after having given up
	* privilege.
	*/
	static int conservative_signals = 1;
	SYSCTL_INT(_security_bsd, OID_AUTO, conservative_signals, CTLFLAG_RW,
	&conservative_signals, 0, "Unprivileged processes prevented from "
	"sending certain signals to processes whose credentials have changed");
	/*-
	* Determine whether cred may deliver the specified signal to proc.
	* Returns: 0 for permitted, an errno value otherwise.
	* Locks: A lock must be held for proc.
	* References: cred and proc must be valid for the lifetime of the call.
	*/
	int
	cr_cansignal(struct ucred cred, struct proc proc, int signum)
	{
	int error;

	PROC_LOCK_ASSERT(proc, MA_OWNED);
	/*
	* Jail semantics limit the scope of signalling to proc in the
	* same jail as cred, if cred is in jail.
	*/
	error = prison_check(cred, proc->p_ucred);
	if (error)
	return (error);
	#ifdef MAC
	if ((error = mac_proc_check_signal(cred, proc, signum)))
	return (error);
	#endif
	if ((error = cr_seeotheruids(cred, proc->p_ucred)))
	return (error);
	if ((error = cr_seeothergids(cred, proc->p_ucred)))
	return (error);

	/*
	* UNIX signal semantics depend on the status of the P_SUGID
	* bit on the target process. If the bit is set, then additional
	* restrictions are placed on the set of available signals.
	*/
	if (conservative_signals && (proc->p_flag & P_SUGID)) {
	switch (signum) {
	case 0:
	case SIGKILL:
	case SIGINT:
	case SIGTERM:
	case SIGALRM:
	case SIGSTOP:
	case SIGTTIN:
	case SIGTTOU:
	case SIGTSTP:
	case SIGHUP:
	case SIGUSR1:
	case SIGUSR2:
	/*
	* Generally, permit job and terminal control
	* signals.
	*/
	break;
	default:
	/* Not permitted without privilege. */
	error = priv_check_cred(cred, PRIV_SIGNAL_SUGID, 0);
	if (error)
	return (error);
	}
	}

	/*
	* Generally, the target credential's ruid or svuid must match the
	* subject credential's ruid or euid.
	*/
	if (cred->cr_ruid != proc->p_ucred->cr_ruid &&
	cred->cr_ruid != proc->p_ucred->cr_svuid &&
	cred->cr_uid != proc->p_ucred->cr_ruid &&
	cred->cr_uid != proc->p_ucred->cr_svuid) {
	error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED, 0);
	if (error)
	return (error);
	}

	return (0);
	}

	/*-
	* Determine whether td may deliver the specified signal to p.
	* Returns: 0 for permitted, an errno value otherwise
	* Locks: Sufficient locks to protect various components of td and p
	* must be held. td must be curthread, and a lock must be
	* held for p.
	* References: td and p must be valid for the lifetime of the call
	*/
	int
	p_cansignal(struct thread td, struct proc p, int signum)
	{

	KASSERT(td == curthread, ("%s: td not curthread", __func__));
	PROC_LOCK_ASSERT(p, MA_OWNED);
	if (td->td_proc == p)
	return (0);

	/*
	* UNIX signalling semantics require that processes in the same
	* session always be able to deliver SIGCONT to one another,
	* overriding the remaining protections.
	*/
	/* XXX: This will require an additional lock of some sort. */
	if (signum == SIGCONT && td->td_proc->p_session == p->p_session)
	return (0);
	/*
	* Some compat layers use SIGTHR and higher signals for
	* communication between different kernel threads of the same
	* process, so that they expect that it's always possible to
	* deliver them, even for suid applications where cr_cansignal() can
	* deny such ability for security consideration. It should be
	* pretty safe to do since the only way to create two processes
	* with the same p_leader is via rfork(2).
	*/
	if (td->td_proc->p_leader != NULL && signum >= SIGTHR &&
	signum < SIGTHR + 4 && td->td_proc->p_leader == p->p_leader)
	return (0);

	return (cr_cansignal(td->td_ucred, p, signum));
	}

	/*-
	* Determine whether td may reschedule p.
	* Returns: 0 for permitted, an errno value otherwise
	* Locks: Sufficient locks to protect various components of td and p
	* must be held. td must be curthread, and a lock must
	* be held for p.
	* References: td and p must be valid for the lifetime of the call
	*/
	int
	p_cansched(struct thread td, struct proc p)
	{
	int error;

	KASSERT(td == curthread, ("%s: td not curthread", __func__));
	PROC_LOCK_ASSERT(p, MA_OWNED);
	if (td->td_proc == p)
	return (0);
	if ((error = prison_check(td->td_ucred, p->p_ucred)))
	return (error);
	#ifdef MAC
	if ((error = mac_proc_check_sched(td->td_ucred, p)))
	return (error);
	#endif
	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
	return (error);
	if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
	return (error);
	if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid &&
	td->td_ucred->cr_uid != p->p_ucred->cr_ruid) {
	error = priv_check(td, PRIV_SCHED_DIFFCRED);
	if (error)
	return (error);
	}
	return (0);
	}

	/*
	* The 'unprivileged_proc_debug' flag may be used to disable a variety of
	* unprivileged inter-process debugging services, including some procfs
	* functionality, ptrace(), and ktrace(). In the past, inter-process
	* debugging has been involved in a variety of security problems, and sites
	* not requiring the service might choose to disable it when hardening
	* systems.
	*
	* XXX: Should modifying and reading this variable require locking?
	* XXX: data declarations should be together near the beginning of the file.
	*/
	static int unprivileged_proc_debug = 1;
	SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_proc_debug, CTLFLAG_RW,
	&unprivileged_proc_debug, 0,
	"Unprivileged processes may use process debugging facilities");

	/*-
	* Determine whether td may debug p.
	* Returns: 0 for permitted, an errno value otherwise
	* Locks: Sufficient locks to protect various components of td and p
	* must be held. td must be curthread, and a lock must
	* be held for p.
	* References: td and p must be valid for the lifetime of the call
	*/
	int
	p_candebug(struct thread td, struct proc p)
	{
	int credentialchanged, error, grpsubset, i, uidsubset;

	KASSERT(td == curthread, ("%s: td not curthread", __func__));
	PROC_LOCK_ASSERT(p, MA_OWNED);
	if (!unprivileged_proc_debug) {
	error = priv_check(td, PRIV_DEBUG_UNPRIV);
	if (error)
	return (error);
	}
	if (td->td_proc == p)
	return (0);
	if ((error = prison_check(td->td_ucred, p->p_ucred)))
	return (error);
	#ifdef MAC
	if ((error = mac_proc_check_debug(td->td_ucred, p)))
	return (error);
	#endif
	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
	return (error);
	if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
	return (error);

	/*
	* Is p's group set a subset of td's effective group set? This
	* includes p's egid, group access list, rgid, and svgid.
	*/
	grpsubset = 1;
	for (i = 0; i < p->p_ucred->cr_ngroups; i++) {
	if (!groupmember(p->p_ucred->cr_groups[i], td->td_ucred)) {
	grpsubset = 0;
	break;
	}
	}
	grpsubset = grpsubset &&
	groupmember(p->p_ucred->cr_rgid, td->td_ucred) &&
	groupmember(p->p_ucred->cr_svgid, td->td_ucred);

	/*
	* Are the uids present in p's credential equal to td's
	* effective uid? This includes p's euid, svuid, and ruid.
	*/
	uidsubset = (td->td_ucred->cr_uid == p->p_ucred->cr_uid &&
	td->td_ucred->cr_uid == p->p_ucred->cr_svuid &&
	td->td_ucred->cr_uid == p->p_ucred->cr_ruid);

	/*
	* Has the credential of the process changed since the last exec()?
	*/
	credentialchanged = (p->p_flag & P_SUGID);

	/*
	* If p's gids aren't a subset, or the uids aren't a subset,
	* or the credential has changed, require appropriate privilege
	* for td to debug p.
	*/
	if (!grpsubset \|\| !uidsubset) {
	error = priv_check(td, PRIV_DEBUG_DIFFCRED);
	if (error)
	return (error);
	}

	if (credentialchanged) {
	error = priv_check(td, PRIV_DEBUG_SUGID);
	if (error)
	return (error);
	}

	/* Can't trace init when securelevel > 0. */
	if (p == initproc) {
	error = securelevel_gt(td->td_ucred, 0);
	if (error)
	return (error);
	}

	/*
	* Can't trace a process that's currently exec'ing.
	*
	* XXX: Note, this is not a security policy decision, it's a
	* basic correctness/functionality decision. Therefore, this check
	* should be moved to the caller's of p_candebug().
	*/
	if ((p->p_flag & P_INEXEC) != 0)
	return (EBUSY);

	return (0);
	}

	/*-
	* Determine whether the subject represented by cred can "see" a socket.
	* Returns: 0 for permitted, ENOENT otherwise.
	*/
	int
	cr_canseesocket(struct ucred cred, struct socket so)
	{
	int error;

	error = prison_check(cred, so->so_cred);
	if (error)
	return (ENOENT);
	#ifdef MAC
	error = mac_socket_check_visible(cred, so);
	if (error)
	return (error);
	#endif
	if (cr_seeotheruids(cred, so->so_cred))
	return (ENOENT);
	if (cr_seeothergids(cred, so->so_cred))
	return (ENOENT);

	return (0);
	}

	#if defined(INET) \|\| defined(INET6)
	/*-
	* Determine whether the subject represented by cred can "see" a socket.
	* Returns: 0 for permitted, ENOENT otherwise.
	*/
	int
	cr_canseeinpcb(struct ucred cred, struct inpcb inp)
	{
	int error;

	error = prison_check(cred, inp->inp_cred);
	if (error)
	return (ENOENT);
	#ifdef MAC
	INP_LOCK_ASSERT(inp);
	error = mac_inpcb_check_visible(cred, inp);
	if (error)
	return (error);
	#endif
	if (cr_seeotheruids(cred, inp->inp_cred))
	return (ENOENT);
	if (cr_seeothergids(cred, inp->inp_cred))
	return (ENOENT);

	return (0);
	}
	#endif

	/*-
	* Determine whether td can wait for the exit of p.
	* Returns: 0 for permitted, an errno value otherwise
	* Locks: Sufficient locks to protect various components of td and p
	* must be held. td must be curthread, and a lock must
	* be held for p.
	* References: td and p must be valid for the lifetime of the call

	*/
	int
	p_canwait(struct thread td, struct proc p)
	{
	int error;

	KASSERT(td == curthread, ("%s: td not curthread", __func__));
	PROC_LOCK_ASSERT(p, MA_OWNED);
	if ((error = prison_check(td->td_ucred, p->p_ucred)))
	return (error);
	#ifdef MAC
	if ((error = mac_proc_check_wait(td->td_ucred, p)))
	return (error);
	#endif
	#if 0
	/* XXXMAC: This could have odd effects on some shells. */
	if ((error = cr_seeotheruids(td->td_ucred, p->p_ucred)))
	return (error);
	#endif

	return (0);
	}

	/*
	* Allocate a zeroed cred structure.
	*/
	struct ucred *
	crget(void)
	{
	register struct ucred *cr;

	cr = malloc(sizeof(*cr), M_CRED, M_WAITOK \| M_ZERO);
	refcount_init(&cr->cr_ref, 1);
	#ifdef AUDIT
	audit_cred_init(cr);
	#endif
	#ifdef MAC
	mac_cred_init(cr);
	#endif
	crextend(cr, XU_NGROUPS);
	return (cr);
	}

	/*
	* Claim another reference to a ucred structure.
	*/
	struct ucred *
	crhold(struct ucred *cr)
	{

	refcount_acquire(&cr->cr_ref);
	return (cr);
	}

	/*
	* Free a cred structure. Throws away space when ref count gets to 0.
	*/
	void
	crfree(struct ucred *cr)
	{

	KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
	KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred"));
	if (refcount_release(&cr->cr_ref)) {
	/*
	* Some callers of crget(), such as nfs_statfs(),
	* allocate a temporary credential, but don't
	* allocate a uidinfo structure.
	*/
	if (cr->cr_uidinfo != NULL)
	uifree(cr->cr_uidinfo);
	if (cr->cr_ruidinfo != NULL)
	uifree(cr->cr_ruidinfo);
	/*
	* Free a prison, if any.
	*/
	if (cr->cr_prison != NULL)
	prison_free(cr->cr_prison);
	if (cr->cr_loginclass != NULL)
	loginclass_free(cr->cr_loginclass);
	#ifdef AUDIT
	audit_cred_destroy(cr);
	#endif
	#ifdef MAC
	mac_cred_destroy(cr);
	#endif
	free(cr->cr_groups, M_CRED);
	free(cr, M_CRED);
	}
	}

	/*
	* Check to see if this ucred is shared.
	*/
	int
	crshared(struct ucred *cr)
	{

	return (cr->cr_ref > 1);
	}

	/*
	* Copy a ucred's contents from a template. Does not block.
	*/
	void
	crcopy(struct ucred dest, struct ucred src)
	{

	KASSERT(crshared(dest) == 0, ("crcopy of shared ucred"));
	bcopy(&src->cr_startcopy, &dest->cr_startcopy,
	(unsigned)((caddr_t)&src->cr_endcopy -
	(caddr_t)&src->cr_startcopy));
	crsetgroups(dest, src->cr_ngroups, src->cr_groups);
	uihold(dest->cr_uidinfo);
	uihold(dest->cr_ruidinfo);
	prison_hold(dest->cr_prison);
	loginclass_hold(dest->cr_loginclass);
	#ifdef AUDIT
	audit_cred_copy(src, dest);
	#endif
	#ifdef MAC
	mac_cred_copy(src, dest);
	#endif
	}

	/*
	* Dup cred struct to a new held one.
	*/
	struct ucred *
	crdup(struct ucred *cr)
	{
	struct ucred *newcr;

	newcr = crget();
	crcopy(newcr, cr);
	return (newcr);
	}

	/*
	* Fill in a struct xucred based on a struct ucred.
	*/
	void
	cru2x(struct ucred cr, struct xucred xcr)
	{
	int ngroups;

	bzero(xcr, sizeof(*xcr));
	xcr->cr_version = XUCRED_VERSION;
	xcr->cr_uid = cr->cr_uid;

	ngroups = MIN(cr->cr_ngroups, XU_NGROUPS);
	xcr->cr_ngroups = ngroups;
	bcopy(cr->cr_groups, xcr->cr_groups,
	ngroups * sizeof(*cr->cr_groups));
	}

	/*
	* small routine to swap a thread's current ucred for the correct one taken
	* from the process.
	*/
	void
	cred_update_thread(struct thread *td)
	{
	struct proc *p;
	struct ucred *cred;

	p = td->td_proc;
	cred = td->td_ucred;
	PROC_LOCK(p);
	td->td_ucred = crhold(p->p_ucred);
	PROC_UNLOCK(p);
	if (cred != NULL)
	crfree(cred);
	}

	struct ucred *
	crcopysafe(struct proc p, struct ucred cr)
	{
	struct ucred *oldcred;
	int groups;

	PROC_LOCK_ASSERT(p, MA_OWNED);

	oldcred = p->p_ucred;
	while (cr->cr_agroups < oldcred->cr_agroups) {
	groups = oldcred->cr_agroups;
	PROC_UNLOCK(p);
	crextend(cr, groups);
	PROC_LOCK(p);
	oldcred = p->p_ucred;
	}
	crcopy(cr, oldcred);

	return (oldcred);
	}

	/*
	* Extend the passed in credential to hold n items.
	*/
	static void
	crextend(struct ucred *cr, int n)
	{
	int cnt;

	/* Truncate? */
	if (n <= cr->cr_agroups)
	return;

	/*
	* We extend by 2 each time since we're using a power of two
	* allocator until we need enough groups to fill a page.
	* Once we're allocating multiple pages, only allocate as many
	* as we actually need. The case of processes needing a
	* non-power of two number of pages seems more likely than
	* a real world process that adds thousands of groups one at a
	* time.
	*/
	if ( n < PAGE_SIZE / sizeof(gid_t) ) {
	if (cr->cr_agroups == 0)
	cnt = MINALLOCSIZE / sizeof(gid_t);
	else
	cnt = cr->cr_agroups * 2;

	while (cnt < n)
	cnt *= 2;
	} else
	cnt = roundup2(n, PAGE_SIZE / sizeof(gid_t));

	/* Free the old array. */
	if (cr->cr_groups)
	free(cr->cr_groups, M_CRED);

	cr->cr_groups = malloc(cnt * sizeof(gid_t), M_CRED, M_WAITOK \| M_ZERO);
	cr->cr_agroups = cnt;
	}

	/*
	* Copy groups in to a credential, preserving any necessary invariants.
	* Currently this includes the sorting of all supplemental gids.
	* crextend() must have been called before hand to ensure sufficient
	* space is available.
	*/
	static void
	crsetgroups_locked(struct ucred cr, int ngrp, gid_t groups)
	{
	int i;
	int j;
	gid_t g;

	KASSERT(cr->cr_agroups >= ngrp, ("cr_ngroups is too small"));

	bcopy(groups, cr->cr_groups, ngrp * sizeof(gid_t));
	cr->cr_ngroups = ngrp;

	/*
	* Sort all groups except cr_groups[0] to allow groupmember to
	* perform a binary search.
	*
	* XXX: If large numbers of groups become common this should
	* be replaced with shell sort like linux uses or possibly
	* heap sort.
	*/
	for (i = 2; i < ngrp; i++) {
	g = cr->cr_groups[i];
	for (j = i-1; j >= 1 && g < cr->cr_groups[j]; j--)
	cr->cr_groups[j + 1] = cr->cr_groups[j];
	cr->cr_groups[j + 1] = g;
	}
	}

	/*
	* Copy groups in to a credential after expanding it if required.
	* Truncate the list to (ngroups_max + 1) if it is too large.
	*/
	void
	crsetgroups(struct ucred cr, int ngrp, gid_t groups)
	{

	if (ngrp > ngroups_max + 1)
	ngrp = ngroups_max + 1;

	crextend(cr, ngrp);
	crsetgroups_locked(cr, ngrp, groups);
	}

	/*
	* Get login name, if available.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct getlogin_args {
	char *namebuf;
	u_int namelen;
	};
	#endif
	/* ARGSUSED */
	int
	-getlogin(struct thread td, struct getlogin_args uap)
	+sys_getlogin(struct thread td, struct getlogin_args uap)
	{
	int error;
	char login[MAXLOGNAME];
	struct proc *p = td->td_proc;

	if (uap->namelen > MAXLOGNAME)
	uap->namelen = MAXLOGNAME;
	PROC_LOCK(p);
	SESS_LOCK(p->p_session);
	bcopy(p->p_session->s_login, login, uap->namelen);
	SESS_UNLOCK(p->p_session);
	PROC_UNLOCK(p);
	error = copyout(login, uap->namebuf, uap->namelen);
	return(error);
	}

	/*
	* Set login name.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct setlogin_args {
	char *namebuf;
	};
	#endif
	/* ARGSUSED */
	int
	-setlogin(struct thread td, struct setlogin_args uap)
	+sys_setlogin(struct thread td, struct setlogin_args uap)
	{
	struct proc *p = td->td_proc;
	int error;
	char logintmp[MAXLOGNAME];

	error = priv_check(td, PRIV_PROC_SETLOGIN);
	if (error)
	return (error);
	error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL);
	if (error == ENAMETOOLONG)
	error = EINVAL;
	else if (!error) {
	PROC_LOCK(p);
	SESS_LOCK(p->p_session);
	(void) memcpy(p->p_session->s_login, logintmp,
	sizeof(logintmp));
	SESS_UNLOCK(p->p_session);
	PROC_UNLOCK(p);
	}
	return (error);
	}

	void
	setsugid(struct proc *p)
	{

	PROC_LOCK_ASSERT(p, MA_OWNED);
	p->p_flag \|= P_SUGID;
	if (!(p->p_pfsflags & PF_ISUGID))
	p->p_stops = 0;
	}

	/*-
	* Change a process's effective uid.
	* Side effects: newcred->cr_uid and newcred->cr_uidinfo will be modified.
	* References: newcred must be an exclusive credential reference for the
	* duration of the call.
	*/
	void
	change_euid(struct ucred newcred, struct uidinfo euip)
	{

	newcred->cr_uid = euip->ui_uid;
	uihold(euip);
	uifree(newcred->cr_uidinfo);
	newcred->cr_uidinfo = euip;
	}

	/*-
	* Change a process's effective gid.
	* Side effects: newcred->cr_gid will be modified.
	* References: newcred must be an exclusive credential reference for the
	* duration of the call.
	*/
	void
	change_egid(struct ucred *newcred, gid_t egid)
	{

	newcred->cr_groups[0] = egid;
	}

	/*-
	* Change a process's real uid.
	* Side effects: newcred->cr_ruid will be updated, newcred->cr_ruidinfo
	* will be updated, and the old and new cr_ruidinfo proc
	* counts will be updated.
	* References: newcred must be an exclusive credential reference for the
	* duration of the call.
	*/
	void
	change_ruid(struct ucred newcred, struct uidinfo ruip)
	{

	(void)chgproccnt(newcred->cr_ruidinfo, -1, 0);
	newcred->cr_ruid = ruip->ui_uid;
	uihold(ruip);
	uifree(newcred->cr_ruidinfo);
	newcred->cr_ruidinfo = ruip;
	(void)chgproccnt(newcred->cr_ruidinfo, 1, 0);
	}

	/*-
	* Change a process's real gid.
	* Side effects: newcred->cr_rgid will be updated.
	* References: newcred must be an exclusive credential reference for the
	* duration of the call.
	*/
	void
	change_rgid(struct ucred *newcred, gid_t rgid)
	{

	newcred->cr_rgid = rgid;
	}

	/*-
	* Change a process's saved uid.
	* Side effects: newcred->cr_svuid will be updated.
	* References: newcred must be an exclusive credential reference for the
	* duration of the call.
	*/
	void
	change_svuid(struct ucred *newcred, uid_t svuid)
	{

	newcred->cr_svuid = svuid;
	}

	/*-
	* Change a process's saved gid.
	* Side effects: newcred->cr_svgid will be updated.
	* References: newcred must be an exclusive credential reference for the
	* duration of the call.
	*/
	void
	change_svgid(struct ucred *newcred, gid_t svgid)
	{

	newcred->cr_svgid = svgid;
	}
	Index: head/sys/kern/kern_rctl.c
	===================================================================
	--- head/sys/kern/kern_rctl.c (revision 225616)
	+++ head/sys/kern/kern_rctl.c (revision 225617)
	@@ -1,1838 +1,1838 @@
	/*-
	* Copyright (c) 2010 The FreeBSD Foundation
	* All rights reserved.
	*
	* This software was developed by Edward Tomasz Napierala under sponsorship
	* from the FreeBSD Foundation.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/bus.h>
	#include <sys/malloc.h>
	#include <sys/queue.h>
	#include <sys/refcount.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/loginclass.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/racct.h>
	#include <sys/rctl.h>
	#include <sys/resourcevar.h>
	#include <sys/sx.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/types.h>
	#include <sys/eventhandler.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/rwlock.h>
	#include <sys/sbuf.h>
	#include <sys/taskqueue.h>
	#include <sys/tree.h>
	#include <vm/uma.h>

	#ifdef RCTL
	#ifndef RACCT
	#error "The RCTL option requires the RACCT option"
	#endif

	FEATURE(rctl, "Resource Limits");

	#define HRF_DEFAULT 0
	#define HRF_DONT_INHERIT 1
	#define HRF_DONT_ACCUMULATE 2

	/* Default buffer size for rctl_get_rules(2). */
	#define RCTL_DEFAULT_BUFSIZE 4096
	#define RCTL_LOG_BUFSIZE 128

	/*
	* 'rctl_rule_link' connects a rule with every racct it's related to.
	* For example, rule 'user:X:openfiles:deny=N/process' is linked
	* with uidinfo for user X, and to each process of that user.
	*/
	struct rctl_rule_link {
	LIST_ENTRY(rctl_rule_link) rrl_next;
	struct rctl_rule *rrl_rule;
	int rrl_exceeded;
	};

	struct dict {
	const char *d_name;
	int d_value;
	};

	static struct dict subjectnames[] = {
	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
	{ "user", RCTL_SUBJECT_TYPE_USER },
	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
	{ NULL, -1 }};

	static struct dict resourcenames[] = {
	{ "cputime", RACCT_CPU },
	{ "datasize", RACCT_DATA },
	{ "stacksize", RACCT_STACK },
	{ "coredumpsize", RACCT_CORE },
	{ "memoryuse", RACCT_RSS },
	{ "memorylocked", RACCT_MEMLOCK },
	{ "maxproc", RACCT_NPROC },
	{ "openfiles", RACCT_NOFILE },
	{ "vmemoryuse", RACCT_VMEM },
	{ "pseudoterminals", RACCT_NPTS },
	{ "swapuse", RACCT_SWAP },
	{ "nthr", RACCT_NTHR },
	{ "msgqqueued", RACCT_MSGQQUEUED },
	{ "msgqsize", RACCT_MSGQSIZE },
	{ "nmsgq", RACCT_NMSGQ },
	{ "nsem", RACCT_NSEM },
	{ "nsemop", RACCT_NSEMOP },
	{ "nshm", RACCT_NSHM },
	{ "shmsize", RACCT_SHMSIZE },
	{ "wallclock", RACCT_WALLCLOCK },
	{ NULL, -1 }};

	static struct dict actionnames[] = {
	{ "sighup", RCTL_ACTION_SIGHUP },
	{ "sigint", RCTL_ACTION_SIGINT },
	{ "sigquit", RCTL_ACTION_SIGQUIT },
	{ "sigill", RCTL_ACTION_SIGILL },
	{ "sigtrap", RCTL_ACTION_SIGTRAP },
	{ "sigabrt", RCTL_ACTION_SIGABRT },
	{ "sigemt", RCTL_ACTION_SIGEMT },
	{ "sigfpe", RCTL_ACTION_SIGFPE },
	{ "sigkill", RCTL_ACTION_SIGKILL },
	{ "sigbus", RCTL_ACTION_SIGBUS },
	{ "sigsegv", RCTL_ACTION_SIGSEGV },
	{ "sigsys", RCTL_ACTION_SIGSYS },
	{ "sigpipe", RCTL_ACTION_SIGPIPE },
	{ "sigalrm", RCTL_ACTION_SIGALRM },
	{ "sigterm", RCTL_ACTION_SIGTERM },
	{ "sigurg", RCTL_ACTION_SIGURG },
	{ "sigstop", RCTL_ACTION_SIGSTOP },
	{ "sigtstp", RCTL_ACTION_SIGTSTP },
	{ "sigchld", RCTL_ACTION_SIGCHLD },
	{ "sigttin", RCTL_ACTION_SIGTTIN },
	{ "sigttou", RCTL_ACTION_SIGTTOU },
	{ "sigio", RCTL_ACTION_SIGIO },
	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
	{ "sigprof", RCTL_ACTION_SIGPROF },
	{ "sigwinch", RCTL_ACTION_SIGWINCH },
	{ "siginfo", RCTL_ACTION_SIGINFO },
	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
	{ "sigthr", RCTL_ACTION_SIGTHR },
	{ "deny", RCTL_ACTION_DENY },
	{ "log", RCTL_ACTION_LOG },
	{ "devctl", RCTL_ACTION_DEVCTL },
	{ NULL, -1 }};

	static void rctl_init(void);
	SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);

	static uma_zone_t rctl_rule_link_zone;
	static uma_zone_t rctl_rule_zone;
	static struct rwlock rctl_lock;
	RW_SYSINIT(rctl_lock, &rctl_lock, "RCTL lock");

	static int rctl_rule_fully_specified(const struct rctl_rule *rule);
	static void rctl_rule_to_sbuf(struct sbuf sb, const struct rctl_rule rule);

	MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");

	static const char *
	rctl_subject_type_name(int subject)
	{
	int i;

	for (i = 0; subjectnames[i].d_name != NULL; i++) {
	if (subjectnames[i].d_value == subject)
	return (subjectnames[i].d_name);
	}

	panic("rctl_subject_type_name: unknown subject type %d", subject);
	}

	static const char *
	rctl_action_name(int action)
	{
	int i;

	for (i = 0; actionnames[i].d_name != NULL; i++) {
	if (actionnames[i].d_value == action)
	return (actionnames[i].d_name);
	}

	panic("rctl_action_name: unknown action %d", action);
	}

	const char *
	rctl_resource_name(int resource)
	{
	int i;

	for (i = 0; resourcenames[i].d_name != NULL; i++) {
	if (resourcenames[i].d_value == resource)
	return (resourcenames[i].d_name);
	}

	panic("rctl_resource_name: unknown resource %d", resource);
	}

	/*
	* Return the amount of resource that can be allocated by 'p' before
	* hitting 'rule'.
	*/
	static int64_t
	rctl_available_resource(const struct proc p, const struct rctl_rule rule)
	{
	int resource;
	int64_t available = INT64_MAX;
	struct ucred *cred = p->p_ucred;

	rw_assert(&rctl_lock, RA_LOCKED);

	resource = rule->rr_resource;
	switch (rule->rr_per) {
	case RCTL_SUBJECT_TYPE_PROCESS:
	available = rule->rr_amount -
	p->p_racct->r_resources[resource];
	break;
	case RCTL_SUBJECT_TYPE_USER:
	available = rule->rr_amount -
	cred->cr_ruidinfo->ui_racct->r_resources[resource];
	break;
	case RCTL_SUBJECT_TYPE_LOGINCLASS:
	available = rule->rr_amount -
	cred->cr_loginclass->lc_racct->r_resources[resource];
	break;
	case RCTL_SUBJECT_TYPE_JAIL:
	available = rule->rr_amount -
	cred->cr_prison->pr_prison_racct->prr_racct->
	r_resources[resource];
	break;
	default:
	panic("rctl_compute_available: unknown per %d",
	rule->rr_per);
	}

	return (available);
	}

	/*
	* Return non-zero if allocating 'amount' by proc 'p' would exceed
	* resource limit specified by 'rule'.
	*/
	static int
	rctl_would_exceed(const struct proc p, const struct rctl_rule rule,
	int64_t amount)
	{
	int64_t available;

	rw_assert(&rctl_lock, RA_LOCKED);

	available = rctl_available_resource(p, rule);
	if (available >= amount)
	return (0);

	return (1);
	}

	/*
	* Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
	* to what it keeps allocated now. Returns non-zero if the allocation should
	* be denied, 0 otherwise.
	*/
	int
	rctl_enforce(struct proc *p, int resource, uint64_t amount)
	{
	struct rctl_rule *rule;
	struct rctl_rule_link *link;
	struct sbuf sb;
	int should_deny = 0;
	char *buf;
	static int curtime = 0;
	static struct timeval lasttime;

	rw_rlock(&rctl_lock);

	/*
	* There may be more than one matching rule; go through all of them.
	* Denial should be done last, after logging and sending signals.
	*/
	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
	rule = link->rrl_rule;
	if (rule->rr_resource != resource)
	continue;
	if (!rctl_would_exceed(p, rule, amount)) {
	link->rrl_exceeded = 0;
	continue;
	}

	switch (rule->rr_action) {
	case RCTL_ACTION_DENY:
	should_deny = 1;
	continue;
	case RCTL_ACTION_LOG:
	/*
	* If rrl_exceeded != 0, it means we've already
	* logged a warning for this process.
	*/
	if (link->rrl_exceeded != 0)
	continue;

	if (!ppsratecheck(&lasttime, &curtime, 10))
	continue;

	buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
	if (buf == NULL) {
	printf("rctl_enforce: out of memory\n");
	continue;
	}
	sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
	rctl_rule_to_sbuf(&sb, rule);
	sbuf_finish(&sb);
	printf("rctl: rule \"%s\" matched by pid %d "
	"(%s), uid %d, jail %s\n", sbuf_data(&sb),
	p->p_pid, p->p_comm, p->p_ucred->cr_uid,
	p->p_ucred->cr_prison->pr_prison_racct->prr_name);
	sbuf_delete(&sb);
	free(buf, M_RCTL);
	link->rrl_exceeded = 1;
	continue;
	case RCTL_ACTION_DEVCTL:
	if (link->rrl_exceeded != 0)
	continue;

	buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
	if (buf == NULL) {
	printf("rctl_enforce: out of memory\n");
	continue;
	}
	sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
	sbuf_printf(&sb, "rule=");
	rctl_rule_to_sbuf(&sb, rule);
	sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
	p->p_pid, p->p_ucred->cr_ruid,
	p->p_ucred->cr_prison->pr_prison_racct->prr_name);
	sbuf_finish(&sb);
	devctl_notify_f("RCTL", "rule", "matched",
	sbuf_data(&sb), M_NOWAIT);
	sbuf_delete(&sb);
	free(buf, M_RCTL);
	link->rrl_exceeded = 1;
	continue;
	default:
	if (link->rrl_exceeded != 0)
	continue;

	KASSERT(rule->rr_action > 0 &&
	rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
	("rctl_enforce: unknown action %d",
	rule->rr_action));

	/*
	* We're supposed to send a signal, but the process
	* is not fully initialized yet, probably because we
	* got called from fork1(). For now just deny the
	* allocation instead.
	*/
	if (p->p_state != PRS_NORMAL) {
	should_deny = 1;
	continue;
	}

	/*
	* We're using the fact that RCTL_ACTION_SIG* values
	* are equal to their counterparts from sys/signal.h.
	*/
	- psignal(p, rule->rr_action);
	+ kern_psignal(p, rule->rr_action);
	link->rrl_exceeded = 1;
	continue;
	}
	}

	rw_runlock(&rctl_lock);

	if (should_deny) {
	/*
	* Return fake error code; the caller should change it
	* into one proper for the situation - EFSIZ, ENOMEM etc.
	*/
	return (EDOOFUS);
	}

	return (0);
	}

	uint64_t
	rctl_get_limit(struct proc *p, int resource)
	{
	struct rctl_rule *rule;
	struct rctl_rule_link *link;
	uint64_t amount = UINT64_MAX;

	rw_rlock(&rctl_lock);

	/*
	* There may be more than one matching rule; go through all of them.
	* Denial should be done last, after logging and sending signals.
	*/
	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
	rule = link->rrl_rule;
	if (rule->rr_resource != resource)
	continue;
	if (rule->rr_action != RCTL_ACTION_DENY)
	continue;
	if (rule->rr_amount < amount)
	amount = rule->rr_amount;
	}

	rw_runlock(&rctl_lock);

	return (amount);
	}

	uint64_t
	rctl_get_available(struct proc *p, int resource)
	{
	struct rctl_rule *rule;
	struct rctl_rule_link *link;
	int64_t available, minavailable, allocated;

	minavailable = INT64_MAX;

	rw_rlock(&rctl_lock);

	/*
	* There may be more than one matching rule; go through all of them.
	* Denial should be done last, after logging and sending signals.
	*/
	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
	rule = link->rrl_rule;
	if (rule->rr_resource != resource)
	continue;
	if (rule->rr_action != RCTL_ACTION_DENY)
	continue;
	available = rctl_available_resource(p, rule);
	if (available < minavailable)
	minavailable = available;
	}

	rw_runlock(&rctl_lock);

	/*
	* XXX: Think about this _hard_.
	*/
	allocated = p->p_racct->r_resources[resource];
	if (minavailable < INT64_MAX - allocated)
	minavailable += allocated;
	if (minavailable < 0)
	minavailable = 0;
	return (minavailable);
	}

	static int
	rctl_rule_matches(const struct rctl_rule rule, const struct rctl_rule filter)
	{

	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
	if (rule->rr_subject_type != filter->rr_subject_type)
	return (0);

	switch (filter->rr_subject_type) {
	case RCTL_SUBJECT_TYPE_PROCESS:
	if (filter->rr_subject.rs_proc != NULL &&
	rule->rr_subject.rs_proc !=
	filter->rr_subject.rs_proc)
	return (0);
	break;
	case RCTL_SUBJECT_TYPE_USER:
	if (filter->rr_subject.rs_uip != NULL &&
	rule->rr_subject.rs_uip !=
	filter->rr_subject.rs_uip)
	return (0);
	break;
	case RCTL_SUBJECT_TYPE_LOGINCLASS:
	if (filter->rr_subject.rs_loginclass != NULL &&
	rule->rr_subject.rs_loginclass !=
	filter->rr_subject.rs_loginclass)
	return (0);
	break;
	case RCTL_SUBJECT_TYPE_JAIL:
	if (filter->rr_subject.rs_prison_racct != NULL &&
	rule->rr_subject.rs_prison_racct !=
	filter->rr_subject.rs_prison_racct)
	return (0);
	break;
	default:
	panic("rctl_rule_matches: unknown subject type %d",
	filter->rr_subject_type);
	}
	}

	if (filter->rr_resource != RACCT_UNDEFINED) {
	if (rule->rr_resource != filter->rr_resource)
	return (0);
	}

	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
	if (rule->rr_action != filter->rr_action)
	return (0);
	}

	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
	if (rule->rr_amount != filter->rr_amount)
	return (0);
	}

	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
	if (rule->rr_per != filter->rr_per)
	return (0);
	}

	return (1);
	}

	static int
	str2value(const char str, int value, struct dict *table)
	{
	int i;

	if (value == NULL)
	return (EINVAL);

	for (i = 0; table[i].d_name != NULL; i++) {
	if (strcasecmp(table[i].d_name, str) == 0) {
	*value = table[i].d_value;
	return (0);
	}
	}

	return (EINVAL);
	}

	static int
	str2id(const char str, id_t value)
	{
	char *end;

	if (str == NULL)
	return (EINVAL);

	*value = strtoul(str, &end, 10);
	if ((size_t)(end - str) != strlen(str))
	return (EINVAL);

	return (0);
	}

	static int
	str2int64(const char str, int64_t value)
	{
	char *end;

	if (str == NULL)
	return (EINVAL);

	*value = strtoul(str, &end, 10);
	if ((size_t)(end - str) != strlen(str))
	return (EINVAL);

	return (0);
	}

	/*
	* Connect the rule to the racct, increasing refcount for the rule.
	*/
	static void
	rctl_racct_add_rule(struct racct racct, struct rctl_rule rule)
	{
	struct rctl_rule_link *link;

	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));

	rctl_rule_acquire(rule);
	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
	link->rrl_rule = rule;
	link->rrl_exceeded = 0;

	rw_wlock(&rctl_lock);
	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
	rw_wunlock(&rctl_lock);
	}

	static int
	rctl_racct_add_rule_locked(struct racct racct, struct rctl_rule rule)
	{
	struct rctl_rule_link *link;

	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
	rw_assert(&rctl_lock, RA_WLOCKED);

	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
	if (link == NULL)
	return (ENOMEM);
	rctl_rule_acquire(rule);
	link->rrl_rule = rule;
	link->rrl_exceeded = 0;

	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
	return (0);
	}

	/*
	* Remove limits for a rules matching the filter and release
	* the refcounts for the rules, possibly freeing them. Returns
	* the number of limit structures removed.
	*/
	static int
	rctl_racct_remove_rules(struct racct *racct,
	const struct rctl_rule *filter)
	{
	int removed = 0;
	struct rctl_rule_link link, linktmp;

	rw_assert(&rctl_lock, RA_WLOCKED);

	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
	if (!rctl_rule_matches(link->rrl_rule, filter))
	continue;

	LIST_REMOVE(link, rrl_next);
	rctl_rule_release(link->rrl_rule);
	uma_zfree(rctl_rule_link_zone, link);
	removed++;
	}
	return (removed);
	}

	static void
	rctl_rule_acquire_subject(struct rctl_rule *rule)
	{

	switch (rule->rr_subject_type) {
	case RCTL_SUBJECT_TYPE_UNDEFINED:
	case RCTL_SUBJECT_TYPE_PROCESS:
	break;
	case RCTL_SUBJECT_TYPE_JAIL:
	if (rule->rr_subject.rs_prison_racct != NULL)
	prison_racct_hold(rule->rr_subject.rs_prison_racct);
	break;
	case RCTL_SUBJECT_TYPE_USER:
	if (rule->rr_subject.rs_uip != NULL)
	uihold(rule->rr_subject.rs_uip);
	break;
	case RCTL_SUBJECT_TYPE_LOGINCLASS:
	if (rule->rr_subject.rs_loginclass != NULL)
	loginclass_hold(rule->rr_subject.rs_loginclass);
	break;
	default:
	panic("rctl_rule_acquire_subject: unknown subject type %d",
	rule->rr_subject_type);
	}
	}

	static void
	rctl_rule_release_subject(struct rctl_rule *rule)
	{

	switch (rule->rr_subject_type) {
	case RCTL_SUBJECT_TYPE_UNDEFINED:
	case RCTL_SUBJECT_TYPE_PROCESS:
	break;
	case RCTL_SUBJECT_TYPE_JAIL:
	if (rule->rr_subject.rs_prison_racct != NULL)
	prison_racct_free(rule->rr_subject.rs_prison_racct);
	break;
	case RCTL_SUBJECT_TYPE_USER:
	if (rule->rr_subject.rs_uip != NULL)
	uifree(rule->rr_subject.rs_uip);
	break;
	case RCTL_SUBJECT_TYPE_LOGINCLASS:
	if (rule->rr_subject.rs_loginclass != NULL)
	loginclass_free(rule->rr_subject.rs_loginclass);
	break;
	default:
	panic("rctl_rule_release_subject: unknown subject type %d",
	rule->rr_subject_type);
	}
	}

	struct rctl_rule *
	rctl_rule_alloc(int flags)
	{
	struct rctl_rule *rule;

	rule = uma_zalloc(rctl_rule_zone, flags);
	if (rule == NULL)
	return (NULL);
	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
	rule->rr_subject.rs_proc = NULL;
	rule->rr_subject.rs_uip = NULL;
	rule->rr_subject.rs_loginclass = NULL;
	rule->rr_subject.rs_prison_racct = NULL;
	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
	rule->rr_resource = RACCT_UNDEFINED;
	rule->rr_action = RCTL_ACTION_UNDEFINED;
	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
	refcount_init(&rule->rr_refcount, 1);

	return (rule);
	}

	struct rctl_rule *
	rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
	{
	struct rctl_rule *copy;

	copy = uma_zalloc(rctl_rule_zone, flags);
	if (copy == NULL)
	return (NULL);
	copy->rr_subject_type = rule->rr_subject_type;
	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
	copy->rr_per = rule->rr_per;
	copy->rr_resource = rule->rr_resource;
	copy->rr_action = rule->rr_action;
	copy->rr_amount = rule->rr_amount;
	refcount_init(&copy->rr_refcount, 1);
	rctl_rule_acquire_subject(copy);

	return (copy);
	}

	void
	rctl_rule_acquire(struct rctl_rule *rule)
	{

	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));

	refcount_acquire(&rule->rr_refcount);
	}

	static void
	rctl_rule_free(void *context, int pending)
	{
	struct rctl_rule *rule;

	rule = (struct rctl_rule *)context;

	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));

	/*
	* We don't need locking here; rule is guaranteed to be inaccessible.
	*/

	rctl_rule_release_subject(rule);
	uma_zfree(rctl_rule_zone, rule);
	}

	void
	rctl_rule_release(struct rctl_rule *rule)
	{

	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));

	if (refcount_release(&rule->rr_refcount)) {
	/*
	* rctl_rule_release() is often called when iterating
	* over all the uidinfo structures in the system,
	* holding uihashtbl_lock. Since rctl_rule_free()
	* might end up calling uifree(), this would lead
	* to lock recursion. Use taskqueue to avoid this.
	*/
	TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
	taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
	}
	}

	static int
	rctl_rule_fully_specified(const struct rctl_rule *rule)
	{

	switch (rule->rr_subject_type) {
	case RCTL_SUBJECT_TYPE_UNDEFINED:
	return (0);
	case RCTL_SUBJECT_TYPE_PROCESS:
	if (rule->rr_subject.rs_proc == NULL)
	return (0);
	break;
	case RCTL_SUBJECT_TYPE_USER:
	if (rule->rr_subject.rs_uip == NULL)
	return (0);
	break;
	case RCTL_SUBJECT_TYPE_LOGINCLASS:
	if (rule->rr_subject.rs_loginclass == NULL)
	return (0);
	break;
	case RCTL_SUBJECT_TYPE_JAIL:
	if (rule->rr_subject.rs_prison_racct == NULL)
	return (0);
	break;
	default:
	panic("rctl_rule_fully_specified: unknown subject type %d",
	rule->rr_subject_type);
	}
	if (rule->rr_resource == RACCT_UNDEFINED)
	return (0);
	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
	return (0);
	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
	return (0);
	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
	return (0);

	return (1);
	}

	static int
	rctl_string_to_rule(char rulestr, struct rctl_rule *rulep)
	{
	int error = 0;
	char subjectstr, subject_idstr, resourcestr, actionstr,
	amountstr, perstr;
	struct rctl_rule *rule;
	id_t id;

	rule = rctl_rule_alloc(M_WAITOK);

	subjectstr = strsep(&rulestr, ":");
	subject_idstr = strsep(&rulestr, ":");
	resourcestr = strsep(&rulestr, ":");
	actionstr = strsep(&rulestr, "=/");
	amountstr = strsep(&rulestr, "/");
	perstr = rulestr;

	if (subjectstr == NULL \|\| subjectstr[0] == '\0')
	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
	else {
	error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
	if (error != 0)
	goto out;
	}

	if (subject_idstr == NULL \|\| subject_idstr[0] == '\0') {
	rule->rr_subject.rs_proc = NULL;
	rule->rr_subject.rs_uip = NULL;
	rule->rr_subject.rs_loginclass = NULL;
	rule->rr_subject.rs_prison_racct = NULL;
	} else {
	switch (rule->rr_subject_type) {
	case RCTL_SUBJECT_TYPE_UNDEFINED:
	error = EINVAL;
	goto out;
	case RCTL_SUBJECT_TYPE_PROCESS:
	error = str2id(subject_idstr, &id);
	if (error != 0)
	goto out;
	sx_assert(&allproc_lock, SA_LOCKED);
	rule->rr_subject.rs_proc = pfind(id);
	if (rule->rr_subject.rs_proc == NULL) {
	error = ESRCH;
	goto out;
	}
	PROC_UNLOCK(rule->rr_subject.rs_proc);
	break;
	case RCTL_SUBJECT_TYPE_USER:
	error = str2id(subject_idstr, &id);
	if (error != 0)
	goto out;
	rule->rr_subject.rs_uip = uifind(id);
	break;
	case RCTL_SUBJECT_TYPE_LOGINCLASS:
	rule->rr_subject.rs_loginclass =
	loginclass_find(subject_idstr);
	if (rule->rr_subject.rs_loginclass == NULL) {
	error = ENAMETOOLONG;
	goto out;
	}
	break;
	case RCTL_SUBJECT_TYPE_JAIL:
	rule->rr_subject.rs_prison_racct =
	prison_racct_find(subject_idstr);
	if (rule->rr_subject.rs_prison_racct == NULL) {
	error = ENAMETOOLONG;
	goto out;
	}
	break;
	default:
	panic("rctl_string_to_rule: unknown subject type %d",
	rule->rr_subject_type);
	}
	}

	if (resourcestr == NULL \|\| resourcestr[0] == '\0')
	rule->rr_resource = RACCT_UNDEFINED;
	else {
	error = str2value(resourcestr, &rule->rr_resource,
	resourcenames);
	if (error != 0)
	goto out;
	}

	if (actionstr == NULL \|\| actionstr[0] == '\0')
	rule->rr_action = RCTL_ACTION_UNDEFINED;
	else {
	error = str2value(actionstr, &rule->rr_action, actionnames);
	if (error != 0)
	goto out;
	}

	if (amountstr == NULL \|\| amountstr[0] == '\0')
	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
	else {
	error = str2int64(amountstr, &rule->rr_amount);
	if (error != 0)
	goto out;
	if (RACCT_IS_IN_MILLIONS(rule->rr_resource))
	rule->rr_amount *= 1000000;
	}

	if (perstr == NULL \|\| perstr[0] == '\0')
	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
	else {
	error = str2value(perstr, &rule->rr_per, subjectnames);
	if (error != 0)
	goto out;
	}

	out:
	if (error == 0)
	*rulep = rule;
	else
	rctl_rule_release(rule);

	return (error);
	}

	/*
	* Link a rule with all the subjects it applies to.
	*/
	int
	rctl_rule_add(struct rctl_rule *rule)
	{
	struct proc *p;
	struct ucred *cred;
	struct uidinfo *uip;
	struct prison *pr;
	struct prison_racct *prr;
	struct loginclass *lc;
	struct rctl_rule *rule2;
	int match;

	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));

	/*
	* Some rules just don't make sense. Note that the one below
	* cannot be rewritten using RACCT_IS_DENIABLE(); the RACCT_PCTCPU,
	* for example, is not deniable in the racct sense, but the
	* limit is enforced in a different way, so "deny" rules for %CPU
	* do make sense.
	*/
	if (rule->rr_action == RCTL_ACTION_DENY &&
	(rule->rr_resource == RACCT_CPU \|\|
	rule->rr_resource == RACCT_WALLCLOCK))
	return (EOPNOTSUPP);

	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
	RACCT_IS_SLOPPY(rule->rr_resource))
	return (EOPNOTSUPP);

	/*
	* Make sure there are no duplicated rules. Also, for the "deny"
	* rules, remove ones differing only by "amount".
	*/
	if (rule->rr_action == RCTL_ACTION_DENY) {
	rule2 = rctl_rule_duplicate(rule, M_WAITOK);
	rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
	rctl_rule_remove(rule2);
	rctl_rule_release(rule2);
	} else
	rctl_rule_remove(rule);

	switch (rule->rr_subject_type) {
	case RCTL_SUBJECT_TYPE_PROCESS:
	p = rule->rr_subject.rs_proc;
	KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
	/*
	* No resource limits for system processes.
	*/
	if (p->p_flag & P_SYSTEM)
	return (EPERM);

	rctl_racct_add_rule(p->p_racct, rule);
	/*
	* In case of per-process rule, we don't have anything more
	* to do.
	*/
	return (0);

	case RCTL_SUBJECT_TYPE_USER:
	uip = rule->rr_subject.rs_uip;
	KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
	rctl_racct_add_rule(uip->ui_racct, rule);
	break;

	case RCTL_SUBJECT_TYPE_LOGINCLASS:
	lc = rule->rr_subject.rs_loginclass;
	KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
	rctl_racct_add_rule(lc->lc_racct, rule);
	break;

	case RCTL_SUBJECT_TYPE_JAIL:
	prr = rule->rr_subject.rs_prison_racct;
	KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
	rctl_racct_add_rule(prr->prr_racct, rule);
	break;

	default:
	panic("rctl_rule_add: unknown subject type %d",
	rule->rr_subject_type);
	}

	/*
	* Now go through all the processes and add the new rule to the ones
	* it applies to.
	*/
	sx_assert(&allproc_lock, SA_LOCKED);
	FOREACH_PROC_IN_SYSTEM(p) {
	if (p->p_flag & P_SYSTEM)
	continue;
	cred = p->p_ucred;
	switch (rule->rr_subject_type) {
	case RCTL_SUBJECT_TYPE_USER:
	if (cred->cr_uidinfo == rule->rr_subject.rs_uip \|\|
	cred->cr_ruidinfo == rule->rr_subject.rs_uip)
	break;
	continue;
	case RCTL_SUBJECT_TYPE_LOGINCLASS:
	if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
	break;
	continue;
	case RCTL_SUBJECT_TYPE_JAIL:
	match = 0;
	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
	if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
	match = 1;
	break;
	}
	}
	if (match)
	break;
	continue;
	default:
	panic("rctl_rule_add: unknown subject type %d",
	rule->rr_subject_type);
	}

	rctl_racct_add_rule(p->p_racct, rule);
	}

	return (0);
	}

	static void
	rctl_rule_remove_callback(struct racct racct, void arg2, void *arg3)
	{
	struct rctl_rule filter = (struct rctl_rule )arg2;
	int found = 0;

	rw_wlock(&rctl_lock);
	found += rctl_racct_remove_rules(racct, filter);
	rw_wunlock(&rctl_lock);

	((int )arg3) += found;
	}

	/*
	* Remove all rules that match the filter.
	*/
	int
	rctl_rule_remove(struct rctl_rule *filter)
	{
	int found = 0;
	struct proc *p;

	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
	filter->rr_subject.rs_proc != NULL) {
	p = filter->rr_subject.rs_proc;
	rw_wlock(&rctl_lock);
	found = rctl_racct_remove_rules(p->p_racct, filter);
	rw_wunlock(&rctl_lock);
	if (found)
	return (0);
	return (ESRCH);
	}

	loginclass_racct_foreach(rctl_rule_remove_callback, filter,
	(void *)&found);
	ui_racct_foreach(rctl_rule_remove_callback, filter,
	(void *)&found);
	prison_racct_foreach(rctl_rule_remove_callback, filter,
	(void *)&found);

	sx_assert(&allproc_lock, SA_LOCKED);
	rw_wlock(&rctl_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	found += rctl_racct_remove_rules(p->p_racct, filter);
	}
	rw_wunlock(&rctl_lock);

	if (found)
	return (0);
	return (ESRCH);
	}

	/*
	* Appends a rule to the sbuf.
	*/
	static void
	rctl_rule_to_sbuf(struct sbuf sb, const struct rctl_rule rule)
	{
	int64_t amount;

	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));

	switch (rule->rr_subject_type) {
	case RCTL_SUBJECT_TYPE_PROCESS:
	if (rule->rr_subject.rs_proc == NULL)
	sbuf_printf(sb, ":");
	else
	sbuf_printf(sb, "%d:",
	rule->rr_subject.rs_proc->p_pid);
	break;
	case RCTL_SUBJECT_TYPE_USER:
	if (rule->rr_subject.rs_uip == NULL)
	sbuf_printf(sb, ":");
	else
	sbuf_printf(sb, "%d:",
	rule->rr_subject.rs_uip->ui_uid);
	break;
	case RCTL_SUBJECT_TYPE_LOGINCLASS:
	if (rule->rr_subject.rs_loginclass == NULL)
	sbuf_printf(sb, ":");
	else
	sbuf_printf(sb, "%s:",
	rule->rr_subject.rs_loginclass->lc_name);
	break;
	case RCTL_SUBJECT_TYPE_JAIL:
	if (rule->rr_subject.rs_prison_racct == NULL)
	sbuf_printf(sb, ":");
	else
	sbuf_printf(sb, "%s:",
	rule->rr_subject.rs_prison_racct->prr_name);
	break;
	default:
	panic("rctl_rule_to_sbuf: unknown subject type %d",
	rule->rr_subject_type);
	}

	amount = rule->rr_amount;
	if (amount != RCTL_AMOUNT_UNDEFINED &&
	RACCT_IS_IN_MILLIONS(rule->rr_resource))
	amount /= 1000000;

	sbuf_printf(sb, "%s:%s=%jd",
	rctl_resource_name(rule->rr_resource),
	rctl_action_name(rule->rr_action),
	amount);

	if (rule->rr_per != rule->rr_subject_type)
	sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
	}

	/*
	* Routine used by RCTL syscalls to read in input string.
	*/
	static int
	rctl_read_inbuf(char *inputstr, const char inbufp, size_t inbuflen)
	{
	int error;
	char *str;

	if (inbuflen <= 0)
	return (EINVAL);

	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
	error = copyinstr(inbufp, str, inbuflen, NULL);
	if (error != 0) {
	free(str, M_RCTL);
	return (error);
	}

	*inputstr = str;

	return (0);
	}

	/*
	* Routine used by RCTL syscalls to write out output string.
	*/
	static int
	rctl_write_outbuf(struct sbuf outputsbuf, char outbufp, size_t outbuflen)
	{
	int error;

	if (outputsbuf == NULL)
	return (0);

	sbuf_finish(outputsbuf);
	if (outbuflen < sbuf_len(outputsbuf) + 1) {
	sbuf_delete(outputsbuf);
	return (ERANGE);
	}
	error = copyout(sbuf_data(outputsbuf), outbufp,
	sbuf_len(outputsbuf) + 1);
	sbuf_delete(outputsbuf);
	return (error);
	}

	static struct sbuf *
	rctl_racct_to_sbuf(struct racct *racct, int sloppy)
	{
	int i;
	int64_t amount;
	struct sbuf *sb;

	sb = sbuf_new_auto();
	for (i = 0; i <= RACCT_MAX; i++) {
	if (sloppy == 0 && RACCT_IS_SLOPPY(i))
	continue;
	amount = racct->r_resources[i];
	if (RACCT_IS_IN_MILLIONS(i))
	amount /= 1000000;
	sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
	}
	sbuf_setpos(sb, sbuf_len(sb) - 1);
	return (sb);
	}

	int
	-rctl_get_racct(struct thread td, struct rctl_get_racct_args uap)
	+sys_rctl_get_racct(struct thread td, struct rctl_get_racct_args uap)
	{
	int error;
	char *inputstr;
	struct rctl_rule *filter;
	struct sbuf *outputsbuf = NULL;
	struct proc *p;
	struct uidinfo *uip;
	struct loginclass *lc;
	struct prison_racct *prr;

	error = priv_check(td, PRIV_RCTL_GET_RACCT);
	if (error != 0)
	return (error);

	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
	if (error != 0)
	return (error);

	sx_slock(&allproc_lock);
	error = rctl_string_to_rule(inputstr, &filter);
	free(inputstr, M_RCTL);
	if (error != 0) {
	sx_sunlock(&allproc_lock);
	return (error);
	}

	switch (filter->rr_subject_type) {
	case RCTL_SUBJECT_TYPE_PROCESS:
	p = filter->rr_subject.rs_proc;
	if (p == NULL) {
	error = EINVAL;
	goto out;
	}
	if (p->p_flag & P_SYSTEM) {
	error = EINVAL;
	goto out;
	}
	outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
	break;
	case RCTL_SUBJECT_TYPE_USER:
	uip = filter->rr_subject.rs_uip;
	if (uip == NULL) {
	error = EINVAL;
	goto out;
	}
	outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
	break;
	case RCTL_SUBJECT_TYPE_LOGINCLASS:
	lc = filter->rr_subject.rs_loginclass;
	if (lc == NULL) {
	error = EINVAL;
	goto out;
	}
	outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
	break;
	case RCTL_SUBJECT_TYPE_JAIL:
	prr = filter->rr_subject.rs_prison_racct;
	if (prr == NULL) {
	error = EINVAL;
	goto out;
	}
	outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
	break;
	default:
	error = EINVAL;
	}
	out:
	rctl_rule_release(filter);
	sx_sunlock(&allproc_lock);
	if (error != 0)
	return (error);

	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);

	return (error);
	}

	static void
	rctl_get_rules_callback(struct racct racct, void arg2, void *arg3)
	{
	struct rctl_rule filter = (struct rctl_rule )arg2;
	struct rctl_rule_link *link;
	struct sbuf sb = (struct sbuf )arg3;

	rw_rlock(&rctl_lock);
	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
	if (!rctl_rule_matches(link->rrl_rule, filter))
	continue;
	rctl_rule_to_sbuf(sb, link->rrl_rule);
	sbuf_printf(sb, ",");
	}
	rw_runlock(&rctl_lock);
	}

	int
	-rctl_get_rules(struct thread td, struct rctl_get_rules_args uap)
	+sys_rctl_get_rules(struct thread td, struct rctl_get_rules_args uap)
	{
	int error;
	size_t bufsize = RCTL_DEFAULT_BUFSIZE;
	char inputstr, buf;
	struct sbuf *sb;
	struct rctl_rule *filter;
	struct rctl_rule_link *link;
	struct proc *p;

	error = priv_check(td, PRIV_RCTL_GET_RULES);
	if (error != 0)
	return (error);

	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
	if (error != 0)
	return (error);

	sx_slock(&allproc_lock);
	error = rctl_string_to_rule(inputstr, &filter);
	free(inputstr, M_RCTL);
	if (error != 0) {
	sx_sunlock(&allproc_lock);
	return (error);
	}

	again:
	buf = malloc(bufsize, M_RCTL, M_WAITOK);
	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
	KASSERT(sb != NULL, ("sbuf_new failed"));

	sx_assert(&allproc_lock, SA_LOCKED);
	FOREACH_PROC_IN_SYSTEM(p) {
	rw_rlock(&rctl_lock);
	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
	/*
	* Non-process rules will be added to the buffer later.
	* Adding them here would result in duplicated output.
	*/
	if (link->rrl_rule->rr_subject_type !=
	RCTL_SUBJECT_TYPE_PROCESS)
	continue;
	if (!rctl_rule_matches(link->rrl_rule, filter))
	continue;
	rctl_rule_to_sbuf(sb, link->rrl_rule);
	sbuf_printf(sb, ",");
	}
	rw_runlock(&rctl_lock);
	}

	loginclass_racct_foreach(rctl_get_rules_callback, filter, sb);
	ui_racct_foreach(rctl_get_rules_callback, filter, sb);
	prison_racct_foreach(rctl_get_rules_callback, filter, sb);
	if (sbuf_error(sb) == ENOMEM) {
	sbuf_delete(sb);
	free(buf, M_RCTL);
	bufsize *= 4;
	goto again;
	}

	/*
	* Remove trailing ",".
	*/
	if (sbuf_len(sb) > 0)
	sbuf_setpos(sb, sbuf_len(sb) - 1);

	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);

	rctl_rule_release(filter);
	sx_sunlock(&allproc_lock);
	free(buf, M_RCTL);
	return (error);
	}

	int
	-rctl_get_limits(struct thread td, struct rctl_get_limits_args uap)
	+sys_rctl_get_limits(struct thread td, struct rctl_get_limits_args uap)
	{
	int error;
	size_t bufsize = RCTL_DEFAULT_BUFSIZE;
	char inputstr, buf;
	struct sbuf *sb;
	struct rctl_rule *filter;
	struct rctl_rule_link *link;

	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
	if (error != 0)
	return (error);

	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
	if (error != 0)
	return (error);

	sx_slock(&allproc_lock);
	error = rctl_string_to_rule(inputstr, &filter);
	free(inputstr, M_RCTL);
	if (error != 0) {
	sx_sunlock(&allproc_lock);
	return (error);
	}

	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
	rctl_rule_release(filter);
	sx_sunlock(&allproc_lock);
	return (EINVAL);
	}
	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
	rctl_rule_release(filter);
	sx_sunlock(&allproc_lock);
	return (EOPNOTSUPP);
	}
	if (filter->rr_subject.rs_proc == NULL) {
	rctl_rule_release(filter);
	sx_sunlock(&allproc_lock);
	return (EINVAL);
	}

	again:
	buf = malloc(bufsize, M_RCTL, M_WAITOK);
	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
	KASSERT(sb != NULL, ("sbuf_new failed"));

	rw_rlock(&rctl_lock);
	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
	rrl_next) {
	rctl_rule_to_sbuf(sb, link->rrl_rule);
	sbuf_printf(sb, ",");
	}
	rw_runlock(&rctl_lock);
	if (sbuf_error(sb) == ENOMEM) {
	sbuf_delete(sb);
	free(buf, M_RCTL);
	bufsize *= 4;
	goto again;
	}

	/*
	* Remove trailing ",".
	*/
	if (sbuf_len(sb) > 0)
	sbuf_setpos(sb, sbuf_len(sb) - 1);

	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
	rctl_rule_release(filter);
	sx_sunlock(&allproc_lock);
	free(buf, M_RCTL);
	return (error);
	}

	int
	-rctl_add_rule(struct thread td, struct rctl_add_rule_args uap)
	+sys_rctl_add_rule(struct thread td, struct rctl_add_rule_args uap)
	{
	int error;
	struct rctl_rule *rule;
	char *inputstr;

	error = priv_check(td, PRIV_RCTL_ADD_RULE);
	if (error != 0)
	return (error);

	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
	if (error != 0)
	return (error);

	sx_slock(&allproc_lock);
	error = rctl_string_to_rule(inputstr, &rule);
	free(inputstr, M_RCTL);
	if (error != 0) {
	sx_sunlock(&allproc_lock);
	return (error);
	}
	/*
	* The 'per' part of a rule is optional.
	*/
	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
	rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
	rule->rr_per = rule->rr_subject_type;

	if (!rctl_rule_fully_specified(rule)) {
	error = EINVAL;
	goto out;
	}

	error = rctl_rule_add(rule);

	out:
	rctl_rule_release(rule);
	sx_sunlock(&allproc_lock);
	return (error);
	}

	int
	-rctl_remove_rule(struct thread td, struct rctl_remove_rule_args uap)
	+sys_rctl_remove_rule(struct thread td, struct rctl_remove_rule_args uap)
	{
	int error;
	struct rctl_rule *filter;
	char *inputstr;

	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
	if (error != 0)
	return (error);

	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
	if (error != 0)
	return (error);

	sx_slock(&allproc_lock);
	error = rctl_string_to_rule(inputstr, &filter);
	free(inputstr, M_RCTL);
	if (error != 0) {
	sx_sunlock(&allproc_lock);
	return (error);
	}

	error = rctl_rule_remove(filter);
	rctl_rule_release(filter);
	sx_sunlock(&allproc_lock);

	return (error);
	}

	/*
	* Update RCTL rule list after credential change.
	*/
	void
	rctl_proc_ucred_changed(struct proc p, struct ucred newcred)
	{
	int rulecnt, i;
	struct rctl_rule_link link, newlink;
	struct uidinfo *newuip;
	struct loginclass *newlc;
	struct prison_racct *newprr;
	LIST_HEAD(, rctl_rule_link) newrules;

	newuip = newcred->cr_ruidinfo;
	newlc = newcred->cr_loginclass;
	newprr = newcred->cr_prison->pr_prison_racct;

	LIST_INIT(&newrules);

	again:
	/*
	* First, count the rules that apply to the process with new
	* credentials.
	*/
	rulecnt = 0;
	rw_rlock(&rctl_lock);
	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
	if (link->rrl_rule->rr_subject_type ==
	RCTL_SUBJECT_TYPE_PROCESS)
	rulecnt++;
	}
	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
	rulecnt++;
	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
	rulecnt++;
	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
	rulecnt++;
	rw_runlock(&rctl_lock);

	/*
	* Create temporary list. We've dropped the rctl_lock in order
	* to use M_WAITOK.
	*/
	for (i = 0; i < rulecnt; i++) {
	newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
	newlink->rrl_rule = NULL;
	LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
	}

	newlink = LIST_FIRST(&newrules);

	/*
	* Assign rules to the newly allocated list entries.
	*/
	rw_wlock(&rctl_lock);
	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
	if (link->rrl_rule->rr_subject_type ==
	RCTL_SUBJECT_TYPE_PROCESS) {
	if (newlink == NULL)
	goto goaround;
	rctl_rule_acquire(link->rrl_rule);
	newlink->rrl_rule = link->rrl_rule;
	newlink = LIST_NEXT(newlink, rrl_next);
	rulecnt--;
	}
	}

	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
	if (newlink == NULL)
	goto goaround;
	rctl_rule_acquire(link->rrl_rule);
	newlink->rrl_rule = link->rrl_rule;
	newlink = LIST_NEXT(newlink, rrl_next);
	rulecnt--;
	}

	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
	if (newlink == NULL)
	goto goaround;
	rctl_rule_acquire(link->rrl_rule);
	newlink->rrl_rule = link->rrl_rule;
	newlink = LIST_NEXT(newlink, rrl_next);
	rulecnt--;
	}

	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
	if (newlink == NULL)
	goto goaround;
	rctl_rule_acquire(link->rrl_rule);
	newlink->rrl_rule = link->rrl_rule;
	newlink = LIST_NEXT(newlink, rrl_next);
	rulecnt--;
	}

	if (rulecnt == 0) {
	/*
	* Free the old rule list.
	*/
	while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
	link = LIST_FIRST(&p->p_racct->r_rule_links);
	LIST_REMOVE(link, rrl_next);
	rctl_rule_release(link->rrl_rule);
	uma_zfree(rctl_rule_link_zone, link);
	}

	/*
	* Replace lists and we're done.
	*
	* XXX: Is there any way to switch list heads instead
	* of iterating here?
	*/
	while (!LIST_EMPTY(&newrules)) {
	newlink = LIST_FIRST(&newrules);
	LIST_REMOVE(newlink, rrl_next);
	LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
	newlink, rrl_next);
	}

	rw_wunlock(&rctl_lock);

	return;
	}

	goaround:
	rw_wunlock(&rctl_lock);

	/*
	* Rule list changed while we were not holding the rctl_lock.
	* Free the new list and try again.
	*/
	while (!LIST_EMPTY(&newrules)) {
	newlink = LIST_FIRST(&newrules);
	LIST_REMOVE(newlink, rrl_next);
	if (newlink->rrl_rule != NULL)
	rctl_rule_release(newlink->rrl_rule);
	uma_zfree(rctl_rule_link_zone, newlink);
	}

	goto again;
	}

	/*
	* Assign RCTL rules to the newly created process.
	*/
	int
	rctl_proc_fork(struct proc parent, struct proc child)
	{
	int error;
	struct rctl_rule_link *link;
	struct rctl_rule *rule;

	LIST_INIT(&child->p_racct->r_rule_links);

	/*
	* No limits for kernel processes.
	*/
	if (child->p_flag & P_SYSTEM)
	return (0);

	/*
	* Nothing to inherit from P_SYSTEM parents.
	*/
	if (parent->p_racct == NULL) {
	KASSERT(parent->p_flag & P_SYSTEM,
	("non-system process without racct; p = %p", parent));
	return (0);
	}

	rw_wlock(&rctl_lock);

	/*
	* Go through limits applicable to the parent and assign them
	* to the child. Rules with 'process' subject have to be duplicated
	* in order to make their rr_subject point to the new process.
	*/
	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
	if (link->rrl_rule->rr_subject_type ==
	RCTL_SUBJECT_TYPE_PROCESS) {
	rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
	if (rule == NULL)
	goto fail;
	KASSERT(rule->rr_subject.rs_proc == parent,
	("rule->rr_subject.rs_proc != parent"));
	rule->rr_subject.rs_proc = child;
	error = rctl_racct_add_rule_locked(child->p_racct,
	rule);
	rctl_rule_release(rule);
	if (error != 0)
	goto fail;
	} else {
	error = rctl_racct_add_rule_locked(child->p_racct,
	link->rrl_rule);
	if (error != 0)
	goto fail;
	}
	}

	rw_wunlock(&rctl_lock);
	return (0);

	fail:
	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
	link = LIST_FIRST(&child->p_racct->r_rule_links);
	LIST_REMOVE(link, rrl_next);
	rctl_rule_release(link->rrl_rule);
	uma_zfree(rctl_rule_link_zone, link);
	}
	rw_wunlock(&rctl_lock);
	return (EAGAIN);
	}

	/*
	* Release rules attached to the racct.
	*/
	void
	rctl_racct_release(struct racct *racct)
	{
	struct rctl_rule_link *link;

	rw_wlock(&rctl_lock);
	while (!LIST_EMPTY(&racct->r_rule_links)) {
	link = LIST_FIRST(&racct->r_rule_links);
	LIST_REMOVE(link, rrl_next);
	rctl_rule_release(link->rrl_rule);
	uma_zfree(rctl_rule_link_zone, link);
	}
	rw_wunlock(&rctl_lock);
	}

	static void
	rctl_init(void)
	{

	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
	sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
	UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	}

	#else /* !RCTL */

	int
	-rctl_get_racct(struct thread td, struct rctl_get_racct_args uap)
	+sys_rctl_get_racct(struct thread td, struct rctl_get_racct_args uap)
	{

	return (ENOSYS);
	}

	int
	-rctl_get_rules(struct thread td, struct rctl_get_rules_args uap)
	+sys_rctl_get_rules(struct thread td, struct rctl_get_rules_args uap)
	{

	return (ENOSYS);
	}

	int
	-rctl_get_limits(struct thread td, struct rctl_get_limits_args uap)
	+sys_rctl_get_limits(struct thread td, struct rctl_get_limits_args uap)
	{

	return (ENOSYS);
	}

	int
	-rctl_add_rule(struct thread td, struct rctl_add_rule_args uap)
	+sys_rctl_add_rule(struct thread td, struct rctl_add_rule_args uap)
	{

	return (ENOSYS);
	}

	int
	-rctl_remove_rule(struct thread td, struct rctl_remove_rule_args uap)
	+sys_rctl_remove_rule(struct thread td, struct rctl_remove_rule_args uap)
	{

	return (ENOSYS);
	}

	#endif /* !RCTL */
	Index: head/sys/kern/kern_resource.c
	===================================================================
	--- head/sys/kern/kern_resource.c (revision 225616)
	+++ head/sys/kern/kern_resource.c (revision 225617)
	@@ -1,1415 +1,1415 @@
	/*-
	* Copyright (c) 1982, 1986, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_resource.c 8.5 (Berkeley) 1/21/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/file.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/refcount.h>
	#include <sys/racct.h>
	#include <sys/resourcevar.h>
	#include <sys/rwlock.h>
	#include <sys/sched.h>
	#include <sys/sx.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/time.h>
	#include <sys/umtx.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>


	static MALLOC_DEFINE(M_PLIMIT, "plimit", "plimit structures");
	static MALLOC_DEFINE(M_UIDINFO, "uidinfo", "uidinfo structures");
	#define UIHASH(uid) (&uihashtbl[(uid) & uihash])
	static struct rwlock uihashtbl_lock;
	static LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
	static u_long uihash; /* size of hash table - 1 */

	static void calcru1(struct proc p, struct rusage_ext ruxp,
	struct timeval up, struct timeval sp);
	static int donice(struct thread td, struct proc chgp, int n);
	static struct uidinfo *uilookup(uid_t uid);
	static void ruxagg_locked(struct rusage_ext rux, struct thread td);

	/*
	* Resource controls and accounting.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct getpriority_args {
	int which;
	int who;
	};
	#endif
	int
	-getpriority(td, uap)
	+sys_getpriority(td, uap)
	struct thread *td;
	register struct getpriority_args *uap;
	{
	struct proc *p;
	struct pgrp *pg;
	int error, low;

	error = 0;
	low = PRIO_MAX + 1;
	switch (uap->which) {

	case PRIO_PROCESS:
	if (uap->who == 0)
	low = td->td_proc->p_nice;
	else {
	p = pfind(uap->who);
	if (p == NULL)
	break;
	if (p_cansee(td, p) == 0)
	low = p->p_nice;
	PROC_UNLOCK(p);
	}
	break;

	case PRIO_PGRP:
	sx_slock(&proctree_lock);
	if (uap->who == 0) {
	pg = td->td_proc->p_pgrp;
	PGRP_LOCK(pg);
	} else {
	pg = pgfind(uap->who);
	if (pg == NULL) {
	sx_sunlock(&proctree_lock);
	break;
	}
	}
	sx_sunlock(&proctree_lock);
	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
	PROC_LOCK(p);
	if (p->p_state == PRS_NORMAL &&
	p_cansee(td, p) == 0) {
	if (p->p_nice < low)
	low = p->p_nice;
	}
	PROC_UNLOCK(p);
	}
	PGRP_UNLOCK(pg);
	break;

	case PRIO_USER:
	if (uap->who == 0)
	uap->who = td->td_ucred->cr_uid;
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	PROC_LOCK(p);
	if (p->p_state == PRS_NORMAL &&
	p_cansee(td, p) == 0 &&
	p->p_ucred->cr_uid == uap->who) {
	if (p->p_nice < low)
	low = p->p_nice;
	}
	PROC_UNLOCK(p);
	}
	sx_sunlock(&allproc_lock);
	break;

	default:
	error = EINVAL;
	break;
	}
	if (low == PRIO_MAX + 1 && error == 0)
	error = ESRCH;
	td->td_retval[0] = low;
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct setpriority_args {
	int which;
	int who;
	int prio;
	};
	#endif
	int
	-setpriority(td, uap)
	+sys_setpriority(td, uap)
	struct thread *td;
	struct setpriority_args *uap;
	{
	struct proc curp, p;
	struct pgrp *pg;
	int found = 0, error = 0;

	curp = td->td_proc;
	switch (uap->which) {
	case PRIO_PROCESS:
	if (uap->who == 0) {
	PROC_LOCK(curp);
	error = donice(td, curp, uap->prio);
	PROC_UNLOCK(curp);
	} else {
	p = pfind(uap->who);
	if (p == NULL)
	break;
	error = p_cansee(td, p);
	if (error == 0)
	error = donice(td, p, uap->prio);
	PROC_UNLOCK(p);
	}
	found++;
	break;

	case PRIO_PGRP:
	sx_slock(&proctree_lock);
	if (uap->who == 0) {
	pg = curp->p_pgrp;
	PGRP_LOCK(pg);
	} else {
	pg = pgfind(uap->who);
	if (pg == NULL) {
	sx_sunlock(&proctree_lock);
	break;
	}
	}
	sx_sunlock(&proctree_lock);
	LIST_FOREACH(p, &pg->pg_members, p_pglist) {
	PROC_LOCK(p);
	if (p->p_state == PRS_NORMAL &&
	p_cansee(td, p) == 0) {
	error = donice(td, p, uap->prio);
	found++;
	}
	PROC_UNLOCK(p);
	}
	PGRP_UNLOCK(pg);
	break;

	case PRIO_USER:
	if (uap->who == 0)
	uap->who = td->td_ucred->cr_uid;
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	PROC_LOCK(p);
	if (p->p_state == PRS_NORMAL &&
	p->p_ucred->cr_uid == uap->who &&
	p_cansee(td, p) == 0) {
	error = donice(td, p, uap->prio);
	found++;
	}
	PROC_UNLOCK(p);
	}
	sx_sunlock(&allproc_lock);
	break;

	default:
	error = EINVAL;
	break;
	}
	if (found == 0 && error == 0)
	error = ESRCH;
	return (error);
	}

	/*
	* Set "nice" for a (whole) process.
	*/
	static int
	donice(struct thread td, struct proc p, int n)
	{
	int error;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	if ((error = p_cansched(td, p)))
	return (error);
	if (n > PRIO_MAX)
	n = PRIO_MAX;
	if (n < PRIO_MIN)
	n = PRIO_MIN;
	if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0)
	return (EACCES);
	sched_nice(p, n);
	return (0);
	}

	/*
	* Set realtime priority for LWP.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct rtprio_thread_args {
	int function;
	lwpid_t lwpid;
	struct rtprio *rtp;
	};
	#endif
	int
	-rtprio_thread(struct thread td, struct rtprio_thread_args uap)
	+sys_rtprio_thread(struct thread td, struct rtprio_thread_args uap)
	{
	struct proc *p;
	struct rtprio rtp;
	struct thread *td1;
	int cierror, error;

	/* Perform copyin before acquiring locks if needed. */
	if (uap->function == RTP_SET)
	cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
	else
	cierror = 0;

	if (uap->lwpid == 0 \|\| uap->lwpid == td->td_tid) {
	p = td->td_proc;
	td1 = td;
	PROC_LOCK(p);
	} else {
	/* Only look up thread in current process */
	td1 = tdfind(uap->lwpid, curproc->p_pid);
	if (td1 == NULL)
	return (ESRCH);
	p = td1->td_proc;
	}

	switch (uap->function) {
	case RTP_LOOKUP:
	if ((error = p_cansee(td, p)))
	break;
	pri_to_rtp(td1, &rtp);
	PROC_UNLOCK(p);
	return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
	case RTP_SET:
	if ((error = p_cansched(td, p)) \|\| (error = cierror))
	break;

	/* Disallow setting rtprio in most cases if not superuser. */
	/*
	* Realtime priority has to be restricted for reasons which should be
	* obvious. However, for idle priority, there is a potential for
	* system deadlock if an idleprio process gains a lock on a resource
	* that other processes need (and the idleprio process can't run
	* due to a CPU-bound normal process). Fix me! XXX
	*/
	#if 0
	if (RTP_PRIO_IS_REALTIME(rtp.type)) {
	#else
	if (rtp.type != RTP_PRIO_NORMAL) {
	#endif
	error = priv_check(td, PRIV_SCHED_RTPRIO);
	if (error)
	break;
	}
	error = rtp_to_pri(&rtp, td1);
	break;
	default:
	error = EINVAL;
	break;
	}
	PROC_UNLOCK(p);
	return (error);
	}

	/*
	* Set realtime priority.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct rtprio_args {
	int function;
	pid_t pid;
	struct rtprio *rtp;
	};
	#endif
	int
	-rtprio(td, uap)
	+sys_rtprio(td, uap)
	struct thread td; / curthread */
	register struct rtprio_args *uap;
	{
	struct proc *p;
	struct thread *tdp;
	struct rtprio rtp;
	int cierror, error;

	/* Perform copyin before acquiring locks if needed. */
	if (uap->function == RTP_SET)
	cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
	else
	cierror = 0;

	if (uap->pid == 0) {
	p = td->td_proc;
	PROC_LOCK(p);
	} else {
	p = pfind(uap->pid);
	if (p == NULL)
	return (ESRCH);
	}

	switch (uap->function) {
	case RTP_LOOKUP:
	if ((error = p_cansee(td, p)))
	break;
	/*
	* Return OUR priority if no pid specified,
	* or if one is, report the highest priority
	* in the process. There isn't much more you can do as
	* there is only room to return a single priority.
	* Note: specifying our own pid is not the same
	* as leaving it zero.
	*/
	if (uap->pid == 0) {
	pri_to_rtp(td, &rtp);
	} else {
	struct rtprio rtp2;

	rtp.type = RTP_PRIO_IDLE;
	rtp.prio = RTP_PRIO_MAX;
	FOREACH_THREAD_IN_PROC(p, tdp) {
	pri_to_rtp(tdp, &rtp2);
	if (rtp2.type < rtp.type \|\|
	(rtp2.type == rtp.type &&
	rtp2.prio < rtp.prio)) {
	rtp.type = rtp2.type;
	rtp.prio = rtp2.prio;
	}
	}
	}
	PROC_UNLOCK(p);
	return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
	case RTP_SET:
	if ((error = p_cansched(td, p)) \|\| (error = cierror))
	break;

	/* Disallow setting rtprio in most cases if not superuser. */
	/*
	* Realtime priority has to be restricted for reasons which should be
	* obvious. However, for idle priority, there is a potential for
	* system deadlock if an idleprio process gains a lock on a resource
	* that other processes need (and the idleprio process can't run
	* due to a CPU-bound normal process). Fix me! XXX
	*/
	#if 0
	if (RTP_PRIO_IS_REALTIME(rtp.type)) {
	#else
	if (rtp.type != RTP_PRIO_NORMAL) {
	#endif
	error = priv_check(td, PRIV_SCHED_RTPRIO);
	if (error)
	break;
	}

	/*
	* If we are setting our own priority, set just our
	* thread but if we are doing another process,
	* do all the threads on that process. If we
	* specify our own pid we do the latter.
	*/
	if (uap->pid == 0) {
	error = rtp_to_pri(&rtp, td);
	} else {
	FOREACH_THREAD_IN_PROC(p, td) {
	if ((error = rtp_to_pri(&rtp, td)) != 0)
	break;
	}
	}
	break;
	default:
	error = EINVAL;
	break;
	}
	PROC_UNLOCK(p);
	return (error);
	}

	int
	rtp_to_pri(struct rtprio rtp, struct thread td)
	{
	u_char newpri;
	u_char oldpri;

	switch (RTP_PRIO_BASE(rtp->type)) {
	case RTP_PRIO_REALTIME:
	if (rtp->prio > RTP_PRIO_MAX)
	return (EINVAL);
	newpri = PRI_MIN_REALTIME + rtp->prio;
	break;
	case RTP_PRIO_NORMAL:
	if (rtp->prio > (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE))
	return (EINVAL);
	newpri = PRI_MIN_TIMESHARE + rtp->prio;
	break;
	case RTP_PRIO_IDLE:
	if (rtp->prio > RTP_PRIO_MAX)
	return (EINVAL);
	newpri = PRI_MIN_IDLE + rtp->prio;
	break;
	default:
	return (EINVAL);
	}

	thread_lock(td);
	sched_class(td, rtp->type); /* XXX fix */
	oldpri = td->td_user_pri;
	sched_user_prio(td, newpri);
	if (curthread == td)
	sched_prio(curthread, td->td_user_pri); /* XXX dubious */
	if (TD_ON_UPILOCK(td) && oldpri != newpri) {
	critical_enter();
	thread_unlock(td);
	umtx_pi_adjust(td, oldpri);
	critical_exit();
	} else
	thread_unlock(td);
	return (0);
	}

	void
	pri_to_rtp(struct thread td, struct rtprio rtp)
	{

	thread_lock(td);
	switch (PRI_BASE(td->td_pri_class)) {
	case PRI_REALTIME:
	rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME;
	break;
	case PRI_TIMESHARE:
	rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE;
	break;
	case PRI_IDLE:
	rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE;
	break;
	default:
	break;
	}
	rtp->type = td->td_pri_class;
	thread_unlock(td);
	}

	#if defined(COMPAT_43)
	#ifndef _SYS_SYSPROTO_H_
	struct osetrlimit_args {
	u_int which;
	struct orlimit *rlp;
	};
	#endif
	int
	osetrlimit(td, uap)
	struct thread *td;
	register struct osetrlimit_args *uap;
	{
	struct orlimit olim;
	struct rlimit lim;
	int error;

	if ((error = copyin(uap->rlp, &olim, sizeof(struct orlimit))))
	return (error);
	lim.rlim_cur = olim.rlim_cur;
	lim.rlim_max = olim.rlim_max;
	error = kern_setrlimit(td, uap->which, &lim);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ogetrlimit_args {
	u_int which;
	struct orlimit *rlp;
	};
	#endif
	int
	ogetrlimit(td, uap)
	struct thread *td;
	register struct ogetrlimit_args *uap;
	{
	struct orlimit olim;
	struct rlimit rl;
	struct proc *p;
	int error;

	if (uap->which >= RLIM_NLIMITS)
	return (EINVAL);
	p = td->td_proc;
	PROC_LOCK(p);
	lim_rlimit(p, uap->which, &rl);
	PROC_UNLOCK(p);

	/*
	* XXX would be more correct to convert only RLIM_INFINITY to the
	* old RLIM_INFINITY and fail with EOVERFLOW for other larger
	* values. Most 64->32 and 32->16 conversions, including not
	* unimportant ones of uids are even more broken than what we
	* do here (they blindly truncate). We don't do this correctly
	* here since we have little experience with EOVERFLOW yet.
	* Elsewhere, getuid() can't fail...
	*/
	olim.rlim_cur = rl.rlim_cur > 0x7fffffff ? 0x7fffffff : rl.rlim_cur;
	olim.rlim_max = rl.rlim_max > 0x7fffffff ? 0x7fffffff : rl.rlim_max;
	error = copyout(&olim, uap->rlp, sizeof(olim));
	return (error);
	}
	#endif /* COMPAT_43 */

	#ifndef _SYS_SYSPROTO_H_
	struct __setrlimit_args {
	u_int which;
	struct rlimit *rlp;
	};
	#endif
	int
	-setrlimit(td, uap)
	+sys_setrlimit(td, uap)
	struct thread *td;
	register struct __setrlimit_args *uap;
	{
	struct rlimit alim;
	int error;

	if ((error = copyin(uap->rlp, &alim, sizeof(struct rlimit))))
	return (error);
	error = kern_setrlimit(td, uap->which, &alim);
	return (error);
	}

	static void
	lim_cb(void *arg)
	{
	struct rlimit rlim;
	struct thread *td;
	struct proc *p;

	p = arg;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	/*
	* Check if the process exceeds its cpu resource allocation. If
	* it reaches the max, arrange to kill the process in ast().
	*/
	if (p->p_cpulimit == RLIM_INFINITY)
	return;
	PROC_SLOCK(p);
	FOREACH_THREAD_IN_PROC(p, td) {
	ruxagg(p, td);
	}
	PROC_SUNLOCK(p);
	if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) {
	lim_rlimit(p, RLIMIT_CPU, &rlim);
	if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) {
	killproc(p, "exceeded maximum CPU limit");
	} else {
	if (p->p_cpulimit < rlim.rlim_max)
	p->p_cpulimit += 5;
	- psignal(p, SIGXCPU);
	+ kern_psignal(p, SIGXCPU);
	}
	}
	if ((p->p_flag & P_WEXIT) == 0)
	callout_reset(&p->p_limco, hz, lim_cb, p);
	}

	int
	kern_setrlimit(td, which, limp)
	struct thread *td;
	u_int which;
	struct rlimit *limp;
	{
	struct plimit newlim, oldlim;
	struct proc *p;
	register struct rlimit *alimp;
	struct rlimit oldssiz;
	int error;

	if (which >= RLIM_NLIMITS)
	return (EINVAL);

	/*
	* Preserve historical bugs by treating negative limits as unsigned.
	*/
	if (limp->rlim_cur < 0)
	limp->rlim_cur = RLIM_INFINITY;
	if (limp->rlim_max < 0)
	limp->rlim_max = RLIM_INFINITY;

	oldssiz.rlim_cur = 0;
	p = td->td_proc;
	newlim = lim_alloc();
	PROC_LOCK(p);
	oldlim = p->p_limit;
	alimp = &oldlim->pl_rlimit[which];
	if (limp->rlim_cur > alimp->rlim_max \|\|
	limp->rlim_max > alimp->rlim_max)
	if ((error = priv_check(td, PRIV_PROC_SETRLIMIT))) {
	PROC_UNLOCK(p);
	lim_free(newlim);
	return (error);
	}
	if (limp->rlim_cur > limp->rlim_max)
	limp->rlim_cur = limp->rlim_max;
	lim_copy(newlim, oldlim);
	alimp = &newlim->pl_rlimit[which];

	switch (which) {

	case RLIMIT_CPU:
	if (limp->rlim_cur != RLIM_INFINITY &&
	p->p_cpulimit == RLIM_INFINITY)
	callout_reset(&p->p_limco, hz, lim_cb, p);
	p->p_cpulimit = limp->rlim_cur;
	break;
	case RLIMIT_DATA:
	if (limp->rlim_cur > maxdsiz)
	limp->rlim_cur = maxdsiz;
	if (limp->rlim_max > maxdsiz)
	limp->rlim_max = maxdsiz;
	break;

	case RLIMIT_STACK:
	if (limp->rlim_cur > maxssiz)
	limp->rlim_cur = maxssiz;
	if (limp->rlim_max > maxssiz)
	limp->rlim_max = maxssiz;
	oldssiz = *alimp;
	if (p->p_sysent->sv_fixlimit != NULL)
	p->p_sysent->sv_fixlimit(&oldssiz,
	RLIMIT_STACK);
	break;

	case RLIMIT_NOFILE:
	if (limp->rlim_cur > maxfilesperproc)
	limp->rlim_cur = maxfilesperproc;
	if (limp->rlim_max > maxfilesperproc)
	limp->rlim_max = maxfilesperproc;
	break;

	case RLIMIT_NPROC:
	if (limp->rlim_cur > maxprocperuid)
	limp->rlim_cur = maxprocperuid;
	if (limp->rlim_max > maxprocperuid)
	limp->rlim_max = maxprocperuid;
	if (limp->rlim_cur < 1)
	limp->rlim_cur = 1;
	if (limp->rlim_max < 1)
	limp->rlim_max = 1;
	break;
	}
	if (p->p_sysent->sv_fixlimit != NULL)
	p->p_sysent->sv_fixlimit(limp, which);
	alimp = limp;
	p->p_limit = newlim;
	PROC_UNLOCK(p);
	lim_free(oldlim);

	if (which == RLIMIT_STACK) {
	/*
	* Stack is allocated to the max at exec time with only
	* "rlim_cur" bytes accessible. If stack limit is going
	* up make more accessible, if going down make inaccessible.
	*/
	if (limp->rlim_cur != oldssiz.rlim_cur) {
	vm_offset_t addr;
	vm_size_t size;
	vm_prot_t prot;

	if (limp->rlim_cur > oldssiz.rlim_cur) {
	prot = p->p_sysent->sv_stackprot;
	size = limp->rlim_cur - oldssiz.rlim_cur;
	addr = p->p_sysent->sv_usrstack -
	limp->rlim_cur;
	} else {
	prot = VM_PROT_NONE;
	size = oldssiz.rlim_cur - limp->rlim_cur;
	addr = p->p_sysent->sv_usrstack -
	oldssiz.rlim_cur;
	}
	addr = trunc_page(addr);
	size = round_page(size);
	(void)vm_map_protect(&p->p_vmspace->vm_map,
	addr, addr + size, prot, FALSE);
	}
	}

	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct __getrlimit_args {
	u_int which;
	struct rlimit *rlp;
	};
	#endif
	/* ARGSUSED */
	int
	-getrlimit(td, uap)
	+sys_getrlimit(td, uap)
	struct thread *td;
	register struct __getrlimit_args *uap;
	{
	struct rlimit rlim;
	struct proc *p;
	int error;

	if (uap->which >= RLIM_NLIMITS)
	return (EINVAL);
	p = td->td_proc;
	PROC_LOCK(p);
	lim_rlimit(p, uap->which, &rlim);
	PROC_UNLOCK(p);
	error = copyout(&rlim, uap->rlp, sizeof(struct rlimit));
	return (error);
	}

	/*
	* Transform the running time and tick information for children of proc p
	* into user and system time usage.
	*/
	void
	calccru(p, up, sp)
	struct proc *p;
	struct timeval *up;
	struct timeval *sp;
	{

	PROC_LOCK_ASSERT(p, MA_OWNED);
	calcru1(p, &p->p_crux, up, sp);
	}

	/*
	* Transform the running time and tick information in proc p into user
	* and system time usage. If appropriate, include the current time slice
	* on this CPU.
	*/
	void
	calcru(struct proc p, struct timeval up, struct timeval *sp)
	{
	struct thread *td;
	uint64_t runtime, u;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	PROC_SLOCK_ASSERT(p, MA_OWNED);
	/*
	* If we are getting stats for the current process, then add in the
	* stats that this thread has accumulated in its current time slice.
	* We reset the thread and CPU state as if we had performed a context
	* switch right here.
	*/
	td = curthread;
	if (td->td_proc == p) {
	u = cpu_ticks();
	runtime = u - PCPU_GET(switchtime);
	td->td_runtime += runtime;
	td->td_incruntime += runtime;
	PCPU_SET(switchtime, u);
	}
	/* Make sure the per-thread stats are current. */
	FOREACH_THREAD_IN_PROC(p, td) {
	if (td->td_incruntime == 0)
	continue;
	ruxagg(p, td);
	}
	calcru1(p, &p->p_rux, up, sp);
	}

	/* Collect resource usage for a single thread. */
	void
	rufetchtd(struct thread td, struct rusage ru)
	{
	struct proc *p;
	uint64_t runtime, u;

	p = td->td_proc;
	PROC_SLOCK_ASSERT(p, MA_OWNED);
	THREAD_LOCK_ASSERT(td, MA_OWNED);
	/*
	* If we are getting stats for the current thread, then add in the
	* stats that this thread has accumulated in its current time slice.
	* We reset the thread and CPU state as if we had performed a context
	* switch right here.
	*/
	if (td == curthread) {
	u = cpu_ticks();
	runtime = u - PCPU_GET(switchtime);
	td->td_runtime += runtime;
	td->td_incruntime += runtime;
	PCPU_SET(switchtime, u);
	}
	ruxagg(p, td);
	*ru = td->td_ru;
	calcru1(p, &td->td_rux, &ru->ru_utime, &ru->ru_stime);
	}

	static void
	calcru1(struct proc p, struct rusage_ext ruxp, struct timeval *up,
	struct timeval *sp)
	{
	/* {user, system, interrupt, total} {ticks, usec}: */
	uint64_t ut, uu, st, su, it, tt, tu;

	ut = ruxp->rux_uticks;
	st = ruxp->rux_sticks;
	it = ruxp->rux_iticks;
	tt = ut + st + it;
	if (tt == 0) {
	/* Avoid divide by zero */
	st = 1;
	tt = 1;
	}
	tu = cputick2usec(ruxp->rux_runtime);
	if ((int64_t)tu < 0) {
	/* XXX: this should be an assert /phk */
	printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
	(intmax_t)tu, p->p_pid, p->p_comm);
	tu = ruxp->rux_tu;
	}

	if (tu >= ruxp->rux_tu) {
	/*
	* The normal case, time increased.
	* Enforce monotonicity of bucketed numbers.
	*/
	uu = (tu * ut) / tt;
	if (uu < ruxp->rux_uu)
	uu = ruxp->rux_uu;
	su = (tu * st) / tt;
	if (su < ruxp->rux_su)
	su = ruxp->rux_su;
	} else if (tu + 3 > ruxp->rux_tu \|\| 101 * tu > 100 * ruxp->rux_tu) {
	/*
	* When we calibrate the cputicker, it is not uncommon to
	* see the presumably fixed frequency increase slightly over
	* time as a result of thermal stabilization and NTP
	* discipline (of the reference clock). We therefore ignore
	* a bit of backwards slop because we expect to catch up
	* shortly. We use a 3 microsecond limit to catch low
	* counts and a 1% limit for high counts.
	*/
	uu = ruxp->rux_uu;
	su = ruxp->rux_su;
	tu = ruxp->rux_tu;
	} else { /* tu < ruxp->rux_tu */
	/*
	* What happened here was likely that a laptop, which ran at
	* a reduced clock frequency at boot, kicked into high gear.
	* The wisdom of spamming this message in that case is
	* dubious, but it might also be indicative of something
	* serious, so lets keep it and hope laptops can be made
	* more truthful about their CPU speed via ACPI.
	*/
	printf("calcru: runtime went backwards from %ju usec "
	"to %ju usec for pid %d (%s)\n",
	(uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
	p->p_pid, p->p_comm);
	uu = (tu * ut) / tt;
	su = (tu * st) / tt;
	}

	ruxp->rux_uu = uu;
	ruxp->rux_su = su;
	ruxp->rux_tu = tu;

	up->tv_sec = uu / 1000000;
	up->tv_usec = uu % 1000000;
	sp->tv_sec = su / 1000000;
	sp->tv_usec = su % 1000000;
	}

	#ifndef _SYS_SYSPROTO_H_
	struct getrusage_args {
	int who;
	struct rusage *rusage;
	};
	#endif
	int
	-getrusage(td, uap)
	+sys_getrusage(td, uap)
	register struct thread *td;
	register struct getrusage_args *uap;
	{
	struct rusage ru;
	int error;

	error = kern_getrusage(td, uap->who, &ru);
	if (error == 0)
	error = copyout(&ru, uap->rusage, sizeof(struct rusage));
	return (error);
	}

	int
	kern_getrusage(struct thread td, int who, struct rusage rup)
	{
	struct proc *p;
	int error;

	error = 0;
	p = td->td_proc;
	PROC_LOCK(p);
	switch (who) {
	case RUSAGE_SELF:
	rufetchcalc(p, rup, &rup->ru_utime,
	&rup->ru_stime);
	break;

	case RUSAGE_CHILDREN:
	*rup = p->p_stats->p_cru;
	calccru(p, &rup->ru_utime, &rup->ru_stime);
	break;

	case RUSAGE_THREAD:
	PROC_SLOCK(p);
	thread_lock(td);
	rufetchtd(td, rup);
	thread_unlock(td);
	PROC_SUNLOCK(p);
	break;

	default:
	error = EINVAL;
	}
	PROC_UNLOCK(p);
	return (error);
	}

	void
	rucollect(struct rusage ru, struct rusage ru2)
	{
	long ip, ip2;
	int i;

	if (ru->ru_maxrss < ru2->ru_maxrss)
	ru->ru_maxrss = ru2->ru_maxrss;
	ip = &ru->ru_first;
	ip2 = &ru2->ru_first;
	for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--)
	ip++ += ip2++;
	}

	void
	ruadd(struct rusage ru, struct rusage_ext rux, struct rusage *ru2,
	struct rusage_ext *rux2)
	{

	rux->rux_runtime += rux2->rux_runtime;
	rux->rux_uticks += rux2->rux_uticks;
	rux->rux_sticks += rux2->rux_sticks;
	rux->rux_iticks += rux2->rux_iticks;
	rux->rux_uu += rux2->rux_uu;
	rux->rux_su += rux2->rux_su;
	rux->rux_tu += rux2->rux_tu;
	rucollect(ru, ru2);
	}

	/*
	* Aggregate tick counts into the proc's rusage_ext.
	*/
	static void
	ruxagg_locked(struct rusage_ext rux, struct thread td)
	{

	THREAD_LOCK_ASSERT(td, MA_OWNED);
	PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED);
	rux->rux_runtime += td->td_incruntime;
	rux->rux_uticks += td->td_uticks;
	rux->rux_sticks += td->td_sticks;
	rux->rux_iticks += td->td_iticks;
	}

	void
	ruxagg(struct proc p, struct thread td)
	{

	thread_lock(td);
	ruxagg_locked(&p->p_rux, td);
	ruxagg_locked(&td->td_rux, td);
	td->td_incruntime = 0;
	td->td_uticks = 0;
	td->td_iticks = 0;
	td->td_sticks = 0;
	thread_unlock(td);
	}

	/*
	* Update the rusage_ext structure and fetch a valid aggregate rusage
	* for proc p if storage for one is supplied.
	*/
	void
	rufetch(struct proc p, struct rusage ru)
	{
	struct thread *td;

	PROC_SLOCK_ASSERT(p, MA_OWNED);

	*ru = p->p_ru;
	if (p->p_numthreads > 0) {
	FOREACH_THREAD_IN_PROC(p, td) {
	ruxagg(p, td);
	rucollect(ru, &td->td_ru);
	}
	}
	}

	/*
	* Atomically perform a rufetch and a calcru together.
	* Consumers, can safely assume the calcru is executed only once
	* rufetch is completed.
	*/
	void
	rufetchcalc(struct proc p, struct rusage ru, struct timeval *up,
	struct timeval *sp)
	{

	PROC_SLOCK(p);
	rufetch(p, ru);
	calcru(p, up, sp);
	PROC_SUNLOCK(p);
	}

	/*
	* Allocate a new resource limits structure and initialize its
	* reference count and mutex pointer.
	*/
	struct plimit *
	lim_alloc()
	{
	struct plimit *limp;

	limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
	refcount_init(&limp->pl_refcnt, 1);
	return (limp);
	}

	struct plimit *
	lim_hold(limp)
	struct plimit *limp;
	{

	refcount_acquire(&limp->pl_refcnt);
	return (limp);
	}

	void
	lim_fork(struct proc p1, struct proc p2)
	{
	p2->p_limit = lim_hold(p1->p_limit);
	callout_init_mtx(&p2->p_limco, &p2->p_mtx, 0);
	if (p1->p_cpulimit != RLIM_INFINITY)
	callout_reset(&p2->p_limco, hz, lim_cb, p2);
	}

	void
	lim_free(limp)
	struct plimit *limp;
	{

	KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
	if (refcount_release(&limp->pl_refcnt))
	free((void *)limp, M_PLIMIT);
	}

	/*
	* Make a copy of the plimit structure.
	* We share these structures copy-on-write after fork.
	*/
	void
	lim_copy(dst, src)
	struct plimit dst, src;
	{

	KASSERT(dst->pl_refcnt == 1, ("lim_copy to shared limit"));
	bcopy(src->pl_rlimit, dst->pl_rlimit, sizeof(src->pl_rlimit));
	}

	/*
	* Return the hard limit for a particular system resource. The
	* which parameter specifies the index into the rlimit array.
	*/
	rlim_t
	lim_max(struct proc *p, int which)
	{
	struct rlimit rl;

	lim_rlimit(p, which, &rl);
	return (rl.rlim_max);
	}

	/*
	* Return the current (soft) limit for a particular system resource.
	* The which parameter which specifies the index into the rlimit array
	*/
	rlim_t
	lim_cur(struct proc *p, int which)
	{
	struct rlimit rl;

	lim_rlimit(p, which, &rl);
	return (rl.rlim_cur);
	}

	/*
	* Return a copy of the entire rlimit structure for the system limit
	* specified by 'which' in the rlimit structure pointed to by 'rlp'.
	*/
	void
	lim_rlimit(struct proc p, int which, struct rlimit rlp)
	{

	PROC_LOCK_ASSERT(p, MA_OWNED);
	KASSERT(which >= 0 && which < RLIM_NLIMITS,
	("request for invalid resource limit"));
	*rlp = p->p_limit->pl_rlimit[which];
	if (p->p_sysent->sv_fixlimit != NULL)
	p->p_sysent->sv_fixlimit(rlp, which);
	}

	void
	uihashinit()
	{

	uihashtbl = hashinit(maxproc / 16, M_UIDINFO, &uihash);
	rw_init(&uihashtbl_lock, "uidinfo hash");
	}

	/*
	* Look up a uidinfo struct for the parameter uid.
	* uihashtbl_lock must be locked.
	*/
	static struct uidinfo *
	uilookup(uid)
	uid_t uid;
	{
	struct uihashhead *uipp;
	struct uidinfo *uip;

	rw_assert(&uihashtbl_lock, RA_LOCKED);
	uipp = UIHASH(uid);
	LIST_FOREACH(uip, uipp, ui_hash)
	if (uip->ui_uid == uid)
	break;

	return (uip);
	}

	/*
	* Find or allocate a struct uidinfo for a particular uid.
	* Increase refcount on uidinfo struct returned.
	* uifree() should be called on a struct uidinfo when released.
	*/
	struct uidinfo *
	uifind(uid)
	uid_t uid;
	{
	struct uidinfo old_uip, uip;

	rw_rlock(&uihashtbl_lock);
	uip = uilookup(uid);
	if (uip == NULL) {
	rw_runlock(&uihashtbl_lock);
	uip = malloc(sizeof(*uip), M_UIDINFO, M_WAITOK \| M_ZERO);
	racct_create(&uip->ui_racct);
	rw_wlock(&uihashtbl_lock);
	/*
	* There's a chance someone created our uidinfo while we
	* were in malloc and not holding the lock, so we have to
	* make sure we don't insert a duplicate uidinfo.
	*/
	if ((old_uip = uilookup(uid)) != NULL) {
	/* Someone else beat us to it. */
	racct_destroy(&uip->ui_racct);
	free(uip, M_UIDINFO);
	uip = old_uip;
	} else {
	refcount_init(&uip->ui_ref, 0);
	uip->ui_uid = uid;
	mtx_init(&uip->ui_vmsize_mtx, "ui_vmsize", NULL,
	MTX_DEF);
	LIST_INSERT_HEAD(UIHASH(uid), uip, ui_hash);
	}
	}
	uihold(uip);
	rw_unlock(&uihashtbl_lock);
	return (uip);
	}

	/*
	* Place another refcount on a uidinfo struct.
	*/
	void
	uihold(uip)
	struct uidinfo *uip;
	{

	refcount_acquire(&uip->ui_ref);
	}

	/*-
	* Since uidinfo structs have a long lifetime, we use an
	* opportunistic refcounting scheme to avoid locking the lookup hash
	* for each release.
	*
	* If the refcount hits 0, we need to free the structure,
	* which means we need to lock the hash.
	* Optimal case:
	* After locking the struct and lowering the refcount, if we find
	* that we don't need to free, simply unlock and return.
	* Suboptimal case:
	* If refcount lowering results in need to free, bump the count
	* back up, lose the lock and acquire the locks in the proper
	* order to try again.
	*/
	void
	uifree(uip)
	struct uidinfo *uip;
	{
	int old;

	/* Prepare for optimal case. */
	old = uip->ui_ref;
	if (old > 1 && atomic_cmpset_int(&uip->ui_ref, old, old - 1))
	return;

	/* Prepare for suboptimal case. */
	rw_wlock(&uihashtbl_lock);
	if (refcount_release(&uip->ui_ref)) {
	racct_destroy(&uip->ui_racct);
	LIST_REMOVE(uip, ui_hash);
	rw_wunlock(&uihashtbl_lock);
	if (uip->ui_sbsize != 0)
	printf("freeing uidinfo: uid = %d, sbsize = %ld\n",
	uip->ui_uid, uip->ui_sbsize);
	if (uip->ui_proccnt != 0)
	printf("freeing uidinfo: uid = %d, proccnt = %ld\n",
	uip->ui_uid, uip->ui_proccnt);
	if (uip->ui_vmsize != 0)
	printf("freeing uidinfo: uid = %d, swapuse = %lld\n",
	uip->ui_uid, (unsigned long long)uip->ui_vmsize);
	mtx_destroy(&uip->ui_vmsize_mtx);
	free(uip, M_UIDINFO);
	return;
	}
	/*
	* Someone added a reference between atomic_cmpset_int() and
	* rw_wlock(&uihashtbl_lock).
	*/
	rw_wunlock(&uihashtbl_lock);
	}

	void
	ui_racct_foreach(void (callback)(struct racct racct,
	void arg2, void arg3), void arg2, void arg3)
	{
	struct uidinfo *uip;
	struct uihashhead *uih;

	rw_rlock(&uihashtbl_lock);
	for (uih = &uihashtbl[uihash]; uih >= uihashtbl; uih--) {
	LIST_FOREACH(uip, uih, ui_hash) {
	(callback)(uip->ui_racct, arg2, arg3);
	}
	}
	rw_runlock(&uihashtbl_lock);
	}

	/*
	* Change the count associated with number of processes
	* a given user is using. When 'max' is 0, don't enforce a limit
	*/
	int
	chgproccnt(uip, diff, max)
	struct uidinfo *uip;
	int diff;
	rlim_t max;
	{

	/* Don't allow them to exceed max, but allow subtraction. */
	if (diff > 0 && max != 0) {
	if (atomic_fetchadd_long(&uip->ui_proccnt, (long)diff) + diff > max) {
	atomic_subtract_long(&uip->ui_proccnt, (long)diff);
	return (0);
	}
	} else {
	atomic_add_long(&uip->ui_proccnt, (long)diff);
	if (uip->ui_proccnt < 0)
	printf("negative proccnt for uid = %d\n", uip->ui_uid);
	}
	return (1);
	}

	/*
	* Change the total socket buffer size a user has used.
	*/
	int
	chgsbsize(uip, hiwat, to, max)
	struct uidinfo *uip;
	u_int *hiwat;
	u_int to;
	rlim_t max;
	{
	int diff;

	diff = to - *hiwat;
	if (diff > 0) {
	if (atomic_fetchadd_long(&uip->ui_sbsize, (long)diff) + diff > max) {
	atomic_subtract_long(&uip->ui_sbsize, (long)diff);
	return (0);
	}
	} else {
	atomic_add_long(&uip->ui_sbsize, (long)diff);
	if (uip->ui_sbsize < 0)
	printf("negative sbsize for uid = %d\n", uip->ui_uid);
	}
	*hiwat = to;
	return (1);
	}

	/*
	* Change the count associated with number of pseudo-terminals
	* a given user is using. When 'max' is 0, don't enforce a limit
	*/
	int
	chgptscnt(uip, diff, max)
	struct uidinfo *uip;
	int diff;
	rlim_t max;
	{

	/* Don't allow them to exceed max, but allow subtraction. */
	if (diff > 0 && max != 0) {
	if (atomic_fetchadd_long(&uip->ui_ptscnt, (long)diff) + diff > max) {
	atomic_subtract_long(&uip->ui_ptscnt, (long)diff);
	return (0);
	}
	} else {
	atomic_add_long(&uip->ui_ptscnt, (long)diff);
	if (uip->ui_ptscnt < 0)
	printf("negative ptscnt for uid = %d\n", uip->ui_uid);
	}
	return (1);
	}
	Index: head/sys/kern/kern_shutdown.c
	===================================================================
	--- head/sys/kern/kern_shutdown.c (revision 225616)
	+++ head/sys/kern/kern_shutdown.c (revision 225617)
	@@ -1,735 +1,735 @@
	/*-
	* Copyright (c) 1986, 1988, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ddb.h"
	#include "opt_kdb.h"
	#include "opt_panic.h"
	#include "opt_sched.h"
	#include "opt_watchdog.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/conf.h>
	#include <sys/cons.h>
	#include <sys/eventhandler.h>
	#include <sys/jail.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/kerneldump.h>
	#include <sys/kthread.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/reboot.h>
	#include <sys/resourcevar.h>
	#include <sys/sched.h>
	#include <sys/smp.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/vnode.h>
	#ifdef SW_WATCHDOG
	#include <sys/watchdog.h>
	#endif

	#include <ddb/ddb.h>

	#include <machine/cpu.h>
	#include <machine/pcb.h>
	#include <machine/smp.h>

	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>
	#include <vm/swap_pager.h>

	#include <sys/signalvar.h>

	#ifndef PANIC_REBOOT_WAIT_TIME
	#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
	#endif

	/*
	* Note that stdarg.h and the ANSI style va_start macro is used for both
	* ANSI and traditional C compilers.
	*/
	#include <machine/stdarg.h>

	#ifdef KDB
	#ifdef KDB_UNATTENDED
	int debugger_on_panic = 0;
	#else
	int debugger_on_panic = 1;
	#endif
	SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW \| CTLFLAG_TUN,
	&debugger_on_panic, 0, "Run debugger on kernel panic");
	TUNABLE_INT("debug.debugger_on_panic", &debugger_on_panic);

	#ifdef KDB_TRACE
	static int trace_on_panic = 1;
	#else
	static int trace_on_panic = 0;
	#endif
	SYSCTL_INT(_debug, OID_AUTO, trace_on_panic, CTLFLAG_RW \| CTLFLAG_TUN,
	&trace_on_panic, 0, "Print stack trace on kernel panic");
	TUNABLE_INT("debug.trace_on_panic", &trace_on_panic);
	#endif /* KDB */

	static int sync_on_panic = 0;
	SYSCTL_INT(_kern, OID_AUTO, sync_on_panic, CTLFLAG_RW \| CTLFLAG_TUN,
	&sync_on_panic, 0, "Do a sync before rebooting from a panic");
	TUNABLE_INT("kern.sync_on_panic", &sync_on_panic);

	SYSCTL_NODE(_kern, OID_AUTO, shutdown, CTLFLAG_RW, 0, "Shutdown environment");

	#ifndef DIAGNOSTIC
	static int show_busybufs;
	#else
	static int show_busybufs = 1;
	#endif
	SYSCTL_INT(_kern_shutdown, OID_AUTO, show_busybufs, CTLFLAG_RW,
	&show_busybufs, 0, "");

	/*
	* Variable panicstr contains argument to first call to panic; used as flag
	* to indicate that the kernel has already called panic.
	*/
	const char *panicstr;

	int dumping; /* system is dumping */
	int rebooting; /* system is rebooting */
	static struct dumperinfo dumper; /* our selected dumper */

	/* Context information for dump-debuggers. */
	static struct pcb dumppcb; /* Registers. */
	static lwpid_t dumptid; /* Thread ID. */

	static void poweroff_wait(void *, int);
	static void shutdown_halt(void *junk, int howto);
	static void shutdown_panic(void *junk, int howto);
	static void shutdown_reset(void *junk, int howto);

	/* register various local shutdown events */
	static void
	shutdown_conf(void *unused)
	{

	EVENTHANDLER_REGISTER(shutdown_final, poweroff_wait, NULL,
	SHUTDOWN_PRI_FIRST);
	EVENTHANDLER_REGISTER(shutdown_final, shutdown_halt, NULL,
	SHUTDOWN_PRI_LAST + 100);
	EVENTHANDLER_REGISTER(shutdown_final, shutdown_panic, NULL,
	SHUTDOWN_PRI_LAST + 100);
	EVENTHANDLER_REGISTER(shutdown_final, shutdown_reset, NULL,
	SHUTDOWN_PRI_LAST + 200);
	}

	SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL);

	/*
	* The system call that results in a reboot.
	*/
	/* ARGSUSED */
	int
	-reboot(struct thread td, struct reboot_args uap)
	+sys_reboot(struct thread td, struct reboot_args uap)
	{
	int error;

	error = 0;
	#ifdef MAC
	error = mac_system_check_reboot(td->td_ucred, uap->opt);
	#endif
	if (error == 0)
	error = priv_check(td, PRIV_REBOOT);
	if (error == 0) {
	mtx_lock(&Giant);
	kern_reboot(uap->opt);
	mtx_unlock(&Giant);
	}
	return (error);
	}

	/*
	* Called by events that want to shut down.. e.g <CTL><ALT><DEL> on a PC
	*/
	static int shutdown_howto = 0;

	void
	shutdown_nice(int howto)
	{

	shutdown_howto = howto;

	/* Send a signal to init(8) and have it shutdown the world */
	if (initproc != NULL) {
	PROC_LOCK(initproc);
	- psignal(initproc, SIGINT);
	+ kern_psignal(initproc, SIGINT);
	PROC_UNLOCK(initproc);
	} else {
	/* No init(8) running, so simply reboot */
	kern_reboot(RB_NOSYNC);
	}
	return;
	}
	static int waittime = -1;

	static void
	print_uptime(void)
	{
	int f;
	struct timespec ts;

	getnanouptime(&ts);
	printf("Uptime: ");
	f = 0;
	if (ts.tv_sec >= 86400) {
	printf("%ldd", (long)ts.tv_sec / 86400);
	ts.tv_sec %= 86400;
	f = 1;
	}
	if (f \|\| ts.tv_sec >= 3600) {
	printf("%ldh", (long)ts.tv_sec / 3600);
	ts.tv_sec %= 3600;
	f = 1;
	}
	if (f \|\| ts.tv_sec >= 60) {
	printf("%ldm", (long)ts.tv_sec / 60);
	ts.tv_sec %= 60;
	f = 1;
	}
	printf("%lds\n", (long)ts.tv_sec);
	}

	int
	doadump(boolean_t textdump)
	{
	boolean_t coredump;

	if (dumping)
	return (EBUSY);
	if (dumper.dumper == NULL)
	return (ENXIO);

	savectx(&dumppcb);
	dumptid = curthread->td_tid;
	dumping++;

	coredump = TRUE;
	#ifdef DDB
	if (textdump && textdump_pending) {
	coredump = FALSE;
	textdump_dumpsys(&dumper);
	}
	#endif
	if (coredump)
	dumpsys(&dumper);

	dumping--;
	return (0);
	}

	static int
	isbufbusy(struct buf *bp)
	{
	if (((bp->b_flags & (B_INVAL \| B_PERSISTENT)) == 0 &&
	BUF_ISLOCKED(bp)) \|\|
	((bp->b_flags & (B_DELWRI \| B_INVAL)) == B_DELWRI))
	return (1);
	return (0);
	}

	/*
	* Shutdown the system cleanly to prepare for reboot, halt, or power off.
	*/
	void
	kern_reboot(int howto)
	{
	static int first_buf_printf = 1;

	#if defined(SMP)
	/*
	* Bind us to CPU 0 so that all shutdown code runs there. Some
	* systems don't shutdown properly (i.e., ACPI power off) if we
	* run on another processor.
	*/
	thread_lock(curthread);
	sched_bind(curthread, 0);
	thread_unlock(curthread);
	KASSERT(PCPU_GET(cpuid) == 0, ("%s: not running on cpu 0", __func__));
	#endif
	/* We're in the process of rebooting. */
	rebooting = 1;

	/* collect extra flags that shutdown_nice might have set */
	howto \|= shutdown_howto;

	/* We are out of the debugger now. */
	kdb_active = 0;

	/*
	* Do any callouts that should be done BEFORE syncing the filesystems.
	*/
	EVENTHANDLER_INVOKE(shutdown_pre_sync, howto);

	/*
	* Now sync filesystems
	*/
	if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
	register struct buf *bp;
	int iter, nbusy, pbusy;
	#ifndef PREEMPTION
	int subiter;
	#endif

	waittime = 0;

	#ifdef SW_WATCHDOG
	wdog_kern_pat(WD_LASTVAL);
	#endif
	- sync(curthread, NULL);
	+ sys_sync(curthread, NULL);

	/*
	* With soft updates, some buffers that are
	* written will be remarked as dirty until other
	* buffers are written.
	*/
	for (iter = pbusy = 0; iter < 20; iter++) {
	nbusy = 0;
	for (bp = &buf[nbuf]; --bp >= buf; )
	if (isbufbusy(bp))
	nbusy++;
	if (nbusy == 0) {
	if (first_buf_printf)
	printf("All buffers synced.");
	break;
	}
	if (first_buf_printf) {
	printf("Syncing disks, buffers remaining... ");
	first_buf_printf = 0;
	}
	printf("%d ", nbusy);
	if (nbusy < pbusy)
	iter = 0;
	pbusy = nbusy;
	#ifdef SW_WATCHDOG
	wdog_kern_pat(WD_LASTVAL);
	#endif
	- sync(curthread, NULL);
	+ sys_sync(curthread, NULL);

	#ifdef PREEMPTION
	/*
	* Drop Giant and spin for a while to allow
	* interrupt threads to run.
	*/
	DROP_GIANT();
	DELAY(50000 * iter);
	PICKUP_GIANT();
	#else
	/*
	* Drop Giant and context switch several times to
	* allow interrupt threads to run.
	*/
	DROP_GIANT();
	for (subiter = 0; subiter < 50 * iter; subiter++) {
	thread_lock(curthread);
	mi_switch(SW_VOL, NULL);
	thread_unlock(curthread);
	DELAY(1000);
	}
	PICKUP_GIANT();
	#endif
	}
	printf("\n");
	/*
	* Count only busy local buffers to prevent forcing
	* a fsck if we're just a client of a wedged NFS server
	*/
	nbusy = 0;
	for (bp = &buf[nbuf]; --bp >= buf; ) {
	if (isbufbusy(bp)) {
	#if 0
	/* XXX: This is bogus. We should probably have a BO_REMOTE flag instead */
	if (bp->b_dev == NULL) {
	TAILQ_REMOVE(&mountlist,
	bp->b_vp->v_mount, mnt_list);
	continue;
	}
	#endif
	nbusy++;
	if (show_busybufs > 0) {
	printf(
	"%d: buf:%p, vnode:%p, flags:%0x, blkno:%jd, lblkno:%jd, buflock:",
	nbusy, bp, bp->b_vp, bp->b_flags,
	(intmax_t)bp->b_blkno,
	(intmax_t)bp->b_lblkno);
	BUF_LOCKPRINTINFO(bp);
	if (show_busybufs > 1)
	vn_printf(bp->b_vp,
	"vnode content: ");
	}
	}
	}
	if (nbusy) {
	/*
	* Failed to sync all blocks. Indicate this and don't
	* unmount filesystems (thus forcing an fsck on reboot).
	*/
	printf("Giving up on %d buffers\n", nbusy);
	DELAY(5000000); /* 5 seconds */
	} else {
	if (!first_buf_printf)
	printf("Final sync complete\n");
	/*
	* Unmount filesystems
	*/
	if (panicstr == 0)
	vfs_unmountall();
	}
	swapoff_all();
	DELAY(100000); /* wait for console output to finish */
	}

	print_uptime();

	/*
	* Ok, now do things that assume all filesystem activity has
	* been completed.
	*/
	EVENTHANDLER_INVOKE(shutdown_post_sync, howto);

	if ((howto & (RB_HALT\|RB_DUMP)) == RB_DUMP && !cold && !dumping)
	doadump(TRUE);

	/* Now that we're going to really halt the system... */
	EVENTHANDLER_INVOKE(shutdown_final, howto);

	for(;;) ; /* safety against shutdown_reset not working */
	/* NOTREACHED */
	}

	/*
	* If the shutdown was a clean halt, behave accordingly.
	*/
	static void
	shutdown_halt(void *junk, int howto)
	{

	if (howto & RB_HALT) {
	printf("\n");
	printf("The operating system has halted.\n");
	printf("Please press any key to reboot.\n\n");
	switch (cngetc()) {
	case -1: /* No console, just die */
	cpu_halt();
	/* NOTREACHED */
	default:
	howto &= ~RB_HALT;
	break;
	}
	}
	}

	/*
	* Check to see if the system paniced, pause and then reboot
	* according to the specified delay.
	*/
	static void
	shutdown_panic(void *junk, int howto)
	{
	int loop;

	if (howto & RB_DUMP) {
	if (PANIC_REBOOT_WAIT_TIME != 0) {
	if (PANIC_REBOOT_WAIT_TIME != -1) {
	printf("Automatic reboot in %d seconds - "
	"press a key on the console to abort\n",
	PANIC_REBOOT_WAIT_TIME);
	for (loop = PANIC_REBOOT_WAIT_TIME * 10;
	loop > 0; --loop) {
	DELAY(1000 * 100); /* 1/10th second */
	/* Did user type a key? */
	if (cncheckc() != -1)
	break;
	}
	if (!loop)
	return;
	}
	} else { /* zero time specified - reboot NOW */
	return;
	}
	printf("--> Press a key on the console to reboot,\n");
	printf("--> or switch off the system now.\n");
	cngetc();
	}
	}

	/*
	* Everything done, now reset
	*/
	static void
	shutdown_reset(void *junk, int howto)
	{

	printf("Rebooting...\n");
	DELAY(1000000); /* wait 1 sec for printf's to complete and be read */

	/*
	* Acquiring smp_ipi_mtx here has a double effect:
	* - it disables interrupts avoiding CPU0 preemption
	* by fast handlers (thus deadlocking against other CPUs)
	* - it avoids deadlocks against smp_rendezvous() or, more
	* generally, threads busy-waiting, with this spinlock held,
	* and waiting for responses by threads on other CPUs
	* (ie. smp_tlb_shootdown()).
	*
	* For the !SMP case it just needs to handle the former problem.
	*/
	#ifdef SMP
	mtx_lock_spin(&smp_ipi_mtx);
	#else
	spinlock_enter();
	#endif

	/* cpu_boot(howto); / / doesn't do anything at the moment */
	cpu_reset();
	/* NOTREACHED / / assuming reset worked */
	}

	/*
	* Panic is called on unresolvable fatal errors. It prints "panic: mesg",
	* and then reboots. If we are called twice, then we avoid trying to sync
	* the disks as this often leads to recursive panics.
	*/
	void
	panic(const char *fmt, ...)
	{
	#ifdef SMP
	static volatile u_int panic_cpu = NOCPU;
	#endif
	struct thread *td = curthread;
	int bootopt, newpanic;
	va_list ap;
	static char buf[256];

	critical_enter();
	#ifdef SMP
	/*
	* We don't want multiple CPU's to panic at the same time, so we
	* use panic_cpu as a simple spinlock. We have to keep checking
	* panic_cpu if we are spinning in case the panic on the first
	* CPU is canceled.
	*/
	if (panic_cpu != PCPU_GET(cpuid))
	while (atomic_cmpset_int(&panic_cpu, NOCPU,
	PCPU_GET(cpuid)) == 0)
	while (panic_cpu != NOCPU)
	; /* nothing */
	#endif

	bootopt = RB_AUTOBOOT;
	newpanic = 0;
	if (panicstr)
	bootopt \|= RB_NOSYNC;
	else {
	bootopt \|= RB_DUMP;
	panicstr = fmt;
	newpanic = 1;
	}

	va_start(ap, fmt);
	if (newpanic) {
	(void)vsnprintf(buf, sizeof(buf), fmt, ap);
	panicstr = buf;
	printf("panic: %s\n", buf);
	} else {
	printf("panic: ");
	vprintf(fmt, ap);
	printf("\n");
	}
	va_end(ap);
	#ifdef SMP
	printf("cpuid = %d\n", PCPU_GET(cpuid));
	#endif

	#ifdef KDB
	if (newpanic && trace_on_panic)
	kdb_backtrace();
	if (debugger_on_panic)
	kdb_enter(KDB_WHY_PANIC, "panic");
	#endif
	/thread_lock(td); /
	td->td_flags \|= TDF_INPANIC;
	/* thread_unlock(td); */
	if (!sync_on_panic)
	bootopt \|= RB_NOSYNC;
	critical_exit();
	kern_reboot(bootopt);
	}

	/*
	* Support for poweroff delay.
	*
	* Please note that setting this delay too short might power off your machine
	* before the write cache on your hard disk has been flushed, leading to
	* soft-updates inconsistencies.
	*/
	#ifndef POWEROFF_DELAY
	# define POWEROFF_DELAY 5000
	#endif
	static int poweroff_delay = POWEROFF_DELAY;

	SYSCTL_INT(_kern_shutdown, OID_AUTO, poweroff_delay, CTLFLAG_RW,
	&poweroff_delay, 0, "");

	static void
	poweroff_wait(void *junk, int howto)
	{

	if (!(howto & RB_POWEROFF) \|\| poweroff_delay <= 0)
	return;
	DELAY(poweroff_delay * 1000);
	}

	/*
	* Some system processes (e.g. syncer) need to be stopped at appropriate
	* points in their main loops prior to a system shutdown, so that they
	* won't interfere with the shutdown process (e.g. by holding a disk buf
	* to cause sync to fail). For each of these system processes, register
	* shutdown_kproc() as a handler for one of shutdown events.
	*/
	static int kproc_shutdown_wait = 60;
	SYSCTL_INT(_kern_shutdown, OID_AUTO, kproc_shutdown_wait, CTLFLAG_RW,
	&kproc_shutdown_wait, 0, "");

	void
	kproc_shutdown(void *arg, int howto)
	{
	struct proc *p;
	int error;

	if (panicstr)
	return;

	p = (struct proc *)arg;
	printf("Waiting (max %d seconds) for system process `%s' to stop...",
	kproc_shutdown_wait, p->p_comm);
	error = kproc_suspend(p, kproc_shutdown_wait * hz);

	if (error == EWOULDBLOCK)
	printf("timed out\n");
	else
	printf("done\n");
	}

	void
	kthread_shutdown(void *arg, int howto)
	{
	struct thread *td;
	int error;

	if (panicstr)
	return;

	td = (struct thread *)arg;
	printf("Waiting (max %d seconds) for system thread `%s' to stop...",
	kproc_shutdown_wait, td->td_name);
	error = kthread_suspend(td, kproc_shutdown_wait * hz);

	if (error == EWOULDBLOCK)
	printf("timed out\n");
	else
	printf("done\n");
	}

	/* Registration of dumpers */
	int
	set_dumper(struct dumperinfo *di)
	{

	if (di == NULL) {
	bzero(&dumper, sizeof dumper);
	return (0);
	}
	if (dumper.dumper != NULL)
	return (EBUSY);
	dumper = *di;
	return (0);
	}

	/* Call dumper with bounds checking. */
	int
	dump_write(struct dumperinfo di, void virtual, vm_offset_t physical,
	off_t offset, size_t length)
	{

	if (length != 0 && (offset < di->mediaoffset \|\|
	offset - di->mediaoffset + length > di->mediasize)) {
	printf("Attempt to write outside dump device boundaries.\n"
	"offset(%jd), mediaoffset(%jd), length(%ju), mediasize(%jd).\n",
	(intmax_t)offset, (intmax_t)di->mediaoffset,
	(uintmax_t)length, (intmax_t)di->mediasize);
	return (ENOSPC);
	}
	return (di->dumper(di->priv, virtual, physical, offset, length));
	}

	void
	mkdumpheader(struct kerneldumpheader kdh, char magic, uint32_t archver,
	uint64_t dumplen, uint32_t blksz)
	{

	bzero(kdh, sizeof(*kdh));
	strncpy(kdh->magic, magic, sizeof(kdh->magic));
	strncpy(kdh->architecture, MACHINE_ARCH, sizeof(kdh->architecture));
	kdh->version = htod32(KERNELDUMPVERSION);
	kdh->architectureversion = htod32(archver);
	kdh->dumplength = htod64(dumplen);
	kdh->dumptime = htod64(time_second);
	kdh->blocksize = htod32(blksz);
	strncpy(kdh->hostname, prison0.pr_hostname, sizeof(kdh->hostname));
	strncpy(kdh->versionstring, version, sizeof(kdh->versionstring));
	if (panicstr != NULL)
	strncpy(kdh->panicstring, panicstr, sizeof(kdh->panicstring));
	kdh->parity = kerneldump_parity(kdh);
	}
	Index: head/sys/kern/kern_sig.c
	===================================================================
	--- head/sys/kern/kern_sig.c (revision 225616)
	+++ head/sys/kern/kern_sig.c (revision 225617)
	@@ -1,3453 +1,3453 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_sig.c 8.7 (Berkeley) 4/18/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_kdtrace.h"
	#include "opt_ktrace.h"
	#include "opt_core.h"
	#include "opt_procdesc.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/signalvar.h>
	#include <sys/vnode.h>
	#include <sys/acct.h>
	#include <sys/capability.h>
	#include <sys/condvar.h>
	#include <sys/event.h>
	#include <sys/fcntl.h>
	#include <sys/imgact.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/ktrace.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/proc.h>
	#include <sys/procdesc.h>
	#include <sys/posix4.h>
	#include <sys/pioctl.h>
	#include <sys/racct.h>
	#include <sys/resourcevar.h>
	#include <sys/sdt.h>
	#include <sys/sbuf.h>
	#include <sys/sleepqueue.h>
	#include <sys/smp.h>
	#include <sys/stat.h>
	#include <sys/sx.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/syslog.h>
	#include <sys/sysproto.h>
	#include <sys/timers.h>
	#include <sys/unistd.h>
	#include <sys/wait.h>
	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/uma.h>

	#include <sys/jail.h>

	#include <machine/cpu.h>

	#include <security/audit/audit.h>

	#define ONSIG 32 /* NSIG for osig* syscalls. XXX. */

	SDT_PROVIDER_DECLARE(proc);
	SDT_PROBE_DEFINE(proc, kernel, , signal_send, signal-send);
	SDT_PROBE_ARGTYPE(proc, kernel, , signal_send, 0, "struct thread *");
	SDT_PROBE_ARGTYPE(proc, kernel, , signal_send, 1, "struct proc *");
	SDT_PROBE_ARGTYPE(proc, kernel, , signal_send, 2, "int");
	SDT_PROBE_DEFINE(proc, kernel, , signal_clear, signal-clear);
	SDT_PROBE_ARGTYPE(proc, kernel, , signal_clear, 0, "int");
	SDT_PROBE_ARGTYPE(proc, kernel, , signal_clear, 1, "ksiginfo_t *");
	SDT_PROBE_DEFINE(proc, kernel, , signal_discard, signal-discard);
	SDT_PROBE_ARGTYPE(proc, kernel, , signal_discard, 0, "struct thread *");
	SDT_PROBE_ARGTYPE(proc, kernel, , signal_discard, 1, "struct proc *");
	SDT_PROBE_ARGTYPE(proc, kernel, , signal_discard, 2, "int");

	static int coredump(struct thread *);
	static char expand_name(const char , uid_t, pid_t, struct thread *, int);
	static int killpg1(struct thread *td, int sig, int pgid, int all,
	ksiginfo_t *ksi);
	static int issignal(struct thread *td, int stop_allowed);
	static int sigprop(int sig);
	static void tdsigwakeup(struct thread *, int, sig_t, int);
	static void sig_suspend_threads(struct thread , struct proc , int);
	static int filt_sigattach(struct knote *kn);
	static void filt_sigdetach(struct knote *kn);
	static int filt_signal(struct knote *kn, long hint);
	static struct thread sigtd(struct proc p, int sig, int prop);
	static void sigqueue_start(void);

	static uma_zone_t ksiginfo_zone = NULL;
	struct filterops sig_filtops = {
	.f_isfd = 0,
	.f_attach = filt_sigattach,
	.f_detach = filt_sigdetach,
	.f_event = filt_signal,
	};

	static int kern_logsigexit = 1;
	SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
	&kern_logsigexit, 0,
	"Log processes quitting on abnormal signals to syslog(3)");

	static int kern_forcesigexit = 1;
	SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
	&kern_forcesigexit, 0, "Force trap signal to be handled");

	SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal");

	static int max_pending_per_proc = 128;
	SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
	&max_pending_per_proc, 0, "Max pending signals per proc");

	static int preallocate_siginfo = 1024;
	TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo);
	SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD,
	&preallocate_siginfo, 0, "Preallocated signal memory size");

	static int signal_overflow = 0;
	SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
	&signal_overflow, 0, "Number of signals overflew");

	static int signal_alloc_fail = 0;
	SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
	&signal_alloc_fail, 0, "signals failed to be allocated");

	SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);

	/*
	* Policy -- Can ucred cr1 send SIGIO to process cr2?
	* Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
	* in the right situations.
	*/
	#define CANSIGIO(cr1, cr2) \
	((cr1)->cr_uid == 0 \|\| \
	(cr1)->cr_ruid == (cr2)->cr_ruid \|\| \
	(cr1)->cr_uid == (cr2)->cr_ruid \|\| \
	(cr1)->cr_ruid == (cr2)->cr_uid \|\| \
	(cr1)->cr_uid == (cr2)->cr_uid)

	static int sugid_coredump;
	SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW,
	&sugid_coredump, 0, "Allow setuid and setgid processes to dump core");

	static int do_coredump = 1;
	SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
	&do_coredump, 0, "Enable/Disable coredumps");

	static int set_core_nodump_flag = 0;
	SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
	0, "Enable setting the NODUMP flag on coredump files");

	/*
	* Signal properties and actions.
	* The array below categorizes the signals and their default actions
	* according to the following properties:
	*/
	#define SA_KILL 0x01 /* terminates process by default */
	#define SA_CORE 0x02 /* ditto and coredumps */
	#define SA_STOP 0x04 /* suspend process */
	#define SA_TTYSTOP 0x08 /* ditto, from tty */
	#define SA_IGNORE 0x10 /* ignore by default */
	#define SA_CONT 0x20 /* continue if suspended */
	#define SA_CANTMASK 0x40 /* non-maskable, catchable */
	#define SA_PROC 0x80 /* deliverable to any thread */

	static int sigproptbl[NSIG] = {
	SA_KILL\|SA_PROC, /* SIGHUP */
	SA_KILL\|SA_PROC, /* SIGINT */
	SA_KILL\|SA_CORE\|SA_PROC, /* SIGQUIT */
	SA_KILL\|SA_CORE, /* SIGILL */
	SA_KILL\|SA_CORE, /* SIGTRAP */
	SA_KILL\|SA_CORE, /* SIGABRT */
	SA_KILL\|SA_CORE\|SA_PROC, /* SIGEMT */
	SA_KILL\|SA_CORE, /* SIGFPE */
	SA_KILL\|SA_PROC, /* SIGKILL */
	SA_KILL\|SA_CORE, /* SIGBUS */
	SA_KILL\|SA_CORE, /* SIGSEGV */
	SA_KILL\|SA_CORE, /* SIGSYS */
	SA_KILL\|SA_PROC, /* SIGPIPE */
	SA_KILL\|SA_PROC, /* SIGALRM */
	SA_KILL\|SA_PROC, /* SIGTERM */
	SA_IGNORE\|SA_PROC, /* SIGURG */
	SA_STOP\|SA_PROC, /* SIGSTOP */
	SA_STOP\|SA_TTYSTOP\|SA_PROC, /* SIGTSTP */
	SA_IGNORE\|SA_CONT\|SA_PROC, /* SIGCONT */
	SA_IGNORE\|SA_PROC, /* SIGCHLD */
	SA_STOP\|SA_TTYSTOP\|SA_PROC, /* SIGTTIN */
	SA_STOP\|SA_TTYSTOP\|SA_PROC, /* SIGTTOU */
	SA_IGNORE\|SA_PROC, /* SIGIO */
	SA_KILL, /* SIGXCPU */
	SA_KILL, /* SIGXFSZ */
	SA_KILL\|SA_PROC, /* SIGVTALRM */
	SA_KILL\|SA_PROC, /* SIGPROF */
	SA_IGNORE\|SA_PROC, /* SIGWINCH */
	SA_IGNORE\|SA_PROC, /* SIGINFO */
	SA_KILL\|SA_PROC, /* SIGUSR1 */
	SA_KILL\|SA_PROC, /* SIGUSR2 */
	};

	static void reschedule_signals(struct proc *p, sigset_t block, int flags);

	static void
	sigqueue_start(void)
	{
	ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	uma_prealloc(ksiginfo_zone, preallocate_siginfo);
	p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
	p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
	p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
	}

	ksiginfo_t *
	ksiginfo_alloc(int wait)
	{
	int flags;

	flags = M_ZERO;
	if (! wait)
	flags \|= M_NOWAIT;
	if (ksiginfo_zone != NULL)
	return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags));
	return (NULL);
	}

	void
	ksiginfo_free(ksiginfo_t *ksi)
	{
	uma_zfree(ksiginfo_zone, ksi);
	}

	static __inline int
	ksiginfo_tryfree(ksiginfo_t *ksi)
	{
	if (!(ksi->ksi_flags & KSI_EXT)) {
	uma_zfree(ksiginfo_zone, ksi);
	return (1);
	}
	return (0);
	}

	void
	sigqueue_init(sigqueue_t list, struct proc p)
	{
	SIGEMPTYSET(list->sq_signals);
	SIGEMPTYSET(list->sq_kill);
	TAILQ_INIT(&list->sq_list);
	list->sq_proc = p;
	list->sq_flags = SQ_INIT;
	}

	/*
	* Get a signal's ksiginfo.
	* Return:
	* 0 - signal not found
	* others - signal number
	*/
	static int
	sigqueue_get(sigqueue_t sq, int signo, ksiginfo_t si)
	{
	struct proc *p = sq->sq_proc;
	struct ksiginfo ksi, next;
	int count = 0;

	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));

	if (!SIGISMEMBER(sq->sq_signals, signo))
	return (0);

	if (SIGISMEMBER(sq->sq_kill, signo)) {
	count++;
	SIGDELSET(sq->sq_kill, signo);
	}

	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
	if (ksi->ksi_signo == signo) {
	if (count == 0) {
	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
	ksi->ksi_sigq = NULL;
	ksiginfo_copy(ksi, si);
	if (ksiginfo_tryfree(ksi) && p != NULL)
	p->p_pendingcnt--;
	}
	if (++count > 1)
	break;
	}
	}

	if (count <= 1)
	SIGDELSET(sq->sq_signals, signo);
	si->ksi_signo = signo;
	return (signo);
	}

	void
	sigqueue_take(ksiginfo_t *ksi)
	{
	struct ksiginfo *kp;
	struct proc *p;
	sigqueue_t *sq;

	if (ksi == NULL \|\| (sq = ksi->ksi_sigq) == NULL)
	return;

	p = sq->sq_proc;
	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
	ksi->ksi_sigq = NULL;
	if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
	p->p_pendingcnt--;

	for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
	kp = TAILQ_NEXT(kp, ksi_link)) {
	if (kp->ksi_signo == ksi->ksi_signo)
	break;
	}
	if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo))
	SIGDELSET(sq->sq_signals, ksi->ksi_signo);
	}

	static int
	sigqueue_add(sigqueue_t sq, int signo, ksiginfo_t si)
	{
	struct proc *p = sq->sq_proc;
	struct ksiginfo *ksi;
	int ret = 0;

	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));

	if (signo == SIGKILL \|\| signo == SIGSTOP \|\| si == NULL) {
	SIGADDSET(sq->sq_kill, signo);
	goto out_set_bit;
	}

	/* directly insert the ksi, don't copy it */
	if (si->ksi_flags & KSI_INS) {
	if (si->ksi_flags & KSI_HEAD)
	TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link);
	else
	TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
	si->ksi_sigq = sq;
	goto out_set_bit;
	}

	if (__predict_false(ksiginfo_zone == NULL)) {
	SIGADDSET(sq->sq_kill, signo);
	goto out_set_bit;
	}

	if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
	signal_overflow++;
	ret = EAGAIN;
	} else if ((ksi = ksiginfo_alloc(0)) == NULL) {
	signal_alloc_fail++;
	ret = EAGAIN;
	} else {
	if (p != NULL)
	p->p_pendingcnt++;
	ksiginfo_copy(si, ksi);
	ksi->ksi_signo = signo;
	if (si->ksi_flags & KSI_HEAD)
	TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link);
	else
	TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
	ksi->ksi_sigq = sq;
	}

	if ((si->ksi_flags & KSI_TRAP) != 0 \|\|
	(si->ksi_flags & KSI_SIGQ) == 0) {
	if (ret != 0)
	SIGADDSET(sq->sq_kill, signo);
	ret = 0;
	goto out_set_bit;
	}

	if (ret != 0)
	return (ret);

	out_set_bit:
	SIGADDSET(sq->sq_signals, signo);
	return (ret);
	}

	void
	sigqueue_flush(sigqueue_t *sq)
	{
	struct proc *p = sq->sq_proc;
	ksiginfo_t *ksi;

	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));

	if (p != NULL)
	PROC_LOCK_ASSERT(p, MA_OWNED);

	while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
	ksi->ksi_sigq = NULL;
	if (ksiginfo_tryfree(ksi) && p != NULL)
	p->p_pendingcnt--;
	}

	SIGEMPTYSET(sq->sq_signals);
	SIGEMPTYSET(sq->sq_kill);
	}

	static void
	sigqueue_move_set(sigqueue_t src, sigqueue_t dst, const sigset_t *set)
	{
	sigset_t tmp;
	struct proc p1, p2;
	ksiginfo_t ksi, next;

	KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
	KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
	p1 = src->sq_proc;
	p2 = dst->sq_proc;
	/* Move siginfo to target list */
	TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
	if (SIGISMEMBER(*set, ksi->ksi_signo)) {
	TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
	if (p1 != NULL)
	p1->p_pendingcnt--;
	TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
	ksi->ksi_sigq = dst;
	if (p2 != NULL)
	p2->p_pendingcnt++;
	}
	}

	/* Move pending bits to target list */
	tmp = src->sq_kill;
	SIGSETAND(tmp, *set);
	SIGSETOR(dst->sq_kill, tmp);
	SIGSETNAND(src->sq_kill, tmp);

	tmp = src->sq_signals;
	SIGSETAND(tmp, *set);
	SIGSETOR(dst->sq_signals, tmp);
	SIGSETNAND(src->sq_signals, tmp);
	}

	#if 0
	static void
	sigqueue_move(sigqueue_t src, sigqueue_t dst, int signo)
	{
	sigset_t set;

	SIGEMPTYSET(set);
	SIGADDSET(set, signo);
	sigqueue_move_set(src, dst, &set);
	}
	#endif

	static void
	sigqueue_delete_set(sigqueue_t sq, const sigset_t set)
	{
	struct proc *p = sq->sq_proc;
	ksiginfo_t ksi, next;

	KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));

	/* Remove siginfo queue */
	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
	if (SIGISMEMBER(*set, ksi->ksi_signo)) {
	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
	ksi->ksi_sigq = NULL;
	if (ksiginfo_tryfree(ksi) && p != NULL)
	p->p_pendingcnt--;
	}
	}
	SIGSETNAND(sq->sq_kill, *set);
	SIGSETNAND(sq->sq_signals, *set);
	}

	void
	sigqueue_delete(sigqueue_t *sq, int signo)
	{
	sigset_t set;

	SIGEMPTYSET(set);
	SIGADDSET(set, signo);
	sigqueue_delete_set(sq, &set);
	}

	/* Remove a set of signals for a process */
	static void
	sigqueue_delete_set_proc(struct proc p, const sigset_t set)
	{
	sigqueue_t worklist;
	struct thread *td0;

	PROC_LOCK_ASSERT(p, MA_OWNED);

	sigqueue_init(&worklist, NULL);
	sigqueue_move_set(&p->p_sigqueue, &worklist, set);

	FOREACH_THREAD_IN_PROC(p, td0)
	sigqueue_move_set(&td0->td_sigqueue, &worklist, set);

	sigqueue_flush(&worklist);
	}

	void
	sigqueue_delete_proc(struct proc *p, int signo)
	{
	sigset_t set;

	SIGEMPTYSET(set);
	SIGADDSET(set, signo);
	sigqueue_delete_set_proc(p, &set);
	}

	static void
	sigqueue_delete_stopmask_proc(struct proc *p)
	{
	sigset_t set;

	SIGEMPTYSET(set);
	SIGADDSET(set, SIGSTOP);
	SIGADDSET(set, SIGTSTP);
	SIGADDSET(set, SIGTTIN);
	SIGADDSET(set, SIGTTOU);
	sigqueue_delete_set_proc(p, &set);
	}

	/*
	* Determine signal that should be delivered to process p, the current
	* process, 0 if none. If there is a pending stop signal with default
	* action, the process stops in issignal().
	*/
	int
	cursig(struct thread *td, int stop_allowed)
	{
	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
	KASSERT(stop_allowed == SIG_STOP_ALLOWED \|\|
	stop_allowed == SIG_STOP_NOT_ALLOWED, ("cursig: stop_allowed"));
	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
	return (SIGPENDING(td) ? issignal(td, stop_allowed) : 0);
	}

	/*
	* Arrange for ast() to handle unmasked pending signals on return to user
	* mode. This must be called whenever a signal is added to td_sigqueue or
	* unmasked in td_sigmask.
	*/
	void
	signotify(struct thread *td)
	{
	struct proc *p;

	p = td->td_proc;

	PROC_LOCK_ASSERT(p, MA_OWNED);

	if (SIGPENDING(td)) {
	thread_lock(td);
	td->td_flags \|= TDF_NEEDSIGCHK \| TDF_ASTPENDING;
	thread_unlock(td);
	}
	}

	int
	sigonstack(size_t sp)
	{
	struct thread *td = curthread;

	return ((td->td_pflags & TDP_ALTSTACK) ?
	#if defined(COMPAT_43)
	((td->td_sigstk.ss_size == 0) ?
	(td->td_sigstk.ss_flags & SS_ONSTACK) :
	((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size))
	#else
	((sp - (size_t)td->td_sigstk.ss_sp) < td->td_sigstk.ss_size)
	#endif
	: 0);
	}

	static __inline int
	sigprop(int sig)
	{

	if (sig > 0 && sig < NSIG)
	return (sigproptbl[_SIG_IDX(sig)]);
	return (0);
	}

	int
	sig_ffs(sigset_t *set)
	{
	int i;

	for (i = 0; i < _SIG_WORDS; i++)
	if (set->__bits[i])
	return (ffs(set->__bits[i]) + (i * 32));
	return (0);
	}

	/*
	* kern_sigaction
	* sigaction
	* freebsd4_sigaction
	* osigaction
	*/
	int
	kern_sigaction(td, sig, act, oact, flags)
	struct thread *td;
	register int sig;
	struct sigaction act, oact;
	int flags;
	{
	struct sigacts *ps;
	struct proc *p = td->td_proc;

	if (!_SIG_VALID(sig))
	return (EINVAL);

	PROC_LOCK(p);
	ps = p->p_sigacts;
	mtx_lock(&ps->ps_mtx);
	if (oact) {
	oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
	oact->sa_flags = 0;
	if (SIGISMEMBER(ps->ps_sigonstack, sig))
	oact->sa_flags \|= SA_ONSTACK;
	if (!SIGISMEMBER(ps->ps_sigintr, sig))
	oact->sa_flags \|= SA_RESTART;
	if (SIGISMEMBER(ps->ps_sigreset, sig))
	oact->sa_flags \|= SA_RESETHAND;
	if (SIGISMEMBER(ps->ps_signodefer, sig))
	oact->sa_flags \|= SA_NODEFER;
	if (SIGISMEMBER(ps->ps_siginfo, sig)) {
	oact->sa_flags \|= SA_SIGINFO;
	oact->sa_sigaction =
	(__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)];
	} else
	oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
	if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
	oact->sa_flags \|= SA_NOCLDSTOP;
	if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
	oact->sa_flags \|= SA_NOCLDWAIT;
	}
	if (act) {
	if ((sig == SIGKILL \|\| sig == SIGSTOP) &&
	act->sa_handler != SIG_DFL) {
	mtx_unlock(&ps->ps_mtx);
	PROC_UNLOCK(p);
	return (EINVAL);
	}

	/*
	* Change setting atomically.
	*/

	ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
	SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
	if (act->sa_flags & SA_SIGINFO) {
	ps->ps_sigact[_SIG_IDX(sig)] =
	(__sighandler_t *)act->sa_sigaction;
	SIGADDSET(ps->ps_siginfo, sig);
	} else {
	ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
	SIGDELSET(ps->ps_siginfo, sig);
	}
	if (!(act->sa_flags & SA_RESTART))
	SIGADDSET(ps->ps_sigintr, sig);
	else
	SIGDELSET(ps->ps_sigintr, sig);
	if (act->sa_flags & SA_ONSTACK)
	SIGADDSET(ps->ps_sigonstack, sig);
	else
	SIGDELSET(ps->ps_sigonstack, sig);
	if (act->sa_flags & SA_RESETHAND)
	SIGADDSET(ps->ps_sigreset, sig);
	else
	SIGDELSET(ps->ps_sigreset, sig);
	if (act->sa_flags & SA_NODEFER)
	SIGADDSET(ps->ps_signodefer, sig);
	else
	SIGDELSET(ps->ps_signodefer, sig);
	if (sig == SIGCHLD) {
	if (act->sa_flags & SA_NOCLDSTOP)
	ps->ps_flag \|= PS_NOCLDSTOP;
	else
	ps->ps_flag &= ~PS_NOCLDSTOP;
	if (act->sa_flags & SA_NOCLDWAIT) {
	/*
	* Paranoia: since SA_NOCLDWAIT is implemented
	* by reparenting the dying child to PID 1 (and
	* trust it to reap the zombie), PID 1 itself
	* is forbidden to set SA_NOCLDWAIT.
	*/
	if (p->p_pid == 1)
	ps->ps_flag &= ~PS_NOCLDWAIT;
	else
	ps->ps_flag \|= PS_NOCLDWAIT;
	} else
	ps->ps_flag &= ~PS_NOCLDWAIT;
	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
	ps->ps_flag \|= PS_CLDSIGIGN;
	else
	ps->ps_flag &= ~PS_CLDSIGIGN;
	}
	/*
	* Set bit in ps_sigignore for signals that are set to SIG_IGN,
	* and for signals set to SIG_DFL where the default is to
	* ignore. However, don't put SIGCONT in ps_sigignore, as we
	* have to restart the process.
	*/
	if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN \|\|
	(sigprop(sig) & SA_IGNORE &&
	ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
	/* never to be seen again */
	sigqueue_delete_proc(p, sig);
	if (sig != SIGCONT)
	/* easier in psignal */
	SIGADDSET(ps->ps_sigignore, sig);
	SIGDELSET(ps->ps_sigcatch, sig);
	} else {
	SIGDELSET(ps->ps_sigignore, sig);
	if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
	SIGDELSET(ps->ps_sigcatch, sig);
	else
	SIGADDSET(ps->ps_sigcatch, sig);
	}
	#ifdef COMPAT_FREEBSD4
	if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN \|\|
	ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL \|\|
	(flags & KSA_FREEBSD4) == 0)
	SIGDELSET(ps->ps_freebsd4, sig);
	else
	SIGADDSET(ps->ps_freebsd4, sig);
	#endif
	#ifdef COMPAT_43
	if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN \|\|
	ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL \|\|
	(flags & KSA_OSIGSET) == 0)
	SIGDELSET(ps->ps_osigset, sig);
	else
	SIGADDSET(ps->ps_osigset, sig);
	#endif
	}
	mtx_unlock(&ps->ps_mtx);
	PROC_UNLOCK(p);
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct sigaction_args {
	int sig;
	struct sigaction *act;
	struct sigaction *oact;
	};
	#endif
	int
	-sigaction(td, uap)
	+sys_sigaction(td, uap)
	struct thread *td;
	register struct sigaction_args *uap;
	{
	struct sigaction act, oact;
	register struct sigaction actp, oactp;
	int error;

	actp = (uap->act != NULL) ? &act : NULL;
	oactp = (uap->oact != NULL) ? &oact : NULL;
	if (actp) {
	error = copyin(uap->act, actp, sizeof(act));
	if (error)
	return (error);
	}
	error = kern_sigaction(td, uap->sig, actp, oactp, 0);
	if (oactp && !error)
	error = copyout(oactp, uap->oact, sizeof(oact));
	return (error);
	}

	#ifdef COMPAT_FREEBSD4
	#ifndef _SYS_SYSPROTO_H_
	struct freebsd4_sigaction_args {
	int sig;
	struct sigaction *act;
	struct sigaction *oact;
	};
	#endif
	int
	freebsd4_sigaction(td, uap)
	struct thread *td;
	register struct freebsd4_sigaction_args *uap;
	{
	struct sigaction act, oact;
	register struct sigaction actp, oactp;
	int error;


	actp = (uap->act != NULL) ? &act : NULL;
	oactp = (uap->oact != NULL) ? &oact : NULL;
	if (actp) {
	error = copyin(uap->act, actp, sizeof(act));
	if (error)
	return (error);
	}
	error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
	if (oactp && !error)
	error = copyout(oactp, uap->oact, sizeof(oact));
	return (error);
	}
	#endif /* COMAPT_FREEBSD4 */

	#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
	#ifndef _SYS_SYSPROTO_H_
	struct osigaction_args {
	int signum;
	struct osigaction *nsa;
	struct osigaction *osa;
	};
	#endif
	int
	osigaction(td, uap)
	struct thread *td;
	register struct osigaction_args *uap;
	{
	struct osigaction sa;
	struct sigaction nsa, osa;
	register struct sigaction nsap, osap;
	int error;

	if (uap->signum <= 0 \|\| uap->signum >= ONSIG)
	return (EINVAL);

	nsap = (uap->nsa != NULL) ? &nsa : NULL;
	osap = (uap->osa != NULL) ? &osa : NULL;

	if (nsap) {
	error = copyin(uap->nsa, &sa, sizeof(sa));
	if (error)
	return (error);
	nsap->sa_handler = sa.sa_handler;
	nsap->sa_flags = sa.sa_flags;
	OSIG2SIG(sa.sa_mask, nsap->sa_mask);
	}
	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
	if (osap && !error) {
	sa.sa_handler = osap->sa_handler;
	sa.sa_flags = osap->sa_flags;
	SIG2OSIG(osap->sa_mask, sa.sa_mask);
	error = copyout(&sa, uap->osa, sizeof(sa));
	}
	return (error);
	}

	#if !defined(__i386__)
	/* Avoid replicating the same stub everywhere */
	int
	osigreturn(td, uap)
	struct thread *td;
	struct osigreturn_args *uap;
	{

	return (nosys(td, (struct nosys_args *)uap));
	}
	#endif
	#endif /* COMPAT_43 */

	/*
	* Initialize signal state for process 0;
	* set to ignore signals that are ignored by default.
	*/
	void
	siginit(p)
	struct proc *p;
	{
	register int i;
	struct sigacts *ps;

	PROC_LOCK(p);
	ps = p->p_sigacts;
	mtx_lock(&ps->ps_mtx);
	for (i = 1; i <= NSIG; i++)
	if (sigprop(i) & SA_IGNORE && i != SIGCONT)
	SIGADDSET(ps->ps_sigignore, i);
	mtx_unlock(&ps->ps_mtx);
	PROC_UNLOCK(p);
	}

	/*
	* Reset signals for an exec of the specified process.
	*/
	void
	execsigs(struct proc *p)
	{
	struct sigacts *ps;
	int sig;
	struct thread *td;

	/*
	* Reset caught signals. Held signals remain held
	* through td_sigmask (unless they were caught,
	* and are now ignored by default).
	*/
	PROC_LOCK_ASSERT(p, MA_OWNED);
	td = FIRST_THREAD_IN_PROC(p);
	ps = p->p_sigacts;
	mtx_lock(&ps->ps_mtx);
	while (SIGNOTEMPTY(ps->ps_sigcatch)) {
	sig = sig_ffs(&ps->ps_sigcatch);
	SIGDELSET(ps->ps_sigcatch, sig);
	if (sigprop(sig) & SA_IGNORE) {
	if (sig != SIGCONT)
	SIGADDSET(ps->ps_sigignore, sig);
	sigqueue_delete_proc(p, sig);
	}
	ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
	}
	/*
	* Reset stack state to the user stack.
	* Clear set of signals caught on the signal stack.
	*/
	td->td_sigstk.ss_flags = SS_DISABLE;
	td->td_sigstk.ss_size = 0;
	td->td_sigstk.ss_sp = 0;
	td->td_pflags &= ~TDP_ALTSTACK;
	/*
	* Reset no zombies if child dies flag as Solaris does.
	*/
	ps->ps_flag &= ~(PS_NOCLDWAIT \| PS_CLDSIGIGN);
	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
	ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
	mtx_unlock(&ps->ps_mtx);
	}

	/*
	* kern_sigprocmask()
	*
	* Manipulate signal mask.
	*/
	int
	kern_sigprocmask(struct thread td, int how, sigset_t set, sigset_t *oset,
	int flags)
	{
	sigset_t new_block, oset1;
	struct proc *p;
	int error;

	p = td->td_proc;
	if (!(flags & SIGPROCMASK_PROC_LOCKED))
	PROC_LOCK(p);
	if (oset != NULL)
	*oset = td->td_sigmask;

	error = 0;
	if (set != NULL) {
	switch (how) {
	case SIG_BLOCK:
	SIG_CANTMASK(*set);
	oset1 = td->td_sigmask;
	SIGSETOR(td->td_sigmask, *set);
	new_block = td->td_sigmask;
	SIGSETNAND(new_block, oset1);
	break;
	case SIG_UNBLOCK:
	SIGSETNAND(td->td_sigmask, *set);
	signotify(td);
	goto out;
	case SIG_SETMASK:
	SIG_CANTMASK(*set);
	oset1 = td->td_sigmask;
	if (flags & SIGPROCMASK_OLD)
	SIGSETLO(td->td_sigmask, *set);
	else
	td->td_sigmask = *set;
	new_block = td->td_sigmask;
	SIGSETNAND(new_block, oset1);
	signotify(td);
	break;
	default:
	error = EINVAL;
	goto out;
	}

	/*
	* The new_block set contains signals that were not previously
	* blocked, but are blocked now.
	*
	* In case we block any signal that was not previously blocked
	* for td, and process has the signal pending, try to schedule
	* signal delivery to some thread that does not block the
	* signal, possibly waking it up.
	*/
	if (p->p_numthreads != 1)
	reschedule_signals(p, new_block, flags);
	}

	out:
	if (!(flags & SIGPROCMASK_PROC_LOCKED))
	PROC_UNLOCK(p);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct sigprocmask_args {
	int how;
	const sigset_t *set;
	sigset_t *oset;
	};
	#endif
	int
	-sigprocmask(td, uap)
	+sys_sigprocmask(td, uap)
	register struct thread *td;
	struct sigprocmask_args *uap;
	{
	sigset_t set, oset;
	sigset_t setp, osetp;
	int error;

	setp = (uap->set != NULL) ? &set : NULL;
	osetp = (uap->oset != NULL) ? &oset : NULL;
	if (setp) {
	error = copyin(uap->set, setp, sizeof(set));
	if (error)
	return (error);
	}
	error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
	if (osetp && !error) {
	error = copyout(osetp, uap->oset, sizeof(oset));
	}
	return (error);
	}

	#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
	#ifndef _SYS_SYSPROTO_H_
	struct osigprocmask_args {
	int how;
	osigset_t mask;
	};
	#endif
	int
	osigprocmask(td, uap)
	register struct thread *td;
	struct osigprocmask_args *uap;
	{
	sigset_t set, oset;
	int error;

	OSIG2SIG(uap->mask, set);
	error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
	SIG2OSIG(oset, td->td_retval[0]);
	return (error);
	}
	#endif /* COMPAT_43 */

	int
	-sigwait(struct thread td, struct sigwait_args uap)
	+sys_sigwait(struct thread td, struct sigwait_args uap)
	{
	ksiginfo_t ksi;
	sigset_t set;
	int error;

	error = copyin(uap->set, &set, sizeof(set));
	if (error) {
	td->td_retval[0] = error;
	return (0);
	}

	error = kern_sigtimedwait(td, set, &ksi, NULL);
	if (error) {
	if (error == ERESTART)
	return (error);
	td->td_retval[0] = error;
	return (0);
	}

	error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
	td->td_retval[0] = error;
	return (0);
	}

	int
	-sigtimedwait(struct thread td, struct sigtimedwait_args uap)
	+sys_sigtimedwait(struct thread td, struct sigtimedwait_args uap)
	{
	struct timespec ts;
	struct timespec *timeout;
	sigset_t set;
	ksiginfo_t ksi;
	int error;

	if (uap->timeout) {
	error = copyin(uap->timeout, &ts, sizeof(ts));
	if (error)
	return (error);

	timeout = &ts;
	} else
	timeout = NULL;

	error = copyin(uap->set, &set, sizeof(set));
	if (error)
	return (error);

	error = kern_sigtimedwait(td, set, &ksi, timeout);
	if (error)
	return (error);

	if (uap->info)
	error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));

	if (error == 0)
	td->td_retval[0] = ksi.ksi_signo;
	return (error);
	}

	int
	-sigwaitinfo(struct thread td, struct sigwaitinfo_args uap)
	+sys_sigwaitinfo(struct thread td, struct sigwaitinfo_args uap)
	{
	ksiginfo_t ksi;
	sigset_t set;
	int error;

	error = copyin(uap->set, &set, sizeof(set));
	if (error)
	return (error);

	error = kern_sigtimedwait(td, set, &ksi, NULL);
	if (error)
	return (error);

	if (uap->info)
	error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));

	if (error == 0)
	td->td_retval[0] = ksi.ksi_signo;
	return (error);
	}

	int
	kern_sigtimedwait(struct thread td, sigset_t waitset, ksiginfo_t ksi,
	struct timespec *timeout)
	{
	struct sigacts *ps;
	sigset_t saved_mask, new_block;
	struct proc *p;
	int error, sig, timo, timevalid = 0;
	struct timespec rts, ets, ts;
	struct timeval tv;

	p = td->td_proc;
	error = 0;
	ets.tv_sec = 0;
	ets.tv_nsec = 0;

	if (timeout != NULL) {
	if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) {
	timevalid = 1;
	getnanouptime(&rts);
	ets = rts;
	timespecadd(&ets, timeout);
	}
	}
	ksiginfo_init(ksi);
	/* Some signals can not be waited for. */
	SIG_CANTMASK(waitset);
	ps = p->p_sigacts;
	PROC_LOCK(p);
	saved_mask = td->td_sigmask;
	SIGSETNAND(td->td_sigmask, waitset);
	for (;;) {
	mtx_lock(&ps->ps_mtx);
	sig = cursig(td, SIG_STOP_ALLOWED);
	mtx_unlock(&ps->ps_mtx);
	if (sig != 0 && SIGISMEMBER(waitset, sig)) {
	if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 \|\|
	sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) {
	error = 0;
	break;
	}
	}

	if (error != 0)
	break;

	/*
	* POSIX says this must be checked after looking for pending
	* signals.
	*/
	if (timeout != NULL) {
	if (!timevalid) {
	error = EINVAL;
	break;
	}
	getnanouptime(&rts);
	if (timespeccmp(&rts, &ets, >=)) {
	error = EAGAIN;
	break;
	}
	ts = ets;
	timespecsub(&ts, &rts);
	TIMESPEC_TO_TIMEVAL(&tv, &ts);
	timo = tvtohz(&tv);
	} else {
	timo = 0;
	}

	error = msleep(ps, &p->p_mtx, PPAUSE\|PCATCH, "sigwait", timo);

	if (timeout != NULL) {
	if (error == ERESTART) {
	/* Timeout can not be restarted. */
	error = EINTR;
	} else if (error == EAGAIN) {
	/* We will calculate timeout by ourself. */
	error = 0;
	}
	}
	}

	new_block = saved_mask;
	SIGSETNAND(new_block, td->td_sigmask);
	td->td_sigmask = saved_mask;
	/*
	* Fewer signals can be delivered to us, reschedule signal
	* notification.
	*/
	if (p->p_numthreads != 1)
	reschedule_signals(p, new_block, 0);

	if (error == 0) {
	SDT_PROBE(proc, kernel, , signal_clear, sig, ksi, 0, 0, 0);

	if (ksi->ksi_code == SI_TIMER)
	itimer_accept(p, ksi->ksi_timerid, ksi);

	#ifdef KTRACE
	if (KTRPOINT(td, KTR_PSIG)) {
	sig_t action;

	mtx_lock(&ps->ps_mtx);
	action = ps->ps_sigact[_SIG_IDX(sig)];
	mtx_unlock(&ps->ps_mtx);
	ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code);
	}
	#endif
	if (sig == SIGKILL)
	sigexit(td, sig);
	}
	PROC_UNLOCK(p);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct sigpending_args {
	sigset_t *set;
	};
	#endif
	int
	-sigpending(td, uap)
	+sys_sigpending(td, uap)
	struct thread *td;
	struct sigpending_args *uap;
	{
	struct proc *p = td->td_proc;
	sigset_t pending;

	PROC_LOCK(p);
	pending = p->p_sigqueue.sq_signals;
	SIGSETOR(pending, td->td_sigqueue.sq_signals);
	PROC_UNLOCK(p);
	return (copyout(&pending, uap->set, sizeof(sigset_t)));
	}

	#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
	#ifndef _SYS_SYSPROTO_H_
	struct osigpending_args {
	int dummy;
	};
	#endif
	int
	osigpending(td, uap)
	struct thread *td;
	struct osigpending_args *uap;
	{
	struct proc *p = td->td_proc;
	sigset_t pending;

	PROC_LOCK(p);
	pending = p->p_sigqueue.sq_signals;
	SIGSETOR(pending, td->td_sigqueue.sq_signals);
	PROC_UNLOCK(p);
	SIG2OSIG(pending, td->td_retval[0]);
	return (0);
	}
	#endif /* COMPAT_43 */

	#if defined(COMPAT_43)
	/*
	* Generalized interface signal handler, 4.3-compatible.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct osigvec_args {
	int signum;
	struct sigvec *nsv;
	struct sigvec *osv;
	};
	#endif
	/* ARGSUSED */
	int
	osigvec(td, uap)
	struct thread *td;
	register struct osigvec_args *uap;
	{
	struct sigvec vec;
	struct sigaction nsa, osa;
	register struct sigaction nsap, osap;
	int error;

	if (uap->signum <= 0 \|\| uap->signum >= ONSIG)
	return (EINVAL);
	nsap = (uap->nsv != NULL) ? &nsa : NULL;
	osap = (uap->osv != NULL) ? &osa : NULL;
	if (nsap) {
	error = copyin(uap->nsv, &vec, sizeof(vec));
	if (error)
	return (error);
	nsap->sa_handler = vec.sv_handler;
	OSIG2SIG(vec.sv_mask, nsap->sa_mask);
	nsap->sa_flags = vec.sv_flags;
	nsap->sa_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */
	}
	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
	if (osap && !error) {
	vec.sv_handler = osap->sa_handler;
	SIG2OSIG(osap->sa_mask, vec.sv_mask);
	vec.sv_flags = osap->sa_flags;
	vec.sv_flags &= ~SA_NOCLDWAIT;
	vec.sv_flags ^= SA_RESTART;
	error = copyout(&vec, uap->osv, sizeof(vec));
	}
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct osigblock_args {
	int mask;
	};
	#endif
	int
	osigblock(td, uap)
	register struct thread *td;
	struct osigblock_args *uap;
	{
	sigset_t set, oset;

	OSIG2SIG(uap->mask, set);
	kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0);
	SIG2OSIG(oset, td->td_retval[0]);
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct osigsetmask_args {
	int mask;
	};
	#endif
	int
	osigsetmask(td, uap)
	struct thread *td;
	struct osigsetmask_args *uap;
	{
	sigset_t set, oset;

	OSIG2SIG(uap->mask, set);
	kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0);
	SIG2OSIG(oset, td->td_retval[0]);
	return (0);
	}
	#endif /* COMPAT_43 */

	/*
	* Suspend calling thread until signal, providing mask to be set in the
	* meantime.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct sigsuspend_args {
	const sigset_t *sigmask;
	};
	#endif
	/* ARGSUSED */
	int
	-sigsuspend(td, uap)
	+sys_sigsuspend(td, uap)
	struct thread *td;
	struct sigsuspend_args *uap;
	{
	sigset_t mask;
	int error;

	error = copyin(uap->sigmask, &mask, sizeof(mask));
	if (error)
	return (error);
	return (kern_sigsuspend(td, mask));
	}

	int
	kern_sigsuspend(struct thread *td, sigset_t mask)
	{
	struct proc *p = td->td_proc;
	int has_sig, sig;

	/*
	* When returning from sigsuspend, we want
	* the old mask to be restored after the
	* signal handler has finished. Thus, we
	* save it here and mark the sigacts structure
	* to indicate this.
	*/
	PROC_LOCK(p);
	kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask,
	SIGPROCMASK_PROC_LOCKED);
	td->td_pflags \|= TDP_OLDMASK;

	/*
	* Process signals now. Otherwise, we can get spurious wakeup
	* due to signal entered process queue, but delivered to other
	* thread. But sigsuspend should return only on signal
	* delivery.
	*/
	(p->p_sysent->sv_set_syscall_retval)(td, EINTR);
	for (has_sig = 0; !has_sig;) {
	while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE\|PCATCH, "pause",
	0) == 0)
	/* void */;
	thread_suspend_check(0);
	mtx_lock(&p->p_sigacts->ps_mtx);
	while ((sig = cursig(td, SIG_STOP_ALLOWED)) != 0)
	has_sig += postsig(sig);
	mtx_unlock(&p->p_sigacts->ps_mtx);
	}
	PROC_UNLOCK(p);
	return (EJUSTRETURN);
	}

	#ifdef COMPAT_43 /* XXX - COMPAT_FBSD3 */
	/*
	* Compatibility sigsuspend call for old binaries. Note nonstandard calling
	* convention: libc stub passes mask, not pointer, to save a copyin.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct osigsuspend_args {
	osigset_t mask;
	};
	#endif
	/* ARGSUSED */
	int
	osigsuspend(td, uap)
	struct thread *td;
	struct osigsuspend_args *uap;
	{
	sigset_t mask;

	OSIG2SIG(uap->mask, mask);
	return (kern_sigsuspend(td, mask));
	}
	#endif /* COMPAT_43 */

	#if defined(COMPAT_43)
	#ifndef _SYS_SYSPROTO_H_
	struct osigstack_args {
	struct sigstack *nss;
	struct sigstack *oss;
	};
	#endif
	/* ARGSUSED */
	int
	osigstack(td, uap)
	struct thread *td;
	register struct osigstack_args *uap;
	{
	struct sigstack nss, oss;
	int error = 0;

	if (uap->nss != NULL) {
	error = copyin(uap->nss, &nss, sizeof(nss));
	if (error)
	return (error);
	}
	oss.ss_sp = td->td_sigstk.ss_sp;
	oss.ss_onstack = sigonstack(cpu_getstack(td));
	if (uap->nss != NULL) {
	td->td_sigstk.ss_sp = nss.ss_sp;
	td->td_sigstk.ss_size = 0;
	td->td_sigstk.ss_flags \|= nss.ss_onstack & SS_ONSTACK;
	td->td_pflags \|= TDP_ALTSTACK;
	}
	if (uap->oss != NULL)
	error = copyout(&oss, uap->oss, sizeof(oss));

	return (error);
	}
	#endif /* COMPAT_43 */

	#ifndef _SYS_SYSPROTO_H_
	struct sigaltstack_args {
	stack_t *ss;
	stack_t *oss;
	};
	#endif
	/* ARGSUSED */
	int
	-sigaltstack(td, uap)
	+sys_sigaltstack(td, uap)
	struct thread *td;
	register struct sigaltstack_args *uap;
	{
	stack_t ss, oss;
	int error;

	if (uap->ss != NULL) {
	error = copyin(uap->ss, &ss, sizeof(ss));
	if (error)
	return (error);
	}
	error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
	(uap->oss != NULL) ? &oss : NULL);
	if (error)
	return (error);
	if (uap->oss != NULL)
	error = copyout(&oss, uap->oss, sizeof(stack_t));
	return (error);
	}

	int
	kern_sigaltstack(struct thread td, stack_t ss, stack_t *oss)
	{
	struct proc *p = td->td_proc;
	int oonstack;

	oonstack = sigonstack(cpu_getstack(td));

	if (oss != NULL) {
	*oss = td->td_sigstk;
	oss->ss_flags = (td->td_pflags & TDP_ALTSTACK)
	? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
	}

	if (ss != NULL) {
	if (oonstack)
	return (EPERM);
	if ((ss->ss_flags & ~SS_DISABLE) != 0)
	return (EINVAL);
	if (!(ss->ss_flags & SS_DISABLE)) {
	if (ss->ss_size < p->p_sysent->sv_minsigstksz)
	return (ENOMEM);

	td->td_sigstk = *ss;
	td->td_pflags \|= TDP_ALTSTACK;
	} else {
	td->td_pflags &= ~TDP_ALTSTACK;
	}
	}
	return (0);
	}

	/*
	* Common code for kill process group/broadcast kill.
	* cp is calling process.
	*/
	static int
	killpg1(struct thread td, int sig, int pgid, int all, ksiginfo_t ksi)
	{
	struct proc *p;
	struct pgrp *pgrp;
	int nfound = 0;

	if (all) {
	/*
	* broadcast
	*/
	sx_slock(&allproc_lock);
	FOREACH_PROC_IN_SYSTEM(p) {
	PROC_LOCK(p);
	if (p->p_pid <= 1 \|\| p->p_flag & P_SYSTEM \|\|
	p == td->td_proc \|\| p->p_state == PRS_NEW) {
	PROC_UNLOCK(p);
	continue;
	}
	if (p_cansignal(td, p, sig) == 0) {
	nfound++;
	if (sig)
	pksignal(p, sig, ksi);
	}
	PROC_UNLOCK(p);
	}
	sx_sunlock(&allproc_lock);
	} else {
	sx_slock(&proctree_lock);
	if (pgid == 0) {
	/*
	* zero pgid means send to my process group.
	*/
	pgrp = td->td_proc->p_pgrp;
	PGRP_LOCK(pgrp);
	} else {
	pgrp = pgfind(pgid);
	if (pgrp == NULL) {
	sx_sunlock(&proctree_lock);
	return (ESRCH);
	}
	}
	sx_sunlock(&proctree_lock);
	LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
	PROC_LOCK(p);
	if (p->p_pid <= 1 \|\| p->p_flag & P_SYSTEM \|\|
	p->p_state == PRS_NEW) {
	PROC_UNLOCK(p);
	continue;
	}
	if (p_cansignal(td, p, sig) == 0) {
	nfound++;
	if (sig)
	pksignal(p, sig, ksi);
	}
	PROC_UNLOCK(p);
	}
	PGRP_UNLOCK(pgrp);
	}
	return (nfound ? 0 : ESRCH);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct kill_args {
	int pid;
	int signum;
	};
	#endif
	/* ARGSUSED */
	int
	-kill(struct thread td, struct kill_args uap)
	+sys_kill(struct thread td, struct kill_args uap)
	{
	ksiginfo_t ksi;
	struct proc *p;
	int error;

	AUDIT_ARG_SIGNUM(uap->signum);
	AUDIT_ARG_PID(uap->pid);
	if ((u_int)uap->signum > _SIG_MAXSIG)
	return (EINVAL);

	ksiginfo_init(&ksi);
	ksi.ksi_signo = uap->signum;
	ksi.ksi_code = SI_USER;
	ksi.ksi_pid = td->td_proc->p_pid;
	ksi.ksi_uid = td->td_ucred->cr_ruid;

	if (uap->pid > 0) {
	/* kill single process */
	if ((p = pfind(uap->pid)) == NULL) {
	if ((p = zpfind(uap->pid)) == NULL)
	return (ESRCH);
	}
	AUDIT_ARG_PROCESS(p);
	error = p_cansignal(td, p, uap->signum);
	if (error == 0 && uap->signum)
	pksignal(p, uap->signum, &ksi);
	PROC_UNLOCK(p);
	return (error);
	}
	switch (uap->pid) {
	case -1: /* broadcast signal */
	return (killpg1(td, uap->signum, 0, 1, &ksi));
	case 0: /* signal own process group */
	return (killpg1(td, uap->signum, 0, 0, &ksi));
	default: /* negative explicit process group */
	return (killpg1(td, uap->signum, -uap->pid, 0, &ksi));
	}
	/* NOTREACHED */
	}

	int
	-pdkill(td, uap)
	+sys_pdkill(td, uap)
	struct thread *td;
	struct pdkill_args *uap;
	{
	#ifdef PROCDESC
	struct proc *p;
	int error;

	AUDIT_ARG_SIGNUM(uap->signum);
	AUDIT_ARG_FD(uap->fd);
	if ((u_int)uap->signum > _SIG_MAXSIG)
	return (EINVAL);

	error = procdesc_find(td, uap->fd, CAP_PDKILL, &p);
	if (error)
	return (error);
	AUDIT_ARG_PROCESS(p);
	error = p_cansignal(td, p, uap->signum);
	if (error == 0 && uap->signum)
	- psignal(p, uap->signum);
	+ kern_psignal(p, uap->signum);
	PROC_UNLOCK(p);
	return (error);
	#else
	return (ENOSYS);
	#endif
	}

	#if defined(COMPAT_43)
	#ifndef _SYS_SYSPROTO_H_
	struct okillpg_args {
	int pgid;
	int signum;
	};
	#endif
	/* ARGSUSED */
	int
	okillpg(struct thread td, struct okillpg_args uap)
	{
	ksiginfo_t ksi;

	AUDIT_ARG_SIGNUM(uap->signum);
	AUDIT_ARG_PID(uap->pgid);
	if ((u_int)uap->signum > _SIG_MAXSIG)
	return (EINVAL);

	ksiginfo_init(&ksi);
	ksi.ksi_signo = uap->signum;
	ksi.ksi_code = SI_USER;
	ksi.ksi_pid = td->td_proc->p_pid;
	ksi.ksi_uid = td->td_ucred->cr_ruid;
	return (killpg1(td, uap->signum, uap->pgid, 0, &ksi));
	}
	#endif /* COMPAT_43 */

	#ifndef _SYS_SYSPROTO_H_
	struct sigqueue_args {
	pid_t pid;
	int signum;
	/* union sigval / void value;
	};
	#endif
	int
	-sigqueue(struct thread td, struct sigqueue_args uap)
	+sys_sigqueue(struct thread td, struct sigqueue_args uap)
	{
	ksiginfo_t ksi;
	struct proc *p;
	int error;

	if ((u_int)uap->signum > _SIG_MAXSIG)
	return (EINVAL);

	/*
	* Specification says sigqueue can only send signal to
	* single process.
	*/
	if (uap->pid <= 0)
	return (EINVAL);

	if ((p = pfind(uap->pid)) == NULL) {
	if ((p = zpfind(uap->pid)) == NULL)
	return (ESRCH);
	}
	error = p_cansignal(td, p, uap->signum);
	if (error == 0 && uap->signum != 0) {
	ksiginfo_init(&ksi);
	ksi.ksi_flags = KSI_SIGQ;
	ksi.ksi_signo = uap->signum;
	ksi.ksi_code = SI_QUEUE;
	ksi.ksi_pid = td->td_proc->p_pid;
	ksi.ksi_uid = td->td_ucred->cr_ruid;
	ksi.ksi_value.sival_ptr = uap->value;
	error = pksignal(p, ksi.ksi_signo, &ksi);
	}
	PROC_UNLOCK(p);
	return (error);
	}

	/*
	* Send a signal to a process group.
	*/
	void
	gsignal(int pgid, int sig, ksiginfo_t *ksi)
	{
	struct pgrp *pgrp;

	if (pgid != 0) {
	sx_slock(&proctree_lock);
	pgrp = pgfind(pgid);
	sx_sunlock(&proctree_lock);
	if (pgrp != NULL) {
	pgsignal(pgrp, sig, 0, ksi);
	PGRP_UNLOCK(pgrp);
	}
	}
	}

	/*
	* Send a signal to a process group. If checktty is 1,
	* limit to members which have a controlling terminal.
	*/
	void
	pgsignal(struct pgrp pgrp, int sig, int checkctty, ksiginfo_t ksi)
	{
	struct proc *p;

	if (pgrp) {
	PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
	LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
	PROC_LOCK(p);
	if (p->p_state == PRS_NORMAL &&
	(checkctty == 0 \|\| p->p_flag & P_CONTROLT))
	pksignal(p, sig, ksi);
	PROC_UNLOCK(p);
	}
	}
	}

	/*
	* Send a signal caused by a trap to the current thread. If it will be
	* caught immediately, deliver it with correct code. Otherwise, post it
	* normally.
	*/
	void
	trapsignal(struct thread td, ksiginfo_t ksi)
	{
	struct sigacts *ps;
	sigset_t mask;
	struct proc *p;
	int sig;
	int code;

	p = td->td_proc;
	sig = ksi->ksi_signo;
	code = ksi->ksi_code;
	KASSERT(_SIG_VALID(sig), ("invalid signal"));

	PROC_LOCK(p);
	ps = p->p_sigacts;
	mtx_lock(&ps->ps_mtx);
	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
	!SIGISMEMBER(td->td_sigmask, sig)) {
	td->td_ru.ru_nsignals++;
	#ifdef KTRACE
	if (KTRPOINT(curthread, KTR_PSIG))
	ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
	&td->td_sigmask, code);
	#endif
	(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
	ksi, &td->td_sigmask);
	mask = ps->ps_catchmask[_SIG_IDX(sig)];
	if (!SIGISMEMBER(ps->ps_signodefer, sig))
	SIGADDSET(mask, sig);
	kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
	SIGPROCMASK_PROC_LOCKED \| SIGPROCMASK_PS_LOCKED);
	if (SIGISMEMBER(ps->ps_sigreset, sig)) {
	/*
	* See kern_sigaction() for origin of this code.
	*/
	SIGDELSET(ps->ps_sigcatch, sig);
	if (sig != SIGCONT &&
	sigprop(sig) & SA_IGNORE)
	SIGADDSET(ps->ps_sigignore, sig);
	ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
	}
	mtx_unlock(&ps->ps_mtx);
	} else {
	/*
	* Avoid a possible infinite loop if the thread
	* masking the signal or process is ignoring the
	* signal.
	*/
	if (kern_forcesigexit &&
	(SIGISMEMBER(td->td_sigmask, sig) \|\|
	ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
	SIGDELSET(td->td_sigmask, sig);
	SIGDELSET(ps->ps_sigcatch, sig);
	SIGDELSET(ps->ps_sigignore, sig);
	ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
	}
	mtx_unlock(&ps->ps_mtx);
	p->p_code = code; /* XXX for core dump/debugger */
	p->p_sig = sig; /* XXX to verify code */
	tdsendsignal(p, td, sig, ksi);
	}
	PROC_UNLOCK(p);
	}

	static struct thread *
	sigtd(struct proc *p, int sig, int prop)
	{
	struct thread td, signal_td;

	PROC_LOCK_ASSERT(p, MA_OWNED);

	/*
	* Check if current thread can handle the signal without
	* switching context to another thread.
	*/
	if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
	return (curthread);
	signal_td = NULL;
	FOREACH_THREAD_IN_PROC(p, td) {
	if (!SIGISMEMBER(td->td_sigmask, sig)) {
	signal_td = td;
	break;
	}
	}
	if (signal_td == NULL)
	signal_td = FIRST_THREAD_IN_PROC(p);
	return (signal_td);
	}

	/*
	* Send the signal to the process. If the signal has an action, the action
	* is usually performed by the target process rather than the caller; we add
	* the signal to the set of pending signals for the process.
	*
	* Exceptions:
	* o When a stop signal is sent to a sleeping process that takes the
	* default action, the process is stopped without awakening it.
	* o SIGCONT restarts stopped processes (or puts them back to sleep)
	* regardless of the signal action (eg, blocked or ignored).
	*
	* Other ignored signals are discarded immediately.
	*
	* NB: This function may be entered from the debugger via the "kill" DDB
	* command. There is little that can be done to mitigate the possibly messy
	* side effects of this unwise possibility.
	*/
	void
	-psignal(struct proc *p, int sig)
	+kern_psignal(struct proc *p, int sig)
	{
	ksiginfo_t ksi;

	ksiginfo_init(&ksi);
	ksi.ksi_signo = sig;
	ksi.ksi_code = SI_KERNEL;
	(void) tdsendsignal(p, NULL, sig, &ksi);
	}

	int
	pksignal(struct proc p, int sig, ksiginfo_t ksi)
	{

	return (tdsendsignal(p, NULL, sig, ksi));
	}

	/* Utility function for finding a thread to send signal event to. */
	int
	sigev_findtd(struct proc p ,struct sigevent sigev, struct thread **ttd)
	{
	struct thread *td;

	if (sigev->sigev_notify == SIGEV_THREAD_ID) {
	td = tdfind(sigev->sigev_notify_thread_id, p->p_pid);
	if (td == NULL)
	return (ESRCH);
	*ttd = td;
	} else {
	*ttd = NULL;
	PROC_LOCK(p);
	}
	return (0);
	}

	void
	tdsignal(struct thread *td, int sig)
	{
	ksiginfo_t ksi;

	ksiginfo_init(&ksi);
	ksi.ksi_signo = sig;
	ksi.ksi_code = SI_KERNEL;
	(void) tdsendsignal(td->td_proc, td, sig, &ksi);
	}

	void
	tdksignal(struct thread td, int sig, ksiginfo_t ksi)
	{

	(void) tdsendsignal(td->td_proc, td, sig, ksi);
	}

	int
	tdsendsignal(struct proc p, struct thread td, int sig, ksiginfo_t *ksi)
	{
	sig_t action;
	sigqueue_t *sigqueue;
	int prop;
	struct sigacts *ps;
	int intrval;
	int ret = 0;
	int wakeup_swapper;

	MPASS(td == NULL \|\| p == td->td_proc);
	PROC_LOCK_ASSERT(p, MA_OWNED);

	if (!_SIG_VALID(sig))
	panic("%s(): invalid signal %d", __func__, sig);

	KASSERT(ksi == NULL \|\| !KSI_ONQ(ksi), ("%s: ksi on queue", __func__));

	/*
	* IEEE Std 1003.1-2001: return success when killing a zombie.
	*/
	if (p->p_state == PRS_ZOMBIE) {
	if (ksi && (ksi->ksi_flags & KSI_INS))
	ksiginfo_tryfree(ksi);
	return (ret);
	}

	ps = p->p_sigacts;
	KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL \| sig);
	prop = sigprop(sig);

	if (td == NULL) {
	td = sigtd(p, sig, prop);
	sigqueue = &p->p_sigqueue;
	} else {
	KASSERT(td->td_proc == p, ("invalid thread"));
	sigqueue = &td->td_sigqueue;
	}

	SDT_PROBE(proc, kernel, , signal_send, td, p, sig, 0, 0 );

	/*
	* If the signal is being ignored,
	* then we forget about it immediately.
	* (Note: we don't set SIGCONT in ps_sigignore,
	* and if it is set to SIG_IGN,
	* action will be SIG_DFL here.)
	*/
	mtx_lock(&ps->ps_mtx);
	if (SIGISMEMBER(ps->ps_sigignore, sig)) {
	SDT_PROBE(proc, kernel, , signal_discard, ps, td, sig, 0, 0 );

	mtx_unlock(&ps->ps_mtx);
	if (ksi && (ksi->ksi_flags & KSI_INS))
	ksiginfo_tryfree(ksi);
	return (ret);
	}
	if (SIGISMEMBER(td->td_sigmask, sig))
	action = SIG_HOLD;
	else if (SIGISMEMBER(ps->ps_sigcatch, sig))
	action = SIG_CATCH;
	else
	action = SIG_DFL;
	if (SIGISMEMBER(ps->ps_sigintr, sig))
	intrval = EINTR;
	else
	intrval = ERESTART;
	mtx_unlock(&ps->ps_mtx);

	if (prop & SA_CONT)
	sigqueue_delete_stopmask_proc(p);
	else if (prop & SA_STOP) {
	/*
	* If sending a tty stop signal to a member of an orphaned
	* process group, discard the signal here if the action
	* is default; don't stop the process below if sleeping,
	* and don't clear any pending SIGCONT.
	*/
	if ((prop & SA_TTYSTOP) &&
	(p->p_pgrp->pg_jobc == 0) &&
	(action == SIG_DFL)) {
	if (ksi && (ksi->ksi_flags & KSI_INS))
	ksiginfo_tryfree(ksi);
	return (ret);
	}
	sigqueue_delete_proc(p, SIGCONT);
	if (p->p_flag & P_CONTINUED) {
	p->p_flag &= ~P_CONTINUED;
	PROC_LOCK(p->p_pptr);
	sigqueue_take(p->p_ksi);
	PROC_UNLOCK(p->p_pptr);
	}
	}

	ret = sigqueue_add(sigqueue, sig, ksi);
	if (ret != 0)
	return (ret);
	signotify(td);
	/*
	* Defer further processing for signals which are held,
	* except that stopped processes must be continued by SIGCONT.
	*/
	if (action == SIG_HOLD &&
	!((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG)))
	return (ret);
	/*
	* SIGKILL: Remove procfs STOPEVENTs.
	*/
	if (sig == SIGKILL) {
	/* from procfs_ioctl.c: PIOCBIC */
	p->p_stops = 0;
	/* from procfs_ioctl.c: PIOCCONT */
	p->p_step = 0;
	wakeup(&p->p_step);
	}
	/*
	* Some signals have a process-wide effect and a per-thread
	* component. Most processing occurs when the process next
	* tries to cross the user boundary, however there are some
	* times when processing needs to be done immediatly, such as
	* waking up threads so that they can cross the user boundary.
	* We try do the per-process part here.
	*/
	if (P_SHOULDSTOP(p)) {
	if (sig == SIGKILL) {
	/*
	* If traced process is already stopped,
	* then no further action is necessary.
	*/
	if (p->p_flag & P_TRACED)
	goto out;
	/*
	* SIGKILL sets process running.
	* It will die elsewhere.
	* All threads must be restarted.
	*/
	p->p_flag &= ~P_STOPPED_SIG;
	goto runfast;
	}

	if (prop & SA_CONT) {
	/*
	* If traced process is already stopped,
	* then no further action is necessary.
	*/
	if (p->p_flag & P_TRACED)
	goto out;
	/*
	* If SIGCONT is default (or ignored), we continue the
	* process but don't leave the signal in sigqueue as
	* it has no further action. If SIGCONT is held, we
	* continue the process and leave the signal in
	* sigqueue. If the process catches SIGCONT, let it
	* handle the signal itself. If it isn't waiting on
	* an event, it goes back to run state.
	* Otherwise, process goes back to sleep state.
	*/
	p->p_flag &= ~P_STOPPED_SIG;
	PROC_SLOCK(p);
	if (p->p_numthreads == p->p_suspcount) {
	PROC_SUNLOCK(p);
	p->p_flag \|= P_CONTINUED;
	p->p_xstat = SIGCONT;
	PROC_LOCK(p->p_pptr);
	childproc_continued(p);
	PROC_UNLOCK(p->p_pptr);
	PROC_SLOCK(p);
	}
	if (action == SIG_DFL) {
	thread_unsuspend(p);
	PROC_SUNLOCK(p);
	sigqueue_delete(sigqueue, sig);
	goto out;
	}
	if (action == SIG_CATCH) {
	/*
	* The process wants to catch it so it needs
	* to run at least one thread, but which one?
	*/
	PROC_SUNLOCK(p);
	goto runfast;
	}
	/*
	* The signal is not ignored or caught.
	*/
	thread_unsuspend(p);
	PROC_SUNLOCK(p);
	goto out;
	}

	if (prop & SA_STOP) {
	/*
	* If traced process is already stopped,
	* then no further action is necessary.
	*/
	if (p->p_flag & P_TRACED)
	goto out;
	/*
	* Already stopped, don't need to stop again
	* (If we did the shell could get confused).
	* Just make sure the signal STOP bit set.
	*/
	p->p_flag \|= P_STOPPED_SIG;
	sigqueue_delete(sigqueue, sig);
	goto out;
	}

	/*
	* All other kinds of signals:
	* If a thread is sleeping interruptibly, simulate a
	* wakeup so that when it is continued it will be made
	* runnable and can look at the signal. However, don't make
	* the PROCESS runnable, leave it stopped.
	* It may run a bit until it hits a thread_suspend_check().
	*/
	wakeup_swapper = 0;
	PROC_SLOCK(p);
	thread_lock(td);
	if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
	wakeup_swapper = sleepq_abort(td, intrval);
	thread_unlock(td);
	PROC_SUNLOCK(p);
	if (wakeup_swapper)
	kick_proc0();
	goto out;
	/*
	* Mutexes are short lived. Threads waiting on them will
	* hit thread_suspend_check() soon.
	*/
	} else if (p->p_state == PRS_NORMAL) {
	if (p->p_flag & P_TRACED \|\| action == SIG_CATCH) {
	tdsigwakeup(td, sig, action, intrval);
	goto out;
	}

	MPASS(action == SIG_DFL);

	if (prop & SA_STOP) {
	if (p->p_flag & P_PPWAIT)
	goto out;
	p->p_flag \|= P_STOPPED_SIG;
	p->p_xstat = sig;
	PROC_SLOCK(p);
	sig_suspend_threads(td, p, 1);
	if (p->p_numthreads == p->p_suspcount) {
	/*
	* only thread sending signal to another
	* process can reach here, if thread is sending
	* signal to its process, because thread does
	* not suspend itself here, p_numthreads
	* should never be equal to p_suspcount.
	*/
	thread_stopped(p);
	PROC_SUNLOCK(p);
	sigqueue_delete_proc(p, p->p_xstat);
	} else
	PROC_SUNLOCK(p);
	goto out;
	}
	} else {
	/* Not in "NORMAL" state. discard the signal. */
	sigqueue_delete(sigqueue, sig);
	goto out;
	}

	/*
	* The process is not stopped so we need to apply the signal to all the
	* running threads.
	*/
	runfast:
	tdsigwakeup(td, sig, action, intrval);
	PROC_SLOCK(p);
	thread_unsuspend(p);
	PROC_SUNLOCK(p);
	out:
	/* If we jump here, proc slock should not be owned. */
	PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
	return (ret);
	}

	/*
	* The force of a signal has been directed against a single
	* thread. We need to see what we can do about knocking it
	* out of any sleep it may be in etc.
	*/
	static void
	tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval)
	{
	struct proc *p = td->td_proc;
	register int prop;
	int wakeup_swapper;

	wakeup_swapper = 0;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	prop = sigprop(sig);

	PROC_SLOCK(p);
	thread_lock(td);
	/*
	* Bring the priority of a thread up if we want it to get
	* killed in this lifetime.
	*/
	if (action == SIG_DFL && (prop & SA_KILL) && td->td_priority > PUSER)
	sched_prio(td, PUSER);
	if (TD_ON_SLEEPQ(td)) {
	/*
	* If thread is sleeping uninterruptibly
	* we can't interrupt the sleep... the signal will
	* be noticed when the process returns through
	* trap() or syscall().
	*/
	if ((td->td_flags & TDF_SINTR) == 0)
	goto out;
	/*
	* If SIGCONT is default (or ignored) and process is
	* asleep, we are finished; the process should not
	* be awakened.
	*/
	if ((prop & SA_CONT) && action == SIG_DFL) {
	thread_unlock(td);
	PROC_SUNLOCK(p);
	sigqueue_delete(&p->p_sigqueue, sig);
	/*
	* It may be on either list in this state.
	* Remove from both for now.
	*/
	sigqueue_delete(&td->td_sigqueue, sig);
	return;
	}

	/*
	* Give low priority threads a better chance to run.
	*/
	if (td->td_priority > PUSER)
	sched_prio(td, PUSER);

	wakeup_swapper = sleepq_abort(td, intrval);
	} else {
	/*
	* Other states do nothing with the signal immediately,
	* other than kicking ourselves if we are running.
	* It will either never be noticed, or noticed very soon.
	*/
	#ifdef SMP
	if (TD_IS_RUNNING(td) && td != curthread)
	forward_signal(td);
	#endif
	}
	out:
	PROC_SUNLOCK(p);
	thread_unlock(td);
	if (wakeup_swapper)
	kick_proc0();
	}

	static void
	sig_suspend_threads(struct thread td, struct proc p, int sending)
	{
	struct thread *td2;
	int wakeup_swapper;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	PROC_SLOCK_ASSERT(p, MA_OWNED);

	wakeup_swapper = 0;
	FOREACH_THREAD_IN_PROC(p, td2) {
	thread_lock(td2);
	td2->td_flags \|= TDF_ASTPENDING \| TDF_NEEDSUSPCHK;
	if ((TD_IS_SLEEPING(td2) \|\| TD_IS_SWAPPED(td2)) &&
	(td2->td_flags & TDF_SINTR)) {
	if (td2->td_flags & TDF_SBDRY) {
	if (TD_IS_SUSPENDED(td2))
	wakeup_swapper \|=
	thread_unsuspend_one(td2);
	if (TD_ON_SLEEPQ(td2))
	wakeup_swapper \|=
	sleepq_abort(td2, ERESTART);
	} else if (!TD_IS_SUSPENDED(td2)) {
	thread_suspend_one(td2);
	}
	} else if (!TD_IS_SUSPENDED(td2)) {
	if (sending \|\| td != td2)
	td2->td_flags \|= TDF_ASTPENDING;
	#ifdef SMP
	if (TD_IS_RUNNING(td2) && td2 != td)
	forward_signal(td2);
	#endif
	}
	thread_unlock(td2);
	}
	if (wakeup_swapper)
	kick_proc0();
	}

	int
	ptracestop(struct thread *td, int sig)
	{
	struct proc *p = td->td_proc;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	WITNESS_WARN(WARN_GIANTOK \| WARN_SLEEPOK,
	&p->p_mtx.lock_object, "Stopping for traced signal");

	td->td_dbgflags \|= TDB_XSIG;
	td->td_xsig = sig;
	PROC_SLOCK(p);
	while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) {
	if (p->p_flag & P_SINGLE_EXIT) {
	td->td_dbgflags &= ~TDB_XSIG;
	PROC_SUNLOCK(p);
	return (sig);
	}
	/*
	* Just make wait() to work, the last stopped thread
	* will win.
	*/
	p->p_xstat = sig;
	p->p_xthread = td;
	p->p_flag \|= (P_STOPPED_SIG\|P_STOPPED_TRACE);
	sig_suspend_threads(td, p, 0);
	if ((td->td_dbgflags & TDB_STOPATFORK) != 0) {
	td->td_dbgflags &= ~TDB_STOPATFORK;
	cv_broadcast(&p->p_dbgwait);
	}
	stopme:
	thread_suspend_switch(td);
	if (!(p->p_flag & P_TRACED)) {
	break;
	}
	if (td->td_dbgflags & TDB_SUSPEND) {
	if (p->p_flag & P_SINGLE_EXIT)
	break;
	goto stopme;
	}
	}
	PROC_SUNLOCK(p);
	return (td->td_xsig);
	}

	static void
	reschedule_signals(struct proc *p, sigset_t block, int flags)
	{
	struct sigacts *ps;
	struct thread *td;
	int sig;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	if (SIGISEMPTY(p->p_siglist))
	return;
	ps = p->p_sigacts;
	SIGSETAND(block, p->p_siglist);
	while ((sig = sig_ffs(&block)) != 0) {
	SIGDELSET(block, sig);
	td = sigtd(p, sig, 0);
	signotify(td);
	if (!(flags & SIGPROCMASK_PS_LOCKED))
	mtx_lock(&ps->ps_mtx);
	if (p->p_flag & P_TRACED \|\| SIGISMEMBER(ps->ps_sigcatch, sig))
	tdsigwakeup(td, sig, SIG_CATCH,
	(SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR :
	ERESTART));
	if (!(flags & SIGPROCMASK_PS_LOCKED))
	mtx_unlock(&ps->ps_mtx);
	}
	}

	void
	tdsigcleanup(struct thread *td)
	{
	struct proc *p;
	sigset_t unblocked;

	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);

	sigqueue_flush(&td->td_sigqueue);
	if (p->p_numthreads == 1)
	return;

	/*
	* Since we cannot handle signals, notify signal post code
	* about this by filling the sigmask.
	*
	* Also, if needed, wake up thread(s) that do not block the
	* same signals as the exiting thread, since the thread might
	* have been selected for delivery and woken up.
	*/
	SIGFILLSET(unblocked);
	SIGSETNAND(unblocked, td->td_sigmask);
	SIGFILLSET(td->td_sigmask);
	reschedule_signals(p, unblocked, 0);

	}

	/*
	* If the current process has received a signal (should be caught or cause
	* termination, should interrupt current syscall), return the signal number.
	* Stop signals with default action are processed immediately, then cleared;
	* they aren't returned. This is checked after each entry to the system for
	* a syscall or trap (though this can usually be done without calling issignal
	* by checking the pending signal masks in cursig.) The normal call
	* sequence is
	*
	* while (sig = cursig(curthread))
	* postsig(sig);
	*/
	static int
	issignal(struct thread *td, int stop_allowed)
	{
	struct proc *p;
	struct sigacts *ps;
	struct sigqueue *queue;
	sigset_t sigpending;
	int sig, prop, newsig;

	p = td->td_proc;
	ps = p->p_sigacts;
	mtx_assert(&ps->ps_mtx, MA_OWNED);
	PROC_LOCK_ASSERT(p, MA_OWNED);
	for (;;) {
	int traced = (p->p_flag & P_TRACED) \|\| (p->p_stops & S_SIG);

	sigpending = td->td_sigqueue.sq_signals;
	SIGSETOR(sigpending, p->p_sigqueue.sq_signals);
	SIGSETNAND(sigpending, td->td_sigmask);

	if (p->p_flag & P_PPWAIT)
	SIG_STOPSIGMASK(sigpending);
	if (SIGISEMPTY(sigpending)) /* no signal to send */
	return (0);
	sig = sig_ffs(&sigpending);

	if (p->p_stops & S_SIG) {
	mtx_unlock(&ps->ps_mtx);
	stopevent(p, S_SIG, sig);
	mtx_lock(&ps->ps_mtx);
	}

	/*
	* We should see pending but ignored signals
	* only if P_TRACED was on when they were posted.
	*/
	if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
	sigqueue_delete(&td->td_sigqueue, sig);
	sigqueue_delete(&p->p_sigqueue, sig);
	continue;
	}
	if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
	/*
	* If traced, always stop.
	* Remove old signal from queue before the stop.
	* XXX shrug off debugger, it causes siginfo to
	* be thrown away.
	*/
	queue = &td->td_sigqueue;
	td->td_dbgksi.ksi_signo = 0;
	if (sigqueue_get(queue, sig, &td->td_dbgksi) == 0) {
	queue = &p->p_sigqueue;
	sigqueue_get(queue, sig, &td->td_dbgksi);
	}

	mtx_unlock(&ps->ps_mtx);
	newsig = ptracestop(td, sig);
	mtx_lock(&ps->ps_mtx);

	if (sig != newsig) {

	/*
	* If parent wants us to take the signal,
	* then it will leave it in p->p_xstat;
	* otherwise we just look for signals again.
	*/
	if (newsig == 0)
	continue;
	sig = newsig;

	/*
	* Put the new signal into td_sigqueue. If the
	* signal is being masked, look for other signals.
	*/
	sigqueue_add(queue, sig, NULL);
	if (SIGISMEMBER(td->td_sigmask, sig))
	continue;
	signotify(td);
	} else {
	if (td->td_dbgksi.ksi_signo != 0) {
	td->td_dbgksi.ksi_flags \|= KSI_HEAD;
	if (sigqueue_add(&td->td_sigqueue, sig,
	&td->td_dbgksi) != 0)
	td->td_dbgksi.ksi_signo = 0;
	}
	if (td->td_dbgksi.ksi_signo == 0)
	sigqueue_add(&td->td_sigqueue, sig,
	NULL);
	}

	/*
	* If the traced bit got turned off, go back up
	* to the top to rescan signals. This ensures
	* that p_sig* and p_sigact are consistent.
	*/
	if ((p->p_flag & P_TRACED) == 0)
	continue;
	}

	prop = sigprop(sig);

	/*
	* Decide whether the signal should be returned.
	* Return the signal's number, or fall through
	* to clear it from the pending mask.
	*/
	switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {

	case (intptr_t)SIG_DFL:
	/*
	* Don't take default actions on system processes.
	*/
	if (p->p_pid <= 1) {
	#ifdef DIAGNOSTIC
	/*
	* Are you sure you want to ignore SIGSEGV
	* in init? XXX
	*/
	printf("Process (pid %lu) got signal %d\n",
	(u_long)p->p_pid, sig);
	#endif
	break; /* == ignore */
	}
	/*
	* If there is a pending stop signal to process
	* with default action, stop here,
	* then clear the signal. However,
	* if process is member of an orphaned
	* process group, ignore tty stop signals.
	*/
	if (prop & SA_STOP) {
	if (p->p_flag & P_TRACED \|\|
	(p->p_pgrp->pg_jobc == 0 &&
	prop & SA_TTYSTOP))
	break; /* == ignore */

	/* Ignore, but do not drop the stop signal. */
	if (stop_allowed != SIG_STOP_ALLOWED)
	return (sig);
	mtx_unlock(&ps->ps_mtx);
	WITNESS_WARN(WARN_GIANTOK \| WARN_SLEEPOK,
	&p->p_mtx.lock_object, "Catching SIGSTOP");
	p->p_flag \|= P_STOPPED_SIG;
	p->p_xstat = sig;
	PROC_SLOCK(p);
	sig_suspend_threads(td, p, 0);
	thread_suspend_switch(td);
	PROC_SUNLOCK(p);
	mtx_lock(&ps->ps_mtx);
	break;
	} else if (prop & SA_IGNORE) {
	/*
	* Except for SIGCONT, shouldn't get here.
	* Default action is to ignore; drop it.
	*/
	break; /* == ignore */
	} else
	return (sig);
	/NOTREACHED/

	case (intptr_t)SIG_IGN:
	/*
	* Masking above should prevent us ever trying
	* to take action on an ignored signal other
	* than SIGCONT, unless process is traced.
	*/
	if ((prop & SA_CONT) == 0 &&
	(p->p_flag & P_TRACED) == 0)
	printf("issignal\n");
	break; /* == ignore */

	default:
	/*
	* This signal has an action, let
	* postsig() process it.
	*/
	return (sig);
	}
	sigqueue_delete(&td->td_sigqueue, sig); /* take the signal! */
	sigqueue_delete(&p->p_sigqueue, sig);
	}
	/* NOTREACHED */
	}

	void
	thread_stopped(struct proc *p)
	{
	int n;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	PROC_SLOCK_ASSERT(p, MA_OWNED);
	n = p->p_suspcount;
	if (p == curproc)
	n++;
	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
	PROC_SUNLOCK(p);
	p->p_flag &= ~P_WAITED;
	PROC_LOCK(p->p_pptr);
	childproc_stopped(p, (p->p_flag & P_TRACED) ?
	CLD_TRAPPED : CLD_STOPPED);
	PROC_UNLOCK(p->p_pptr);
	PROC_SLOCK(p);
	}
	}

	/*
	* Take the action for the specified signal
	* from the current set of pending signals.
	*/
	int
	postsig(sig)
	register int sig;
	{
	struct thread *td = curthread;
	register struct proc *p = td->td_proc;
	struct sigacts *ps;
	sig_t action;
	ksiginfo_t ksi;
	sigset_t returnmask, mask;

	KASSERT(sig != 0, ("postsig"));

	PROC_LOCK_ASSERT(p, MA_OWNED);
	ps = p->p_sigacts;
	mtx_assert(&ps->ps_mtx, MA_OWNED);
	ksiginfo_init(&ksi);
	if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 &&
	sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0)
	return (0);
	ksi.ksi_signo = sig;
	if (ksi.ksi_code == SI_TIMER)
	itimer_accept(p, ksi.ksi_timerid, &ksi);
	action = ps->ps_sigact[_SIG_IDX(sig)];
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_PSIG))
	ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
	&td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code);
	#endif
	if (p->p_stops & S_SIG) {
	mtx_unlock(&ps->ps_mtx);
	stopevent(p, S_SIG, sig);
	mtx_lock(&ps->ps_mtx);
	}

	if (action == SIG_DFL) {
	/*
	* Default action, where the default is to kill
	* the process. (Other cases were ignored above.)
	*/
	mtx_unlock(&ps->ps_mtx);
	sigexit(td, sig);
	/* NOTREACHED */
	} else {
	/*
	* If we get here, the signal must be caught.
	*/
	KASSERT(action != SIG_IGN && !SIGISMEMBER(td->td_sigmask, sig),
	("postsig action"));
	/*
	* Set the new mask value and also defer further
	* occurrences of this signal.
	*
	* Special case: user has done a sigsuspend. Here the
	* current mask is not of interest, but rather the
	* mask from before the sigsuspend is what we want
	* restored after the signal processing is completed.
	*/
	if (td->td_pflags & TDP_OLDMASK) {
	returnmask = td->td_oldsigmask;
	td->td_pflags &= ~TDP_OLDMASK;
	} else
	returnmask = td->td_sigmask;

	mask = ps->ps_catchmask[_SIG_IDX(sig)];
	if (!SIGISMEMBER(ps->ps_signodefer, sig))
	SIGADDSET(mask, sig);
	kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
	SIGPROCMASK_PROC_LOCKED \| SIGPROCMASK_PS_LOCKED);

	if (SIGISMEMBER(ps->ps_sigreset, sig)) {
	/*
	* See kern_sigaction() for origin of this code.
	*/
	SIGDELSET(ps->ps_sigcatch, sig);
	if (sig != SIGCONT &&
	sigprop(sig) & SA_IGNORE)
	SIGADDSET(ps->ps_sigignore, sig);
	ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
	}
	td->td_ru.ru_nsignals++;
	if (p->p_sig == sig) {
	p->p_code = 0;
	p->p_sig = 0;
	}
	(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
	}
	return (1);
	}

	/*
	* Kill the current process for stated reason.
	*/
	void
	killproc(p, why)
	struct proc *p;
	char *why;
	{

	PROC_LOCK_ASSERT(p, MA_OWNED);
	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)",
	p, p->p_pid, p->p_comm);
	log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
	p->p_ucred ? p->p_ucred->cr_uid : -1, why);
	p->p_flag \|= P_WKILLED;
	- psignal(p, SIGKILL);
	+ kern_psignal(p, SIGKILL);
	}

	/*
	* Force the current process to exit with the specified signal, dumping core
	* if appropriate. We bypass the normal tests for masked and caught signals,
	* allowing unrecoverable failures to terminate the process without changing
	* signal state. Mark the accounting record with the signal termination.
	* If dumping core, save the signal number for the debugger. Calls exit and
	* does not return.
	*/
	void
	sigexit(td, sig)
	struct thread *td;
	int sig;
	{
	struct proc *p = td->td_proc;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	p->p_acflag \|= AXSIG;
	/*
	* We must be single-threading to generate a core dump. This
	* ensures that the registers in the core file are up-to-date.
	* Also, the ELF dump handler assumes that the thread list doesn't
	* change out from under it.
	*
	* XXX If another thread attempts to single-thread before us
	* (e.g. via fork()), we won't get a dump at all.
	*/
	if ((sigprop(sig) & SA_CORE) && (thread_single(SINGLE_NO_EXIT) == 0)) {
	p->p_sig = sig;
	/*
	* Log signals which would cause core dumps
	* (Log as LOG_INFO to appease those who don't want
	* these messages.)
	* XXX : Todo, as well as euid, write out ruid too
	* Note that coredump() drops proc lock.
	*/
	if (coredump(td) == 0)
	sig \|= WCOREFLAG;
	if (kern_logsigexit)
	log(LOG_INFO,
	"pid %d (%s), uid %d: exited on signal %d%s\n",
	p->p_pid, p->p_comm,
	td->td_ucred ? td->td_ucred->cr_uid : -1,
	sig &~ WCOREFLAG,
	sig & WCOREFLAG ? " (core dumped)" : "");
	} else
	PROC_UNLOCK(p);
	exit1(td, W_EXITCODE(0, sig));
	/* NOTREACHED */
	}

	/*
	* Send queued SIGCHLD to parent when child process's state
	* is changed.
	*/
	static void
	sigparent(struct proc *p, int reason, int status)
	{
	PROC_LOCK_ASSERT(p, MA_OWNED);
	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);

	if (p->p_ksi != NULL) {
	p->p_ksi->ksi_signo = SIGCHLD;
	p->p_ksi->ksi_code = reason;
	p->p_ksi->ksi_status = status;
	p->p_ksi->ksi_pid = p->p_pid;
	p->p_ksi->ksi_uid = p->p_ucred->cr_ruid;
	if (KSI_ONQ(p->p_ksi))
	return;
	}
	pksignal(p->p_pptr, SIGCHLD, p->p_ksi);
	}

	static void
	childproc_jobstate(struct proc *p, int reason, int status)
	{
	struct sigacts *ps;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);

	/*
	* Wake up parent sleeping in kern_wait(), also send
	* SIGCHLD to parent, but SIGCHLD does not guarantee
	* that parent will awake, because parent may masked
	* the signal.
	*/
	p->p_pptr->p_flag \|= P_STATCHILD;
	wakeup(p->p_pptr);

	ps = p->p_pptr->p_sigacts;
	mtx_lock(&ps->ps_mtx);
	if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
	mtx_unlock(&ps->ps_mtx);
	sigparent(p, reason, status);
	} else
	mtx_unlock(&ps->ps_mtx);
	}

	void
	childproc_stopped(struct proc *p, int reason)
	{
	childproc_jobstate(p, reason, p->p_xstat);
	}

	void
	childproc_continued(struct proc *p)
	{
	childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
	}

	void
	childproc_exited(struct proc *p)
	{
	int reason;
	int status = p->p_xstat; /* convert to int */

	reason = CLD_EXITED;
	if (WCOREDUMP(status))
	reason = CLD_DUMPED;
	else if (WIFSIGNALED(status))
	reason = CLD_KILLED;
	/*
	* XXX avoid calling wakeup(p->p_pptr), the work is
	* done in exit1().
	*/
	sigparent(p, reason, status);
	}

	/*
	* We only have 1 character for the core count in the format
	* string, so the range will be 0-9
	*/
	#define MAX_NUM_CORES 10
	static int num_cores = 5;

	static int
	sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
	{
	int error;
	int new_val;

	new_val = num_cores;
	error = sysctl_handle_int(oidp, &new_val, 0, req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	if (new_val > MAX_NUM_CORES)
	new_val = MAX_NUM_CORES;
	if (new_val < 0)
	new_val = 0;
	num_cores = new_val;
	return (0);
	}
	SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT\|CTLFLAG_RW,
	0, sizeof(int), sysctl_debug_num_cores_check, "I", "");

	#if defined(COMPRESS_USER_CORES)
	int compress_user_cores = 1;
	SYSCTL_INT(_kern, OID_AUTO, compress_user_cores, CTLFLAG_RW,
	&compress_user_cores, 0, "");

	int compress_user_cores_gzlevel = -1; /* default level */
	SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_gzlevel, CTLFLAG_RW,
	&compress_user_cores_gzlevel, -1, "user core gz compression level");

	#define GZ_SUFFIX ".gz"
	#define GZ_SUFFIX_LEN 3
	#endif

	static char corefilename[MAXPATHLEN] = {"%N.core"};
	SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
	sizeof(corefilename), "process corefile name format string");

	/*
	* expand_name(name, uid, pid, td, compress)
	* Expand the name described in corefilename, using name, uid, and pid.
	* corefilename is a printf-like string, with three format specifiers:
	* %N name of process ("name")
	* %P process id (pid)
	* %U user id (uid)
	* For example, "%N.core" is the default; they can be disabled completely
	* by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
	* This is controlled by the sysctl variable kern.corefile (see above).
	*/
	static char *
	expand_name(const char name, uid_t uid, pid_t pid, struct thread td,
	int compress)
	{
	struct sbuf sb;
	const char *format;
	char *temp;
	size_t i;
	int indexpos;
	char *hostname;

	hostname = NULL;
	format = corefilename;
	temp = malloc(MAXPATHLEN, M_TEMP, M_NOWAIT \| M_ZERO);
	if (temp == NULL)
	return (NULL);
	indexpos = -1;
	(void)sbuf_new(&sb, temp, MAXPATHLEN, SBUF_FIXEDLEN);
	for (i = 0; format[i]; i++) {
	switch (format[i]) {
	case '%': /* Format character */
	i++;
	switch (format[i]) {
	case '%':
	sbuf_putc(&sb, '%');
	break;
	case 'H': /* hostname */
	if (hostname == NULL) {
	hostname = malloc(MAXHOSTNAMELEN,
	M_TEMP, M_NOWAIT);
	if (hostname == NULL) {
	log(LOG_ERR,
	"pid %ld (%s), uid (%lu): "
	"unable to alloc memory "
	"for corefile hostname\n",
	(long)pid, name,
	(u_long)uid);
	goto nomem;
	}
	}
	getcredhostname(td->td_ucred, hostname,
	MAXHOSTNAMELEN);
	sbuf_printf(&sb, "%s", hostname);
	break;
	case 'I': /* autoincrementing index */
	sbuf_printf(&sb, "0");
	indexpos = sbuf_len(&sb) - 1;
	break;
	case 'N': /* process name */
	sbuf_printf(&sb, "%s", name);
	break;
	case 'P': /* process id */
	sbuf_printf(&sb, "%u", pid);
	break;
	case 'U': /* user id */
	sbuf_printf(&sb, "%u", uid);
	break;
	default:
	log(LOG_ERR,
	"Unknown format character %c in "
	"corename `%s'\n", format[i], format);
	}
	break;
	default:
	sbuf_putc(&sb, format[i]);
	}
	}
	free(hostname, M_TEMP);
	#ifdef COMPRESS_USER_CORES
	if (compress) {
	sbuf_printf(&sb, GZ_SUFFIX);
	}
	#endif
	if (sbuf_error(&sb) != 0) {
	log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
	"long\n", (long)pid, name, (u_long)uid);
	nomem:
	sbuf_delete(&sb);
	free(temp, M_TEMP);
	return (NULL);
	}
	sbuf_finish(&sb);
	sbuf_delete(&sb);

	/*
	* If the core format has a %I in it, then we need to check
	* for existing corefiles before returning a name.
	* To do this we iterate over 0..num_cores to find a
	* non-existing core file name to use.
	*/
	if (indexpos != -1) {
	struct nameidata nd;
	int error, n;
	int flags = O_CREAT \| O_EXCL \| FWRITE \| O_NOFOLLOW;
	int cmode = S_IRUSR \| S_IWUSR \| S_IRGRP \| S_IWGRP;
	int vfslocked;

	for (n = 0; n < num_cores; n++) {
	temp[indexpos] = '0' + n;
	NDINIT(&nd, LOOKUP, NOFOLLOW \| MPSAFE, UIO_SYSSPACE,
	temp, td);
	error = vn_open(&nd, &flags, cmode, NULL);
	if (error) {
	if (error == EEXIST) {
	continue;
	}
	log(LOG_ERR,
	"pid %d (%s), uid (%u): Path `%s' failed "
	"on initial open test, error = %d\n",
	pid, name, uid, temp, error);
	free(temp, M_TEMP);
	return (NULL);
	}
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	VOP_UNLOCK(nd.ni_vp, 0);
	error = vn_close(nd.ni_vp, FWRITE, td->td_ucred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	if (error) {
	log(LOG_ERR,
	"pid %d (%s), uid (%u): Path `%s' failed "
	"on close after initial open test, "
	"error = %d\n",
	pid, name, uid, temp, error);
	free(temp, M_TEMP);
	return (NULL);
	}
	break;
	}
	}
	return (temp);
	}

	/*
	* Dump a process' core. The main routine does some
	* policy checking, and creates the name of the coredump;
	* then it passes on a vnode and a size limit to the process-specific
	* coredump routine if there is one; if there _is not_ one, it returns
	* ENOSYS; otherwise it returns the error from the process-specific routine.
	*/

	static int
	coredump(struct thread *td)
	{
	struct proc *p = td->td_proc;
	register struct vnode *vp;
	register struct ucred *cred = td->td_ucred;
	struct flock lf;
	struct nameidata nd;
	struct vattr vattr;
	int error, error1, flags, locked;
	struct mount *mp;
	char name; / name of corefile */
	off_t limit;
	int vfslocked;
	int compress;

	#ifdef COMPRESS_USER_CORES
	compress = compress_user_cores;
	#else
	compress = 0;
	#endif
	PROC_LOCK_ASSERT(p, MA_OWNED);
	MPASS((p->p_flag & P_HADTHREADS) == 0 \|\| p->p_singlethread == td);
	_STOPEVENT(p, S_CORE, 0);

	name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid, td,
	compress);
	if (name == NULL) {
	PROC_UNLOCK(p);
	#ifdef AUDIT
	audit_proc_coredump(td, NULL, EINVAL);
	#endif
	return (EINVAL);
	}
	if (((sugid_coredump == 0) && p->p_flag & P_SUGID) \|\| do_coredump == 0) {
	PROC_UNLOCK(p);
	#ifdef AUDIT
	audit_proc_coredump(td, name, EFAULT);
	#endif
	free(name, M_TEMP);
	return (EFAULT);
	}

	/*
	* Note that the bulk of limit checking is done after
	* the corefile is created. The exception is if the limit
	* for corefiles is 0, in which case we don't bother
	* creating the corefile at all. This layout means that
	* a corefile is truncated instead of not being created,
	* if it is larger than the limit.
	*/
	limit = (off_t)lim_cur(p, RLIMIT_CORE);
	if (limit == 0 \|\| racct_get_available(p, RACCT_CORE) == 0) {
	PROC_UNLOCK(p);
	#ifdef AUDIT
	audit_proc_coredump(td, name, EFBIG);
	#endif
	free(name, M_TEMP);
	return (EFBIG);
	}
	PROC_UNLOCK(p);

	restart:
	NDINIT(&nd, LOOKUP, NOFOLLOW \| MPSAFE, UIO_SYSSPACE, name, td);
	flags = O_CREAT \| FWRITE \| O_NOFOLLOW;
	error = vn_open_cred(&nd, &flags, S_IRUSR \| S_IWUSR, VN_OPEN_NOAUDIT,
	cred, NULL);
	if (error) {
	#ifdef AUDIT
	audit_proc_coredump(td, name, error);
	#endif
	free(name, M_TEMP);
	return (error);
	}
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vp = nd.ni_vp;

	/* Don't dump to non-regular files or files with links. */
	if (vp->v_type != VREG \|\|
	VOP_GETATTR(vp, &vattr, cred) \|\| vattr.va_nlink != 1) {
	VOP_UNLOCK(vp, 0);
	error = EFAULT;
	goto close;
	}

	VOP_UNLOCK(vp, 0);
	lf.l_whence = SEEK_SET;
	lf.l_start = 0;
	lf.l_len = 0;
	lf.l_type = F_WRLCK;
	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);

	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
	lf.l_type = F_UNLCK;
	if (locked)
	VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
	if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
	goto out;
	if ((error = vn_start_write(NULL, &mp, V_XSLEEP \| PCATCH)) != 0)
	goto out;
	VFS_UNLOCK_GIANT(vfslocked);
	goto restart;
	}

	VATTR_NULL(&vattr);
	vattr.va_size = 0;
	if (set_core_nodump_flag)
	vattr.va_flags = UF_NODUMP;
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	VOP_SETATTR(vp, &vattr, cred);
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	PROC_LOCK(p);
	p->p_acflag \|= ACORE;
	PROC_UNLOCK(p);

	error = p->p_sysent->sv_coredump ?
	p->p_sysent->sv_coredump(td, vp, limit, compress ? IMGACT_CORE_COMPRESS : 0) :
	ENOSYS;

	if (locked) {
	lf.l_type = F_UNLCK;
	VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
	}
	close:
	error1 = vn_close(vp, FWRITE, cred, td);
	if (error == 0)
	error = error1;
	out:
	#ifdef AUDIT
	audit_proc_coredump(td, name, error);
	#endif
	free(name, M_TEMP);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Nonexistent system call-- signal process (may want to handle it). Flag
	* error in case process won't see signal immediately (blocked or ignored).
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct nosys_args {
	int dummy;
	};
	#endif
	/* ARGSUSED */
	int
	nosys(td, args)
	struct thread *td;
	struct nosys_args *args;
	{
	struct proc *p = td->td_proc;

	PROC_LOCK(p);
	- psignal(p, SIGSYS);
	+ kern_psignal(p, SIGSYS);
	PROC_UNLOCK(p);
	return (ENOSYS);
	}

	/*
	* Send a SIGIO or SIGURG signal to a process or process group using stored
	* credentials rather than those of the current process.
	*/
	void
	pgsigio(sigiop, sig, checkctty)
	struct sigio **sigiop;
	int sig, checkctty;
	{
	ksiginfo_t ksi;
	struct sigio *sigio;

	ksiginfo_init(&ksi);
	ksi.ksi_signo = sig;
	ksi.ksi_code = SI_KERNEL;

	SIGIO_LOCK();
	sigio = *sigiop;
	if (sigio == NULL) {
	SIGIO_UNLOCK();
	return;
	}
	if (sigio->sio_pgid > 0) {
	PROC_LOCK(sigio->sio_proc);
	if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
	- psignal(sigio->sio_proc, sig);
	+ kern_psignal(sigio->sio_proc, sig);
	PROC_UNLOCK(sigio->sio_proc);
	} else if (sigio->sio_pgid < 0) {
	struct proc *p;

	PGRP_LOCK(sigio->sio_pgrp);
	LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
	PROC_LOCK(p);
	if (p->p_state == PRS_NORMAL &&
	CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
	(checkctty == 0 \|\| (p->p_flag & P_CONTROLT)))
	- psignal(p, sig);
	+ kern_psignal(p, sig);
	PROC_UNLOCK(p);
	}
	PGRP_UNLOCK(sigio->sio_pgrp);
	}
	SIGIO_UNLOCK();
	}

	static int
	filt_sigattach(struct knote *kn)
	{
	struct proc *p = curproc;

	kn->kn_ptr.p_proc = p;
	kn->kn_flags \|= EV_CLEAR; /* automatically set */

	knlist_add(&p->p_klist, kn, 0);

	return (0);
	}

	static void
	filt_sigdetach(struct knote *kn)
	{
	struct proc *p = kn->kn_ptr.p_proc;

	knlist_remove(&p->p_klist, kn, 0);
	}

	/*
	* signal knotes are shared with proc knotes, so we apply a mask to
	* the hint in order to differentiate them from process hints. This
	* could be avoided by using a signal-specific knote list, but probably
	* isn't worth the trouble.
	*/
	static int
	filt_signal(struct knote *kn, long hint)
	{

	if (hint & NOTE_SIGNAL) {
	hint &= ~NOTE_SIGNAL;

	if (kn->kn_id == hint)
	kn->kn_data++;
	}
	return (kn->kn_data != 0);
	}

	struct sigacts *
	sigacts_alloc(void)
	{
	struct sigacts *ps;

	ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK \| M_ZERO);
	ps->ps_refcnt = 1;
	mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
	return (ps);
	}

	void
	sigacts_free(struct sigacts *ps)
	{

	mtx_lock(&ps->ps_mtx);
	ps->ps_refcnt--;
	if (ps->ps_refcnt == 0) {
	mtx_destroy(&ps->ps_mtx);
	free(ps, M_SUBPROC);
	} else
	mtx_unlock(&ps->ps_mtx);
	}

	struct sigacts *
	sigacts_hold(struct sigacts *ps)
	{
	mtx_lock(&ps->ps_mtx);
	ps->ps_refcnt++;
	mtx_unlock(&ps->ps_mtx);
	return (ps);
	}

	void
	sigacts_copy(struct sigacts dest, struct sigacts src)
	{

	KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
	mtx_lock(&src->ps_mtx);
	bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
	mtx_unlock(&src->ps_mtx);
	}

	int
	sigacts_shared(struct sigacts *ps)
	{
	int shared;

	mtx_lock(&ps->ps_mtx);
	shared = ps->ps_refcnt > 1;
	mtx_unlock(&ps->ps_mtx);
	return (shared);
	}
	Index: head/sys/kern/kern_synch.c
	===================================================================
	--- head/sys/kern/kern_synch.c (revision 225616)
	+++ head/sys/kern/kern_synch.c (revision 225617)
	@@ -1,586 +1,586 @@
	/*-
	* Copyright (c) 1982, 1986, 1990, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_ktrace.h"
	#include "opt_sched.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/condvar.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/sched.h>
	#include <sys/signalvar.h>
	#include <sys/sleepqueue.h>
	#include <sys/smp.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/vmmeter.h>
	#ifdef KTRACE
	#include <sys/uio.h>
	#include <sys/ktrace.h>
	#endif

	#include <machine/cpu.h>

	#ifdef XEN
	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#endif

	#define KTDSTATE(td) \
	(((td)->td_inhibitors & TDI_SLEEPING) != 0 ? "sleep" : \
	((td)->td_inhibitors & TDI_SUSPENDED) != 0 ? "suspended" : \
	((td)->td_inhibitors & TDI_SWAPPED) != 0 ? "swapped" : \
	((td)->td_inhibitors & TDI_LOCK) != 0 ? "blocked" : \
	((td)->td_inhibitors & TDI_IWAIT) != 0 ? "iwait" : "yielding")

	static void synch_setup(void *dummy);
	SYSINIT(synch_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, synch_setup,
	NULL);

	int hogticks;
	static int pause_wchan;

	static struct callout loadav_callout;

	struct loadavg averunnable =
	{ {0, 0, 0}, FSCALE }; /* load average, of runnable procs */
	/*
	* Constants for averages over 1, 5, and 15 minutes
	* when sampling at 5 second intervals.
	*/
	static fixpt_t cexp[3] = {
	0.9200444146293232 * FSCALE, /* exp(-1/12) */
	0.9834714538216174 * FSCALE, /* exp(-1/60) */
	0.9944598480048967 * FSCALE, /* exp(-1/180) */
	};

	/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */
	static int fscale __unused = FSCALE;
	SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, "");

	static void loadav(void *arg);

	void
	sleepinit(void)
	{

	hogticks = (hz / 10) * 2; /* Default only. */
	init_sleepqueues();
	}

	/*
	* General sleep call. Suspends the current thread until a wakeup is
	* performed on the specified identifier. The thread will then be made
	* runnable with the specified priority. Sleeps at most timo/hz seconds
	* (0 means no timeout). If pri includes PCATCH flag, signals are checked
	* before and after sleeping, else signals are not checked. Returns 0 if
	* awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
	* signal needs to be delivered, ERESTART is returned if the current system
	* call should be restarted if possible, and EINTR is returned if the system
	* call should be interrupted by the signal (return EINTR).
	*
	* The lock argument is unlocked before the caller is suspended, and
	* re-locked before _sleep() returns. If priority includes the PDROP
	* flag the lock is not re-locked before returning.
	*/
	int
	_sleep(void ident, struct lock_object lock, int priority,
	const char *wmesg, int timo)
	{
	struct thread *td;
	struct proc *p;
	struct lock_class *class;
	int catch, flags, lock_state, pri, rval;
	WITNESS_SAVE_DECL(lock_witness);

	td = curthread;
	p = td->td_proc;
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_CSW))
	ktrcsw(1, 0);
	#endif
	WITNESS_WARN(WARN_GIANTOK \| WARN_SLEEPOK, lock,
	"Sleeping on \"%s\"", wmesg);
	KASSERT(timo != 0 \|\| mtx_owned(&Giant) \|\| lock != NULL,
	("sleeping without a lock"));
	KASSERT(p != NULL, ("msleep1"));
	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
	if (priority & PDROP)
	KASSERT(lock != NULL && lock != &Giant.lock_object,
	("PDROP requires a non-Giant lock"));
	if (lock != NULL)
	class = LOCK_CLASS(lock);
	else
	class = NULL;

	if (cold) {
	/*
	* During autoconfiguration, just return;
	* don't run any other threads or panic below,
	* in case this is the idle thread and already asleep.
	* XXX: this used to do "s = splhigh(); splx(safepri);
	* splx(s);" to give interrupts a chance, but there is
	* no way to give interrupts a chance now.
	*/
	if (lock != NULL && priority & PDROP)
	class->lc_unlock(lock);
	return (0);
	}
	catch = priority & PCATCH;
	pri = priority & PRIMASK;

	/*
	* If we are already on a sleep queue, then remove us from that
	* sleep queue first. We have to do this to handle recursive
	* sleeps.
	*/
	if (TD_ON_SLEEPQ(td))
	sleepq_remove(td, td->td_wchan);

	if (ident == &pause_wchan)
	flags = SLEEPQ_PAUSE;
	else
	flags = SLEEPQ_SLEEP;
	if (catch)
	flags \|= SLEEPQ_INTERRUPTIBLE;
	if (priority & PBDRY)
	flags \|= SLEEPQ_STOP_ON_BDRY;

	sleepq_lock(ident);
	CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)",
	td->td_tid, p->p_pid, td->td_name, wmesg, ident);

	if (lock == &Giant.lock_object)
	mtx_assert(&Giant, MA_OWNED);
	DROP_GIANT();
	if (lock != NULL && lock != &Giant.lock_object &&
	!(class->lc_flags & LC_SLEEPABLE)) {
	WITNESS_SAVE(lock, lock_witness);
	lock_state = class->lc_unlock(lock);
	} else
	/* GCC needs to follow the Yellow Brick Road */
	lock_state = -1;

	/*
	* We put ourselves on the sleep queue and start our timeout
	* before calling thread_suspend_check, as we could stop there,
	* and a wakeup or a SIGCONT (or both) could occur while we were
	* stopped without resuming us. Thus, we must be ready for sleep
	* when cursig() is called. If the wakeup happens while we're
	* stopped, then td will no longer be on a sleep queue upon
	* return from cursig().
	*/
	sleepq_add(ident, lock, wmesg, flags, 0);
	if (timo)
	sleepq_set_timeout(ident, timo);
	if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
	sleepq_release(ident);
	WITNESS_SAVE(lock, lock_witness);
	lock_state = class->lc_unlock(lock);
	sleepq_lock(ident);
	}
	if (timo && catch)
	rval = sleepq_timedwait_sig(ident, pri);
	else if (timo)
	rval = sleepq_timedwait(ident, pri);
	else if (catch)
	rval = sleepq_wait_sig(ident, pri);
	else {
	sleepq_wait(ident, pri);
	rval = 0;
	}
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_CSW))
	ktrcsw(0, 0);
	#endif
	PICKUP_GIANT();
	if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) {
	class->lc_lock(lock, lock_state);
	WITNESS_RESTORE(lock, lock_witness);
	}
	return (rval);
	}

	int
	msleep_spin(void ident, struct mtx mtx, const char *wmesg, int timo)
	{
	struct thread *td;
	struct proc *p;
	int rval;
	WITNESS_SAVE_DECL(mtx);

	td = curthread;
	p = td->td_proc;
	KASSERT(mtx != NULL, ("sleeping without a mutex"));
	KASSERT(p != NULL, ("msleep1"));
	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));

	if (cold) {
	/*
	* During autoconfiguration, just return;
	* don't run any other threads or panic below,
	* in case this is the idle thread and already asleep.
	* XXX: this used to do "s = splhigh(); splx(safepri);
	* splx(s);" to give interrupts a chance, but there is
	* no way to give interrupts a chance now.
	*/
	return (0);
	}

	sleepq_lock(ident);
	CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)",
	td->td_tid, p->p_pid, td->td_name, wmesg, ident);

	DROP_GIANT();
	mtx_assert(mtx, MA_OWNED \| MA_NOTRECURSED);
	WITNESS_SAVE(&mtx->lock_object, mtx);
	mtx_unlock_spin(mtx);

	/*
	* We put ourselves on the sleep queue and start our timeout.
	*/
	sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0);
	if (timo)
	sleepq_set_timeout(ident, timo);

	/*
	* Can't call ktrace with any spin locks held so it can lock the
	* ktrace_mtx lock, and WITNESS_WARN considers it an error to hold
	* any spin lock. Thus, we have to drop the sleepq spin lock while
	* we handle those requests. This is safe since we have placed our
	* thread on the sleep queue already.
	*/
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_CSW)) {
	sleepq_release(ident);
	ktrcsw(1, 0);
	sleepq_lock(ident);
	}
	#endif
	#ifdef WITNESS
	sleepq_release(ident);
	WITNESS_WARN(WARN_GIANTOK \| WARN_SLEEPOK, NULL, "Sleeping on \"%s\"",
	wmesg);
	sleepq_lock(ident);
	#endif
	if (timo)
	rval = sleepq_timedwait(ident, 0);
	else {
	sleepq_wait(ident, 0);
	rval = 0;
	}
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_CSW))
	ktrcsw(0, 0);
	#endif
	PICKUP_GIANT();
	mtx_lock_spin(mtx);
	WITNESS_RESTORE(&mtx->lock_object, mtx);
	return (rval);
	}

	/*
	* pause() is like tsleep() except that the intention is to not be
	* explicitly woken up by another thread. Instead, the current thread
	* simply wishes to sleep until the timeout expires. It is
	* implemented using a dummy wait channel.
	*/
	int
	pause(const char *wmesg, int timo)
	{

	KASSERT(timo != 0, ("pause: timeout required"));
	return (tsleep(&pause_wchan, 0, wmesg, timo));
	}

	/*
	* Make all threads sleeping on the specified identifier runnable.
	*/
	void
	wakeup(void *ident)
	{
	int wakeup_swapper;

	sleepq_lock(ident);
	wakeup_swapper = sleepq_broadcast(ident, SLEEPQ_SLEEP, 0, 0);
	sleepq_release(ident);
	if (wakeup_swapper) {
	KASSERT(ident != &proc0,
	("wakeup and wakeup_swapper and proc0"));
	kick_proc0();
	}
	}

	/*
	* Make a thread sleeping on the specified identifier runnable.
	* May wake more than one thread if a target thread is currently
	* swapped out.
	*/
	void
	wakeup_one(void *ident)
	{
	int wakeup_swapper;

	sleepq_lock(ident);
	wakeup_swapper = sleepq_signal(ident, SLEEPQ_SLEEP, 0, 0);
	sleepq_release(ident);
	if (wakeup_swapper)
	kick_proc0();
	}

	static void
	kdb_switch(void)
	{
	thread_unlock(curthread);
	kdb_backtrace();
	kdb_reenter();
	panic("%s: did not reenter debugger", __func__);
	}

	/*
	* The machine independent parts of context switching.
	*/
	void
	mi_switch(int flags, struct thread *newtd)
	{
	uint64_t runtime, new_switchtime;
	struct thread *td;
	struct proc *p;

	td = curthread; /* XXX */
	THREAD_LOCK_ASSERT(td, MA_OWNED \| MA_NOTRECURSED);
	p = td->td_proc; /* XXX */
	KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
	#ifdef INVARIANTS
	if (!TD_ON_LOCK(td) && !TD_IS_RUNNING(td))
	mtx_assert(&Giant, MA_NOTOWNED);
	#endif
	KASSERT(td->td_critnest == 1 \|\| panicstr,
	("mi_switch: switch in a critical section"));
	KASSERT((flags & (SW_INVOL \| SW_VOL)) != 0,
	("mi_switch: switch must be voluntary or involuntary"));
	KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself"));

	/*
	* Don't perform context switches from the debugger.
	*/
	if (kdb_active)
	kdb_switch();
	if (flags & SW_VOL) {
	td->td_ru.ru_nvcsw++;
	td->td_swvoltick = ticks;
	} else
	td->td_ru.ru_nivcsw++;
	#ifdef SCHED_STATS
	SCHED_STAT_INC(sched_switch_stats[flags & SW_TYPE_MASK]);
	#endif
	/*
	* Compute the amount of time during which the current
	* thread was running, and add that to its total so far.
	*/
	new_switchtime = cpu_ticks();
	runtime = new_switchtime - PCPU_GET(switchtime);
	td->td_runtime += runtime;
	td->td_incruntime += runtime;
	PCPU_SET(switchtime, new_switchtime);
	td->td_generation++; /* bump preempt-detect counter */
	PCPU_INC(cnt.v_swtch);
	PCPU_SET(switchticks, ticks);
	CTR4(KTR_PROC, "mi_switch: old thread %ld (td_sched %p, pid %ld, %s)",
	td->td_tid, td->td_sched, p->p_pid, td->td_name);
	#if (KTR_COMPILE & KTR_SCHED) != 0
	if (TD_IS_IDLETHREAD(td))
	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
	"prio:%d", td->td_priority);
	else
	KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
	"prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
	"lockname:\"%s\"", td->td_lockname);
	#endif
	#ifdef XEN
	PT_UPDATES_FLUSH();
	#endif
	sched_switch(td, newtd, flags);
	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
	"prio:%d", td->td_priority);

	CTR4(KTR_PROC, "mi_switch: new thread %ld (td_sched %p, pid %ld, %s)",
	td->td_tid, td->td_sched, p->p_pid, td->td_name);

	/*
	* If the last thread was exiting, finish cleaning it up.
	*/
	if ((td = PCPU_GET(deadthread))) {
	PCPU_SET(deadthread, NULL);
	thread_stash(td);
	}
	}

	/*
	* Change thread state to be runnable, placing it on the run queue if
	* it is in memory. If it is swapped out, return true so our caller
	* will know to awaken the swapper.
	*/
	int
	setrunnable(struct thread *td)
	{

	THREAD_LOCK_ASSERT(td, MA_OWNED);
	KASSERT(td->td_proc->p_state != PRS_ZOMBIE,
	("setrunnable: pid %d is a zombie", td->td_proc->p_pid));
	switch (td->td_state) {
	case TDS_RUNNING:
	case TDS_RUNQ:
	return (0);
	case TDS_INHIBITED:
	/*
	* If we are only inhibited because we are swapped out
	* then arange to swap in this process. Otherwise just return.
	*/
	if (td->td_inhibitors != TDI_SWAPPED)
	return (0);
	/* FALLTHROUGH */
	case TDS_CAN_RUN:
	break;
	default:
	printf("state is 0x%x", td->td_state);
	panic("setrunnable(2)");
	}
	if ((td->td_flags & TDF_INMEM) == 0) {
	if ((td->td_flags & TDF_SWAPINREQ) == 0) {
	td->td_flags \|= TDF_SWAPINREQ;
	return (1);
	}
	} else
	sched_wakeup(td);
	return (0);
	}

	/*
	* Compute a tenex style load average of a quantity on
	* 1, 5 and 15 minute intervals.
	*/
	static void
	loadav(void *arg)
	{
	int i, nrun;
	struct loadavg *avg;

	nrun = sched_load();
	avg = &averunnable;

	for (i = 0; i < 3; i++)
	avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
	nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;

	/*
	* Schedule the next update to occur after 5 seconds, but add a
	* random variation to avoid synchronisation with processes that
	* run at regular intervals.
	*/
	callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)),
	loadav, NULL);
	}

	/* ARGSUSED */
	static void
	synch_setup(void *dummy)
	{
	callout_init(&loadav_callout, CALLOUT_MPSAFE);

	/* Kick off timeout driven events by calling first time. */
	loadav(NULL);
	}

	int
	should_yield(void)
	{

	return (ticks - curthread->td_swvoltick >= hogticks);
	}

	void
	maybe_yield(void)
	{

	if (should_yield())
	kern_yield(PRI_USER);
	}

	void
	kern_yield(int prio)
	{
	struct thread *td;

	td = curthread;
	DROP_GIANT();
	thread_lock(td);
	if (prio == PRI_USER)
	prio = td->td_user_pri;
	if (prio >= 0)
	sched_prio(td, prio);
	mi_switch(SW_VOL \| SWT_RELINQUISH, NULL);
	thread_unlock(td);
	PICKUP_GIANT();
	}

	/*
	* General purpose yield system call.
	*/
	int
	-yield(struct thread td, struct yield_args uap)
	+sys_yield(struct thread td, struct yield_args uap)
	{

	thread_lock(td);
	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
	sched_prio(td, PRI_MAX_TIMESHARE);
	mi_switch(SW_VOL \| SWT_RELINQUISH, NULL);
	thread_unlock(td);
	td->td_retval[0] = 0;
	return (0);
	}
	Index: head/sys/kern/kern_sysctl.c
	===================================================================
	--- head/sys/kern/kern_sysctl.c (revision 225616)
	+++ head/sys/kern/kern_sysctl.c (revision 225617)
	@@ -1,1668 +1,1668 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Mike Karels at Berkeley Software Design, Inc.
	*
	* Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
	* project, to make these variables more userfriendly.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_capsicum.h"
	#include "opt_compat.h"
	#include "opt_ktrace.h"

	#include <sys/param.h>
	#include <sys/fail.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/jail.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sbuf.h>
	#include <sys/sx.h>
	#include <sys/sysproto.h>
	#include <sys/uio.h>
	#ifdef KTRACE
	#include <sys/ktrace.h>
	#endif

	#include <net/vnet.h>

	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>

	static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic");
	static MALLOC_DEFINE(M_SYSCTLOID, "sysctloid", "sysctl dynamic oids");
	static MALLOC_DEFINE(M_SYSCTLTMP, "sysctltmp", "sysctl temp output buffer");

	/*
	* The sysctllock protects the MIB tree. It also protects sysctl
	* contexts used with dynamic sysctls. The sysctl_register_oid() and
	* sysctl_unregister_oid() routines require the sysctllock to already
	* be held, so the sysctl_lock() and sysctl_unlock() routines are
	* provided for the few places in the kernel which need to use that
	* API rather than using the dynamic API. Use of the dynamic API is
	* strongly encouraged for most code.
	*
	* The sysctlmemlock is used to limit the amount of user memory wired for
	* sysctl requests. This is implemented by serializing any userland
	* sysctl requests larger than a single page via an exclusive lock.
	*/
	static struct sx sysctllock;
	static struct sx sysctlmemlock;

	#define SYSCTL_XLOCK() sx_xlock(&sysctllock)
	#define SYSCTL_XUNLOCK() sx_xunlock(&sysctllock)
	#define SYSCTL_ASSERT_XLOCKED() sx_assert(&sysctllock, SA_XLOCKED)
	#define SYSCTL_INIT() sx_init(&sysctllock, "sysctl lock")
	#define SYSCTL_SLEEP(ch, wmesg, timo) \
	sx_sleep(ch, &sysctllock, 0, wmesg, timo)

	static int sysctl_root(SYSCTL_HANDLER_ARGS);

	struct sysctl_oid_list sysctl__children; /* root list */

	static int sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del,
	int recurse);

	static struct sysctl_oid *
	sysctl_find_oidname(const char name, struct sysctl_oid_list list)
	{
	struct sysctl_oid *oidp;

	SYSCTL_ASSERT_XLOCKED();
	SLIST_FOREACH(oidp, list, oid_link) {
	if (strcmp(oidp->oid_name, name) == 0) {
	return (oidp);
	}
	}
	return (NULL);
	}

	/*
	* Initialization of the MIB tree.
	*
	* Order by number in each list.
	*/
	void
	sysctl_lock(void)
	{

	SYSCTL_XLOCK();
	}

	void
	sysctl_unlock(void)
	{

	SYSCTL_XUNLOCK();
	}

	void
	sysctl_register_oid(struct sysctl_oid *oidp)
	{
	struct sysctl_oid_list *parent = oidp->oid_parent;
	struct sysctl_oid *p;
	struct sysctl_oid *q;

	/*
	* First check if another oid with the same name already
	* exists in the parent's list.
	*/
	SYSCTL_ASSERT_XLOCKED();
	p = sysctl_find_oidname(oidp->oid_name, parent);
	if (p != NULL) {
	if ((p->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
	p->oid_refcnt++;
	return;
	} else {
	printf("can't re-use a leaf (%s)!\n", p->oid_name);
	return;
	}
	}
	/*
	* If this oid has a number OID_AUTO, give it a number which
	* is greater than any current oid.
	* NOTE: DO NOT change the starting value here, change it in
	* <sys/sysctl.h>, and make sure it is at least 256 to
	* accomodate e.g. net.inet.raw as a static sysctl node.
	*/
	if (oidp->oid_number == OID_AUTO) {
	static int newoid = CTL_AUTO_START;

	oidp->oid_number = newoid++;
	if (newoid == 0x7fffffff)
	panic("out of oids");
	}
	#if 0
	else if (oidp->oid_number >= CTL_AUTO_START) {
	/* do not panic; this happens when unregistering sysctl sets */
	printf("static sysctl oid too high: %d", oidp->oid_number);
	}
	#endif

	/*
	* Insert the oid into the parent's list in order.
	*/
	q = NULL;
	SLIST_FOREACH(p, parent, oid_link) {
	if (oidp->oid_number < p->oid_number)
	break;
	q = p;
	}
	if (q)
	SLIST_INSERT_AFTER(q, oidp, oid_link);
	else
	SLIST_INSERT_HEAD(parent, oidp, oid_link);
	}

	void
	sysctl_unregister_oid(struct sysctl_oid *oidp)
	{
	struct sysctl_oid *p;
	int error;

	SYSCTL_ASSERT_XLOCKED();
	error = ENOENT;
	if (oidp->oid_number == OID_AUTO) {
	error = EINVAL;
	} else {
	SLIST_FOREACH(p, oidp->oid_parent, oid_link) {
	if (p == oidp) {
	SLIST_REMOVE(oidp->oid_parent, oidp,
	sysctl_oid, oid_link);
	error = 0;
	break;
	}
	}
	}

	/*
	* This can happen when a module fails to register and is
	* being unloaded afterwards. It should not be a panic()
	* for normal use.
	*/
	if (error)
	printf("%s: failed to unregister sysctl\n", __func__);
	}

	/* Initialize a new context to keep track of dynamically added sysctls. */
	int
	sysctl_ctx_init(struct sysctl_ctx_list *c)
	{

	if (c == NULL) {
	return (EINVAL);
	}

	/*
	* No locking here, the caller is responsible for not adding
	* new nodes to a context until after this function has
	* returned.
	*/
	TAILQ_INIT(c);
	return (0);
	}

	/* Free the context, and destroy all dynamic oids registered in this context */
	int
	sysctl_ctx_free(struct sysctl_ctx_list *clist)
	{
	struct sysctl_ctx_entry e, e1;
	int error;

	error = 0;
	/*
	* First perform a "dry run" to check if it's ok to remove oids.
	* XXX FIXME
	* XXX This algorithm is a hack. But I don't know any
	* XXX better solution for now...
	*/
	SYSCTL_XLOCK();
	TAILQ_FOREACH(e, clist, link) {
	error = sysctl_remove_oid_locked(e->entry, 0, 0);
	if (error)
	break;
	}
	/*
	* Restore deregistered entries, either from the end,
	* or from the place where error occured.
	* e contains the entry that was not unregistered
	*/
	if (error)
	e1 = TAILQ_PREV(e, sysctl_ctx_list, link);
	else
	e1 = TAILQ_LAST(clist, sysctl_ctx_list);
	while (e1 != NULL) {
	sysctl_register_oid(e1->entry);
	e1 = TAILQ_PREV(e1, sysctl_ctx_list, link);
	}
	if (error) {
	SYSCTL_XUNLOCK();
	return(EBUSY);
	}
	/* Now really delete the entries */
	e = TAILQ_FIRST(clist);
	while (e != NULL) {
	e1 = TAILQ_NEXT(e, link);
	error = sysctl_remove_oid_locked(e->entry, 1, 0);
	if (error)
	panic("sysctl_remove_oid: corrupt tree, entry: %s",
	e->entry->oid_name);
	free(e, M_SYSCTLOID);
	e = e1;
	}
	SYSCTL_XUNLOCK();
	return (error);
	}

	/* Add an entry to the context */
	struct sysctl_ctx_entry *
	sysctl_ctx_entry_add(struct sysctl_ctx_list clist, struct sysctl_oid oidp)
	{
	struct sysctl_ctx_entry *e;

	SYSCTL_ASSERT_XLOCKED();
	if (clist == NULL \|\| oidp == NULL)
	return(NULL);
	e = malloc(sizeof(struct sysctl_ctx_entry), M_SYSCTLOID, M_WAITOK);
	e->entry = oidp;
	TAILQ_INSERT_HEAD(clist, e, link);
	return (e);
	}

	/* Find an entry in the context */
	struct sysctl_ctx_entry *
	sysctl_ctx_entry_find(struct sysctl_ctx_list clist, struct sysctl_oid oidp)
	{
	struct sysctl_ctx_entry *e;

	SYSCTL_ASSERT_XLOCKED();
	if (clist == NULL \|\| oidp == NULL)
	return(NULL);
	TAILQ_FOREACH(e, clist, link) {
	if(e->entry == oidp)
	return(e);
	}
	return (e);
	}

	/*
	* Delete an entry from the context.
	* NOTE: this function doesn't free oidp! You have to remove it
	* with sysctl_remove_oid().
	*/
	int
	sysctl_ctx_entry_del(struct sysctl_ctx_list clist, struct sysctl_oid oidp)
	{
	struct sysctl_ctx_entry *e;

	if (clist == NULL \|\| oidp == NULL)
	return (EINVAL);
	SYSCTL_XLOCK();
	e = sysctl_ctx_entry_find(clist, oidp);
	if (e != NULL) {
	TAILQ_REMOVE(clist, e, link);
	SYSCTL_XUNLOCK();
	free(e, M_SYSCTLOID);
	return (0);
	} else {
	SYSCTL_XUNLOCK();
	return (ENOENT);
	}
	}

	/*
	* Remove dynamically created sysctl trees.
	* oidp - top of the tree to be removed
	* del - if 0 - just deregister, otherwise free up entries as well
	* recurse - if != 0 traverse the subtree to be deleted
	*/
	int
	sysctl_remove_oid(struct sysctl_oid *oidp, int del, int recurse)
	{
	int error;

	SYSCTL_XLOCK();
	error = sysctl_remove_oid_locked(oidp, del, recurse);
	SYSCTL_XUNLOCK();
	return (error);
	}

	int
	sysctl_remove_name(struct sysctl_oid parent, const char name,
	int del, int recurse)
	{
	struct sysctl_oid p, tmp;
	int error;

	error = ENOENT;
	SYSCTL_XLOCK();
	SLIST_FOREACH_SAFE(p, SYSCTL_CHILDREN(parent), oid_link, tmp) {
	if (strcmp(p->oid_name, name) == 0) {
	error = sysctl_remove_oid_locked(p, del, recurse);
	break;
	}
	}
	SYSCTL_XUNLOCK();

	return (error);
	}


	static int
	sysctl_remove_oid_locked(struct sysctl_oid *oidp, int del, int recurse)
	{
	struct sysctl_oid p, tmp;
	int error;

	SYSCTL_ASSERT_XLOCKED();
	if (oidp == NULL)
	return(EINVAL);
	if ((oidp->oid_kind & CTLFLAG_DYN) == 0) {
	printf("can't remove non-dynamic nodes!\n");
	return (EINVAL);
	}
	/*
	* WARNING: normal method to do this should be through
	* sysctl_ctx_free(). Use recursing as the last resort
	* method to purge your sysctl tree of leftovers...
	* However, if some other code still references these nodes,
	* it will panic.
	*/
	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
	if (oidp->oid_refcnt == 1) {
	SLIST_FOREACH_SAFE(p,
	SYSCTL_CHILDREN(oidp), oid_link, tmp) {
	if (!recurse)
	return (ENOTEMPTY);
	error = sysctl_remove_oid_locked(p, del,
	recurse);
	if (error)
	return (error);
	}
	if (del)
	free(SYSCTL_CHILDREN(oidp), M_SYSCTLOID);
	}
	}
	if (oidp->oid_refcnt > 1 ) {
	oidp->oid_refcnt--;
	} else {
	if (oidp->oid_refcnt == 0) {
	printf("Warning: bad oid_refcnt=%u (%s)!\n",
	oidp->oid_refcnt, oidp->oid_name);
	return (EINVAL);
	}
	sysctl_unregister_oid(oidp);
	if (del) {
	/*
	* Wait for all threads running the handler to drain.
	* This preserves the previous behavior when the
	* sysctl lock was held across a handler invocation,
	* and is necessary for module unload correctness.
	*/
	while (oidp->oid_running > 0) {
	oidp->oid_kind \|= CTLFLAG_DYING;
	SYSCTL_SLEEP(&oidp->oid_running, "oidrm", 0);
	}
	if (oidp->oid_descr)
	free((void )(uintptr_t)(const void )oidp->oid_descr, M_SYSCTLOID);
	free((void )(uintptr_t)(const void )oidp->oid_name,
	M_SYSCTLOID);
	free(oidp, M_SYSCTLOID);
	}
	}
	return (0);
	}
	/*
	* Create new sysctls at run time.
	* clist may point to a valid context initialized with sysctl_ctx_init().
	*/
	struct sysctl_oid *
	sysctl_add_oid(struct sysctl_ctx_list clist, struct sysctl_oid_list parent,
	int number, const char name, int kind, void arg1, intptr_t arg2,
	int (handler)(SYSCTL_HANDLER_ARGS), const char fmt, const char *descr)
	{
	struct sysctl_oid *oidp;
	ssize_t len;
	char *newname;

	/* You have to hook up somewhere.. */
	if (parent == NULL)
	return(NULL);
	/* Check if the node already exists, otherwise create it */
	SYSCTL_XLOCK();
	oidp = sysctl_find_oidname(name, parent);
	if (oidp != NULL) {
	if ((oidp->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
	oidp->oid_refcnt++;
	/* Update the context */
	if (clist != NULL)
	sysctl_ctx_entry_add(clist, oidp);
	SYSCTL_XUNLOCK();
	return (oidp);
	} else {
	SYSCTL_XUNLOCK();
	printf("can't re-use a leaf (%s)!\n", name);
	return (NULL);
	}
	}
	oidp = malloc(sizeof(struct sysctl_oid), M_SYSCTLOID, M_WAITOK\|M_ZERO);
	oidp->oid_parent = parent;
	SLIST_NEXT(oidp, oid_link) = NULL;
	oidp->oid_number = number;
	oidp->oid_refcnt = 1;
	len = strlen(name);
	newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK);
	bcopy(name, newname, len + 1);
	newname[len] = '\0';
	oidp->oid_name = newname;
	oidp->oid_handler = handler;
	oidp->oid_kind = CTLFLAG_DYN \| kind;
	if ((kind & CTLTYPE) == CTLTYPE_NODE) {
	/* Allocate space for children */
	SYSCTL_CHILDREN_SET(oidp, malloc(sizeof(struct sysctl_oid_list),
	M_SYSCTLOID, M_WAITOK));
	SLIST_INIT(SYSCTL_CHILDREN(oidp));
	oidp->oid_arg2 = arg2;
	} else {
	oidp->oid_arg1 = arg1;
	oidp->oid_arg2 = arg2;
	}
	oidp->oid_fmt = fmt;
	if (descr) {
	int len = strlen(descr) + 1;
	oidp->oid_descr = malloc(len, M_SYSCTLOID, M_WAITOK);
	if (oidp->oid_descr)
	strcpy((char )(uintptr_t)(const void )oidp->oid_descr, descr);
	}
	/* Update the context, if used */
	if (clist != NULL)
	sysctl_ctx_entry_add(clist, oidp);
	/* Register this oid */
	sysctl_register_oid(oidp);
	SYSCTL_XUNLOCK();
	return (oidp);
	}

	/*
	* Rename an existing oid.
	*/
	void
	sysctl_rename_oid(struct sysctl_oid oidp, const char name)
	{
	ssize_t len;
	char *newname;
	void *oldname;

	len = strlen(name);
	newname = malloc(len + 1, M_SYSCTLOID, M_WAITOK);
	bcopy(name, newname, len + 1);
	newname[len] = '\0';
	SYSCTL_XLOCK();
	oldname = (void )(uintptr_t)(const void )oidp->oid_name;
	oidp->oid_name = newname;
	SYSCTL_XUNLOCK();
	free(oldname, M_SYSCTLOID);
	}

	/*
	* Reparent an existing oid.
	*/
	int
	sysctl_move_oid(struct sysctl_oid oid, struct sysctl_oid_list parent)
	{
	struct sysctl_oid *oidp;

	SYSCTL_XLOCK();
	if (oid->oid_parent == parent) {
	SYSCTL_XUNLOCK();
	return (0);
	}
	oidp = sysctl_find_oidname(oid->oid_name, parent);
	if (oidp != NULL) {
	SYSCTL_XUNLOCK();
	return (EEXIST);
	}
	sysctl_unregister_oid(oid);
	oid->oid_parent = parent;
	oid->oid_number = OID_AUTO;
	sysctl_register_oid(oid);
	SYSCTL_XUNLOCK();
	return (0);
	}

	/*
	* Register the kernel's oids on startup.
	*/
	SET_DECLARE(sysctl_set, struct sysctl_oid);

	static void
	sysctl_register_all(void *arg)
	{
	struct sysctl_oid **oidp;

	sx_init(&sysctlmemlock, "sysctl mem");
	SYSCTL_INIT();
	SYSCTL_XLOCK();
	SET_FOREACH(oidp, sysctl_set)
	sysctl_register_oid(*oidp);
	SYSCTL_XUNLOCK();
	}
	SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_all, 0);

	/*
	* "Staff-functions"
	*
	* These functions implement a presently undocumented interface
	* used by the sysctl program to walk the tree, and get the type
	* so it can print the value.
	* This interface is under work and consideration, and should probably
	* be killed with a big axe by the first person who can find the time.
	* (be aware though, that the proper interface isn't as obvious as it
	* may seem, there are various conflicting requirements.
	*
	* {0,0} printf the entire MIB-tree.
	* {0,1,...} return the name of the "..." OID.
	* {0,2,...} return the next OID.
	* {0,3} return the OID of the name in "new"
	* {0,4,...} return the kind & format info for the "..." OID.
	* {0,5,...} return the description the "..." OID.
	*/

	#ifdef SYSCTL_DEBUG
	static void
	sysctl_sysctl_debug_dump_node(struct sysctl_oid_list *l, int i)
	{
	int k;
	struct sysctl_oid *oidp;

	SYSCTL_ASSERT_XLOCKED();
	SLIST_FOREACH(oidp, l, oid_link) {

	for (k=0; k<i; k++)
	printf(" ");

	printf("%d %s ", oidp->oid_number, oidp->oid_name);

	printf("%c%c",
	oidp->oid_kind & CTLFLAG_RD ? 'R':' ',
	oidp->oid_kind & CTLFLAG_WR ? 'W':' ');

	if (oidp->oid_handler)
	printf(" *Handler");

	switch (oidp->oid_kind & CTLTYPE) {
	case CTLTYPE_NODE:
	printf(" Node\n");
	if (!oidp->oid_handler) {
	sysctl_sysctl_debug_dump_node(
	oidp->oid_arg1, i+2);
	}
	break;
	case CTLTYPE_INT: printf(" Int\n"); break;
	case CTLTYPE_UINT: printf(" u_int\n"); break;
	case CTLTYPE_LONG: printf(" Long\n"); break;
	case CTLTYPE_ULONG: printf(" u_long\n"); break;
	case CTLTYPE_STRING: printf(" String\n"); break;
	case CTLTYPE_U64: printf(" uint64_t\n"); break;
	case CTLTYPE_S64: printf(" int64_t\n"); break;
	case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
	default: printf("\n");
	}

	}
	}

	static int
	sysctl_sysctl_debug(SYSCTL_HANDLER_ARGS)
	{
	int error;

	error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
	if (error)
	return (error);
	SYSCTL_XLOCK();
	sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
	SYSCTL_XUNLOCK();
	return (ENOENT);
	}

	SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING\|CTLFLAG_RD,
	0, 0, sysctl_sysctl_debug, "-", "");
	#endif

	static int
	sysctl_sysctl_name(SYSCTL_HANDLER_ARGS)
	{
	int name = (int ) arg1;
	u_int namelen = arg2;
	int error = 0;
	struct sysctl_oid *oid;
	struct sysctl_oid_list lsp = &sysctl__children, lsp2;
	char buf[10];

	SYSCTL_XLOCK();
	while (namelen) {
	if (!lsp) {
	snprintf(buf,sizeof(buf),"%d",*name);
	if (req->oldidx)
	error = SYSCTL_OUT(req, ".", 1);
	if (!error)
	error = SYSCTL_OUT(req, buf, strlen(buf));
	if (error)
	goto out;
	namelen--;
	name++;
	continue;
	}
	lsp2 = 0;
	SLIST_FOREACH(oid, lsp, oid_link) {
	if (oid->oid_number != *name)
	continue;

	if (req->oldidx)
	error = SYSCTL_OUT(req, ".", 1);
	if (!error)
	error = SYSCTL_OUT(req, oid->oid_name,
	strlen(oid->oid_name));
	if (error)
	goto out;

	namelen--;
	name++;

	if ((oid->oid_kind & CTLTYPE) != CTLTYPE_NODE)
	break;

	if (oid->oid_handler)
	break;

	lsp2 = SYSCTL_CHILDREN(oid);
	break;
	}
	lsp = lsp2;
	}
	error = SYSCTL_OUT(req, "", 1);
	out:
	SYSCTL_XUNLOCK();
	return (error);
	}

	/*
	* XXXRW/JA: Shouldn't return name data for nodes that we don't permit in
	* capability mode.
	*/
	static SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD \| CTLFLAG_CAPRD,
	sysctl_sysctl_name, "");

	static int
	sysctl_sysctl_next_ls(struct sysctl_oid_list lsp, int name, u_int namelen,
	int next, int len, int level, struct sysctl_oid **oidpp)
	{
	struct sysctl_oid *oidp;

	SYSCTL_ASSERT_XLOCKED();
	*len = level;
	SLIST_FOREACH(oidp, lsp, oid_link) {
	*next = oidp->oid_number;
	*oidpp = oidp;

	if (oidp->oid_kind & CTLFLAG_SKIP)
	continue;

	if (!namelen) {
	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
	return (0);
	if (oidp->oid_handler)
	/* We really should call the handler here...*/
	return (0);
	lsp = SYSCTL_CHILDREN(oidp);
	if (!sysctl_sysctl_next_ls(lsp, 0, 0, next+1,
	len, level+1, oidpp))
	return (0);
	goto emptynode;
	}

	if (oidp->oid_number < *name)
	continue;

	if (oidp->oid_number > *name) {
	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
	return (0);
	if (oidp->oid_handler)
	return (0);
	lsp = SYSCTL_CHILDREN(oidp);
	if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1,
	next+1, len, level+1, oidpp))
	return (0);
	goto next;
	}
	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
	continue;

	if (oidp->oid_handler)
	continue;

	lsp = SYSCTL_CHILDREN(oidp);
	if (!sysctl_sysctl_next_ls(lsp, name+1, namelen-1, next+1,
	len, level+1, oidpp))
	return (0);
	next:
	namelen = 1;
	emptynode:
	*len = level;
	}
	return (1);
	}

	static int
	sysctl_sysctl_next(SYSCTL_HANDLER_ARGS)
	{
	int name = (int ) arg1;
	u_int namelen = arg2;
	int i, j, error;
	struct sysctl_oid *oid;
	struct sysctl_oid_list *lsp = &sysctl__children;
	int newoid[CTL_MAXNAME];

	SYSCTL_XLOCK();
	i = sysctl_sysctl_next_ls(lsp, name, namelen, newoid, &j, 1, &oid);
	SYSCTL_XUNLOCK();
	if (i)
	return (ENOENT);
	error = SYSCTL_OUT(req, newoid, j * sizeof (int));
	return (error);
	}

	/*
	* XXXRW/JA: Shouldn't return next data for nodes that we don't permit in
	* capability mode.
	*/
	static SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD \| CTLFLAG_CAPRD,
	sysctl_sysctl_next, "");

	static int
	name2oid(char name, int oid, int len, struct sysctl_oid *oidpp)
	{
	int i;
	struct sysctl_oid *oidp;
	struct sysctl_oid_list *lsp = &sysctl__children;
	char *p;

	SYSCTL_ASSERT_XLOCKED();

	if (!*name)
	return (ENOENT);

	p = name + strlen(name) - 1 ;
	if (*p == '.')
	*p = '\0';

	*len = 0;

	for (p = name; p && p != '.'; p++)
	;
	i = *p;
	if (i == '.')
	*p = '\0';

	oidp = SLIST_FIRST(lsp);

	while (oidp && *len < CTL_MAXNAME) {
	if (strcmp(name, oidp->oid_name)) {
	oidp = SLIST_NEXT(oidp, oid_link);
	continue;
	}
	*oid++ = oidp->oid_number;
	(*len)++;

	if (!i) {
	if (oidpp)
	*oidpp = oidp;
	return (0);
	}

	if ((oidp->oid_kind & CTLTYPE) != CTLTYPE_NODE)
	break;

	if (oidp->oid_handler)
	break;

	lsp = SYSCTL_CHILDREN(oidp);
	oidp = SLIST_FIRST(lsp);
	name = p+1;
	for (p = name; p && p != '.'; p++)
	;
	i = *p;
	if (i == '.')
	*p = '\0';
	}
	return (ENOENT);
	}

	static int
	sysctl_sysctl_name2oid(SYSCTL_HANDLER_ARGS)
	{
	char *p;
	int error, oid[CTL_MAXNAME], len = 0;
	struct sysctl_oid *op = 0;

	if (!req->newlen)
	return (ENOENT);
	if (req->newlen >= MAXPATHLEN) /* XXX arbitrary, undocumented */
	return (ENAMETOOLONG);

	p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);

	error = SYSCTL_IN(req, p, req->newlen);
	if (error) {
	free(p, M_SYSCTL);
	return (error);
	}

	p [req->newlen] = '\0';

	SYSCTL_XLOCK();
	error = name2oid(p, oid, &len, &op);
	SYSCTL_XUNLOCK();

	free(p, M_SYSCTL);

	if (error)
	return (error);

	error = SYSCTL_OUT(req, oid, len * sizeof *oid);
	return (error);
	}

	/*
	* XXXRW/JA: Shouldn't return name2oid data for nodes that we don't permit in
	* capability mode.
	*/
	SYSCTL_PROC(_sysctl, 3, name2oid,
	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_ANYBODY \| CTLFLAG_MPSAFE
	\| CTLFLAG_CAPRW, 0, 0, sysctl_sysctl_name2oid, "I", "");

	static int
	sysctl_sysctl_oidfmt(SYSCTL_HANDLER_ARGS)
	{
	struct sysctl_oid *oid;
	int error;

	SYSCTL_XLOCK();
	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
	if (error)
	goto out;

	if (oid->oid_fmt == NULL) {
	error = ENOENT;
	goto out;
	}
	error = SYSCTL_OUT(req, &oid->oid_kind, sizeof(oid->oid_kind));
	if (error)
	goto out;
	error = SYSCTL_OUT(req, oid->oid_fmt, strlen(oid->oid_fmt) + 1);
	out:
	SYSCTL_XUNLOCK();
	return (error);
	}


	static SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD\|CTLFLAG_MPSAFE\|CTLFLAG_CAPRD,
	sysctl_sysctl_oidfmt, "");

	static int
	sysctl_sysctl_oiddescr(SYSCTL_HANDLER_ARGS)
	{
	struct sysctl_oid *oid;
	int error;

	SYSCTL_XLOCK();
	error = sysctl_find_oid(arg1, arg2, &oid, NULL, req);
	if (error)
	goto out;

	if (oid->oid_descr == NULL) {
	error = ENOENT;
	goto out;
	}
	error = SYSCTL_OUT(req, oid->oid_descr, strlen(oid->oid_descr) + 1);
	out:
	SYSCTL_XUNLOCK();
	return (error);
	}

	static SYSCTL_NODE(_sysctl, 5, oiddescr, CTLFLAG_RD\|CTLFLAG_CAPRD,
	sysctl_sysctl_oiddescr, "");

	/*
	* Default "handler" functions.
	*/

	/*
	* Handle an int, signed or unsigned.
	* Two cases:
	* a variable: point arg1 at it.
	* a constant: pass it in arg2.
	*/

	int
	sysctl_handle_int(SYSCTL_HANDLER_ARGS)
	{
	int tmpout, error = 0;

	/*
	* Attempt to get a coherent snapshot by making a copy of the data.
	*/
	if (arg1)
	tmpout = (int )arg1;
	else
	tmpout = arg2;
	error = SYSCTL_OUT(req, &tmpout, sizeof(int));

	if (error \|\| !req->newptr)
	return (error);

	if (!arg1)
	error = EPERM;
	else
	error = SYSCTL_IN(req, arg1, sizeof(int));
	return (error);
	}

	/*
	* Based on on sysctl_handle_int() convert milliseconds into ticks.
	* Note: this is used by TCP.
	*/

	int
	sysctl_msec_to_ticks(SYSCTL_HANDLER_ARGS)
	{
	int error, s, tt;

	tt = (int )arg1;
	s = (int)((int64_t)tt * 1000 / hz);

	error = sysctl_handle_int(oidp, &s, 0, req);
	if (error \|\| !req->newptr)
	return (error);

	tt = (int)((int64_t)s * hz / 1000);
	if (tt < 1)
	return (EINVAL);

	(int )arg1 = tt;
	return (0);
	}


	/*
	* Handle a long, signed or unsigned. arg1 points to it.
	*/

	int
	sysctl_handle_long(SYSCTL_HANDLER_ARGS)
	{
	int error = 0;
	long tmplong;
	#ifdef SCTL_MASK32
	int tmpint;
	#endif

	/*
	* Attempt to get a coherent snapshot by making a copy of the data.
	*/
	if (!arg1)
	return (EINVAL);
	tmplong = (long )arg1;
	#ifdef SCTL_MASK32
	if (req->flags & SCTL_MASK32) {
	tmpint = tmplong;
	error = SYSCTL_OUT(req, &tmpint, sizeof(int));
	} else
	#endif
	error = SYSCTL_OUT(req, &tmplong, sizeof(long));

	if (error \|\| !req->newptr)
	return (error);

	#ifdef SCTL_MASK32
	if (req->flags & SCTL_MASK32) {
	error = SYSCTL_IN(req, &tmpint, sizeof(int));
	(long )arg1 = (long)tmpint;
	} else
	#endif
	error = SYSCTL_IN(req, arg1, sizeof(long));
	return (error);
	}

	/*
	* Handle a 64 bit int, signed or unsigned. arg1 points to it.
	*/
	int
	sysctl_handle_64(SYSCTL_HANDLER_ARGS)
	{
	int error = 0;
	uint64_t tmpout;

	/*
	* Attempt to get a coherent snapshot by making a copy of the data.
	*/
	if (!arg1)
	return (EINVAL);
	tmpout = (uint64_t )arg1;
	error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t));

	if (error \|\| !req->newptr)
	return (error);

	error = SYSCTL_IN(req, arg1, sizeof(uint64_t));
	return (error);
	}

	/*
	* Handle our generic '\0' terminated 'C' string.
	* Two cases:
	* a variable string: point arg1 at it, arg2 is max length.
	* a constant string: point arg1 at it, arg2 is zero.
	*/

	int
	sysctl_handle_string(SYSCTL_HANDLER_ARGS)
	{
	int error=0;
	char *tmparg;
	size_t outlen;

	/*
	* Attempt to get a coherent snapshot by copying to a
	* temporary kernel buffer.
	*/
	retry:
	outlen = strlen((char *)arg1)+1;
	tmparg = malloc(outlen, M_SYSCTLTMP, M_WAITOK);

	if (strlcpy(tmparg, (char *)arg1, outlen) >= outlen) {
	free(tmparg, M_SYSCTLTMP);
	goto retry;
	}

	error = SYSCTL_OUT(req, tmparg, outlen);
	free(tmparg, M_SYSCTLTMP);

	if (error \|\| !req->newptr)
	return (error);

	if ((req->newlen - req->newidx) >= arg2) {
	error = EINVAL;
	} else {
	arg2 = (req->newlen - req->newidx);
	error = SYSCTL_IN(req, arg1, arg2);
	((char *)arg1)[arg2] = '\0';
	}

	return (error);
	}

	/*
	* Handle any kind of opaque data.
	* arg1 points to it, arg2 is the size.
	*/

	int
	sysctl_handle_opaque(SYSCTL_HANDLER_ARGS)
	{
	int error, tries;
	u_int generation;
	struct sysctl_req req2;

	/*
	* Attempt to get a coherent snapshot, by using the thread
	* pre-emption counter updated from within mi_switch() to
	* determine if we were pre-empted during a bcopy() or
	* copyout(). Make 3 attempts at doing this before giving up.
	* If we encounter an error, stop immediately.
	*/
	tries = 0;
	req2 = *req;
	retry:
	generation = curthread->td_generation;
	error = SYSCTL_OUT(req, arg1, arg2);
	if (error)
	return (error);
	tries++;
	if (generation != curthread->td_generation && tries < 3) {
	*req = req2;
	goto retry;
	}

	error = SYSCTL_IN(req, arg1, arg2);

	return (error);
	}

	/*
	* Transfer functions to/from kernel space.
	* XXX: rather untested at this point
	*/
	static int
	sysctl_old_kernel(struct sysctl_req req, const void p, size_t l)
	{
	size_t i = 0;

	if (req->oldptr) {
	i = l;
	if (req->oldlen <= req->oldidx)
	i = 0;
	else
	if (i > req->oldlen - req->oldidx)
	i = req->oldlen - req->oldidx;
	if (i > 0)
	bcopy(p, (char *)req->oldptr + req->oldidx, i);
	}
	req->oldidx += l;
	if (req->oldptr && i != l)
	return (ENOMEM);
	return (0);
	}

	static int
	sysctl_new_kernel(struct sysctl_req req, void p, size_t l)
	{
	if (!req->newptr)
	return (0);
	if (req->newlen - req->newidx < l)
	return (EINVAL);
	bcopy((char *)req->newptr + req->newidx, p, l);
	req->newidx += l;
	return (0);
	}

	int
	kernel_sysctl(struct thread td, int name, u_int namelen, void *old,
	size_t oldlenp, void new, size_t newlen, size_t *retval, int flags)
	{
	int error = 0;
	struct sysctl_req req;

	bzero(&req, sizeof req);

	req.td = td;
	req.flags = flags;

	if (oldlenp) {
	req.oldlen = *oldlenp;
	}
	req.validlen = req.oldlen;

	if (old) {
	req.oldptr= old;
	}

	if (new != NULL) {
	req.newlen = newlen;
	req.newptr = new;
	}

	req.oldfunc = sysctl_old_kernel;
	req.newfunc = sysctl_new_kernel;
	req.lock = REQ_UNWIRED;

	SYSCTL_XLOCK();
	error = sysctl_root(0, name, namelen, &req);
	SYSCTL_XUNLOCK();

	if (req.lock == REQ_WIRED && req.validlen > 0)
	vsunlock(req.oldptr, req.validlen);

	if (error && error != ENOMEM)
	return (error);

	if (retval) {
	if (req.oldptr && req.oldidx > req.validlen)
	*retval = req.validlen;
	else
	*retval = req.oldidx;
	}
	return (error);
	}

	int
	kernel_sysctlbyname(struct thread td, char name, void old, size_t oldlenp,
	void new, size_t newlen, size_t retval, int flags)
	{
	int oid[CTL_MAXNAME];
	size_t oidlen, plen;
	int error;

	oid[0] = 0; /* sysctl internal magic */
	oid[1] = 3; /* name2oid */
	oidlen = sizeof(oid);

	error = kernel_sysctl(td, oid, 2, oid, &oidlen,
	(void *)name, strlen(name), &plen, flags);
	if (error)
	return (error);

	error = kernel_sysctl(td, oid, plen / sizeof(int), old, oldlenp,
	new, newlen, retval, flags);
	return (error);
	}

	/*
	* Transfer function to/from user space.
	*/
	static int
	sysctl_old_user(struct sysctl_req req, const void p, size_t l)
	{
	int error = 0;
	size_t i, len, origidx;

	origidx = req->oldidx;
	req->oldidx += l;
	if (req->oldptr == NULL)
	return (0);
	/*
	* If we have not wired the user supplied buffer and we are currently
	* holding locks, drop a witness warning, as it's possible that
	* write operations to the user page can sleep.
	*/
	if (req->lock != REQ_WIRED)
	WITNESS_WARN(WARN_GIANTOK \| WARN_SLEEPOK, NULL,
	"sysctl_old_user()");
	i = l;
	len = req->validlen;
	if (len <= origidx)
	i = 0;
	else {
	if (i > len - origidx)
	i = len - origidx;
	error = copyout(p, (char *)req->oldptr + origidx, i);
	}
	if (error)
	return (error);
	if (i < l)
	return (ENOMEM);
	return (0);
	}

	static int
	sysctl_new_user(struct sysctl_req req, void p, size_t l)
	{
	int error;

	if (!req->newptr)
	return (0);
	if (req->newlen - req->newidx < l)
	return (EINVAL);
	WITNESS_WARN(WARN_GIANTOK \| WARN_SLEEPOK, NULL,
	"sysctl_new_user()");
	error = copyin((char *)req->newptr + req->newidx, p, l);
	req->newidx += l;
	return (error);
	}

	/*
	* Wire the user space destination buffer. If set to a value greater than
	* zero, the len parameter limits the maximum amount of wired memory.
	*/
	int
	sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
	{
	int ret;
	size_t wiredlen;

	wiredlen = (len > 0 && len < req->oldlen) ? len : req->oldlen;
	ret = 0;
	if (req->lock != REQ_WIRED && req->oldptr &&
	req->oldfunc == sysctl_old_user) {
	if (wiredlen != 0) {
	ret = vslock(req->oldptr, wiredlen);
	if (ret != 0) {
	if (ret != ENOMEM)
	return (ret);
	wiredlen = 0;
	}
	}
	req->lock = REQ_WIRED;
	req->validlen = wiredlen;
	}
	return (0);
	}

	int
	sysctl_find_oid(int name, u_int namelen, struct sysctl_oid *noid,
	int nindx, struct sysctl_req req)
	{
	struct sysctl_oid_list *lsp;
	struct sysctl_oid *oid;
	int indx;

	SYSCTL_ASSERT_XLOCKED();
	lsp = &sysctl__children;
	indx = 0;
	while (indx < CTL_MAXNAME) {
	SLIST_FOREACH(oid, lsp, oid_link) {
	if (oid->oid_number == name[indx])
	break;
	}
	if (oid == NULL)
	return (ENOENT);

	indx++;
	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
	if (oid->oid_handler != NULL \|\| indx == namelen) {
	*noid = oid;
	if (nindx != NULL)
	*nindx = indx;
	KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
	("%s found DYING node %p", __func__, oid));
	return (0);
	}
	lsp = SYSCTL_CHILDREN(oid);
	} else if (indx == namelen) {
	*noid = oid;
	if (nindx != NULL)
	*nindx = indx;
	KASSERT((oid->oid_kind & CTLFLAG_DYING) == 0,
	("%s found DYING node %p", __func__, oid));
	return (0);
	} else {
	return (ENOTDIR);
	}
	}
	return (ENOENT);
	}

	/*
	* Traverse our tree, and find the right node, execute whatever it points
	* to, and return the resulting error code.
	*/

	static int
	sysctl_root(SYSCTL_HANDLER_ARGS)
	{
	struct sysctl_oid *oid;
	int error, indx, lvl;

	SYSCTL_ASSERT_XLOCKED();

	error = sysctl_find_oid(arg1, arg2, &oid, &indx, req);
	if (error)
	return (error);

	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
	/*
	* You can't call a sysctl when it's a node, but has
	* no handler. Inform the user that it's a node.
	* The indx may or may not be the same as namelen.
	*/
	if (oid->oid_handler == NULL)
	return (EISDIR);
	}

	/* Is this sysctl writable? */
	if (req->newptr && !(oid->oid_kind & CTLFLAG_WR))
	return (EPERM);

	KASSERT(req->td != NULL, ("sysctl_root(): req->td == NULL"));

	#ifdef CAPABILITY_MODE
	/*
	* If the process is in capability mode, then don't permit reading or
	* writing unless specifically granted for the node.
	*/
	if (IN_CAPABILITY_MODE(req->td)) {
	if (req->oldptr && !(oid->oid_kind & CTLFLAG_CAPRD))
	return (EPERM);
	if (req->newptr && !(oid->oid_kind & CTLFLAG_CAPWR))
	return (EPERM);
	}
	#endif

	/* Is this sysctl sensitive to securelevels? */
	if (req->newptr && (oid->oid_kind & CTLFLAG_SECURE)) {
	lvl = (oid->oid_kind & CTLMASK_SECURE) >> CTLSHIFT_SECURE;
	error = securelevel_gt(req->td->td_ucred, lvl);
	if (error)
	return (error);
	}

	/* Is this sysctl writable by only privileged users? */
	if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
	int priv;

	if (oid->oid_kind & CTLFLAG_PRISON)
	priv = PRIV_SYSCTL_WRITEJAIL;
	#ifdef VIMAGE
	else if ((oid->oid_kind & CTLFLAG_VNET) &&
	prison_owns_vnet(req->td->td_ucred))
	priv = PRIV_SYSCTL_WRITEJAIL;
	#endif
	else
	priv = PRIV_SYSCTL_WRITE;
	error = priv_check(req->td, priv);
	if (error)
	return (error);
	}

	if (!oid->oid_handler)
	return (EINVAL);

	if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
	arg1 = (int *)arg1 + indx;
	arg2 -= indx;
	} else {
	arg1 = oid->oid_arg1;
	arg2 = oid->oid_arg2;
	}
	#ifdef MAC
	error = mac_system_check_sysctl(req->td->td_ucred, oid, arg1, arg2,
	req);
	if (error != 0)
	return (error);
	#endif
	oid->oid_running++;
	SYSCTL_XUNLOCK();

	if (!(oid->oid_kind & CTLFLAG_MPSAFE))
	mtx_lock(&Giant);
	error = oid->oid_handler(oid, arg1, arg2, req);
	if (!(oid->oid_kind & CTLFLAG_MPSAFE))
	mtx_unlock(&Giant);

	KFAIL_POINT_ERROR(_debug_fail_point, sysctl_running, error);

	SYSCTL_XLOCK();
	oid->oid_running--;
	if (oid->oid_running == 0 && (oid->oid_kind & CTLFLAG_DYING) != 0)
	wakeup(&oid->oid_running);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct sysctl_args {
	int *name;
	u_int namelen;
	void *old;
	size_t *oldlenp;
	void *new;
	size_t newlen;
	};
	#endif
	int
	-__sysctl(struct thread td, struct sysctl_args uap)
	+sys___sysctl(struct thread td, struct sysctl_args uap)
	{
	int error, i, name[CTL_MAXNAME];
	size_t j;

	if (uap->namelen > CTL_MAXNAME \|\| uap->namelen < 2)
	return (EINVAL);

	error = copyin(uap->name, &name, uap->namelen * sizeof(int));
	if (error)
	return (error);

	error = userland_sysctl(td, name, uap->namelen,
	uap->old, uap->oldlenp, 0,
	uap->new, uap->newlen, &j, 0);
	if (error && error != ENOMEM)
	return (error);
	if (uap->oldlenp) {
	i = copyout(&j, uap->oldlenp, sizeof(j));
	if (i)
	return (i);
	}
	return (error);
	}

	/*
	* This is used from various compatibility syscalls too. That's why name
	* must be in kernel space.
	*/
	int
	userland_sysctl(struct thread td, int name, u_int namelen, void *old,
	size_t oldlenp, int inkernel, void new, size_t newlen, size_t *retval,
	int flags)
	{
	int error = 0, memlocked;
	struct sysctl_req req;

	bzero(&req, sizeof req);

	req.td = td;
	req.flags = flags;

	if (oldlenp) {
	if (inkernel) {
	req.oldlen = *oldlenp;
	} else {
	error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
	if (error)
	return (error);
	}
	}
	req.validlen = req.oldlen;

	if (old) {
	if (!useracc(old, req.oldlen, VM_PROT_WRITE))
	return (EFAULT);
	req.oldptr= old;
	}

	if (new != NULL) {
	if (!useracc(new, newlen, VM_PROT_READ))
	return (EFAULT);
	req.newlen = newlen;
	req.newptr = new;
	}

	req.oldfunc = sysctl_old_user;
	req.newfunc = sysctl_new_user;
	req.lock = REQ_UNWIRED;

	#ifdef KTRACE
	if (KTRPOINT(curthread, KTR_SYSCTL))
	ktrsysctl(name, namelen);
	#endif

	if (req.oldlen > PAGE_SIZE) {
	memlocked = 1;
	sx_xlock(&sysctlmemlock);
	} else
	memlocked = 0;
	CURVNET_SET(TD_TO_VNET(td));

	for (;;) {
	req.oldidx = 0;
	req.newidx = 0;
	SYSCTL_XLOCK();
	error = sysctl_root(0, name, namelen, &req);
	SYSCTL_XUNLOCK();
	if (error != EAGAIN)
	break;
	kern_yield(PRI_USER);
	}

	CURVNET_RESTORE();

	if (req.lock == REQ_WIRED && req.validlen > 0)
	vsunlock(req.oldptr, req.validlen);
	if (memlocked)
	sx_xunlock(&sysctlmemlock);

	if (error && error != ENOMEM)
	return (error);

	if (retval) {
	if (req.oldptr && req.oldidx > req.validlen)
	*retval = req.validlen;
	else
	*retval = req.oldidx;
	}
	return (error);
	}

	/*
	* Drain into a sysctl struct. The user buffer should be wired if a page
	* fault would cause issue.
	*/
	static int
	sbuf_sysctl_drain(void arg, const char data, int len)
	{
	struct sysctl_req *req = arg;
	int error;

	error = SYSCTL_OUT(req, data, len);
	KASSERT(error >= 0, ("Got unexpected negative value %d", error));
	return (error == 0 ? len : -error);
	}

	struct sbuf *
	sbuf_new_for_sysctl(struct sbuf s, char buf, int length,
	struct sysctl_req *req)
	{

	s = sbuf_new(s, buf, length, SBUF_FIXEDLEN);
	sbuf_set_drain(s, sbuf_sysctl_drain, req);
	return (s);
	}
	Index: head/sys/kern/kern_thr.c
	===================================================================
	--- head/sys/kern/kern_thr.c (revision 225616)
	+++ head/sys/kern/kern_thr.c (revision 225617)
	@@ -1,555 +1,555 @@
	/*-
	* Copyright (c) 2003, Jeffrey Roberson <jeff@freebsd.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice unmodified, this list of conditions, and the following
	* disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_posix.h"
	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/posix4.h>
	#include <sys/racct.h>
	#include <sys/resourcevar.h>
	#include <sys/rwlock.h>
	#include <sys/sched.h>
	#include <sys/sysctl.h>
	#include <sys/smp.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/signalvar.h>
	#include <sys/sysctl.h>
	#include <sys/ucontext.h>
	#include <sys/thr.h>
	#include <sys/rtprio.h>
	#include <sys/umtx.h>
	#include <sys/limits.h>

	#include <machine/frame.h>

	#include <security/audit/audit.h>

	SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");

	static int max_threads_per_proc = 1500;
	SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
	&max_threads_per_proc, 0, "Limit on threads per proc");

	static int max_threads_hits;
	SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
	&max_threads_hits, 0, "");

	#ifdef COMPAT_FREEBSD32

	static inline int
	suword_lwpid(void *addr, lwpid_t lwpid)
	{
	int error;

	if (SV_CURPROC_FLAG(SV_LP64))
	error = suword(addr, lwpid);
	else
	error = suword32(addr, lwpid);
	return (error);
	}

	#else
	#define suword_lwpid suword
	#endif

	static int create_thread(struct thread td, mcontext_t ctx,
	void (start_func)(void ), void *arg,
	char *stack_base, size_t stack_size,
	char *tls_base,
	long child_tid, long parent_tid,
	int flags, struct rtprio *rtp);

	/*
	* System call interface.
	*/
	int
	-thr_create(struct thread td, struct thr_create_args uap)
	+sys_thr_create(struct thread td, struct thr_create_args uap)
	/* ucontext_t ctx, long id, int flags */
	{
	ucontext_t ctx;
	int error;

	if ((error = copyin(uap->ctx, &ctx, sizeof(ctx))))
	return (error);

	error = create_thread(td, &ctx.uc_mcontext, NULL, NULL,
	NULL, 0, NULL, uap->id, NULL, uap->flags, NULL);
	return (error);
	}

	int
	-thr_new(struct thread td, struct thr_new_args uap)
	+sys_thr_new(struct thread td, struct thr_new_args uap)
	/* struct thr_param * */
	{
	struct thr_param param;
	int error;

	if (uap->param_size < 0 \|\| uap->param_size > sizeof(param))
	return (EINVAL);
	bzero(&param, sizeof(param));
	if ((error = copyin(uap->param, &param, uap->param_size)))
	return (error);
	return (kern_thr_new(td, &param));
	}

	int
	kern_thr_new(struct thread td, struct thr_param param)
	{
	struct rtprio rtp, *rtpp;
	int error;

	rtpp = NULL;
	if (param->rtp != 0) {
	error = copyin(param->rtp, &rtp, sizeof(struct rtprio));
	if (error)
	return (error);
	rtpp = &rtp;
	}
	error = create_thread(td, NULL, param->start_func, param->arg,
	param->stack_base, param->stack_size, param->tls_base,
	param->child_tid, param->parent_tid, param->flags,
	rtpp);
	return (error);
	}

	static int
	create_thread(struct thread td, mcontext_t ctx,
	void (start_func)(void ), void *arg,
	char *stack_base, size_t stack_size,
	char *tls_base,
	long child_tid, long parent_tid,
	int flags, struct rtprio *rtp)
	{
	stack_t stack;
	struct thread *newtd;
	struct proc *p;
	int error;

	p = td->td_proc;

	/* Have race condition but it is cheap. */
	if (p->p_numthreads >= max_threads_per_proc) {
	++max_threads_hits;
	return (EPROCLIM);
	}

	if (rtp != NULL) {
	switch(rtp->type) {
	case RTP_PRIO_REALTIME:
	case RTP_PRIO_FIFO:
	/* Only root can set scheduler policy */
	if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0)
	return (EPERM);
	if (rtp->prio > RTP_PRIO_MAX)
	return (EINVAL);
	break;
	case RTP_PRIO_NORMAL:
	rtp->prio = 0;
	break;
	default:
	return (EINVAL);
	}
	}

	#ifdef RACCT
	PROC_LOCK(td->td_proc);
	error = racct_add(p, RACCT_NTHR, 1);
	PROC_UNLOCK(td->td_proc);
	if (error != 0)
	return (EPROCLIM);
	#endif

	/* Initialize our td */
	newtd = thread_alloc(0);
	if (newtd == NULL) {
	error = ENOMEM;
	goto fail;
	}

	/*
	* Try the copyout as soon as we allocate the td so we don't
	* have to tear things down in a failure case below.
	* Here we copy out tid to two places, one for child and one
	* for parent, because pthread can create a detached thread,
	* if parent wants to safely access child tid, it has to provide
	* its storage, because child thread may exit quickly and
	* memory is freed before parent thread can access it.
	*/
	if ((child_tid != NULL &&
	suword_lwpid(child_tid, newtd->td_tid)) \|\|
	(parent_tid != NULL &&
	suword_lwpid(parent_tid, newtd->td_tid))) {
	thread_free(newtd);
	error = EFAULT;
	goto fail;
	}

	bzero(&newtd->td_startzero,
	__rangeof(struct thread, td_startzero, td_endzero));
	bcopy(&td->td_startcopy, &newtd->td_startcopy,
	__rangeof(struct thread, td_startcopy, td_endcopy));
	newtd->td_proc = td->td_proc;
	newtd->td_ucred = crhold(td->td_ucred);

	cpu_set_upcall(newtd, td);

	if (ctx != NULL) { /* old way to set user context */
	error = set_mcontext(newtd, ctx);
	if (error != 0) {
	thread_free(newtd);
	crfree(td->td_ucred);
	goto fail;
	}
	} else {
	/* Set up our machine context. */
	stack.ss_sp = stack_base;
	stack.ss_size = stack_size;
	/* Set upcall address to user thread entry function. */
	cpu_set_upcall_kse(newtd, start_func, arg, &stack);
	/* Setup user TLS address and TLS pointer register. */
	error = cpu_set_user_tls(newtd, tls_base);
	if (error != 0) {
	thread_free(newtd);
	crfree(td->td_ucred);
	goto fail;
	}
	}

	PROC_LOCK(td->td_proc);
	td->td_proc->p_flag \|= P_HADTHREADS;
	newtd->td_sigmask = td->td_sigmask;
	thread_link(newtd, p);
	bcopy(p->p_comm, newtd->td_name, sizeof(newtd->td_name));
	thread_lock(td);
	/* let the scheduler know about these things. */
	sched_fork_thread(td, newtd);
	thread_unlock(td);
	if (P_SHOULDSTOP(p))
	newtd->td_flags \|= TDF_ASTPENDING \| TDF_NEEDSUSPCHK;
	PROC_UNLOCK(p);

	tidhash_add(newtd);

	thread_lock(newtd);
	if (rtp != NULL) {
	if (!(td->td_pri_class == PRI_TIMESHARE &&
	rtp->type == RTP_PRIO_NORMAL)) {
	rtp_to_pri(rtp, newtd);
	sched_prio(newtd, newtd->td_user_pri);
	} /* ignore timesharing class */
	}
	TD_SET_CAN_RUN(newtd);
	sched_add(newtd, SRQ_BORING);
	thread_unlock(newtd);

	return (0);

	fail:
	#ifdef RACCT
	PROC_LOCK(p);
	racct_sub(p, RACCT_NTHR, 1);
	PROC_UNLOCK(p);
	#endif
	return (error);
	}

	int
	-thr_self(struct thread td, struct thr_self_args uap)
	+sys_thr_self(struct thread td, struct thr_self_args uap)
	/* long id /
	{
	int error;

	error = suword_lwpid(uap->id, (unsigned)td->td_tid);
	if (error == -1)
	return (EFAULT);
	return (0);
	}

	int
	-thr_exit(struct thread td, struct thr_exit_args uap)
	+sys_thr_exit(struct thread td, struct thr_exit_args uap)
	/* long state /
	{
	struct proc *p;

	p = td->td_proc;

	/* Signal userland that it can free the stack. */
	if ((void *)uap->state != NULL) {
	suword_lwpid(uap->state, 1);
	kern_umtx_wake(td, uap->state, INT_MAX, 0);
	}

	rw_wlock(&tidhash_lock);

	PROC_LOCK(p);
	racct_sub(p, RACCT_NTHR, 1);

	/*
	* Shutting down last thread in the proc. This will actually
	* call exit() in the trampoline when it returns.
	*/
	if (p->p_numthreads != 1) {
	LIST_REMOVE(td, td_hash);
	rw_wunlock(&tidhash_lock);
	tdsigcleanup(td);
	PROC_SLOCK(p);
	thread_stopped(p);
	thread_exit();
	/* NOTREACHED */
	}
	PROC_UNLOCK(p);
	rw_wunlock(&tidhash_lock);
	return (0);
	}

	int
	-thr_kill(struct thread td, struct thr_kill_args uap)
	+sys_thr_kill(struct thread td, struct thr_kill_args uap)
	/* long id, int sig */
	{
	ksiginfo_t ksi;
	struct thread *ttd;
	struct proc *p;
	int error;

	p = td->td_proc;
	ksiginfo_init(&ksi);
	ksi.ksi_signo = uap->sig;
	ksi.ksi_code = SI_LWP;
	ksi.ksi_pid = p->p_pid;
	ksi.ksi_uid = td->td_ucred->cr_ruid;
	if (uap->id == -1) {
	if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
	error = EINVAL;
	} else {
	error = ESRCH;
	PROC_LOCK(p);
	FOREACH_THREAD_IN_PROC(p, ttd) {
	if (ttd != td) {
	error = 0;
	if (uap->sig == 0)
	break;
	tdksignal(ttd, uap->sig, &ksi);
	}
	}
	PROC_UNLOCK(p);
	}
	} else {
	error = 0;
	ttd = tdfind((lwpid_t)uap->id, p->p_pid);
	if (ttd == NULL)
	return (ESRCH);
	if (uap->sig == 0)
	;
	else if (!_SIG_VALID(uap->sig))
	error = EINVAL;
	else
	tdksignal(ttd, uap->sig, &ksi);
	PROC_UNLOCK(ttd->td_proc);
	}
	return (error);
	}

	int
	-thr_kill2(struct thread td, struct thr_kill2_args uap)
	+sys_thr_kill2(struct thread td, struct thr_kill2_args uap)
	/* pid_t pid, long id, int sig */
	{
	ksiginfo_t ksi;
	struct thread *ttd;
	struct proc *p;
	int error;

	AUDIT_ARG_SIGNUM(uap->sig);

	ksiginfo_init(&ksi);
	ksi.ksi_signo = uap->sig;
	ksi.ksi_code = SI_LWP;
	ksi.ksi_pid = td->td_proc->p_pid;
	ksi.ksi_uid = td->td_ucred->cr_ruid;
	if (uap->id == -1) {
	if ((p = pfind(uap->pid)) == NULL)
	return (ESRCH);
	AUDIT_ARG_PROCESS(p);
	error = p_cansignal(td, p, uap->sig);
	if (error) {
	PROC_UNLOCK(p);
	return (error);
	}
	if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
	error = EINVAL;
	} else {
	error = ESRCH;
	FOREACH_THREAD_IN_PROC(p, ttd) {
	if (ttd != td) {
	error = 0;
	if (uap->sig == 0)
	break;
	tdksignal(ttd, uap->sig, &ksi);
	}
	}
	}
	PROC_UNLOCK(p);
	} else {
	ttd = tdfind((lwpid_t)uap->id, uap->pid);
	if (ttd == NULL)
	return (ESRCH);
	p = ttd->td_proc;
	AUDIT_ARG_PROCESS(p);
	error = p_cansignal(td, p, uap->sig);
	if (uap->sig == 0)
	;
	else if (!_SIG_VALID(uap->sig))
	error = EINVAL;
	else
	tdksignal(ttd, uap->sig, &ksi);
	PROC_UNLOCK(p);
	}
	return (error);
	}

	int
	-thr_suspend(struct thread td, struct thr_suspend_args uap)
	+sys_thr_suspend(struct thread td, struct thr_suspend_args uap)
	/* const struct timespec timeout /
	{
	struct timespec ts, *tsp;
	int error;

	tsp = NULL;
	if (uap->timeout != NULL) {
	error = copyin((const void )uap->timeout, (void )&ts,
	sizeof(struct timespec));
	if (error != 0)
	return (error);
	tsp = &ts;
	}

	return (kern_thr_suspend(td, tsp));
	}

	int
	kern_thr_suspend(struct thread td, struct timespec tsp)
	{
	struct proc *p = td->td_proc;
	struct timeval tv;
	int error = 0;
	int timo = 0;

	if (td->td_pflags & TDP_WAKEUP) {
	td->td_pflags &= ~TDP_WAKEUP;
	return (0);
	}

	if (tsp != NULL) {
	if (tsp->tv_nsec < 0 \|\| tsp->tv_nsec > 1000000000)
	return (EINVAL);
	if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
	error = EWOULDBLOCK;
	else {
	TIMESPEC_TO_TIMEVAL(&tv, tsp);
	timo = tvtohz(&tv);
	}
	}

	PROC_LOCK(p);
	if (error == 0 && (td->td_flags & TDF_THRWAKEUP) == 0)
	error = msleep((void *)td, &p->p_mtx,
	PCATCH, "lthr", timo);

	if (td->td_flags & TDF_THRWAKEUP) {
	thread_lock(td);
	td->td_flags &= ~TDF_THRWAKEUP;
	thread_unlock(td);
	PROC_UNLOCK(p);
	return (0);
	}
	PROC_UNLOCK(p);
	if (error == EWOULDBLOCK)
	error = ETIMEDOUT;
	else if (error == ERESTART) {
	if (timo != 0)
	error = EINTR;
	}
	return (error);
	}

	int
	-thr_wake(struct thread td, struct thr_wake_args uap)
	+sys_thr_wake(struct thread td, struct thr_wake_args uap)
	/* long id */
	{
	struct proc *p;
	struct thread *ttd;

	if (uap->id == td->td_tid) {
	td->td_pflags \|= TDP_WAKEUP;
	return (0);
	}

	p = td->td_proc;
	ttd = tdfind((lwpid_t)uap->id, p->p_pid);
	if (ttd == NULL)
	return (ESRCH);
	thread_lock(ttd);
	ttd->td_flags \|= TDF_THRWAKEUP;
	thread_unlock(ttd);
	wakeup((void *)ttd);
	PROC_UNLOCK(p);
	return (0);
	}

	int
	-thr_set_name(struct thread td, struct thr_set_name_args uap)
	+sys_thr_set_name(struct thread td, struct thr_set_name_args uap)
	{
	struct proc *p;
	char name[MAXCOMLEN + 1];
	struct thread *ttd;
	int error;

	error = 0;
	name[0] = '\0';
	if (uap->name != NULL) {
	error = copyinstr(uap->name, name, sizeof(name),
	NULL);
	if (error)
	return (error);
	}
	p = td->td_proc;
	ttd = tdfind((lwpid_t)uap->id, p->p_pid);
	if (ttd == NULL)
	return (ESRCH);
	strcpy(ttd->td_name, name);
	PROC_UNLOCK(p);
	return (error);
	}
	Index: head/sys/kern/kern_time.c
	===================================================================
	--- head/sys/kern/kern_time.c (revision 225616)
	+++ head/sys/kern/kern_time.c (revision 225617)
	@@ -1,1496 +1,1496 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)kern_time.c 8.1 (Berkeley) 6/10/93
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/limits.h>
	#include <sys/clock.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysproto.h>
	#include <sys/eventhandler.h>
	#include <sys/resourcevar.h>
	#include <sys/signalvar.h>
	#include <sys/kernel.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/posix4.h>
	#include <sys/time.h>
	#include <sys/timers.h>
	#include <sys/timetc.h>
	#include <sys/vnode.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>

	#define MAX_CLOCKS (CLOCK_MONOTONIC+1)

	static struct kclock posix_clocks[MAX_CLOCKS];
	static uma_zone_t itimer_zone = NULL;

	/*
	* Time of day and interval timer support.
	*
	* These routines provide the kernel entry points to get and set
	* the time-of-day and per-process interval timers. Subroutines
	* here provide support for adding and subtracting timeval structures
	* and decrementing interval timers, optionally reloading the interval
	* timers when they expire.
	*/

	static int settime(struct thread , struct timeval );
	static void timevalfix(struct timeval *);

	static void itimer_start(void);
	static int itimer_init(void *, int, int);
	static void itimer_fini(void *, int);
	static void itimer_enter(struct itimer *);
	static void itimer_leave(struct itimer *);
	static struct itimer itimer_find(struct proc , int);
	static void itimers_alloc(struct proc *);
	static void itimers_event_hook_exec(void arg, struct proc p, struct image_params *imgp);
	static void itimers_event_hook_exit(void arg, struct proc p);
	static int realtimer_create(struct itimer *);
	static int realtimer_gettime(struct itimer , struct itimerspec );
	static int realtimer_settime(struct itimer *, int,
	struct itimerspec , struct itimerspec );
	static int realtimer_delete(struct itimer *);
	static void realtimer_clocktime(clockid_t, struct timespec *);
	static void realtimer_expire(void *);
	static int kern_timer_create(struct thread *, clockid_t,
	struct sigevent , int , int);
	static int kern_timer_delete(struct thread *, int);

	int register_posix_clock(int, struct kclock *);
	void itimer_fire(struct itimer *it);
	int itimespecfix(struct timespec *ts);

	#define CLOCK_CALL(clock, call, arglist) \
	((*posix_clocks[clock].call) arglist)

	SYSINIT(posix_timer, SI_SUB_P1003_1B, SI_ORDER_FIRST+4, itimer_start, NULL);


	static int
	settime(struct thread td, struct timeval tv)
	{
	struct timeval delta, tv1, tv2;
	static struct timeval maxtime, laststep;
	struct timespec ts;
	int s;

	s = splclock();
	microtime(&tv1);
	delta = *tv;
	timevalsub(&delta, &tv1);

	/*
	* If the system is secure, we do not allow the time to be
	* set to a value earlier than 1 second less than the highest
	* time we have yet seen. The worst a miscreant can do in
	* this circumstance is "freeze" time. He couldn't go
	* back to the past.
	*
	* We similarly do not allow the clock to be stepped more
	* than one second, nor more than once per second. This allows
	* a miscreant to make the clock march double-time, but no worse.
	*/
	if (securelevel_gt(td->td_ucred, 1) != 0) {
	if (delta.tv_sec < 0 \|\| delta.tv_usec < 0) {
	/*
	* Update maxtime to latest time we've seen.
	*/
	if (tv1.tv_sec > maxtime.tv_sec)
	maxtime = tv1;
	tv2 = *tv;
	timevalsub(&tv2, &maxtime);
	if (tv2.tv_sec < -1) {
	tv->tv_sec = maxtime.tv_sec - 1;
	printf("Time adjustment clamped to -1 second\n");
	}
	} else {
	if (tv1.tv_sec == laststep.tv_sec) {
	splx(s);
	return (EPERM);
	}
	if (delta.tv_sec > 1) {
	tv->tv_sec = tv1.tv_sec + 1;
	printf("Time adjustment clamped to +1 second\n");
	}
	laststep = *tv;
	}
	}

	ts.tv_sec = tv->tv_sec;
	ts.tv_nsec = tv->tv_usec * 1000;
	mtx_lock(&Giant);
	tc_setclock(&ts);
	resettodr();
	mtx_unlock(&Giant);
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct clock_gettime_args {
	clockid_t clock_id;
	struct timespec *tp;
	};
	#endif
	/* ARGSUSED */
	int
	-clock_gettime(struct thread td, struct clock_gettime_args uap)
	+sys_clock_gettime(struct thread td, struct clock_gettime_args uap)
	{
	struct timespec ats;
	int error;

	error = kern_clock_gettime(td, uap->clock_id, &ats);
	if (error == 0)
	error = copyout(&ats, uap->tp, sizeof(ats));

	return (error);
	}

	int
	kern_clock_gettime(struct thread td, clockid_t clock_id, struct timespec ats)
	{
	struct timeval sys, user;
	struct proc *p;
	uint64_t runtime, curtime, switchtime;

	p = td->td_proc;
	switch (clock_id) {
	case CLOCK_REALTIME: /* Default to precise. */
	case CLOCK_REALTIME_PRECISE:
	nanotime(ats);
	break;
	case CLOCK_REALTIME_FAST:
	getnanotime(ats);
	break;
	case CLOCK_VIRTUAL:
	PROC_LOCK(p);
	PROC_SLOCK(p);
	calcru(p, &user, &sys);
	PROC_SUNLOCK(p);
	PROC_UNLOCK(p);
	TIMEVAL_TO_TIMESPEC(&user, ats);
	break;
	case CLOCK_PROF:
	PROC_LOCK(p);
	PROC_SLOCK(p);
	calcru(p, &user, &sys);
	PROC_SUNLOCK(p);
	PROC_UNLOCK(p);
	timevaladd(&user, &sys);
	TIMEVAL_TO_TIMESPEC(&user, ats);
	break;
	case CLOCK_MONOTONIC: /* Default to precise. */
	case CLOCK_MONOTONIC_PRECISE:
	case CLOCK_UPTIME:
	case CLOCK_UPTIME_PRECISE:
	nanouptime(ats);
	break;
	case CLOCK_UPTIME_FAST:
	case CLOCK_MONOTONIC_FAST:
	getnanouptime(ats);
	break;
	case CLOCK_SECOND:
	ats->tv_sec = time_second;
	ats->tv_nsec = 0;
	break;
	case CLOCK_THREAD_CPUTIME_ID:
	critical_enter();
	switchtime = PCPU_GET(switchtime);
	curtime = cpu_ticks();
	runtime = td->td_runtime;
	critical_exit();
	runtime = cputick2usec(runtime + curtime - switchtime);
	ats->tv_sec = runtime / 1000000;
	ats->tv_nsec = runtime % 1000000 * 1000;
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct clock_settime_args {
	clockid_t clock_id;
	const struct timespec *tp;
	};
	#endif
	/* ARGSUSED */
	int
	-clock_settime(struct thread td, struct clock_settime_args uap)
	+sys_clock_settime(struct thread td, struct clock_settime_args uap)
	{
	struct timespec ats;
	int error;

	if ((error = copyin(uap->tp, &ats, sizeof(ats))) != 0)
	return (error);
	return (kern_clock_settime(td, uap->clock_id, &ats));
	}

	int
	kern_clock_settime(struct thread td, clockid_t clock_id, struct timespec ats)
	{
	struct timeval atv;
	int error;

	if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
	return (error);
	if (clock_id != CLOCK_REALTIME)
	return (EINVAL);
	if (ats->tv_nsec < 0 \|\| ats->tv_nsec >= 1000000000)
	return (EINVAL);
	/* XXX Don't convert nsec->usec and back */
	TIMESPEC_TO_TIMEVAL(&atv, ats);
	error = settime(td, &atv);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct clock_getres_args {
	clockid_t clock_id;
	struct timespec *tp;
	};
	#endif
	int
	-clock_getres(struct thread td, struct clock_getres_args uap)
	+sys_clock_getres(struct thread td, struct clock_getres_args uap)
	{
	struct timespec ts;
	int error;

	if (uap->tp == NULL)
	return (0);

	error = kern_clock_getres(td, uap->clock_id, &ts);
	if (error == 0)
	error = copyout(&ts, uap->tp, sizeof(ts));
	return (error);
	}

	int
	kern_clock_getres(struct thread td, clockid_t clock_id, struct timespec ts)
	{

	ts->tv_sec = 0;
	switch (clock_id) {
	case CLOCK_REALTIME:
	case CLOCK_REALTIME_FAST:
	case CLOCK_REALTIME_PRECISE:
	case CLOCK_MONOTONIC:
	case CLOCK_MONOTONIC_FAST:
	case CLOCK_MONOTONIC_PRECISE:
	case CLOCK_UPTIME:
	case CLOCK_UPTIME_FAST:
	case CLOCK_UPTIME_PRECISE:
	/*
	* Round up the result of the division cheaply by adding 1.
	* Rounding up is especially important if rounding down
	* would give 0. Perfect rounding is unimportant.
	*/
	ts->tv_nsec = 1000000000 / tc_getfrequency() + 1;
	break;
	case CLOCK_VIRTUAL:
	case CLOCK_PROF:
	/* Accurately round up here because we can do so cheaply. */
	ts->tv_nsec = (1000000000 + hz - 1) / hz;
	break;
	case CLOCK_SECOND:
	ts->tv_sec = 1;
	ts->tv_nsec = 0;
	break;
	case CLOCK_THREAD_CPUTIME_ID:
	/* sync with cputick2usec */
	ts->tv_nsec = 1000000 / cpu_tickrate();
	if (ts->tv_nsec == 0)
	ts->tv_nsec = 1000;
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int nanowait;

	int
	kern_nanosleep(struct thread td, struct timespec rqt, struct timespec *rmt)
	{
	struct timespec ts, ts2, ts3;
	struct timeval tv;
	int error;

	if (rqt->tv_nsec < 0 \|\| rqt->tv_nsec >= 1000000000)
	return (EINVAL);
	if (rqt->tv_sec < 0 \|\| (rqt->tv_sec == 0 && rqt->tv_nsec == 0))
	return (0);
	getnanouptime(&ts);
	timespecadd(&ts, rqt);
	TIMESPEC_TO_TIMEVAL(&tv, rqt);
	for (;;) {
	error = tsleep(&nanowait, PWAIT \| PCATCH, "nanslp",
	tvtohz(&tv));
	getnanouptime(&ts2);
	if (error != EWOULDBLOCK) {
	if (error == ERESTART)
	error = EINTR;
	if (rmt != NULL) {
	timespecsub(&ts, &ts2);
	if (ts.tv_sec < 0)
	timespecclear(&ts);
	*rmt = ts;
	}
	return (error);
	}
	if (timespeccmp(&ts2, &ts, >=))
	return (0);
	ts3 = ts;
	timespecsub(&ts3, &ts2);
	TIMESPEC_TO_TIMEVAL(&tv, &ts3);
	}
	}

	#ifndef _SYS_SYSPROTO_H_
	struct nanosleep_args {
	struct timespec *rqtp;
	struct timespec *rmtp;
	};
	#endif
	/* ARGSUSED */
	int
	-nanosleep(struct thread td, struct nanosleep_args uap)
	+sys_nanosleep(struct thread td, struct nanosleep_args uap)
	{
	struct timespec rmt, rqt;
	int error;

	error = copyin(uap->rqtp, &rqt, sizeof(rqt));
	if (error)
	return (error);

	if (uap->rmtp &&
	!useracc((caddr_t)uap->rmtp, sizeof(rmt), VM_PROT_WRITE))
	return (EFAULT);
	error = kern_nanosleep(td, &rqt, &rmt);
	if (error && uap->rmtp) {
	int error2;

	error2 = copyout(&rmt, uap->rmtp, sizeof(rmt));
	if (error2)
	error = error2;
	}
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct gettimeofday_args {
	struct timeval *tp;
	struct timezone *tzp;
	};
	#endif
	/* ARGSUSED */
	int
	-gettimeofday(struct thread td, struct gettimeofday_args uap)
	+sys_gettimeofday(struct thread td, struct gettimeofday_args uap)
	{
	struct timeval atv;
	struct timezone rtz;
	int error = 0;

	if (uap->tp) {
	microtime(&atv);
	error = copyout(&atv, uap->tp, sizeof (atv));
	}
	if (error == 0 && uap->tzp != NULL) {
	rtz.tz_minuteswest = tz_minuteswest;
	rtz.tz_dsttime = tz_dsttime;
	error = copyout(&rtz, uap->tzp, sizeof (rtz));
	}
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct settimeofday_args {
	struct timeval *tv;
	struct timezone *tzp;
	};
	#endif
	/* ARGSUSED */
	int
	-settimeofday(struct thread td, struct settimeofday_args uap)
	+sys_settimeofday(struct thread td, struct settimeofday_args uap)
	{
	struct timeval atv, *tvp;
	struct timezone atz, *tzp;
	int error;

	if (uap->tv) {
	error = copyin(uap->tv, &atv, sizeof(atv));
	if (error)
	return (error);
	tvp = &atv;
	} else
	tvp = NULL;
	if (uap->tzp) {
	error = copyin(uap->tzp, &atz, sizeof(atz));
	if (error)
	return (error);
	tzp = &atz;
	} else
	tzp = NULL;
	return (kern_settimeofday(td, tvp, tzp));
	}

	int
	kern_settimeofday(struct thread td, struct timeval tv, struct timezone *tzp)
	{
	int error;

	error = priv_check(td, PRIV_SETTIMEOFDAY);
	if (error)
	return (error);
	/* Verify all parameters before changing time. */
	if (tv) {
	if (tv->tv_usec < 0 \|\| tv->tv_usec >= 1000000)
	return (EINVAL);
	error = settime(td, tv);
	}
	if (tzp && error == 0) {
	tz_minuteswest = tzp->tz_minuteswest;
	tz_dsttime = tzp->tz_dsttime;
	}
	return (error);
	}

	/*
	* Get value of an interval timer. The process virtual and profiling virtual
	* time timers are kept in the p_stats area, since they can be swapped out.
	* These are kept internally in the way they are specified externally: in
	* time until they expire.
	*
	* The real time interval timer is kept in the process table slot for the
	* process, and its value (it_value) is kept as an absolute time rather than
	* as a delta, so that it is easy to keep periodic real-time signals from
	* drifting.
	*
	* Virtual time timers are processed in the hardclock() routine of
	* kern_clock.c. The real time timer is processed by a timeout routine,
	* called from the softclock() routine. Since a callout may be delayed in
	* real time due to interrupt processing in the system, it is possible for
	* the real time timeout routine (realitexpire, given below), to be delayed
	* in real time past when it is supposed to occur. It does not suffice,
	* therefore, to reload the real timer .it_value from the real time timers
	* .it_interval. Rather, we compute the next time in absolute time the timer
	* should go off.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct getitimer_args {
	u_int which;
	struct itimerval *itv;
	};
	#endif
	int
	-getitimer(struct thread td, struct getitimer_args uap)
	+sys_getitimer(struct thread td, struct getitimer_args uap)
	{
	struct itimerval aitv;
	int error;

	error = kern_getitimer(td, uap->which, &aitv);
	if (error != 0)
	return (error);
	return (copyout(&aitv, uap->itv, sizeof (struct itimerval)));
	}

	int
	kern_getitimer(struct thread td, u_int which, struct itimerval aitv)
	{
	struct proc *p = td->td_proc;
	struct timeval ctv;

	if (which > ITIMER_PROF)
	return (EINVAL);

	if (which == ITIMER_REAL) {
	/*
	* Convert from absolute to relative time in .it_value
	* part of real time timer. If time for real time timer
	* has passed return 0, else return difference between
	* current time and time for the timer to go off.
	*/
	PROC_LOCK(p);
	*aitv = p->p_realtimer;
	PROC_UNLOCK(p);
	if (timevalisset(&aitv->it_value)) {
	getmicrouptime(&ctv);
	if (timevalcmp(&aitv->it_value, &ctv, <))
	timevalclear(&aitv->it_value);
	else
	timevalsub(&aitv->it_value, &ctv);
	}
	} else {
	PROC_SLOCK(p);
	*aitv = p->p_stats->p_timer[which];
	PROC_SUNLOCK(p);
	}
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct setitimer_args {
	u_int which;
	struct itimerval itv, oitv;
	};
	#endif
	int
	-setitimer(struct thread td, struct setitimer_args uap)
	+sys_setitimer(struct thread td, struct setitimer_args uap)
	{
	struct itimerval aitv, oitv;
	int error;

	if (uap->itv == NULL) {
	uap->itv = uap->oitv;
	- return (getitimer(td, (struct getitimer_args *)uap));
	+ return (sys_getitimer(td, (struct getitimer_args *)uap));
	}

	if ((error = copyin(uap->itv, &aitv, sizeof(struct itimerval))))
	return (error);
	error = kern_setitimer(td, uap->which, &aitv, &oitv);
	if (error != 0 \|\| uap->oitv == NULL)
	return (error);
	return (copyout(&oitv, uap->oitv, sizeof(struct itimerval)));
	}

	int
	kern_setitimer(struct thread td, u_int which, struct itimerval aitv,
	struct itimerval *oitv)
	{
	struct proc *p = td->td_proc;
	struct timeval ctv;

	if (aitv == NULL)
	return (kern_getitimer(td, which, oitv));

	if (which > ITIMER_PROF)
	return (EINVAL);
	if (itimerfix(&aitv->it_value))
	return (EINVAL);
	if (!timevalisset(&aitv->it_value))
	timevalclear(&aitv->it_interval);
	else if (itimerfix(&aitv->it_interval))
	return (EINVAL);

	if (which == ITIMER_REAL) {
	PROC_LOCK(p);
	if (timevalisset(&p->p_realtimer.it_value))
	callout_stop(&p->p_itcallout);
	getmicrouptime(&ctv);
	if (timevalisset(&aitv->it_value)) {
	callout_reset(&p->p_itcallout, tvtohz(&aitv->it_value),
	realitexpire, p);
	timevaladd(&aitv->it_value, &ctv);
	}
	*oitv = p->p_realtimer;
	p->p_realtimer = *aitv;
	PROC_UNLOCK(p);
	if (timevalisset(&oitv->it_value)) {
	if (timevalcmp(&oitv->it_value, &ctv, <))
	timevalclear(&oitv->it_value);
	else
	timevalsub(&oitv->it_value, &ctv);
	}
	} else {
	PROC_SLOCK(p);
	*oitv = p->p_stats->p_timer[which];
	p->p_stats->p_timer[which] = *aitv;
	PROC_SUNLOCK(p);
	}
	return (0);
	}

	/*
	* Real interval timer expired:
	* send process whose timer expired an alarm signal.
	* If time is not set up to reload, then just return.
	* Else compute next time timer should go off which is > current time.
	* This is where delay in processing this timeout causes multiple
	* SIGALRM calls to be compressed into one.
	* tvtohz() always adds 1 to allow for the time until the next clock
	* interrupt being strictly less than 1 clock tick, but we don't want
	* that here since we want to appear to be in sync with the clock
	* interrupt even when we're delayed.
	*/
	void
	realitexpire(void *arg)
	{
	struct proc *p;
	struct timeval ctv, ntv;

	p = (struct proc *)arg;
	PROC_LOCK(p);
	- psignal(p, SIGALRM);
	+ kern_psignal(p, SIGALRM);
	if (!timevalisset(&p->p_realtimer.it_interval)) {
	timevalclear(&p->p_realtimer.it_value);
	if (p->p_flag & P_WEXIT)
	wakeup(&p->p_itcallout);
	PROC_UNLOCK(p);
	return;
	}
	for (;;) {
	timevaladd(&p->p_realtimer.it_value,
	&p->p_realtimer.it_interval);
	getmicrouptime(&ctv);
	if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) {
	ntv = p->p_realtimer.it_value;
	timevalsub(&ntv, &ctv);
	callout_reset(&p->p_itcallout, tvtohz(&ntv) - 1,
	realitexpire, p);
	PROC_UNLOCK(p);
	return;
	}
	}
	/NOTREACHED/
	}

	/*
	* Check that a proposed value to load into the .it_value or
	* .it_interval part of an interval timer is acceptable, and
	* fix it to have at least minimal value (i.e. if it is less
	* than the resolution of the clock, round it up.)
	*/
	int
	itimerfix(struct timeval *tv)
	{

	if (tv->tv_sec < 0 \|\| tv->tv_usec < 0 \|\| tv->tv_usec >= 1000000)
	return (EINVAL);
	if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick)
	tv->tv_usec = tick;
	return (0);
	}

	/*
	* Decrement an interval timer by a specified number
	* of microseconds, which must be less than a second,
	* i.e. < 1000000. If the timer expires, then reload
	* it. In this case, carry over (usec - old value) to
	* reduce the value reloaded into the timer so that
	* the timer does not drift. This routine assumes
	* that it is called in a context where the timers
	* on which it is operating cannot change in value.
	*/
	int
	itimerdecr(struct itimerval *itp, int usec)
	{

	if (itp->it_value.tv_usec < usec) {
	if (itp->it_value.tv_sec == 0) {
	/* expired, and already in next interval */
	usec -= itp->it_value.tv_usec;
	goto expire;
	}
	itp->it_value.tv_usec += 1000000;
	itp->it_value.tv_sec--;
	}
	itp->it_value.tv_usec -= usec;
	usec = 0;
	if (timevalisset(&itp->it_value))
	return (1);
	/* expired, exactly at end of interval */
	expire:
	if (timevalisset(&itp->it_interval)) {
	itp->it_value = itp->it_interval;
	itp->it_value.tv_usec -= usec;
	if (itp->it_value.tv_usec < 0) {
	itp->it_value.tv_usec += 1000000;
	itp->it_value.tv_sec--;
	}
	} else
	itp->it_value.tv_usec = 0; /* sec is already 0 */
	return (0);
	}

	/*
	* Add and subtract routines for timevals.
	* N.B.: subtract routine doesn't deal with
	* results which are before the beginning,
	* it just gets very confused in this case.
	* Caveat emptor.
	*/
	void
	timevaladd(struct timeval t1, const struct timeval t2)
	{

	t1->tv_sec += t2->tv_sec;
	t1->tv_usec += t2->tv_usec;
	timevalfix(t1);
	}

	void
	timevalsub(struct timeval t1, const struct timeval t2)
	{

	t1->tv_sec -= t2->tv_sec;
	t1->tv_usec -= t2->tv_usec;
	timevalfix(t1);
	}

	static void
	timevalfix(struct timeval *t1)
	{

	if (t1->tv_usec < 0) {
	t1->tv_sec--;
	t1->tv_usec += 1000000;
	}
	if (t1->tv_usec >= 1000000) {
	t1->tv_sec++;
	t1->tv_usec -= 1000000;
	}
	}

	/*
	* ratecheck(): simple time-based rate-limit checking.
	*/
	int
	ratecheck(struct timeval lasttime, const struct timeval mininterval)
	{
	struct timeval tv, delta;
	int rv = 0;

	getmicrouptime(&tv); /* NB: 10ms precision */
	delta = tv;
	timevalsub(&delta, lasttime);

	/*
	* check for 0,0 is so that the message will be seen at least once,
	* even if interval is huge.
	*/
	if (timevalcmp(&delta, mininterval, >=) \|\|
	(lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
	*lasttime = tv;
	rv = 1;
	}

	return (rv);
	}

	/*
	* ppsratecheck(): packets (or events) per second limitation.
	*
	* Return 0 if the limit is to be enforced (e.g. the caller
	* should drop a packet because of the rate limitation).
	*
	* maxpps of 0 always causes zero to be returned. maxpps of -1
	* always causes 1 to be returned; this effectively defeats rate
	* limiting.
	*
	* Note that we maintain the struct timeval for compatibility
	* with other bsd systems. We reuse the storage and just monitor
	* clock ticks for minimal overhead.
	*/
	int
	ppsratecheck(struct timeval lasttime, int curpps, int maxpps)
	{
	int now;

	/*
	* Reset the last time and counter if this is the first call
	* or more than a second has passed since the last update of
	* lasttime.
	*/
	now = ticks;
	if (lasttime->tv_sec == 0 \|\| (u_int)(now - lasttime->tv_sec) >= hz) {
	lasttime->tv_sec = now;
	*curpps = 1;
	return (maxpps != 0);
	} else {
	(curpps)++; / NB: ignore potential overflow */
	return (maxpps < 0 \|\| *curpps < maxpps);
	}
	}

	static void
	itimer_start(void)
	{
	struct kclock rt_clock = {
	.timer_create = realtimer_create,
	.timer_delete = realtimer_delete,
	.timer_settime = realtimer_settime,
	.timer_gettime = realtimer_gettime,
	.event_hook = NULL
	};

	itimer_zone = uma_zcreate("itimer", sizeof(struct itimer),
	NULL, NULL, itimer_init, itimer_fini, UMA_ALIGN_PTR, 0);
	register_posix_clock(CLOCK_REALTIME, &rt_clock);
	register_posix_clock(CLOCK_MONOTONIC, &rt_clock);
	p31b_setcfg(CTL_P1003_1B_TIMERS, 200112L);
	p31b_setcfg(CTL_P1003_1B_DELAYTIMER_MAX, INT_MAX);
	p31b_setcfg(CTL_P1003_1B_TIMER_MAX, TIMER_MAX);
	EVENTHANDLER_REGISTER(process_exit, itimers_event_hook_exit,
	(void *)ITIMER_EV_EXIT, EVENTHANDLER_PRI_ANY);
	EVENTHANDLER_REGISTER(process_exec, itimers_event_hook_exec,
	(void *)ITIMER_EV_EXEC, EVENTHANDLER_PRI_ANY);
	}

	int
	register_posix_clock(int clockid, struct kclock *clk)
	{
	if ((unsigned)clockid >= MAX_CLOCKS) {
	printf("%s: invalid clockid\n", __func__);
	return (0);
	}
	posix_clocks[clockid] = *clk;
	return (1);
	}

	static int
	itimer_init(void *mem, int size, int flags)
	{
	struct itimer *it;

	it = (struct itimer *)mem;
	mtx_init(&it->it_mtx, "itimer lock", NULL, MTX_DEF);
	return (0);
	}

	static void
	itimer_fini(void *mem, int size)
	{
	struct itimer *it;

	it = (struct itimer *)mem;
	mtx_destroy(&it->it_mtx);
	}

	static void
	itimer_enter(struct itimer *it)
	{

	mtx_assert(&it->it_mtx, MA_OWNED);
	it->it_usecount++;
	}

	static void
	itimer_leave(struct itimer *it)
	{

	mtx_assert(&it->it_mtx, MA_OWNED);
	KASSERT(it->it_usecount > 0, ("invalid it_usecount"));

	if (--it->it_usecount == 0 && (it->it_flags & ITF_WANTED) != 0)
	wakeup(it);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ktimer_create_args {
	clockid_t clock_id;
	struct sigevent * evp;
	int * timerid;
	};
	#endif
	int
	-ktimer_create(struct thread td, struct ktimer_create_args uap)
	+sys_ktimer_create(struct thread td, struct ktimer_create_args uap)
	{
	struct sigevent *evp1, ev;
	int id;
	int error;

	if (uap->evp != NULL) {
	error = copyin(uap->evp, &ev, sizeof(ev));
	if (error != 0)
	return (error);
	evp1 = &ev;
	} else
	evp1 = NULL;

	error = kern_timer_create(td, uap->clock_id, evp1, &id, -1);

	if (error == 0) {
	error = copyout(&id, uap->timerid, sizeof(int));
	if (error != 0)
	kern_timer_delete(td, id);
	}
	return (error);
	}

	static int
	kern_timer_create(struct thread *td, clockid_t clock_id,
	struct sigevent evp, int timerid, int preset_id)
	{
	struct proc *p = td->td_proc;
	struct itimer *it;
	int id;
	int error;

	if (clock_id < 0 \|\| clock_id >= MAX_CLOCKS)
	return (EINVAL);

	if (posix_clocks[clock_id].timer_create == NULL)
	return (EINVAL);

	if (evp != NULL) {
	if (evp->sigev_notify != SIGEV_NONE &&
	evp->sigev_notify != SIGEV_SIGNAL &&
	evp->sigev_notify != SIGEV_THREAD_ID)
	return (EINVAL);
	if ((evp->sigev_notify == SIGEV_SIGNAL \|\|
	evp->sigev_notify == SIGEV_THREAD_ID) &&
	!_SIG_VALID(evp->sigev_signo))
	return (EINVAL);
	}

	if (p->p_itimers == NULL)
	itimers_alloc(p);

	it = uma_zalloc(itimer_zone, M_WAITOK);
	it->it_flags = 0;
	it->it_usecount = 0;
	it->it_active = 0;
	timespecclear(&it->it_time.it_value);
	timespecclear(&it->it_time.it_interval);
	it->it_overrun = 0;
	it->it_overrun_last = 0;
	it->it_clockid = clock_id;
	it->it_timerid = -1;
	it->it_proc = p;
	ksiginfo_init(&it->it_ksi);
	it->it_ksi.ksi_flags \|= KSI_INS \| KSI_EXT;
	error = CLOCK_CALL(clock_id, timer_create, (it));
	if (error != 0)
	goto out;

	PROC_LOCK(p);
	if (preset_id != -1) {
	KASSERT(preset_id >= 0 && preset_id < 3, ("invalid preset_id"));
	id = preset_id;
	if (p->p_itimers->its_timers[id] != NULL) {
	PROC_UNLOCK(p);
	error = 0;
	goto out;
	}
	} else {
	/*
	* Find a free timer slot, skipping those reserved
	* for setitimer().
	*/
	for (id = 3; id < TIMER_MAX; id++)
	if (p->p_itimers->its_timers[id] == NULL)
	break;
	if (id == TIMER_MAX) {
	PROC_UNLOCK(p);
	error = EAGAIN;
	goto out;
	}
	}
	it->it_timerid = id;
	p->p_itimers->its_timers[id] = it;
	if (evp != NULL)
	it->it_sigev = *evp;
	else {
	it->it_sigev.sigev_notify = SIGEV_SIGNAL;
	switch (clock_id) {
	default:
	case CLOCK_REALTIME:
	it->it_sigev.sigev_signo = SIGALRM;
	break;
	case CLOCK_VIRTUAL:
	it->it_sigev.sigev_signo = SIGVTALRM;
	break;
	case CLOCK_PROF:
	it->it_sigev.sigev_signo = SIGPROF;
	break;
	}
	it->it_sigev.sigev_value.sival_int = id;
	}

	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL \|\|
	it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
	it->it_ksi.ksi_signo = it->it_sigev.sigev_signo;
	it->it_ksi.ksi_code = SI_TIMER;
	it->it_ksi.ksi_value = it->it_sigev.sigev_value;
	it->it_ksi.ksi_timerid = id;
	}
	PROC_UNLOCK(p);
	*timerid = id;
	return (0);

	out:
	ITIMER_LOCK(it);
	CLOCK_CALL(it->it_clockid, timer_delete, (it));
	ITIMER_UNLOCK(it);
	uma_zfree(itimer_zone, it);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ktimer_delete_args {
	int timerid;
	};
	#endif
	int
	-ktimer_delete(struct thread td, struct ktimer_delete_args uap)
	+sys_ktimer_delete(struct thread td, struct ktimer_delete_args uap)
	{
	return (kern_timer_delete(td, uap->timerid));
	}

	static struct itimer *
	itimer_find(struct proc *p, int timerid)
	{
	struct itimer *it;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	if ((p->p_itimers == NULL) \|\|
	(timerid < 0) \|\| (timerid >= TIMER_MAX) \|\|
	(it = p->p_itimers->its_timers[timerid]) == NULL) {
	return (NULL);
	}
	ITIMER_LOCK(it);
	if ((it->it_flags & ITF_DELETING) != 0) {
	ITIMER_UNLOCK(it);
	it = NULL;
	}
	return (it);
	}

	static int
	kern_timer_delete(struct thread *td, int timerid)
	{
	struct proc *p = td->td_proc;
	struct itimer *it;

	PROC_LOCK(p);
	it = itimer_find(p, timerid);
	if (it == NULL) {
	PROC_UNLOCK(p);
	return (EINVAL);
	}
	PROC_UNLOCK(p);

	it->it_flags \|= ITF_DELETING;
	while (it->it_usecount > 0) {
	it->it_flags \|= ITF_WANTED;
	msleep(it, &it->it_mtx, PPAUSE, "itimer", 0);
	}
	it->it_flags &= ~ITF_WANTED;
	CLOCK_CALL(it->it_clockid, timer_delete, (it));
	ITIMER_UNLOCK(it);

	PROC_LOCK(p);
	if (KSI_ONQ(&it->it_ksi))
	sigqueue_take(&it->it_ksi);
	p->p_itimers->its_timers[timerid] = NULL;
	PROC_UNLOCK(p);
	uma_zfree(itimer_zone, it);
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ktimer_settime_args {
	int timerid;
	int flags;
	const struct itimerspec * value;
	struct itimerspec * ovalue;
	};
	#endif
	int
	-ktimer_settime(struct thread td, struct ktimer_settime_args uap)
	+sys_ktimer_settime(struct thread td, struct ktimer_settime_args uap)
	{
	struct proc *p = td->td_proc;
	struct itimer *it;
	struct itimerspec val, oval, *ovalp;
	int error;

	error = copyin(uap->value, &val, sizeof(val));
	if (error != 0)
	return (error);

	if (uap->ovalue != NULL)
	ovalp = &oval;
	else
	ovalp = NULL;

	PROC_LOCK(p);
	if (uap->timerid < 3 \|\|
	(it = itimer_find(p, uap->timerid)) == NULL) {
	PROC_UNLOCK(p);
	error = EINVAL;
	} else {
	PROC_UNLOCK(p);
	itimer_enter(it);
	error = CLOCK_CALL(it->it_clockid, timer_settime,
	(it, uap->flags, &val, ovalp));
	itimer_leave(it);
	ITIMER_UNLOCK(it);
	}
	if (error == 0 && uap->ovalue != NULL)
	error = copyout(ovalp, uap->ovalue, sizeof(*ovalp));
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ktimer_gettime_args {
	int timerid;
	struct itimerspec * value;
	};
	#endif
	int
	-ktimer_gettime(struct thread td, struct ktimer_gettime_args uap)
	+sys_ktimer_gettime(struct thread td, struct ktimer_gettime_args uap)
	{
	struct proc *p = td->td_proc;
	struct itimer *it;
	struct itimerspec val;
	int error;

	PROC_LOCK(p);
	if (uap->timerid < 3 \|\|
	(it = itimer_find(p, uap->timerid)) == NULL) {
	PROC_UNLOCK(p);
	error = EINVAL;
	} else {
	PROC_UNLOCK(p);
	itimer_enter(it);
	error = CLOCK_CALL(it->it_clockid, timer_gettime,
	(it, &val));
	itimer_leave(it);
	ITIMER_UNLOCK(it);
	}
	if (error == 0)
	error = copyout(&val, uap->value, sizeof(val));
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct timer_getoverrun_args {
	int timerid;
	};
	#endif
	int
	-ktimer_getoverrun(struct thread td, struct ktimer_getoverrun_args uap)
	+sys_ktimer_getoverrun(struct thread td, struct ktimer_getoverrun_args uap)
	{
	struct proc *p = td->td_proc;
	struct itimer *it;
	int error ;

	PROC_LOCK(p);
	if (uap->timerid < 3 \|\|
	(it = itimer_find(p, uap->timerid)) == NULL) {
	PROC_UNLOCK(p);
	error = EINVAL;
	} else {
	td->td_retval[0] = it->it_overrun_last;
	ITIMER_UNLOCK(it);
	PROC_UNLOCK(p);
	error = 0;
	}
	return (error);
	}

	static int
	realtimer_create(struct itimer *it)
	{
	callout_init_mtx(&it->it_callout, &it->it_mtx, 0);
	return (0);
	}

	static int
	realtimer_delete(struct itimer *it)
	{
	mtx_assert(&it->it_mtx, MA_OWNED);

	/*
	* clear timer's value and interval to tell realtimer_expire
	* to not rearm the timer.
	*/
	timespecclear(&it->it_time.it_value);
	timespecclear(&it->it_time.it_interval);
	ITIMER_UNLOCK(it);
	callout_drain(&it->it_callout);
	ITIMER_LOCK(it);
	return (0);
	}

	static int
	realtimer_gettime(struct itimer it, struct itimerspec ovalue)
	{
	struct timespec cts;

	mtx_assert(&it->it_mtx, MA_OWNED);

	realtimer_clocktime(it->it_clockid, &cts);
	*ovalue = it->it_time;
	if (ovalue->it_value.tv_sec != 0 \|\| ovalue->it_value.tv_nsec != 0) {
	timespecsub(&ovalue->it_value, &cts);
	if (ovalue->it_value.tv_sec < 0 \|\|
	(ovalue->it_value.tv_sec == 0 &&
	ovalue->it_value.tv_nsec == 0)) {
	ovalue->it_value.tv_sec = 0;
	ovalue->it_value.tv_nsec = 1;
	}
	}
	return (0);
	}

	static int
	realtimer_settime(struct itimer *it, int flags,
	struct itimerspec value, struct itimerspec ovalue)
	{
	struct timespec cts, ts;
	struct timeval tv;
	struct itimerspec val;

	mtx_assert(&it->it_mtx, MA_OWNED);

	val = *value;
	if (itimespecfix(&val.it_value))
	return (EINVAL);

	if (timespecisset(&val.it_value)) {
	if (itimespecfix(&val.it_interval))
	return (EINVAL);
	} else {
	timespecclear(&val.it_interval);
	}

	if (ovalue != NULL)
	realtimer_gettime(it, ovalue);

	it->it_time = val;
	if (timespecisset(&val.it_value)) {
	realtimer_clocktime(it->it_clockid, &cts);
	ts = val.it_value;
	if ((flags & TIMER_ABSTIME) == 0) {
	/* Convert to absolute time. */
	timespecadd(&it->it_time.it_value, &cts);
	} else {
	timespecsub(&ts, &cts);
	/*
	* We don't care if ts is negative, tztohz will
	* fix it.
	*/
	}
	TIMESPEC_TO_TIMEVAL(&tv, &ts);
	callout_reset(&it->it_callout, tvtohz(&tv),
	realtimer_expire, it);
	} else {
	callout_stop(&it->it_callout);
	}

	return (0);
	}

	static void
	realtimer_clocktime(clockid_t id, struct timespec *ts)
	{
	if (id == CLOCK_REALTIME)
	getnanotime(ts);
	else /* CLOCK_MONOTONIC */
	getnanouptime(ts);
	}

	int
	itimer_accept(struct proc p, int timerid, ksiginfo_t ksi)
	{
	struct itimer *it;

	PROC_LOCK_ASSERT(p, MA_OWNED);
	it = itimer_find(p, timerid);
	if (it != NULL) {
	ksi->ksi_overrun = it->it_overrun;
	it->it_overrun_last = it->it_overrun;
	it->it_overrun = 0;
	ITIMER_UNLOCK(it);
	return (0);
	}
	return (EINVAL);
	}

	int
	itimespecfix(struct timespec *ts)
	{

	if (ts->tv_sec < 0 \|\| ts->tv_nsec < 0 \|\| ts->tv_nsec >= 1000000000)
	return (EINVAL);
	if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000)
	ts->tv_nsec = tick * 1000;
	return (0);
	}

	/* Timeout callback for realtime timer */
	static void
	realtimer_expire(void *arg)
	{
	struct timespec cts, ts;
	struct timeval tv;
	struct itimer *it;

	it = (struct itimer *)arg;

	realtimer_clocktime(it->it_clockid, &cts);
	/* Only fire if time is reached. */
	if (timespeccmp(&cts, &it->it_time.it_value, >=)) {
	if (timespecisset(&it->it_time.it_interval)) {
	timespecadd(&it->it_time.it_value,
	&it->it_time.it_interval);
	while (timespeccmp(&cts, &it->it_time.it_value, >=)) {
	if (it->it_overrun < INT_MAX)
	it->it_overrun++;
	else
	it->it_ksi.ksi_errno = ERANGE;
	timespecadd(&it->it_time.it_value,
	&it->it_time.it_interval);
	}
	} else {
	/* single shot timer ? */
	timespecclear(&it->it_time.it_value);
	}
	if (timespecisset(&it->it_time.it_value)) {
	ts = it->it_time.it_value;
	timespecsub(&ts, &cts);
	TIMESPEC_TO_TIMEVAL(&tv, &ts);
	callout_reset(&it->it_callout, tvtohz(&tv),
	realtimer_expire, it);
	}
	itimer_enter(it);
	ITIMER_UNLOCK(it);
	itimer_fire(it);
	ITIMER_LOCK(it);
	itimer_leave(it);
	} else if (timespecisset(&it->it_time.it_value)) {
	ts = it->it_time.it_value;
	timespecsub(&ts, &cts);
	TIMESPEC_TO_TIMEVAL(&tv, &ts);
	callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire,
	it);
	}
	}

	void
	itimer_fire(struct itimer *it)
	{
	struct proc *p = it->it_proc;
	struct thread *td;

	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL \|\|
	it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
	if (sigev_findtd(p, &it->it_sigev, &td) != 0) {
	ITIMER_LOCK(it);
	timespecclear(&it->it_time.it_value);
	timespecclear(&it->it_time.it_interval);
	callout_stop(&it->it_callout);
	ITIMER_UNLOCK(it);
	return;
	}
	if (!KSI_ONQ(&it->it_ksi)) {
	it->it_ksi.ksi_errno = 0;
	ksiginfo_set_sigev(&it->it_ksi, &it->it_sigev);
	tdsendsignal(p, td, it->it_ksi.ksi_signo, &it->it_ksi);
	} else {
	if (it->it_overrun < INT_MAX)
	it->it_overrun++;
	else
	it->it_ksi.ksi_errno = ERANGE;
	}
	PROC_UNLOCK(p);
	}
	}

	static void
	itimers_alloc(struct proc *p)
	{
	struct itimers *its;
	int i;

	its = malloc(sizeof (struct itimers), M_SUBPROC, M_WAITOK \| M_ZERO);
	LIST_INIT(&its->its_virtual);
	LIST_INIT(&its->its_prof);
	TAILQ_INIT(&its->its_worklist);
	for (i = 0; i < TIMER_MAX; i++)
	its->its_timers[i] = NULL;
	PROC_LOCK(p);
	if (p->p_itimers == NULL) {
	p->p_itimers = its;
	PROC_UNLOCK(p);
	}
	else {
	PROC_UNLOCK(p);
	free(its, M_SUBPROC);
	}
	}

	static void
	itimers_event_hook_exec(void arg, struct proc p, struct image_params *imgp __unused)
	{
	itimers_event_hook_exit(arg, p);
	}

	/* Clean up timers when some process events are being triggered. */
	static void
	itimers_event_hook_exit(void arg, struct proc p)
	{
	struct itimers *its;
	struct itimer *it;
	int event = (int)(intptr_t)arg;
	int i;

	if (p->p_itimers != NULL) {
	its = p->p_itimers;
	for (i = 0; i < MAX_CLOCKS; ++i) {
	if (posix_clocks[i].event_hook != NULL)
	CLOCK_CALL(i, event_hook, (p, i, event));
	}
	/*
	* According to susv3, XSI interval timers should be inherited
	* by new image.
	*/
	if (event == ITIMER_EV_EXEC)
	i = 3;
	else if (event == ITIMER_EV_EXIT)
	i = 0;
	else
	panic("unhandled event");
	for (; i < TIMER_MAX; ++i) {
	if ((it = its->its_timers[i]) != NULL)
	kern_timer_delete(curthread, i);
	}
	if (its->its_timers[0] == NULL &&
	its->its_timers[1] == NULL &&
	its->its_timers[2] == NULL) {
	free(its, M_SUBPROC);
	p->p_itimers = NULL;
	}
	}
	}
	Index: head/sys/kern/kern_umtx.c
	===================================================================
	--- head/sys/kern/kern_umtx.c (revision 225616)
	+++ head/sys/kern/kern_umtx.c (revision 225617)
	@@ -1,3612 +1,3612 @@
	/*-
	* Copyright (c) 2004, David Xu <davidxu@freebsd.org>
	* Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice unmodified, this list of conditions, and the following
	* disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/sched.h>
	#include <sys/smp.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/syscallsubr.h>
	#include <sys/eventhandler.h>
	#include <sys/umtx.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>

	#include <machine/cpu.h>

	#ifdef COMPAT_FREEBSD32
	#include <compat/freebsd32/freebsd32_proto.h>
	#endif

	#define _UMUTEX_TRY 1
	#define _UMUTEX_WAIT 2

	/* Priority inheritance mutex info. */
	struct umtx_pi {
	/* Owner thread */
	struct thread *pi_owner;

	/* Reference count */
	int pi_refcount;

	/* List entry to link umtx holding by thread */
	TAILQ_ENTRY(umtx_pi) pi_link;

	/* List entry in hash */
	TAILQ_ENTRY(umtx_pi) pi_hashlink;

	/* List for waiters */
	TAILQ_HEAD(,umtx_q) pi_blocked;

	/* Identify a userland lock object */
	struct umtx_key pi_key;
	};

	/* A userland synchronous object user. */
	struct umtx_q {
	/* Linked list for the hash. */
	TAILQ_ENTRY(umtx_q) uq_link;

	/* Umtx key. */
	struct umtx_key uq_key;

	/* Umtx flags. */
	int uq_flags;
	#define UQF_UMTXQ 0x0001

	/* The thread waits on. */
	struct thread *uq_thread;

	/*
	* Blocked on PI mutex. read can use chain lock
	* or umtx_lock, write must have both chain lock and
	* umtx_lock being hold.
	*/
	struct umtx_pi *uq_pi_blocked;

	/* On blocked list */
	TAILQ_ENTRY(umtx_q) uq_lockq;

	/* Thread contending with us */
	TAILQ_HEAD(,umtx_pi) uq_pi_contested;

	/* Inherited priority from PP mutex */
	u_char uq_inherited_pri;

	/* Spare queue ready to be reused */
	struct umtxq_queue *uq_spare_queue;

	/* The queue we on */
	struct umtxq_queue *uq_cur_queue;
	};

	TAILQ_HEAD(umtxq_head, umtx_q);

	/* Per-key wait-queue */
	struct umtxq_queue {
	struct umtxq_head head;
	struct umtx_key key;
	LIST_ENTRY(umtxq_queue) link;
	int length;
	};

	LIST_HEAD(umtxq_list, umtxq_queue);

	/* Userland lock object's wait-queue chain */
	struct umtxq_chain {
	/* Lock for this chain. */
	struct mtx uc_lock;

	/* List of sleep queues. */
	struct umtxq_list uc_queue[2];
	#define UMTX_SHARED_QUEUE 0
	#define UMTX_EXCLUSIVE_QUEUE 1

	LIST_HEAD(, umtxq_queue) uc_spare_queue;

	/* Busy flag */
	char uc_busy;

	/* Chain lock waiters */
	int uc_waiters;

	/* All PI in the list */
	TAILQ_HEAD(,umtx_pi) uc_pi_list;

	};

	#define UMTXQ_LOCKED_ASSERT(uc) mtx_assert(&(uc)->uc_lock, MA_OWNED)
	#define UMTXQ_BUSY_ASSERT(uc) KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))

	/*
	* Don't propagate time-sharing priority, there is a security reason,
	* a user can simply introduce PI-mutex, let thread A lock the mutex,
	* and let another thread B block on the mutex, because B is
	* sleeping, its priority will be boosted, this causes A's priority to
	* be boosted via priority propagating too and will never be lowered even
	* if it is using 100%CPU, this is unfair to other processes.
	*/

	#define UPRI(td) (((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
	(td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
	PRI_MAX_TIMESHARE : (td)->td_user_pri)

	#define GOLDEN_RATIO_PRIME 2654404609U
	#define UMTX_CHAINS 512
	#define UMTX_SHIFTS (__WORD_BIT - 9)

	#define GET_SHARE(flags) \
	(((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)

	#define BUSY_SPINS 200

	static uma_zone_t umtx_pi_zone;
	static struct umtxq_chain umtxq_chains[2][UMTX_CHAINS];
	static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
	static int umtx_pi_allocated;

	SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
	SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
	&umtx_pi_allocated, 0, "Allocated umtx_pi");

	static void umtxq_sysinit(void *);
	static void umtxq_hash(struct umtx_key *key);
	static struct umtxq_chain umtxq_getchain(struct umtx_key key);
	static void umtxq_lock(struct umtx_key *key);
	static void umtxq_unlock(struct umtx_key *key);
	static void umtxq_busy(struct umtx_key *key);
	static void umtxq_unbusy(struct umtx_key *key);
	static void umtxq_insert_queue(struct umtx_q *uq, int q);
	static void umtxq_remove_queue(struct umtx_q *uq, int q);
	static int umtxq_sleep(struct umtx_q uq, const char wmesg, int timo);
	static int umtxq_count(struct umtx_key *key);
	static struct umtx_pi *umtx_pi_alloc(int);
	static void umtx_pi_free(struct umtx_pi *pi);
	static int do_unlock_pp(struct thread td, struct umutex m, uint32_t flags);
	static void umtx_thread_cleanup(struct thread *td);
	static void umtx_exec_hook(void arg __unused, struct proc p __unused,
	struct image_params *imgp __unused);
	SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);

	#define umtxq_signal(key, nwake) umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
	#define umtxq_insert(uq) umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
	#define umtxq_remove(uq) umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)

	static struct mtx umtx_lock;

	static void
	umtxq_sysinit(void *arg __unused)
	{
	int i, j;

	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	for (i = 0; i < 2; ++i) {
	for (j = 0; j < UMTX_CHAINS; ++j) {
	mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
	MTX_DEF \| MTX_DUPOK);
	LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
	LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
	LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
	TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
	umtxq_chains[i][j].uc_busy = 0;
	umtxq_chains[i][j].uc_waiters = 0;
	}
	}
	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
	EVENTHANDLER_PRI_ANY);
	}

	struct umtx_q *
	umtxq_alloc(void)
	{
	struct umtx_q *uq;

	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK \| M_ZERO);
	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK \| M_ZERO);
	TAILQ_INIT(&uq->uq_spare_queue->head);
	TAILQ_INIT(&uq->uq_pi_contested);
	uq->uq_inherited_pri = PRI_MAX;
	return (uq);
	}

	void
	umtxq_free(struct umtx_q *uq)
	{
	MPASS(uq->uq_spare_queue != NULL);
	free(uq->uq_spare_queue, M_UMTX);
	free(uq, M_UMTX);
	}

	static inline void
	umtxq_hash(struct umtx_key *key)
	{
	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
	}

	static inline struct umtxq_chain *
	umtxq_getchain(struct umtx_key *key)
	{
	if (key->type <= TYPE_SEM)
	return (&umtxq_chains[1][key->hash]);
	return (&umtxq_chains[0][key->hash]);
	}

	/*
	* Lock a chain.
	*/
	static inline void
	umtxq_lock(struct umtx_key *key)
	{
	struct umtxq_chain *uc;

	uc = umtxq_getchain(key);
	mtx_lock(&uc->uc_lock);
	}

	/*
	* Unlock a chain.
	*/
	static inline void
	umtxq_unlock(struct umtx_key *key)
	{
	struct umtxq_chain *uc;

	uc = umtxq_getchain(key);
	mtx_unlock(&uc->uc_lock);
	}

	/*
	* Set chain to busy state when following operation
	* may be blocked (kernel mutex can not be used).
	*/
	static inline void
	umtxq_busy(struct umtx_key *key)
	{
	struct umtxq_chain *uc;

	uc = umtxq_getchain(key);
	mtx_assert(&uc->uc_lock, MA_OWNED);
	if (uc->uc_busy) {
	#ifdef SMP
	if (smp_cpus > 1) {
	int count = BUSY_SPINS;
	if (count > 0) {
	umtxq_unlock(key);
	while (uc->uc_busy && --count > 0)
	cpu_spinwait();
	umtxq_lock(key);
	}
	}
	#endif
	while (uc->uc_busy) {
	uc->uc_waiters++;
	msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
	uc->uc_waiters--;
	}
	}
	uc->uc_busy = 1;
	}

	/*
	* Unbusy a chain.
	*/
	static inline void
	umtxq_unbusy(struct umtx_key *key)
	{
	struct umtxq_chain *uc;

	uc = umtxq_getchain(key);
	mtx_assert(&uc->uc_lock, MA_OWNED);
	KASSERT(uc->uc_busy != 0, ("not busy"));
	uc->uc_busy = 0;
	if (uc->uc_waiters)
	wakeup_one(uc);
	}

	static struct umtxq_queue *
	umtxq_queue_lookup(struct umtx_key *key, int q)
	{
	struct umtxq_queue *uh;
	struct umtxq_chain *uc;

	uc = umtxq_getchain(key);
	UMTXQ_LOCKED_ASSERT(uc);
	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
	if (umtx_key_match(&uh->key, key))
	return (uh);
	}

	return (NULL);
	}

	static inline void
	umtxq_insert_queue(struct umtx_q *uq, int q)
	{
	struct umtxq_queue *uh;
	struct umtxq_chain *uc;

	uc = umtxq_getchain(&uq->uq_key);
	UMTXQ_LOCKED_ASSERT(uc);
	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
	uh = umtxq_queue_lookup(&uq->uq_key, q);
	if (uh != NULL) {
	LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
	} else {
	uh = uq->uq_spare_queue;
	uh->key = uq->uq_key;
	LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
	}
	uq->uq_spare_queue = NULL;

	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
	uh->length++;
	uq->uq_flags \|= UQF_UMTXQ;
	uq->uq_cur_queue = uh;
	return;
	}

	static inline void
	umtxq_remove_queue(struct umtx_q *uq, int q)
	{
	struct umtxq_chain *uc;
	struct umtxq_queue *uh;

	uc = umtxq_getchain(&uq->uq_key);
	UMTXQ_LOCKED_ASSERT(uc);
	if (uq->uq_flags & UQF_UMTXQ) {
	uh = uq->uq_cur_queue;
	TAILQ_REMOVE(&uh->head, uq, uq_link);
	uh->length--;
	uq->uq_flags &= ~UQF_UMTXQ;
	if (TAILQ_EMPTY(&uh->head)) {
	KASSERT(uh->length == 0,
	("inconsistent umtxq_queue length"));
	LIST_REMOVE(uh, link);
	} else {
	uh = LIST_FIRST(&uc->uc_spare_queue);
	KASSERT(uh != NULL, ("uc_spare_queue is empty"));
	LIST_REMOVE(uh, link);
	}
	uq->uq_spare_queue = uh;
	uq->uq_cur_queue = NULL;
	}
	}

	/*
	* Check if there are multiple waiters
	*/
	static int
	umtxq_count(struct umtx_key *key)
	{
	struct umtxq_chain *uc;
	struct umtxq_queue *uh;

	uc = umtxq_getchain(key);
	UMTXQ_LOCKED_ASSERT(uc);
	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
	if (uh != NULL)
	return (uh->length);
	return (0);
	}

	/*
	* Check if there are multiple PI waiters and returns first
	* waiter.
	*/
	static int
	umtxq_count_pi(struct umtx_key key, struct umtx_q *first)
	{
	struct umtxq_chain *uc;
	struct umtxq_queue *uh;

	*first = NULL;
	uc = umtxq_getchain(key);
	UMTXQ_LOCKED_ASSERT(uc);
	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
	if (uh != NULL) {
	*first = TAILQ_FIRST(&uh->head);
	return (uh->length);
	}
	return (0);
	}

	/*
	* Wake up threads waiting on an userland object.
	*/

	static int
	umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
	{
	struct umtxq_chain *uc;
	struct umtxq_queue *uh;
	struct umtx_q *uq;
	int ret;

	ret = 0;
	uc = umtxq_getchain(key);
	UMTXQ_LOCKED_ASSERT(uc);
	uh = umtxq_queue_lookup(key, q);
	if (uh != NULL) {
	while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
	umtxq_remove_queue(uq, q);
	wakeup(uq);
	if (++ret >= n_wake)
	return (ret);
	}
	}
	return (ret);
	}


	/*
	* Wake up specified thread.
	*/
	static inline void
	umtxq_signal_thread(struct umtx_q *uq)
	{
	struct umtxq_chain *uc;

	uc = umtxq_getchain(&uq->uq_key);
	UMTXQ_LOCKED_ASSERT(uc);
	umtxq_remove(uq);
	wakeup(uq);
	}

	/*
	* Put thread into sleep state, before sleeping, check if
	* thread was removed from umtx queue.
	*/
	static inline int
	umtxq_sleep(struct umtx_q uq, const char wmesg, int timo)
	{
	struct umtxq_chain *uc;
	int error;

	uc = umtxq_getchain(&uq->uq_key);
	UMTXQ_LOCKED_ASSERT(uc);
	if (!(uq->uq_flags & UQF_UMTXQ))
	return (0);
	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
	if (error == EWOULDBLOCK)
	error = ETIMEDOUT;
	return (error);
	}

	/*
	* Convert userspace address into unique logical address.
	*/
	int
	umtx_key_get(void addr, int type, int share, struct umtx_key key)
	{
	struct thread *td = curthread;
	vm_map_t map;
	vm_map_entry_t entry;
	vm_pindex_t pindex;
	vm_prot_t prot;
	boolean_t wired;

	key->type = type;
	if (share == THREAD_SHARE) {
	key->shared = 0;
	key->info.private.vs = td->td_proc->p_vmspace;
	key->info.private.addr = (uintptr_t)addr;
	} else {
	MPASS(share == PROCESS_SHARE \|\| share == AUTO_SHARE);
	map = &td->td_proc->p_vmspace->vm_map;
	if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
	&entry, &key->info.shared.object, &pindex, &prot,
	&wired) != KERN_SUCCESS) {
	return EFAULT;
	}

	if ((share == PROCESS_SHARE) \|\|
	(share == AUTO_SHARE &&
	VM_INHERIT_SHARE == entry->inheritance)) {
	key->shared = 1;
	key->info.shared.offset = entry->offset + entry->start -
	(vm_offset_t)addr;
	vm_object_reference(key->info.shared.object);
	} else {
	key->shared = 0;
	key->info.private.vs = td->td_proc->p_vmspace;
	key->info.private.addr = (uintptr_t)addr;
	}
	vm_map_lookup_done(map, entry);
	}

	umtxq_hash(key);
	return (0);
	}

	/*
	* Release key.
	*/
	void
	umtx_key_release(struct umtx_key *key)
	{
	if (key->shared)
	vm_object_deallocate(key->info.shared.object);
	}

	/*
	* Lock a umtx object.
	*/
	static int
	_do_lock_umtx(struct thread td, struct umtx umtx, u_long id, int timo)
	{
	struct umtx_q *uq;
	u_long owner;
	u_long old;
	int error = 0;

	uq = td->td_umtxq;

	/*
	* Care must be exercised when dealing with umtx structure. It
	* can fault on any access.
	*/
	for (;;) {
	/*
	* Try the uncontested case. This should be done in userland.
	*/
	owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);

	/* The acquire succeeded. */
	if (owner == UMTX_UNOWNED)
	return (0);

	/* The address was invalid. */
	if (owner == -1)
	return (EFAULT);

	/* If no one owns it but it is contested try to acquire it. */
	if (owner == UMTX_CONTESTED) {
	owner = casuword(&umtx->u_owner,
	UMTX_CONTESTED, id \| UMTX_CONTESTED);

	if (owner == UMTX_CONTESTED)
	return (0);

	/* The address was invalid. */
	if (owner == -1)
	return (EFAULT);

	/* If this failed the lock has changed, restart. */
	continue;
	}

	/*
	* If we caught a signal, we have retried and now
	* exit immediately.
	*/
	if (error != 0)
	return (error);

	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
	AUTO_SHARE, &uq->uq_key)) != 0)
	return (error);

	umtxq_lock(&uq->uq_key);
	umtxq_busy(&uq->uq_key);
	umtxq_insert(uq);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);

	/*
	* Set the contested bit so that a release in user space
	* knows to use the system call for unlock. If this fails
	* either some one else has acquired the lock or it has been
	* released.
	*/
	old = casuword(&umtx->u_owner, owner, owner \| UMTX_CONTESTED);

	/* The address was invalid. */
	if (old == -1) {
	umtxq_lock(&uq->uq_key);
	umtxq_remove(uq);
	umtxq_unlock(&uq->uq_key);
	umtx_key_release(&uq->uq_key);
	return (EFAULT);
	}

	/*
	* We set the contested bit, sleep. Otherwise the lock changed
	* and we need to retry or we lost a race to the thread
	* unlocking the umtx.
	*/
	umtxq_lock(&uq->uq_key);
	if (old == owner)
	error = umtxq_sleep(uq, "umtx", timo);
	umtxq_remove(uq);
	umtxq_unlock(&uq->uq_key);
	umtx_key_release(&uq->uq_key);
	}

	return (0);
	}

	/*
	* Lock a umtx object.
	*/
	static int
	do_lock_umtx(struct thread td, struct umtx umtx, u_long id,
	struct timespec *timeout)
	{
	struct timespec ts, ts2, ts3;
	struct timeval tv;
	int error;

	if (timeout == NULL) {
	error = _do_lock_umtx(td, umtx, id, 0);
	/* Mutex locking is restarted if it is interrupted. */
	if (error == EINTR)
	error = ERESTART;
	} else {
	getnanouptime(&ts);
	timespecadd(&ts, timeout);
	TIMESPEC_TO_TIMEVAL(&tv, timeout);
	for (;;) {
	error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
	if (error != ETIMEDOUT)
	break;
	getnanouptime(&ts2);
	if (timespeccmp(&ts2, &ts, >=)) {
	error = ETIMEDOUT;
	break;
	}
	ts3 = ts;
	timespecsub(&ts3, &ts2);
	TIMESPEC_TO_TIMEVAL(&tv, &ts3);
	}
	/* Timed-locking is not restarted. */
	if (error == ERESTART)
	error = EINTR;
	}
	return (error);
	}

	/*
	* Unlock a umtx object.
	*/
	static int
	do_unlock_umtx(struct thread td, struct umtx umtx, u_long id)
	{
	struct umtx_key key;
	u_long owner;
	u_long old;
	int error;
	int count;

	/*
	* Make sure we own this mtx.
	*/
	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
	if (owner == -1)
	return (EFAULT);

	if ((owner & ~UMTX_CONTESTED) != id)
	return (EPERM);

	/* This should be done in userland */
	if ((owner & UMTX_CONTESTED) == 0) {
	old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
	if (old == -1)
	return (EFAULT);
	if (old == owner)
	return (0);
	owner = old;
	}

	/* We should only ever be in here for contested locks */
	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
	&key)) != 0)
	return (error);

	umtxq_lock(&key);
	umtxq_busy(&key);
	count = umtxq_count(&key);
	umtxq_unlock(&key);

	/*
	* When unlocking the umtx, it must be marked as unowned if
	* there is zero or one thread only waiting for it.
	* Otherwise, it must be marked as contested.
	*/
	old = casuword(&umtx->u_owner, owner,
	count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
	umtxq_lock(&key);
	umtxq_signal(&key,1);
	umtxq_unbusy(&key);
	umtxq_unlock(&key);
	umtx_key_release(&key);
	if (old == -1)
	return (EFAULT);
	if (old != owner)
	return (EINVAL);
	return (0);
	}

	#ifdef COMPAT_FREEBSD32

	/*
	* Lock a umtx object.
	*/
	static int
	_do_lock_umtx32(struct thread td, uint32_t m, uint32_t id, int timo)
	{
	struct umtx_q *uq;
	uint32_t owner;
	uint32_t old;
	int error = 0;

	uq = td->td_umtxq;

	/*
	* Care must be exercised when dealing with umtx structure. It
	* can fault on any access.
	*/
	for (;;) {
	/*
	* Try the uncontested case. This should be done in userland.
	*/
	owner = casuword32(m, UMUTEX_UNOWNED, id);

	/* The acquire succeeded. */
	if (owner == UMUTEX_UNOWNED)
	return (0);

	/* The address was invalid. */
	if (owner == -1)
	return (EFAULT);

	/* If no one owns it but it is contested try to acquire it. */
	if (owner == UMUTEX_CONTESTED) {
	owner = casuword32(m,
	UMUTEX_CONTESTED, id \| UMUTEX_CONTESTED);
	if (owner == UMUTEX_CONTESTED)
	return (0);

	/* The address was invalid. */
	if (owner == -1)
	return (EFAULT);

	/* If this failed the lock has changed, restart. */
	continue;
	}

	/*
	* If we caught a signal, we have retried and now
	* exit immediately.
	*/
	if (error != 0)
	return (error);

	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
	AUTO_SHARE, &uq->uq_key)) != 0)
	return (error);

	umtxq_lock(&uq->uq_key);
	umtxq_busy(&uq->uq_key);
	umtxq_insert(uq);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);

	/*
	* Set the contested bit so that a release in user space
	* knows to use the system call for unlock. If this fails
	* either some one else has acquired the lock or it has been
	* released.
	*/
	old = casuword32(m, owner, owner \| UMUTEX_CONTESTED);

	/* The address was invalid. */
	if (old == -1) {
	umtxq_lock(&uq->uq_key);
	umtxq_remove(uq);
	umtxq_unlock(&uq->uq_key);
	umtx_key_release(&uq->uq_key);
	return (EFAULT);
	}

	/*
	* We set the contested bit, sleep. Otherwise the lock changed
	* and we need to retry or we lost a race to the thread
	* unlocking the umtx.
	*/
	umtxq_lock(&uq->uq_key);
	if (old == owner)
	error = umtxq_sleep(uq, "umtx", timo);
	umtxq_remove(uq);
	umtxq_unlock(&uq->uq_key);
	umtx_key_release(&uq->uq_key);
	}

	return (0);
	}

	/*
	* Lock a umtx object.
	*/
	static int
	do_lock_umtx32(struct thread td, void m, uint32_t id,
	struct timespec *timeout)
	{
	struct timespec ts, ts2, ts3;
	struct timeval tv;
	int error;

	if (timeout == NULL) {
	error = _do_lock_umtx32(td, m, id, 0);
	/* Mutex locking is restarted if it is interrupted. */
	if (error == EINTR)
	error = ERESTART;
	} else {
	getnanouptime(&ts);
	timespecadd(&ts, timeout);
	TIMESPEC_TO_TIMEVAL(&tv, timeout);
	for (;;) {
	error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
	if (error != ETIMEDOUT)
	break;
	getnanouptime(&ts2);
	if (timespeccmp(&ts2, &ts, >=)) {
	error = ETIMEDOUT;
	break;
	}
	ts3 = ts;
	timespecsub(&ts3, &ts2);
	TIMESPEC_TO_TIMEVAL(&tv, &ts3);
	}
	/* Timed-locking is not restarted. */
	if (error == ERESTART)
	error = EINTR;
	}
	return (error);
	}

	/*
	* Unlock a umtx object.
	*/
	static int
	do_unlock_umtx32(struct thread td, uint32_t m, uint32_t id)
	{
	struct umtx_key key;
	uint32_t owner;
	uint32_t old;
	int error;
	int count;

	/*
	* Make sure we own this mtx.
	*/
	owner = fuword32(m);
	if (owner == -1)
	return (EFAULT);

	if ((owner & ~UMUTEX_CONTESTED) != id)
	return (EPERM);

	/* This should be done in userland */
	if ((owner & UMUTEX_CONTESTED) == 0) {
	old = casuword32(m, owner, UMUTEX_UNOWNED);
	if (old == -1)
	return (EFAULT);
	if (old == owner)
	return (0);
	owner = old;
	}

	/* We should only ever be in here for contested locks */
	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
	&key)) != 0)
	return (error);

	umtxq_lock(&key);
	umtxq_busy(&key);
	count = umtxq_count(&key);
	umtxq_unlock(&key);

	/*
	* When unlocking the umtx, it must be marked as unowned if
	* there is zero or one thread only waiting for it.
	* Otherwise, it must be marked as contested.
	*/
	old = casuword32(m, owner,
	count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
	umtxq_lock(&key);
	umtxq_signal(&key,1);
	umtxq_unbusy(&key);
	umtxq_unlock(&key);
	umtx_key_release(&key);
	if (old == -1)
	return (EFAULT);
	if (old != owner)
	return (EINVAL);
	return (0);
	}
	#endif

	/*
	* Fetch and compare value, sleep on the address if value is not changed.
	*/
	static int
	do_wait(struct thread td, void addr, u_long id,
	struct timespec *timeout, int compat32, int is_private)
	{
	struct umtx_q *uq;
	struct timespec ts, ts2, ts3;
	struct timeval tv;
	u_long tmp;
	int error = 0;

	uq = td->td_umtxq;
	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
	is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
	return (error);

	umtxq_lock(&uq->uq_key);
	umtxq_insert(uq);
	umtxq_unlock(&uq->uq_key);
	if (compat32 == 0)
	tmp = fuword(addr);
	else
	tmp = (unsigned int)fuword32(addr);
	if (tmp != id) {
	umtxq_lock(&uq->uq_key);
	umtxq_remove(uq);
	umtxq_unlock(&uq->uq_key);
	} else if (timeout == NULL) {
	umtxq_lock(&uq->uq_key);
	error = umtxq_sleep(uq, "uwait", 0);
	umtxq_remove(uq);
	umtxq_unlock(&uq->uq_key);
	} else {
	getnanouptime(&ts);
	timespecadd(&ts, timeout);
	TIMESPEC_TO_TIMEVAL(&tv, timeout);
	umtxq_lock(&uq->uq_key);
	for (;;) {
	error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
	if (!(uq->uq_flags & UQF_UMTXQ)) {
	error = 0;
	break;
	}
	if (error != ETIMEDOUT)
	break;
	umtxq_unlock(&uq->uq_key);
	getnanouptime(&ts2);
	if (timespeccmp(&ts2, &ts, >=)) {
	error = ETIMEDOUT;
	umtxq_lock(&uq->uq_key);
	break;
	}
	ts3 = ts;
	timespecsub(&ts3, &ts2);
	TIMESPEC_TO_TIMEVAL(&tv, &ts3);
	umtxq_lock(&uq->uq_key);
	}
	umtxq_remove(uq);
	umtxq_unlock(&uq->uq_key);
	}
	umtx_key_release(&uq->uq_key);
	if (error == ERESTART)
	error = EINTR;
	return (error);
	}

	/*
	* Wake up threads sleeping on the specified address.
	*/
	int
	kern_umtx_wake(struct thread td, void uaddr, int n_wake, int is_private)
	{
	struct umtx_key key;
	int ret;

	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
	is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
	return (ret);
	umtxq_lock(&key);
	ret = umtxq_signal(&key, n_wake);
	umtxq_unlock(&key);
	umtx_key_release(&key);
	return (0);
	}

	/*
	* Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
	*/
	static int
	_do_lock_normal(struct thread td, struct umutex m, uint32_t flags, int timo,
	int mode)
	{
	struct umtx_q *uq;
	uint32_t owner, old, id;
	int error = 0;

	id = td->td_tid;
	uq = td->td_umtxq;

	/*
	* Care must be exercised when dealing with umtx structure. It
	* can fault on any access.
	*/
	for (;;) {
	owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
	if (mode == _UMUTEX_WAIT) {
	if (owner == UMUTEX_UNOWNED \|\| owner == UMUTEX_CONTESTED)
	return (0);
	} else {
	/*
	* Try the uncontested case. This should be done in userland.
	*/
	owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);

	/* The acquire succeeded. */
	if (owner == UMUTEX_UNOWNED)
	return (0);

	/* The address was invalid. */
	if (owner == -1)
	return (EFAULT);

	/* If no one owns it but it is contested try to acquire it. */
	if (owner == UMUTEX_CONTESTED) {
	owner = casuword32(&m->m_owner,
	UMUTEX_CONTESTED, id \| UMUTEX_CONTESTED);

	if (owner == UMUTEX_CONTESTED)
	return (0);

	/* The address was invalid. */
	if (owner == -1)
	return (EFAULT);

	/* If this failed the lock has changed, restart. */
	continue;
	}
	}

	if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
	(owner & ~UMUTEX_CONTESTED) == id)
	return (EDEADLK);

	if (mode == _UMUTEX_TRY)
	return (EBUSY);

	/*
	* If we caught a signal, we have retried and now
	* exit immediately.
	*/
	if (error != 0)
	return (error);

	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
	GET_SHARE(flags), &uq->uq_key)) != 0)
	return (error);

	umtxq_lock(&uq->uq_key);
	umtxq_busy(&uq->uq_key);
	umtxq_insert(uq);
	umtxq_unlock(&uq->uq_key);

	/*
	* Set the contested bit so that a release in user space
	* knows to use the system call for unlock. If this fails
	* either some one else has acquired the lock or it has been
	* released.
	*/
	old = casuword32(&m->m_owner, owner, owner \| UMUTEX_CONTESTED);

	/* The address was invalid. */
	if (old == -1) {
	umtxq_lock(&uq->uq_key);
	umtxq_remove(uq);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);
	umtx_key_release(&uq->uq_key);
	return (EFAULT);
	}

	/*
	* We set the contested bit, sleep. Otherwise the lock changed
	* and we need to retry or we lost a race to the thread
	* unlocking the umtx.
	*/
	umtxq_lock(&uq->uq_key);
	umtxq_unbusy(&uq->uq_key);
	if (old == owner)
	error = umtxq_sleep(uq, "umtxn", timo);
	umtxq_remove(uq);
	umtxq_unlock(&uq->uq_key);
	umtx_key_release(&uq->uq_key);
	}

	return (0);
	}

	/*
	* Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
	*/
	/*
	* Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
	*/
	static int
	do_unlock_normal(struct thread td, struct umutex m, uint32_t flags)
	{
	struct umtx_key key;
	uint32_t owner, old, id;
	int error;
	int count;

	id = td->td_tid;
	/*
	* Make sure we own this mtx.
	*/
	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
	if (owner == -1)
	return (EFAULT);

	if ((owner & ~UMUTEX_CONTESTED) != id)
	return (EPERM);

	if ((owner & UMUTEX_CONTESTED) == 0) {
	old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
	if (old == -1)
	return (EFAULT);
	if (old == owner)
	return (0);
	owner = old;
	}

	/* We should only ever be in here for contested locks */
	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
	&key)) != 0)
	return (error);

	umtxq_lock(&key);
	umtxq_busy(&key);
	count = umtxq_count(&key);
	umtxq_unlock(&key);

	/*
	* When unlocking the umtx, it must be marked as unowned if
	* there is zero or one thread only waiting for it.
	* Otherwise, it must be marked as contested.
	*/
	old = casuword32(&m->m_owner, owner,
	count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
	umtxq_lock(&key);
	umtxq_signal(&key,1);
	umtxq_unbusy(&key);
	umtxq_unlock(&key);
	umtx_key_release(&key);
	if (old == -1)
	return (EFAULT);
	if (old != owner)
	return (EINVAL);
	return (0);
	}

	/*
	* Check if the mutex is available and wake up a waiter,
	* only for simple mutex.
	*/
	static int
	do_wake_umutex(struct thread td, struct umutex m)
	{
	struct umtx_key key;
	uint32_t owner;
	uint32_t flags;
	int error;
	int count;

	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
	if (owner == -1)
	return (EFAULT);

	if ((owner & ~UMUTEX_CONTESTED) != 0)
	return (0);

	flags = fuword32(&m->m_flags);

	/* We should only ever be in here for contested locks */
	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
	&key)) != 0)
	return (error);

	umtxq_lock(&key);
	umtxq_busy(&key);
	count = umtxq_count(&key);
	umtxq_unlock(&key);

	if (count <= 1)
	owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);

	umtxq_lock(&key);
	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
	umtxq_signal(&key, 1);
	umtxq_unbusy(&key);
	umtxq_unlock(&key);
	umtx_key_release(&key);
	return (0);
	}

	static inline struct umtx_pi *
	umtx_pi_alloc(int flags)
	{
	struct umtx_pi *pi;

	pi = uma_zalloc(umtx_pi_zone, M_ZERO \| flags);
	TAILQ_INIT(&pi->pi_blocked);
	atomic_add_int(&umtx_pi_allocated, 1);
	return (pi);
	}

	static inline void
	umtx_pi_free(struct umtx_pi *pi)
	{
	uma_zfree(umtx_pi_zone, pi);
	atomic_add_int(&umtx_pi_allocated, -1);
	}

	/*
	* Adjust the thread's position on a pi_state after its priority has been
	* changed.
	*/
	static int
	umtx_pi_adjust_thread(struct umtx_pi pi, struct thread td)
	{
	struct umtx_q uq, uq1, *uq2;
	struct thread *td1;

	mtx_assert(&umtx_lock, MA_OWNED);
	if (pi == NULL)
	return (0);

	uq = td->td_umtxq;

	/*
	* Check if the thread needs to be moved on the blocked chain.
	* It needs to be moved if either its priority is lower than
	* the previous thread or higher than the next thread.
	*/
	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
	uq2 = TAILQ_NEXT(uq, uq_lockq);
	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) \|\|
	(uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
	/*
	* Remove thread from blocked chain and determine where
	* it should be moved to.
	*/
	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
	td1 = uq1->uq_thread;
	MPASS(td1->td_proc->p_magic == P_MAGIC);
	if (UPRI(td1) > UPRI(td))
	break;
	}

	if (uq1 == NULL)
	TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
	else
	TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
	}
	return (1);
	}

	/*
	* Propagate priority when a thread is blocked on POSIX
	* PI mutex.
	*/
	static void
	umtx_propagate_priority(struct thread *td)
	{
	struct umtx_q *uq;
	struct umtx_pi *pi;
	int pri;

	mtx_assert(&umtx_lock, MA_OWNED);
	pri = UPRI(td);
	uq = td->td_umtxq;
	pi = uq->uq_pi_blocked;
	if (pi == NULL)
	return;

	for (;;) {
	td = pi->pi_owner;
	if (td == NULL \|\| td == curthread)
	return;

	MPASS(td->td_proc != NULL);
	MPASS(td->td_proc->p_magic == P_MAGIC);

	thread_lock(td);
	if (td->td_lend_user_pri > pri)
	sched_lend_user_prio(td, pri);
	else {
	thread_unlock(td);
	break;
	}
	thread_unlock(td);

	/*
	* Pick up the lock that td is blocked on.
	*/
	uq = td->td_umtxq;
	pi = uq->uq_pi_blocked;
	if (pi == NULL)
	break;
	/* Resort td on the list if needed. */
	umtx_pi_adjust_thread(pi, td);
	}
	}

	/*
	* Unpropagate priority for a PI mutex when a thread blocked on
	* it is interrupted by signal or resumed by others.
	*/
	static void
	umtx_repropagate_priority(struct umtx_pi *pi)
	{
	struct umtx_q uq, uq_owner;
	struct umtx_pi *pi2;
	int pri;

	mtx_assert(&umtx_lock, MA_OWNED);

	while (pi != NULL && pi->pi_owner != NULL) {
	pri = PRI_MAX;
	uq_owner = pi->pi_owner->td_umtxq;

	TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
	uq = TAILQ_FIRST(&pi2->pi_blocked);
	if (uq != NULL) {
	if (pri > UPRI(uq->uq_thread))
	pri = UPRI(uq->uq_thread);
	}
	}

	if (pri > uq_owner->uq_inherited_pri)
	pri = uq_owner->uq_inherited_pri;
	thread_lock(pi->pi_owner);
	sched_lend_user_prio(pi->pi_owner, pri);
	thread_unlock(pi->pi_owner);
	if ((pi = uq_owner->uq_pi_blocked) != NULL)
	umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
	}
	}

	/*
	* Insert a PI mutex into owned list.
	*/
	static void
	umtx_pi_setowner(struct umtx_pi pi, struct thread owner)
	{
	struct umtx_q *uq_owner;

	uq_owner = owner->td_umtxq;
	mtx_assert(&umtx_lock, MA_OWNED);
	if (pi->pi_owner != NULL)
	panic("pi_ower != NULL");
	pi->pi_owner = owner;
	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
	}

	/*
	* Claim ownership of a PI mutex.
	*/
	static int
	umtx_pi_claim(struct umtx_pi pi, struct thread owner)
	{
	struct umtx_q uq, uq_owner;

	uq_owner = owner->td_umtxq;
	mtx_lock_spin(&umtx_lock);
	if (pi->pi_owner == owner) {
	mtx_unlock_spin(&umtx_lock);
	return (0);
	}

	if (pi->pi_owner != NULL) {
	/*
	* userland may have already messed the mutex, sigh.
	*/
	mtx_unlock_spin(&umtx_lock);
	return (EPERM);
	}
	umtx_pi_setowner(pi, owner);
	uq = TAILQ_FIRST(&pi->pi_blocked);
	if (uq != NULL) {
	int pri;

	pri = UPRI(uq->uq_thread);
	thread_lock(owner);
	if (pri < UPRI(owner))
	sched_lend_user_prio(owner, pri);
	thread_unlock(owner);
	}
	mtx_unlock_spin(&umtx_lock);
	return (0);
	}

	/*
	* Adjust a thread's order position in its blocked PI mutex,
	* this may result new priority propagating process.
	*/
	void
	umtx_pi_adjust(struct thread *td, u_char oldpri)
	{
	struct umtx_q *uq;
	struct umtx_pi *pi;

	uq = td->td_umtxq;
	mtx_lock_spin(&umtx_lock);
	/*
	* Pick up the lock that td is blocked on.
	*/
	pi = uq->uq_pi_blocked;
	if (pi != NULL) {
	umtx_pi_adjust_thread(pi, td);
	umtx_repropagate_priority(pi);
	}
	mtx_unlock_spin(&umtx_lock);
	}

	/*
	* Sleep on a PI mutex.
	*/
	static int
	umtxq_sleep_pi(struct umtx_q uq, struct umtx_pi pi,
	uint32_t owner, const char *wmesg, int timo)
	{
	struct umtxq_chain *uc;
	struct thread td, td1;
	struct umtx_q *uq1;
	int pri;
	int error = 0;

	td = uq->uq_thread;
	KASSERT(td == curthread, ("inconsistent uq_thread"));
	uc = umtxq_getchain(&uq->uq_key);
	UMTXQ_LOCKED_ASSERT(uc);
	UMTXQ_BUSY_ASSERT(uc);
	umtxq_insert(uq);
	mtx_lock_spin(&umtx_lock);
	if (pi->pi_owner == NULL) {
	mtx_unlock_spin(&umtx_lock);
	/* XXX Only look up thread in current process. */
	td1 = tdfind(owner, curproc->p_pid);
	mtx_lock_spin(&umtx_lock);
	if (td1 != NULL) {
	if (pi->pi_owner == NULL)
	umtx_pi_setowner(pi, td1);
	PROC_UNLOCK(td1->td_proc);
	}
	}

	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
	pri = UPRI(uq1->uq_thread);
	if (pri > UPRI(td))
	break;
	}

	if (uq1 != NULL)
	TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
	else
	TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);

	uq->uq_pi_blocked = pi;
	thread_lock(td);
	td->td_flags \|= TDF_UPIBLOCKED;
	thread_unlock(td);
	umtx_propagate_priority(td);
	mtx_unlock_spin(&umtx_lock);
	umtxq_unbusy(&uq->uq_key);

	if (uq->uq_flags & UQF_UMTXQ) {
	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
	if (error == EWOULDBLOCK)
	error = ETIMEDOUT;
	if (uq->uq_flags & UQF_UMTXQ) {
	umtxq_remove(uq);
	}
	}
	mtx_lock_spin(&umtx_lock);
	uq->uq_pi_blocked = NULL;
	thread_lock(td);
	td->td_flags &= ~TDF_UPIBLOCKED;
	thread_unlock(td);
	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
	umtx_repropagate_priority(pi);
	mtx_unlock_spin(&umtx_lock);
	umtxq_unlock(&uq->uq_key);

	return (error);
	}

	/*
	* Add reference count for a PI mutex.
	*/
	static void
	umtx_pi_ref(struct umtx_pi *pi)
	{
	struct umtxq_chain *uc;

	uc = umtxq_getchain(&pi->pi_key);
	UMTXQ_LOCKED_ASSERT(uc);
	pi->pi_refcount++;
	}

	/*
	* Decrease reference count for a PI mutex, if the counter
	* is decreased to zero, its memory space is freed.
	*/
	static void
	umtx_pi_unref(struct umtx_pi *pi)
	{
	struct umtxq_chain *uc;

	uc = umtxq_getchain(&pi->pi_key);
	UMTXQ_LOCKED_ASSERT(uc);
	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
	if (--pi->pi_refcount == 0) {
	mtx_lock_spin(&umtx_lock);
	if (pi->pi_owner != NULL) {
	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
	pi, pi_link);
	pi->pi_owner = NULL;
	}
	KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
	("blocked queue not empty"));
	mtx_unlock_spin(&umtx_lock);
	TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
	umtx_pi_free(pi);
	}
	}

	/*
	* Find a PI mutex in hash table.
	*/
	static struct umtx_pi *
	umtx_pi_lookup(struct umtx_key *key)
	{
	struct umtxq_chain *uc;
	struct umtx_pi *pi;

	uc = umtxq_getchain(key);
	UMTXQ_LOCKED_ASSERT(uc);

	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
	if (umtx_key_match(&pi->pi_key, key)) {
	return (pi);
	}
	}
	return (NULL);
	}

	/*
	* Insert a PI mutex into hash table.
	*/
	static inline void
	umtx_pi_insert(struct umtx_pi *pi)
	{
	struct umtxq_chain *uc;

	uc = umtxq_getchain(&pi->pi_key);
	UMTXQ_LOCKED_ASSERT(uc);
	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
	}

	/*
	* Lock a PI mutex.
	*/
	static int
	_do_lock_pi(struct thread td, struct umutex m, uint32_t flags, int timo,
	int try)
	{
	struct umtx_q *uq;
	struct umtx_pi pi, new_pi;
	uint32_t id, owner, old;
	int error;

	id = td->td_tid;
	uq = td->td_umtxq;

	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
	&uq->uq_key)) != 0)
	return (error);
	umtxq_lock(&uq->uq_key);
	pi = umtx_pi_lookup(&uq->uq_key);
	if (pi == NULL) {
	new_pi = umtx_pi_alloc(M_NOWAIT);
	if (new_pi == NULL) {
	umtxq_unlock(&uq->uq_key);
	new_pi = umtx_pi_alloc(M_WAITOK);
	umtxq_lock(&uq->uq_key);
	pi = umtx_pi_lookup(&uq->uq_key);
	if (pi != NULL) {
	umtx_pi_free(new_pi);
	new_pi = NULL;
	}
	}
	if (new_pi != NULL) {
	new_pi->pi_key = uq->uq_key;
	umtx_pi_insert(new_pi);
	pi = new_pi;
	}
	}
	umtx_pi_ref(pi);
	umtxq_unlock(&uq->uq_key);

	/*
	* Care must be exercised when dealing with umtx structure. It
	* can fault on any access.
	*/
	for (;;) {
	/*
	* Try the uncontested case. This should be done in userland.
	*/
	owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);

	/* The acquire succeeded. */
	if (owner == UMUTEX_UNOWNED) {
	error = 0;
	break;
	}

	/* The address was invalid. */
	if (owner == -1) {
	error = EFAULT;
	break;
	}

	/* If no one owns it but it is contested try to acquire it. */
	if (owner == UMUTEX_CONTESTED) {
	owner = casuword32(&m->m_owner,
	UMUTEX_CONTESTED, id \| UMUTEX_CONTESTED);

	if (owner == UMUTEX_CONTESTED) {
	umtxq_lock(&uq->uq_key);
	umtxq_busy(&uq->uq_key);
	error = umtx_pi_claim(pi, td);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);
	break;
	}

	/* The address was invalid. */
	if (owner == -1) {
	error = EFAULT;
	break;
	}

	/* If this failed the lock has changed, restart. */
	continue;
	}

	if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
	(owner & ~UMUTEX_CONTESTED) == id) {
	error = EDEADLK;
	break;
	}

	if (try != 0) {
	error = EBUSY;
	break;
	}

	/*
	* If we caught a signal, we have retried and now
	* exit immediately.
	*/
	if (error != 0)
	break;

	umtxq_lock(&uq->uq_key);
	umtxq_busy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);

	/*
	* Set the contested bit so that a release in user space
	* knows to use the system call for unlock. If this fails
	* either some one else has acquired the lock or it has been
	* released.
	*/
	old = casuword32(&m->m_owner, owner, owner \| UMUTEX_CONTESTED);

	/* The address was invalid. */
	if (old == -1) {
	umtxq_lock(&uq->uq_key);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);
	error = EFAULT;
	break;
	}

	umtxq_lock(&uq->uq_key);
	/*
	* We set the contested bit, sleep. Otherwise the lock changed
	* and we need to retry or we lost a race to the thread
	* unlocking the umtx.
	*/
	if (old == owner)
	error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
	"umtxpi", timo);
	else {
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);
	}
	}

	umtxq_lock(&uq->uq_key);
	umtx_pi_unref(pi);
	umtxq_unlock(&uq->uq_key);

	umtx_key_release(&uq->uq_key);
	return (error);
	}

	/*
	* Unlock a PI mutex.
	*/
	static int
	do_unlock_pi(struct thread td, struct umutex m, uint32_t flags)
	{
	struct umtx_key key;
	struct umtx_q uq_first, uq_first2, *uq_me;
	struct umtx_pi pi, pi2;
	uint32_t owner, old, id;
	int error;
	int count;
	int pri;

	id = td->td_tid;
	/*
	* Make sure we own this mtx.
	*/
	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
	if (owner == -1)
	return (EFAULT);

	if ((owner & ~UMUTEX_CONTESTED) != id)
	return (EPERM);

	/* This should be done in userland */
	if ((owner & UMUTEX_CONTESTED) == 0) {
	old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
	if (old == -1)
	return (EFAULT);
	if (old == owner)
	return (0);
	owner = old;
	}

	/* We should only ever be in here for contested locks */
	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
	&key)) != 0)
	return (error);

	umtxq_lock(&key);
	umtxq_busy(&key);
	count = umtxq_count_pi(&key, &uq_first);
	if (uq_first != NULL) {
	mtx_lock_spin(&umtx_lock);
	pi = uq_first->uq_pi_blocked;
	KASSERT(pi != NULL, ("pi == NULL?"));
	if (pi->pi_owner != curthread) {
	mtx_unlock_spin(&umtx_lock);
	umtxq_unbusy(&key);
	umtxq_unlock(&key);
	umtx_key_release(&key);
	/* userland messed the mutex */
	return (EPERM);
	}
	uq_me = curthread->td_umtxq;
	pi->pi_owner = NULL;
	TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
	/* get highest priority thread which is still sleeping. */
	uq_first = TAILQ_FIRST(&pi->pi_blocked);
	while (uq_first != NULL &&
	(uq_first->uq_flags & UQF_UMTXQ) == 0) {
	uq_first = TAILQ_NEXT(uq_first, uq_lockq);
	}
	pri = PRI_MAX;
	TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
	uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
	if (uq_first2 != NULL) {
	if (pri > UPRI(uq_first2->uq_thread))
	pri = UPRI(uq_first2->uq_thread);
	}
	}
	thread_lock(curthread);
	sched_lend_user_prio(curthread, pri);
	thread_unlock(curthread);
	mtx_unlock_spin(&umtx_lock);
	if (uq_first)
	umtxq_signal_thread(uq_first);
	}
	umtxq_unlock(&key);

	/*
	* When unlocking the umtx, it must be marked as unowned if
	* there is zero or one thread only waiting for it.
	* Otherwise, it must be marked as contested.
	*/
	old = casuword32(&m->m_owner, owner,
	count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);

	umtxq_lock(&key);
	umtxq_unbusy(&key);
	umtxq_unlock(&key);
	umtx_key_release(&key);
	if (old == -1)
	return (EFAULT);
	if (old != owner)
	return (EINVAL);
	return (0);
	}

	/*
	* Lock a PP mutex.
	*/
	static int
	_do_lock_pp(struct thread td, struct umutex m, uint32_t flags, int timo,
	int try)
	{
	struct umtx_q uq, uq2;
	struct umtx_pi *pi;
	uint32_t ceiling;
	uint32_t owner, id;
	int error, pri, old_inherited_pri, su;

	id = td->td_tid;
	uq = td->td_umtxq;
	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
	&uq->uq_key)) != 0)
	return (error);
	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
	for (;;) {
	old_inherited_pri = uq->uq_inherited_pri;
	umtxq_lock(&uq->uq_key);
	umtxq_busy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);

	ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
	if (ceiling > RTP_PRIO_MAX) {
	error = EINVAL;
	goto out;
	}

	mtx_lock_spin(&umtx_lock);
	if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
	mtx_unlock_spin(&umtx_lock);
	error = EINVAL;
	goto out;
	}
	if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
	uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
	thread_lock(td);
	if (uq->uq_inherited_pri < UPRI(td))
	sched_lend_user_prio(td, uq->uq_inherited_pri);
	thread_unlock(td);
	}
	mtx_unlock_spin(&umtx_lock);

	owner = casuword32(&m->m_owner,
	UMUTEX_CONTESTED, id \| UMUTEX_CONTESTED);

	if (owner == UMUTEX_CONTESTED) {
	error = 0;
	break;
	}

	/* The address was invalid. */
	if (owner == -1) {
	error = EFAULT;
	break;
	}

	if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
	(owner & ~UMUTEX_CONTESTED) == id) {
	error = EDEADLK;
	break;
	}

	if (try != 0) {
	error = EBUSY;
	break;
	}

	/*
	* If we caught a signal, we have retried and now
	* exit immediately.
	*/
	if (error != 0)
	break;

	umtxq_lock(&uq->uq_key);
	umtxq_insert(uq);
	umtxq_unbusy(&uq->uq_key);
	error = umtxq_sleep(uq, "umtxpp", timo);
	umtxq_remove(uq);
	umtxq_unlock(&uq->uq_key);

	mtx_lock_spin(&umtx_lock);
	uq->uq_inherited_pri = old_inherited_pri;
	pri = PRI_MAX;
	TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
	uq2 = TAILQ_FIRST(&pi->pi_blocked);
	if (uq2 != NULL) {
	if (pri > UPRI(uq2->uq_thread))
	pri = UPRI(uq2->uq_thread);
	}
	}
	if (pri > uq->uq_inherited_pri)
	pri = uq->uq_inherited_pri;
	thread_lock(td);
	sched_lend_user_prio(td, pri);
	thread_unlock(td);
	mtx_unlock_spin(&umtx_lock);
	}

	if (error != 0) {
	mtx_lock_spin(&umtx_lock);
	uq->uq_inherited_pri = old_inherited_pri;
	pri = PRI_MAX;
	TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
	uq2 = TAILQ_FIRST(&pi->pi_blocked);
	if (uq2 != NULL) {
	if (pri > UPRI(uq2->uq_thread))
	pri = UPRI(uq2->uq_thread);
	}
	}
	if (pri > uq->uq_inherited_pri)
	pri = uq->uq_inherited_pri;
	thread_lock(td);
	sched_lend_user_prio(td, pri);
	thread_unlock(td);
	mtx_unlock_spin(&umtx_lock);
	}

	out:
	umtxq_lock(&uq->uq_key);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);
	umtx_key_release(&uq->uq_key);
	return (error);
	}

	/*
	* Unlock a PP mutex.
	*/
	static int
	do_unlock_pp(struct thread td, struct umutex m, uint32_t flags)
	{
	struct umtx_key key;
	struct umtx_q uq, uq2;
	struct umtx_pi *pi;
	uint32_t owner, id;
	uint32_t rceiling;
	int error, pri, new_inherited_pri, su;

	id = td->td_tid;
	uq = td->td_umtxq;
	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);

	/*
	* Make sure we own this mtx.
	*/
	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
	if (owner == -1)
	return (EFAULT);

	if ((owner & ~UMUTEX_CONTESTED) != id)
	return (EPERM);

	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
	if (error != 0)
	return (error);

	if (rceiling == -1)
	new_inherited_pri = PRI_MAX;
	else {
	rceiling = RTP_PRIO_MAX - rceiling;
	if (rceiling > RTP_PRIO_MAX)
	return (EINVAL);
	new_inherited_pri = PRI_MIN_REALTIME + rceiling;
	}

	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
	&key)) != 0)
	return (error);
	umtxq_lock(&key);
	umtxq_busy(&key);
	umtxq_unlock(&key);
	/*
	* For priority protected mutex, always set unlocked state
	* to UMUTEX_CONTESTED, so that userland always enters kernel
	* to lock the mutex, it is necessary because thread priority
	* has to be adjusted for such mutex.
	*/
	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
	UMUTEX_CONTESTED);

	umtxq_lock(&key);
	if (error == 0)
	umtxq_signal(&key, 1);
	umtxq_unbusy(&key);
	umtxq_unlock(&key);

	if (error == -1)
	error = EFAULT;
	else {
	mtx_lock_spin(&umtx_lock);
	if (su != 0)
	uq->uq_inherited_pri = new_inherited_pri;
	pri = PRI_MAX;
	TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
	uq2 = TAILQ_FIRST(&pi->pi_blocked);
	if (uq2 != NULL) {
	if (pri > UPRI(uq2->uq_thread))
	pri = UPRI(uq2->uq_thread);
	}
	}
	if (pri > uq->uq_inherited_pri)
	pri = uq->uq_inherited_pri;
	thread_lock(td);
	sched_lend_user_prio(td, pri);
	thread_unlock(td);
	mtx_unlock_spin(&umtx_lock);
	}
	umtx_key_release(&key);
	return (error);
	}

	static int
	do_set_ceiling(struct thread td, struct umutex m, uint32_t ceiling,
	uint32_t *old_ceiling)
	{
	struct umtx_q *uq;
	uint32_t save_ceiling;
	uint32_t owner, id;
	uint32_t flags;
	int error;

	flags = fuword32(&m->m_flags);
	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
	return (EINVAL);
	if (ceiling > RTP_PRIO_MAX)
	return (EINVAL);
	id = td->td_tid;
	uq = td->td_umtxq;
	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
	&uq->uq_key)) != 0)
	return (error);
	for (;;) {
	umtxq_lock(&uq->uq_key);
	umtxq_busy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);

	save_ceiling = fuword32(&m->m_ceilings[0]);

	owner = casuword32(&m->m_owner,
	UMUTEX_CONTESTED, id \| UMUTEX_CONTESTED);

	if (owner == UMUTEX_CONTESTED) {
	suword32(&m->m_ceilings[0], ceiling);
	suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
	UMUTEX_CONTESTED);
	error = 0;
	break;
	}

	/* The address was invalid. */
	if (owner == -1) {
	error = EFAULT;
	break;
	}

	if ((owner & ~UMUTEX_CONTESTED) == id) {
	suword32(&m->m_ceilings[0], ceiling);
	error = 0;
	break;
	}

	/*
	* If we caught a signal, we have retried and now
	* exit immediately.
	*/
	if (error != 0)
	break;

	/*
	* We set the contested bit, sleep. Otherwise the lock changed
	* and we need to retry or we lost a race to the thread
	* unlocking the umtx.
	*/
	umtxq_lock(&uq->uq_key);
	umtxq_insert(uq);
	umtxq_unbusy(&uq->uq_key);
	error = umtxq_sleep(uq, "umtxpp", 0);
	umtxq_remove(uq);
	umtxq_unlock(&uq->uq_key);
	}
	umtxq_lock(&uq->uq_key);
	if (error == 0)
	umtxq_signal(&uq->uq_key, INT_MAX);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);
	umtx_key_release(&uq->uq_key);
	if (error == 0 && old_ceiling != NULL)
	suword32(old_ceiling, save_ceiling);
	return (error);
	}

	static int
	_do_lock_umutex(struct thread td, struct umutex m, int flags, int timo,
	int mode)
	{
	switch(flags & (UMUTEX_PRIO_INHERIT \| UMUTEX_PRIO_PROTECT)) {
	case 0:
	return (_do_lock_normal(td, m, flags, timo, mode));
	case UMUTEX_PRIO_INHERIT:
	return (_do_lock_pi(td, m, flags, timo, mode));
	case UMUTEX_PRIO_PROTECT:
	return (_do_lock_pp(td, m, flags, timo, mode));
	}
	return (EINVAL);
	}

	/*
	* Lock a userland POSIX mutex.
	*/
	static int
	do_lock_umutex(struct thread td, struct umutex m,
	struct timespec *timeout, int mode)
	{
	struct timespec ts, ts2, ts3;
	struct timeval tv;
	uint32_t flags;
	int error;

	flags = fuword32(&m->m_flags);
	if (flags == -1)
	return (EFAULT);

	if (timeout == NULL) {
	error = _do_lock_umutex(td, m, flags, 0, mode);
	/* Mutex locking is restarted if it is interrupted. */
	if (error == EINTR && mode != _UMUTEX_WAIT)
	error = ERESTART;
	} else {
	getnanouptime(&ts);
	timespecadd(&ts, timeout);
	TIMESPEC_TO_TIMEVAL(&tv, timeout);
	for (;;) {
	error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
	if (error != ETIMEDOUT)
	break;
	getnanouptime(&ts2);
	if (timespeccmp(&ts2, &ts, >=)) {
	error = ETIMEDOUT;
	break;
	}
	ts3 = ts;
	timespecsub(&ts3, &ts2);
	TIMESPEC_TO_TIMEVAL(&tv, &ts3);
	}
	/* Timed-locking is not restarted. */
	if (error == ERESTART)
	error = EINTR;
	}
	return (error);
	}

	/*
	* Unlock a userland POSIX mutex.
	*/
	static int
	do_unlock_umutex(struct thread td, struct umutex m)
	{
	uint32_t flags;

	flags = fuword32(&m->m_flags);
	if (flags == -1)
	return (EFAULT);

	switch(flags & (UMUTEX_PRIO_INHERIT \| UMUTEX_PRIO_PROTECT)) {
	case 0:
	return (do_unlock_normal(td, m, flags));
	case UMUTEX_PRIO_INHERIT:
	return (do_unlock_pi(td, m, flags));
	case UMUTEX_PRIO_PROTECT:
	return (do_unlock_pp(td, m, flags));
	}

	return (EINVAL);
	}

	static int
	do_cv_wait(struct thread td, struct ucond cv, struct umutex *m,
	struct timespec *timeout, u_long wflags)
	{
	struct umtx_q *uq;
	struct timeval tv;
	struct timespec cts, ets, tts;
	uint32_t flags;
	uint32_t clockid;
	int error;

	uq = td->td_umtxq;
	flags = fuword32(&cv->c_flags);
	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
	if (error != 0)
	return (error);

	if ((wflags & CVWAIT_CLOCKID) != 0) {
	clockid = fuword32(&cv->c_clockid);
	if (clockid < CLOCK_REALTIME \|\|
	clockid >= CLOCK_THREAD_CPUTIME_ID) {
	/* hmm, only HW clock id will work. */
	return (EINVAL);
	}
	} else {
	clockid = CLOCK_REALTIME;
	}

	umtxq_lock(&uq->uq_key);
	umtxq_busy(&uq->uq_key);
	umtxq_insert(uq);
	umtxq_unlock(&uq->uq_key);

	/*
	* Set c_has_waiters to 1 before releasing user mutex, also
	* don't modify cache line when unnecessary.
	*/
	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);

	umtxq_lock(&uq->uq_key);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);

	error = do_unlock_umutex(td, m);

	umtxq_lock(&uq->uq_key);
	if (error == 0) {
	if (timeout == NULL) {
	error = umtxq_sleep(uq, "ucond", 0);
	} else {
	if ((wflags & CVWAIT_ABSTIME) == 0) {
	kern_clock_gettime(td, clockid, &ets);
	timespecadd(&ets, timeout);
	tts = *timeout;
	} else { /* absolute time */
	ets = *timeout;
	tts = *timeout;
	kern_clock_gettime(td, clockid, &cts);
	timespecsub(&tts, &cts);
	}
	TIMESPEC_TO_TIMEVAL(&tv, &tts);
	for (;;) {
	error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
	if (error != ETIMEDOUT)
	break;
	kern_clock_gettime(td, clockid, &cts);
	if (timespeccmp(&cts, &ets, >=)) {
	error = ETIMEDOUT;
	break;
	}
	tts = ets;
	timespecsub(&tts, &cts);
	TIMESPEC_TO_TIMEVAL(&tv, &tts);
	}
	}
	}

	if ((uq->uq_flags & UQF_UMTXQ) == 0)
	error = 0;
	else {
	/*
	* This must be timeout,interrupted by signal or
	* surprious wakeup, clear c_has_waiter flag when
	* necessary.
	*/
	umtxq_busy(&uq->uq_key);
	if ((uq->uq_flags & UQF_UMTXQ) != 0) {
	int oldlen = uq->uq_cur_queue->length;
	umtxq_remove(uq);
	if (oldlen == 1) {
	umtxq_unlock(&uq->uq_key);
	suword32(
	__DEVOLATILE(uint32_t *,
	&cv->c_has_waiters), 0);
	umtxq_lock(&uq->uq_key);
	}
	}
	umtxq_unbusy(&uq->uq_key);
	if (error == ERESTART)
	error = EINTR;
	}

	umtxq_unlock(&uq->uq_key);
	umtx_key_release(&uq->uq_key);
	return (error);
	}

	/*
	* Signal a userland condition variable.
	*/
	static int
	do_cv_signal(struct thread td, struct ucond cv)
	{
	struct umtx_key key;
	int error, cnt, nwake;
	uint32_t flags;

	flags = fuword32(&cv->c_flags);
	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
	return (error);
	umtxq_lock(&key);
	umtxq_busy(&key);
	cnt = umtxq_count(&key);
	nwake = umtxq_signal(&key, 1);
	if (cnt <= nwake) {
	umtxq_unlock(&key);
	error = suword32(
	__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
	umtxq_lock(&key);
	}
	umtxq_unbusy(&key);
	umtxq_unlock(&key);
	umtx_key_release(&key);
	return (error);
	}

	static int
	do_cv_broadcast(struct thread td, struct ucond cv)
	{
	struct umtx_key key;
	int error;
	uint32_t flags;

	flags = fuword32(&cv->c_flags);
	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
	return (error);

	umtxq_lock(&key);
	umtxq_busy(&key);
	umtxq_signal(&key, INT_MAX);
	umtxq_unlock(&key);

	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);

	umtxq_lock(&key);
	umtxq_unbusy(&key);
	umtxq_unlock(&key);

	umtx_key_release(&key);
	return (error);
	}

	static int
	do_rw_rdlock(struct thread td, struct urwlock rwlock, long fflag, int timo)
	{
	struct umtx_q *uq;
	uint32_t flags, wrflags;
	int32_t state, oldstate;
	int32_t blocked_readers;
	int error;

	uq = td->td_umtxq;
	flags = fuword32(&rwlock->rw_flags);
	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
	if (error != 0)
	return (error);

	wrflags = URWLOCK_WRITE_OWNER;
	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
	wrflags \|= URWLOCK_WRITE_WAITERS;

	for (;;) {
	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
	/* try to lock it */
	while (!(state & wrflags)) {
	if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
	umtx_key_release(&uq->uq_key);
	return (EAGAIN);
	}
	oldstate = casuword32(&rwlock->rw_state, state, state + 1);
	if (oldstate == state) {
	umtx_key_release(&uq->uq_key);
	return (0);
	}
	state = oldstate;
	}

	if (error)
	break;

	/* grab monitor lock */
	umtxq_lock(&uq->uq_key);
	umtxq_busy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);

	/*
	* re-read the state, in case it changed between the try-lock above
	* and the check below
	*/
	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));

	/* set read contention bit */
	while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
	oldstate = casuword32(&rwlock->rw_state, state, state \| URWLOCK_READ_WAITERS);
	if (oldstate == state)
	goto sleep;
	state = oldstate;
	}

	/* state is changed while setting flags, restart */
	if (!(state & wrflags)) {
	umtxq_lock(&uq->uq_key);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);
	continue;
	}

	sleep:
	/* contention bit is set, before sleeping, increase read waiter count */
	blocked_readers = fuword32(&rwlock->rw_blocked_readers);
	suword32(&rwlock->rw_blocked_readers, blocked_readers+1);

	while (state & wrflags) {
	umtxq_lock(&uq->uq_key);
	umtxq_insert(uq);
	umtxq_unbusy(&uq->uq_key);

	error = umtxq_sleep(uq, "urdlck", timo);

	umtxq_busy(&uq->uq_key);
	umtxq_remove(uq);
	umtxq_unlock(&uq->uq_key);
	if (error)
	break;
	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
	}

	/* decrease read waiter count, and may clear read contention bit */
	blocked_readers = fuword32(&rwlock->rw_blocked_readers);
	suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
	if (blocked_readers == 1) {
	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
	for (;;) {
	oldstate = casuword32(&rwlock->rw_state, state,
	state & ~URWLOCK_READ_WAITERS);
	if (oldstate == state)
	break;
	state = oldstate;
	}
	}

	umtxq_lock(&uq->uq_key);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);
	}
	umtx_key_release(&uq->uq_key);
	return (error);
	}

	static int
	do_rw_rdlock2(struct thread td, void obj, long val, struct timespec *timeout)
	{
	struct timespec ts, ts2, ts3;
	struct timeval tv;
	int error;

	getnanouptime(&ts);
	timespecadd(&ts, timeout);
	TIMESPEC_TO_TIMEVAL(&tv, timeout);
	for (;;) {
	error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
	if (error != ETIMEDOUT)
	break;
	getnanouptime(&ts2);
	if (timespeccmp(&ts2, &ts, >=)) {
	error = ETIMEDOUT;
	break;
	}
	ts3 = ts;
	timespecsub(&ts3, &ts2);
	TIMESPEC_TO_TIMEVAL(&tv, &ts3);
	}
	if (error == ERESTART)
	error = EINTR;
	return (error);
	}

	static int
	do_rw_wrlock(struct thread td, struct urwlock rwlock, int timo)
	{
	struct umtx_q *uq;
	uint32_t flags;
	int32_t state, oldstate;
	int32_t blocked_writers;
	int32_t blocked_readers;
	int error;

	uq = td->td_umtxq;
	flags = fuword32(&rwlock->rw_flags);
	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
	if (error != 0)
	return (error);

	blocked_readers = 0;
	for (;;) {
	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
	while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
	oldstate = casuword32(&rwlock->rw_state, state, state \| URWLOCK_WRITE_OWNER);
	if (oldstate == state) {
	umtx_key_release(&uq->uq_key);
	return (0);
	}
	state = oldstate;
	}

	if (error) {
	if (!(state & (URWLOCK_WRITE_OWNER\|URWLOCK_WRITE_WAITERS)) &&
	blocked_readers != 0) {
	umtxq_lock(&uq->uq_key);
	umtxq_busy(&uq->uq_key);
	umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);
	}

	break;
	}

	/* grab monitor lock */
	umtxq_lock(&uq->uq_key);
	umtxq_busy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);

	/*
	* re-read the state, in case it changed between the try-lock above
	* and the check below
	*/
	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));

	while (((state & URWLOCK_WRITE_OWNER) \|\| URWLOCK_READER_COUNT(state) != 0) &&
	(state & URWLOCK_WRITE_WAITERS) == 0) {
	oldstate = casuword32(&rwlock->rw_state, state, state \| URWLOCK_WRITE_WAITERS);
	if (oldstate == state)
	goto sleep;
	state = oldstate;
	}

	if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
	umtxq_lock(&uq->uq_key);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);
	continue;
	}
	sleep:
	blocked_writers = fuword32(&rwlock->rw_blocked_writers);
	suword32(&rwlock->rw_blocked_writers, blocked_writers+1);

	while ((state & URWLOCK_WRITE_OWNER) \|\| URWLOCK_READER_COUNT(state) != 0) {
	umtxq_lock(&uq->uq_key);
	umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
	umtxq_unbusy(&uq->uq_key);

	error = umtxq_sleep(uq, "uwrlck", timo);

	umtxq_busy(&uq->uq_key);
	umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
	umtxq_unlock(&uq->uq_key);
	if (error)
	break;
	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
	}

	blocked_writers = fuword32(&rwlock->rw_blocked_writers);
	suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
	if (blocked_writers == 1) {
	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
	for (;;) {
	oldstate = casuword32(&rwlock->rw_state, state,
	state & ~URWLOCK_WRITE_WAITERS);
	if (oldstate == state)
	break;
	state = oldstate;
	}
	blocked_readers = fuword32(&rwlock->rw_blocked_readers);
	} else
	blocked_readers = 0;

	umtxq_lock(&uq->uq_key);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);
	}

	umtx_key_release(&uq->uq_key);
	return (error);
	}

	static int
	do_rw_wrlock2(struct thread td, void obj, struct timespec *timeout)
	{
	struct timespec ts, ts2, ts3;
	struct timeval tv;
	int error;

	getnanouptime(&ts);
	timespecadd(&ts, timeout);
	TIMESPEC_TO_TIMEVAL(&tv, timeout);
	for (;;) {
	error = do_rw_wrlock(td, obj, tvtohz(&tv));
	if (error != ETIMEDOUT)
	break;
	getnanouptime(&ts2);
	if (timespeccmp(&ts2, &ts, >=)) {
	error = ETIMEDOUT;
	break;
	}
	ts3 = ts;
	timespecsub(&ts3, &ts2);
	TIMESPEC_TO_TIMEVAL(&tv, &ts3);
	}
	if (error == ERESTART)
	error = EINTR;
	return (error);
	}

	static int
	do_rw_unlock(struct thread td, struct urwlock rwlock)
	{
	struct umtx_q *uq;
	uint32_t flags;
	int32_t state, oldstate;
	int error, q, count;

	uq = td->td_umtxq;
	flags = fuword32(&rwlock->rw_flags);
	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
	if (error != 0)
	return (error);

	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
	if (state & URWLOCK_WRITE_OWNER) {
	for (;;) {
	oldstate = casuword32(&rwlock->rw_state, state,
	state & ~URWLOCK_WRITE_OWNER);
	if (oldstate != state) {
	state = oldstate;
	if (!(oldstate & URWLOCK_WRITE_OWNER)) {
	error = EPERM;
	goto out;
	}
	} else
	break;
	}
	} else if (URWLOCK_READER_COUNT(state) != 0) {
	for (;;) {
	oldstate = casuword32(&rwlock->rw_state, state,
	state - 1);
	if (oldstate != state) {
	state = oldstate;
	if (URWLOCK_READER_COUNT(oldstate) == 0) {
	error = EPERM;
	goto out;
	}
	}
	else
	break;
	}
	} else {
	error = EPERM;
	goto out;
	}

	count = 0;

	if (!(flags & URWLOCK_PREFER_READER)) {
	if (state & URWLOCK_WRITE_WAITERS) {
	count = 1;
	q = UMTX_EXCLUSIVE_QUEUE;
	} else if (state & URWLOCK_READ_WAITERS) {
	count = INT_MAX;
	q = UMTX_SHARED_QUEUE;
	}
	} else {
	if (state & URWLOCK_READ_WAITERS) {
	count = INT_MAX;
	q = UMTX_SHARED_QUEUE;
	} else if (state & URWLOCK_WRITE_WAITERS) {
	count = 1;
	q = UMTX_EXCLUSIVE_QUEUE;
	}
	}

	if (count) {
	umtxq_lock(&uq->uq_key);
	umtxq_busy(&uq->uq_key);
	umtxq_signal_queue(&uq->uq_key, count, q);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);
	}
	out:
	umtx_key_release(&uq->uq_key);
	return (error);
	}

	static int
	do_sem_wait(struct thread td, struct _usem sem, struct timespec *timeout)
	{
	struct umtx_q *uq;
	struct timeval tv;
	struct timespec cts, ets, tts;
	uint32_t flags, count;
	int error;

	uq = td->td_umtxq;
	flags = fuword32(&sem->_flags);
	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
	if (error != 0)
	return (error);
	umtxq_lock(&uq->uq_key);
	umtxq_busy(&uq->uq_key);
	umtxq_insert(uq);
	umtxq_unlock(&uq->uq_key);

	if (fuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters)) == 0)
	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);

	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
	if (count != 0) {
	umtxq_lock(&uq->uq_key);
	umtxq_unbusy(&uq->uq_key);
	umtxq_remove(uq);
	umtxq_unlock(&uq->uq_key);
	umtx_key_release(&uq->uq_key);
	return (0);
	}

	umtxq_lock(&uq->uq_key);
	umtxq_unbusy(&uq->uq_key);
	umtxq_unlock(&uq->uq_key);

	umtxq_lock(&uq->uq_key);
	if (timeout == NULL) {
	error = umtxq_sleep(uq, "usem", 0);
	} else {
	getnanouptime(&ets);
	timespecadd(&ets, timeout);
	TIMESPEC_TO_TIMEVAL(&tv, timeout);
	for (;;) {
	error = umtxq_sleep(uq, "usem", tvtohz(&tv));
	if (error != ETIMEDOUT)
	break;
	getnanouptime(&cts);
	if (timespeccmp(&cts, &ets, >=)) {
	error = ETIMEDOUT;
	break;
	}
	tts = ets;
	timespecsub(&tts, &cts);
	TIMESPEC_TO_TIMEVAL(&tv, &tts);
	}
	}

	if ((uq->uq_flags & UQF_UMTXQ) == 0)
	error = 0;
	else {
	umtxq_remove(uq);
	if (error == ERESTART)
	error = EINTR;
	}
	umtxq_unlock(&uq->uq_key);
	umtx_key_release(&uq->uq_key);
	return (error);
	}

	/*
	* Signal a userland condition variable.
	*/
	static int
	do_sem_wake(struct thread td, struct _usem sem)
	{
	struct umtx_key key;
	int error, cnt, nwake;
	uint32_t flags;

	flags = fuword32(&sem->_flags);
	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
	return (error);
	umtxq_lock(&key);
	umtxq_busy(&key);
	cnt = umtxq_count(&key);
	nwake = umtxq_signal(&key, 1);
	if (cnt <= nwake) {
	umtxq_unlock(&key);
	error = suword32(
	__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
	umtxq_lock(&key);
	}
	umtxq_unbusy(&key);
	umtxq_unlock(&key);
	umtx_key_release(&key);
	return (error);
	}

	int
	-_umtx_lock(struct thread td, struct _umtx_lock_args uap)
	+sys__umtx_lock(struct thread td, struct _umtx_lock_args uap)
	/* struct umtx umtx /
	{
	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
	}

	int
	-_umtx_unlock(struct thread td, struct _umtx_unlock_args uap)
	+sys__umtx_unlock(struct thread td, struct _umtx_unlock_args uap)
	/* struct umtx umtx /
	{
	return do_unlock_umtx(td, uap->umtx, td->td_tid);
	}

	static int
	__umtx_op_lock_umtx(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0) {
	return (EINVAL);
	}
	ts = &timeout;
	}
	return (do_lock_umtx(td, uap->obj, uap->val, ts));
	}

	static int
	__umtx_op_unlock_umtx(struct thread td, struct _umtx_op_args uap)
	{
	return (do_unlock_umtx(td, uap->obj, uap->val));
	}

	static int
	__umtx_op_wait(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0)
	return (EINVAL);
	ts = &timeout;
	}
	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
	}

	static int
	__umtx_op_wait_uint(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0)
	return (EINVAL);
	ts = &timeout;
	}
	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
	}

	static int
	__umtx_op_wait_uint_private(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0)
	return (EINVAL);
	ts = &timeout;
	}
	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
	}

	static int
	__umtx_op_wake(struct thread td, struct _umtx_op_args uap)
	{
	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
	}

	#define BATCH_SIZE 128
	static int
	__umtx_op_nwake_private(struct thread td, struct _umtx_op_args uap)
	{
	int count = uap->val;
	void *uaddrs[BATCH_SIZE];
	char upp = (char )uap->obj;
	int tocopy;
	int error = 0;
	int i, pos = 0;

	while (count > 0) {
	tocopy = count;
	if (tocopy > BATCH_SIZE)
	tocopy = BATCH_SIZE;
	error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
	if (error != 0)
	break;
	for (i = 0; i < tocopy; ++i)
	kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
	count -= tocopy;
	pos += tocopy;
	}
	return (error);
	}

	static int
	__umtx_op_wake_private(struct thread td, struct _umtx_op_args uap)
	{
	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
	}

	static int
	__umtx_op_lock_umutex(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin(uap->uaddr2, &timeout,
	sizeof(timeout));
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0) {
	return (EINVAL);
	}
	ts = &timeout;
	}
	return do_lock_umutex(td, uap->obj, ts, 0);
	}

	static int
	__umtx_op_trylock_umutex(struct thread td, struct _umtx_op_args uap)
	{
	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
	}

	static int
	__umtx_op_wait_umutex(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin(uap->uaddr2, &timeout,
	sizeof(timeout));
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0) {
	return (EINVAL);
	}
	ts = &timeout;
	}
	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
	}

	static int
	__umtx_op_wake_umutex(struct thread td, struct _umtx_op_args uap)
	{
	return do_wake_umutex(td, uap->obj);
	}

	static int
	__umtx_op_unlock_umutex(struct thread td, struct _umtx_op_args uap)
	{
	return do_unlock_umutex(td, uap->obj);
	}

	static int
	__umtx_op_set_ceiling(struct thread td, struct _umtx_op_args uap)
	{
	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
	}

	static int
	__umtx_op_cv_wait(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin(uap->uaddr2, &timeout,
	sizeof(timeout));
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0) {
	return (EINVAL);
	}
	ts = &timeout;
	}
	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
	}

	static int
	__umtx_op_cv_signal(struct thread td, struct _umtx_op_args uap)
	{
	return do_cv_signal(td, uap->obj);
	}

	static int
	__umtx_op_cv_broadcast(struct thread td, struct _umtx_op_args uap)
	{
	return do_cv_broadcast(td, uap->obj);
	}

	static int
	__umtx_op_rw_rdlock(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL) {
	error = do_rw_rdlock(td, uap->obj, uap->val, 0);
	} else {
	error = copyin(uap->uaddr2, &timeout,
	sizeof(timeout));
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0) {
	return (EINVAL);
	}
	error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
	}
	return (error);
	}

	static int
	__umtx_op_rw_wrlock(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL) {
	error = do_rw_wrlock(td, uap->obj, 0);
	} else {
	error = copyin(uap->uaddr2, &timeout,
	sizeof(timeout));
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0) {
	return (EINVAL);
	}

	error = do_rw_wrlock2(td, uap->obj, &timeout);
	}
	return (error);
	}

	static int
	__umtx_op_rw_unlock(struct thread td, struct _umtx_op_args uap)
	{
	return do_rw_unlock(td, uap->obj);
	}

	static int
	__umtx_op_sem_wait(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin(uap->uaddr2, &timeout,
	sizeof(timeout));
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0) {
	return (EINVAL);
	}
	ts = &timeout;
	}
	return (do_sem_wait(td, uap->obj, ts));
	}

	static int
	__umtx_op_sem_wake(struct thread td, struct _umtx_op_args uap)
	{
	return do_sem_wake(td, uap->obj);
	}

	typedef int (_umtx_op_func)(struct thread td, struct _umtx_op_args *uap);

	static _umtx_op_func op_table[] = {
	__umtx_op_lock_umtx, /* UMTX_OP_LOCK */
	__umtx_op_unlock_umtx, /* UMTX_OP_UNLOCK */
	__umtx_op_wait, /* UMTX_OP_WAIT */
	__umtx_op_wake, /* UMTX_OP_WAKE */
	__umtx_op_trylock_umutex, /* UMTX_OP_MUTEX_TRYLOCK */
	__umtx_op_lock_umutex, /* UMTX_OP_MUTEX_LOCK */
	__umtx_op_unlock_umutex, /* UMTX_OP_MUTEX_UNLOCK */
	__umtx_op_set_ceiling, /* UMTX_OP_SET_CEILING */
	__umtx_op_cv_wait, /* UMTX_OP_CV_WAIT*/
	__umtx_op_cv_signal, /* UMTX_OP_CV_SIGNAL */
	__umtx_op_cv_broadcast, /* UMTX_OP_CV_BROADCAST */
	__umtx_op_wait_uint, /* UMTX_OP_WAIT_UINT */
	__umtx_op_rw_rdlock, /* UMTX_OP_RW_RDLOCK */
	__umtx_op_rw_wrlock, /* UMTX_OP_RW_WRLOCK */
	__umtx_op_rw_unlock, /* UMTX_OP_RW_UNLOCK */
	__umtx_op_wait_uint_private, /* UMTX_OP_WAIT_UINT_PRIVATE */
	__umtx_op_wake_private, /* UMTX_OP_WAKE_PRIVATE */
	__umtx_op_wait_umutex, /* UMTX_OP_UMUTEX_WAIT */
	__umtx_op_wake_umutex, /* UMTX_OP_UMUTEX_WAKE */
	__umtx_op_sem_wait, /* UMTX_OP_SEM_WAIT */
	__umtx_op_sem_wake, /* UMTX_OP_SEM_WAKE */
	__umtx_op_nwake_private /* UMTX_OP_NWAKE_PRIVATE */
	};

	int
	-_umtx_op(struct thread td, struct _umtx_op_args uap)
	+sys__umtx_op(struct thread td, struct _umtx_op_args uap)
	{
	if ((unsigned)uap->op < UMTX_OP_MAX)
	return (*op_table[uap->op])(td, uap);
	return (EINVAL);
	}

	#ifdef COMPAT_FREEBSD32
	int
	freebsd32_umtx_lock(struct thread td, struct freebsd32_umtx_lock_args uap)
	/* struct umtx umtx /
	{
	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
	}

	int
	freebsd32_umtx_unlock(struct thread td, struct freebsd32_umtx_unlock_args uap)
	/* struct umtx umtx /
	{
	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
	}

	struct timespec32 {
	uint32_t tv_sec;
	uint32_t tv_nsec;
	};

	static inline int
	copyin_timeout32(void addr, struct timespec tsp)
	{
	struct timespec32 ts32;
	int error;

	error = copyin(addr, &ts32, sizeof(struct timespec32));
	if (error == 0) {
	tsp->tv_sec = ts32.tv_sec;
	tsp->tv_nsec = ts32.tv_nsec;
	}
	return (error);
	}

	static int
	__umtx_op_lock_umtx_compat32(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin_timeout32(uap->uaddr2, &timeout);
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0) {
	return (EINVAL);
	}
	ts = &timeout;
	}
	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
	}

	static int
	__umtx_op_unlock_umtx_compat32(struct thread td, struct _umtx_op_args uap)
	{
	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
	}

	static int
	__umtx_op_wait_compat32(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin_timeout32(uap->uaddr2, &timeout);
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0)
	return (EINVAL);
	ts = &timeout;
	}
	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
	}

	static int
	__umtx_op_lock_umutex_compat32(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin_timeout32(uap->uaddr2, &timeout);
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0)
	return (EINVAL);
	ts = &timeout;
	}
	return do_lock_umutex(td, uap->obj, ts, 0);
	}

	static int
	__umtx_op_wait_umutex_compat32(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin_timeout32(uap->uaddr2, &timeout);
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0)
	return (EINVAL);
	ts = &timeout;
	}
	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
	}

	static int
	__umtx_op_cv_wait_compat32(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin_timeout32(uap->uaddr2, &timeout);
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0)
	return (EINVAL);
	ts = &timeout;
	}
	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
	}

	static int
	__umtx_op_rw_rdlock_compat32(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL) {
	error = do_rw_rdlock(td, uap->obj, uap->val, 0);
	} else {
	error = copyin_timeout32(uap->uaddr2, &timeout);
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0) {
	return (EINVAL);
	}
	error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
	}
	return (error);
	}

	static int
	__umtx_op_rw_wrlock_compat32(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL) {
	error = do_rw_wrlock(td, uap->obj, 0);
	} else {
	error = copyin_timeout32(uap->uaddr2, &timeout);
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0) {
	return (EINVAL);
	}

	error = do_rw_wrlock2(td, uap->obj, &timeout);
	}
	return (error);
	}

	static int
	__umtx_op_wait_uint_private_compat32(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin_timeout32(uap->uaddr2, &timeout);
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0)
	return (EINVAL);
	ts = &timeout;
	}
	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
	}

	static int
	__umtx_op_sem_wait_compat32(struct thread td, struct _umtx_op_args uap)
	{
	struct timespec *ts, timeout;
	int error;

	/* Allow a null timespec (wait forever). */
	if (uap->uaddr2 == NULL)
	ts = NULL;
	else {
	error = copyin_timeout32(uap->uaddr2, &timeout);
	if (error != 0)
	return (error);
	if (timeout.tv_nsec >= 1000000000 \|\|
	timeout.tv_nsec < 0)
	return (EINVAL);
	ts = &timeout;
	}
	return (do_sem_wait(td, uap->obj, ts));
	}

	static int
	__umtx_op_nwake_private32(struct thread td, struct _umtx_op_args uap)
	{
	int count = uap->val;
	uint32_t uaddrs[BATCH_SIZE];
	uint32_t upp = (uint32_t )uap->obj;
	int tocopy;
	int error = 0;
	int i, pos = 0;

	while (count > 0) {
	tocopy = count;
	if (tocopy > BATCH_SIZE)
	tocopy = BATCH_SIZE;
	error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
	if (error != 0)
	break;
	for (i = 0; i < tocopy; ++i)
	kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
	INT_MAX, 1);
	count -= tocopy;
	pos += tocopy;
	}
	return (error);
	}

	static _umtx_op_func op_table_compat32[] = {
	__umtx_op_lock_umtx_compat32, /* UMTX_OP_LOCK */
	__umtx_op_unlock_umtx_compat32, /* UMTX_OP_UNLOCK */
	__umtx_op_wait_compat32, /* UMTX_OP_WAIT */
	__umtx_op_wake, /* UMTX_OP_WAKE */
	__umtx_op_trylock_umutex, /* UMTX_OP_MUTEX_LOCK */
	__umtx_op_lock_umutex_compat32, /* UMTX_OP_MUTEX_TRYLOCK */
	__umtx_op_unlock_umutex, /* UMTX_OP_MUTEX_UNLOCK */
	__umtx_op_set_ceiling, /* UMTX_OP_SET_CEILING */
	__umtx_op_cv_wait_compat32, /* UMTX_OP_CV_WAIT*/
	__umtx_op_cv_signal, /* UMTX_OP_CV_SIGNAL */
	__umtx_op_cv_broadcast, /* UMTX_OP_CV_BROADCAST */
	__umtx_op_wait_compat32, /* UMTX_OP_WAIT_UINT */
	__umtx_op_rw_rdlock_compat32, /* UMTX_OP_RW_RDLOCK */
	__umtx_op_rw_wrlock_compat32, /* UMTX_OP_RW_WRLOCK */
	__umtx_op_rw_unlock, /* UMTX_OP_RW_UNLOCK */
	__umtx_op_wait_uint_private_compat32, /* UMTX_OP_WAIT_UINT_PRIVATE */
	__umtx_op_wake_private, /* UMTX_OP_WAKE_PRIVATE */
	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
	__umtx_op_wake_umutex, /* UMTX_OP_UMUTEX_WAKE */
	__umtx_op_sem_wait_compat32, /* UMTX_OP_SEM_WAIT */
	__umtx_op_sem_wake, /* UMTX_OP_SEM_WAKE */
	__umtx_op_nwake_private32 /* UMTX_OP_NWAKE_PRIVATE */
	};

	int
	freebsd32_umtx_op(struct thread td, struct freebsd32_umtx_op_args uap)
	{
	if ((unsigned)uap->op < UMTX_OP_MAX)
	return (*op_table_compat32[uap->op])(td,
	(struct _umtx_op_args *)uap);
	return (EINVAL);
	}
	#endif

	void
	umtx_thread_init(struct thread *td)
	{
	td->td_umtxq = umtxq_alloc();
	td->td_umtxq->uq_thread = td;
	}

	void
	umtx_thread_fini(struct thread *td)
	{
	umtxq_free(td->td_umtxq);
	}

	/*
	* It will be called when new thread is created, e.g fork().
	*/
	void
	umtx_thread_alloc(struct thread *td)
	{
	struct umtx_q *uq;

	uq = td->td_umtxq;
	uq->uq_inherited_pri = PRI_MAX;

	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
	}

	/*
	* exec() hook.
	*/
	static void
	umtx_exec_hook(void arg __unused, struct proc p __unused,
	struct image_params *imgp __unused)
	{
	umtx_thread_cleanup(curthread);
	}

	/*
	* thread_exit() hook.
	*/
	void
	umtx_thread_exit(struct thread *td)
	{
	umtx_thread_cleanup(td);
	}

	/*
	* clean up umtx data.
	*/
	static void
	umtx_thread_cleanup(struct thread *td)
	{
	struct umtx_q *uq;
	struct umtx_pi *pi;

	if ((uq = td->td_umtxq) == NULL)
	return;

	mtx_lock_spin(&umtx_lock);
	uq->uq_inherited_pri = PRI_MAX;
	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
	pi->pi_owner = NULL;
	TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
	}
	mtx_unlock_spin(&umtx_lock);
	thread_lock(td);
	sched_lend_user_prio(td, PRI_MAX);
	thread_unlock(td);
	}
	Index: head/sys/kern/kern_uuid.c
	===================================================================
	--- head/sys/kern/kern_uuid.c (revision 225616)
	+++ head/sys/kern/kern_uuid.c (revision 225617)
	@@ -1,369 +1,369 @@
	/*-
	* Copyright (c) 2002 Marcel Moolenaar
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	*
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/endian.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sbuf.h>
	#include <sys/socket.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/jail.h>
	#include <sys/uuid.h>

	#include <net/if.h>
	#include <net/if_dl.h>
	#include <net/if_types.h>
	#include <net/vnet.h>

	/*
	* See also:
	* http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
	* http://www.opengroup.org/onlinepubs/009629399/apdxa.htm
	*
	* Note that the generator state is itself an UUID, but the time and clock
	* sequence fields are written in the native byte order.
	*/

	CTASSERT(sizeof(struct uuid) == 16);

	/* We use an alternative, more convenient representation in the generator. */
	struct uuid_private {
	union {
	uint64_t ll; /* internal. */
	struct {
	uint32_t low;
	uint16_t mid;
	uint16_t hi;
	} x;
	} time;
	uint16_t seq; /* Big-endian. */
	uint16_t node[UUID_NODE_LEN>>1];
	};

	CTASSERT(sizeof(struct uuid_private) == 16);

	static struct uuid_private uuid_last;

	static struct mtx uuid_mutex;
	MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF);

	/*
	* Return the first MAC address we encounter or, if none was found,
	* construct a sufficiently random multicast address. We don't try
	* to return the same MAC address as previously returned. We always
	* generate a new multicast address if no MAC address exists in the
	* system.
	* It would be nice to know if 'ifnet' or any of its sub-structures
	* has been changed in any way. If not, we could simply skip the
	* scan and safely return the MAC address we returned before.
	*/
	static void
	uuid_node(uint16_t *node)
	{
	struct ifnet *ifp;
	struct ifaddr *ifa;
	struct sockaddr_dl *sdl;
	int i;

	CURVNET_SET(TD_TO_VNET(curthread));
	IFNET_RLOCK_NOSLEEP();
	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
	/* Walk the address list */
	IF_ADDR_LOCK(ifp);
	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
	sdl = (struct sockaddr_dl*)ifa->ifa_addr;
	if (sdl != NULL && sdl->sdl_family == AF_LINK &&
	sdl->sdl_type == IFT_ETHER) {
	/* Got a MAC address. */
	bcopy(LLADDR(sdl), node, UUID_NODE_LEN);
	IF_ADDR_UNLOCK(ifp);
	IFNET_RUNLOCK_NOSLEEP();
	CURVNET_RESTORE();
	return;
	}
	}
	IF_ADDR_UNLOCK(ifp);
	}
	IFNET_RUNLOCK_NOSLEEP();

	for (i = 0; i < (UUID_NODE_LEN>>1); i++)
	node[i] = (uint16_t)arc4random();
	((uint8_t)node) \|= 0x01;
	CURVNET_RESTORE();
	}

	/*
	* Get the current time as a 60 bit count of 100-nanosecond intervals
	* since 00:00:00.00, October 15,1582. We apply a magic offset to convert
	* the Unix time since 00:00:00.00, January 1, 1970 to the date of the
	* Gregorian reform to the Christian calendar.
	*/
	static uint64_t
	uuid_time(void)
	{
	struct bintime bt;
	uint64_t time = 0x01B21DD213814000LL;

	bintime(&bt);
	time += (uint64_t)bt.sec * 10000000LL;
	time += (10000000LL * (uint32_t)(bt.frac >> 32)) >> 32;
	return (time & ((1LL << 60) - 1LL));
	}

	struct uuid *
	kern_uuidgen(struct uuid *store, size_t count)
	{
	struct uuid_private uuid;
	uint64_t time;
	size_t n;

	mtx_lock(&uuid_mutex);

	uuid_node(uuid.node);
	time = uuid_time();

	if (uuid_last.time.ll == 0LL \|\| uuid_last.node[0] != uuid.node[0] \|\|
	uuid_last.node[1] != uuid.node[1] \|\|
	uuid_last.node[2] != uuid.node[2])
	uuid.seq = (uint16_t)arc4random() & 0x3fff;
	else if (uuid_last.time.ll >= time)
	uuid.seq = (uuid_last.seq + 1) & 0x3fff;
	else
	uuid.seq = uuid_last.seq;

	uuid_last = uuid;
	uuid_last.time.ll = (time + count - 1) & ((1LL << 60) - 1LL);

	mtx_unlock(&uuid_mutex);

	/* Set sequence and variant and deal with byte order. */
	uuid.seq = htobe16(uuid.seq \| 0x8000);

	for (n = 0; n < count; n++) {
	/* Set time and version (=1). */
	uuid.time.x.low = (uint32_t)time;
	uuid.time.x.mid = (uint16_t)(time >> 32);
	uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) \| (1 << 12);
	store[n] = (struct uuid )&uuid;
	time++;
	}

	return (store);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct uuidgen_args {
	struct uuid *store;
	int count;
	};
	#endif
	int
	-uuidgen(struct thread td, struct uuidgen_args uap)
	+sys_uuidgen(struct thread td, struct uuidgen_args uap)
	{
	struct uuid *store;
	size_t count;
	int error;

	/*
	* Limit the number of UUIDs that can be created at the same time
	* to some arbitrary number. This isn't really necessary, but I
	* like to have some sort of upper-bound that's less than 2G :-)
	* XXX probably needs to be tunable.
	*/
	if (uap->count < 1 \|\| uap->count > 2048)
	return (EINVAL);

	count = uap->count;
	store = malloc(count * sizeof(struct uuid), M_TEMP, M_WAITOK);
	kern_uuidgen(store, count);
	error = copyout(store, uap->store, count * sizeof(struct uuid));
	free(store, M_TEMP);
	return (error);
	}

	int
	snprintf_uuid(char buf, size_t sz, struct uuid uuid)
	{
	struct uuid_private *id;
	int cnt;

	id = (struct uuid_private *)uuid;
	cnt = snprintf(buf, sz, "%08x-%04x-%04x-%04x-%04x%04x%04x",
	id->time.x.low, id->time.x.mid, id->time.x.hi, be16toh(id->seq),
	be16toh(id->node[0]), be16toh(id->node[1]), be16toh(id->node[2]));
	return (cnt);
	}

	int
	printf_uuid(struct uuid *uuid)
	{
	char buf[38];

	snprintf_uuid(buf, sizeof(buf), uuid);
	return (printf("%s", buf));
	}

	int
	sbuf_printf_uuid(struct sbuf sb, struct uuid uuid)
	{
	char buf[38];

	snprintf_uuid(buf, sizeof(buf), uuid);
	return (sbuf_printf(sb, "%s", buf));
	}

	/*
	* Encode/Decode UUID into byte-stream.
	* http://www.opengroup.org/dce/info/draft-leach-uuids-guids-01.txt
	*
	* 0 1 2 3
	* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	* \| time_low \|
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	* \| time_mid \| time_hi_and_version \|
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	* \|clk_seq_hi_res \| clk_seq_low \| node (0-1) \|
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	* \| node (2-5) \|
	* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
	*/

	void
	le_uuid_enc(void buf, struct uuid const uuid)
	{
	u_char *p;
	int i;

	p = buf;
	le32enc(p, uuid->time_low);
	le16enc(p + 4, uuid->time_mid);
	le16enc(p + 6, uuid->time_hi_and_version);
	p[8] = uuid->clock_seq_hi_and_reserved;
	p[9] = uuid->clock_seq_low;
	for (i = 0; i < _UUID_NODE_LEN; i++)
	p[10 + i] = uuid->node[i];
	}

	void
	le_uuid_dec(void const buf, struct uuid uuid)
	{
	u_char const *p;
	int i;

	p = buf;
	uuid->time_low = le32dec(p);
	uuid->time_mid = le16dec(p + 4);
	uuid->time_hi_and_version = le16dec(p + 6);
	uuid->clock_seq_hi_and_reserved = p[8];
	uuid->clock_seq_low = p[9];
	for (i = 0; i < _UUID_NODE_LEN; i++)
	uuid->node[i] = p[10 + i];
	}

	void
	be_uuid_enc(void buf, struct uuid const uuid)
	{
	u_char *p;
	int i;

	p = buf;
	be32enc(p, uuid->time_low);
	be16enc(p + 4, uuid->time_mid);
	be16enc(p + 6, uuid->time_hi_and_version);
	p[8] = uuid->clock_seq_hi_and_reserved;
	p[9] = uuid->clock_seq_low;
	for (i = 0; i < _UUID_NODE_LEN; i++)
	p[10 + i] = uuid->node[i];
	}

	void
	be_uuid_dec(void const buf, struct uuid uuid)
	{
	u_char const *p;
	int i;

	p = buf;
	uuid->time_low = be32dec(p);
	uuid->time_mid = le16dec(p + 4);
	uuid->time_hi_and_version = be16dec(p + 6);
	uuid->clock_seq_hi_and_reserved = p[8];
	uuid->clock_seq_low = p[9];
	for (i = 0; i < _UUID_NODE_LEN; i++)
	uuid->node[i] = p[10 + i];
	}

	int
	parse_uuid(const char str, struct uuid uuid)
	{
	u_int c[11];
	int n;

	/* An empty string represents a nil UUID. */
	if (*str == '\0') {
	bzero(uuid, sizeof(*uuid));
	return (0);
	}

	/* The UUID string representation has a fixed length. */
	if (strlen(str) != 36)
	return (EINVAL);

	/*
	* We only work with "new" UUIDs. New UUIDs have the form:
	* 01234567-89ab-cdef-0123-456789abcdef
	* The so called "old" UUIDs, which we don't support, have the form:
	* 0123456789ab.cd.ef.01.23.45.67.89.ab
	*/
	if (str[8] != '-')
	return (EINVAL);

	n = sscanf(str, "%8x-%4x-%4x-%2x%2x-%2x%2x%2x%2x%2x%2x", c + 0, c + 1,
	c + 2, c + 3, c + 4, c + 5, c + 6, c + 7, c + 8, c + 9, c + 10);
	/* Make sure we have all conversions. */
	if (n != 11)
	return (EINVAL);

	/* Successful scan. Build the UUID. */
	uuid->time_low = c[0];
	uuid->time_mid = c[1];
	uuid->time_hi_and_version = c[2];
	uuid->clock_seq_hi_and_reserved = c[3];
	uuid->clock_seq_low = c[4];
	for (n = 0; n < 6; n++)
	uuid->node[n] = c[n + 5];

	/* Check semantics... */
	return (((c[3] & 0x80) != 0x00 && /* variant 0? */
	(c[3] & 0xc0) != 0x80 && /* variant 1? */
	(c[3] & 0xe0) != 0xc0) ? EINVAL : 0); /* variant 2? */
	}
	Index: head/sys/kern/makesyscalls.sh
	===================================================================
	--- head/sys/kern/makesyscalls.sh (revision 225616)
	+++ head/sys/kern/makesyscalls.sh (revision 225617)
	@@ -1,620 +1,636 @@
	#! /bin/sh -
	# @(#)makesyscalls.sh 8.1 (Berkeley) 6/10/93
	# $FreeBSD$

	set -e

	# name of compat options:
	compat=COMPAT_43
	compat4=COMPAT_FREEBSD4
	compat6=COMPAT_FREEBSD6
	compat7=COMPAT_FREEBSD7

	# output files:
	sysnames="syscalls.c"
	sysproto="../sys/sysproto.h"
	sysproto_h=_SYS_SYSPROTO_H_
	syshdr="../sys/syscall.h"
	sysmk="../sys/syscall.mk"
	syssw="init_sysent.c"
	syscallprefix="SYS_"
	switchname="sysent"
	namesname="syscallnames"
	systrace="systrace_args.c"

	# tmp files:
	sysaue="sysent.aue.$$"
	sysdcl="sysent.dcl.$$"
	syscompat="sysent.compat.$$"
	syscompatdcl="sysent.compatdcl.$$"
	syscompat4="sysent.compat4.$$"
	syscompat4dcl="sysent.compat4dcl.$$"
	syscompat6="sysent.compat6.$$"
	syscompat6dcl="sysent.compat6dcl.$$"
	syscompat7="sysent.compat7.$$"
	syscompat7dcl="sysent.compat7dcl.$$"
	sysent="sysent.switch.$$"
	sysinc="sysinc.switch.$$"
	sysarg="sysarg.switch.$$"
	sysprotoend="sysprotoend.$$"
	systracetmp="systrace.$$"

	if [ -r capabilities.conf ]; then
	capenabled=`cat capabilities.conf \| grep -v "^#" \| grep -v "^$"`
	capenabled=`echo $capenabled \| sed 's/ /,/g'`
	else
	capenabled=""
	fi

	trap "rm $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp" 0

	touch $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $syscompat7 $syscompat7dcl $sysent $sysinc $sysarg $sysprotoend $systracetmp

	case $# in
	0) echo "usage: $0 input-file <config-file>" 1>&2
	exit 1
	;;
	esac

	if [ -n "$2" -a -f "$2" ]; then
	. $2
	fi

	sed -e '
	s/\$//g
	:join
	/\\$/{a\

	N
	s/\\\n//
	b join
	}
	2,${
	/^#/!s/$[{}()*,]$/ \1 /g
	}
	' < $1 \| awk "
	BEGIN {
	sysaue = \"$sysaue\"
	sysdcl = \"$sysdcl\"
	sysproto = \"$sysproto\"
	sysprotoend = \"$sysprotoend\"
	sysproto_h = \"$sysproto_h\"
	syscompat = \"$syscompat\"
	syscompatdcl = \"$syscompatdcl\"
	syscompat4 = \"$syscompat4\"
	syscompat4dcl = \"$syscompat4dcl\"
	syscompat6 = \"$syscompat6\"
	syscompat6dcl = \"$syscompat6dcl\"
	syscompat7 = \"$syscompat7\"
	syscompat7dcl = \"$syscompat7dcl\"
	sysent = \"$sysent\"
	syssw = \"$syssw\"
	sysinc = \"$sysinc\"
	sysarg = \"$sysarg\"
	sysnames = \"$sysnames\"
	syshdr = \"$syshdr\"
	sysmk = \"$sysmk\"
	systrace = \"$systrace\"
	systracetmp = \"$systracetmp\"
	compat = \"$compat\"
	compat4 = \"$compat4\"
	compat6 = \"$compat6\"
	compat7 = \"$compat7\"
	syscallprefix = \"$syscallprefix\"
	switchname = \"$switchname\"
	namesname = \"$namesname\"
	infile = \"$1\"
	capenabled_string = \"$capenabled\"
	"'

	split(capenabled_string, capenabled, ",");

	printf "/\n System call switch table.\n *\n" > syssw
	printf " * DO NOT EDIT-- this file is automatically generated.\n" > syssw
	printf " * $%s$\n", "FreeBSD" > syssw

	printf "/\n System call prototypes.\n *\n" > sysarg
	printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarg
	printf " * $%s$\n", "FreeBSD" > sysarg

	printf "\n#ifdef %s\n\n", compat > syscompat
	printf "\n#ifdef %s\n\n", compat4 > syscompat4
	printf "\n#ifdef %s\n\n", compat6 > syscompat6
	printf "\n#ifdef %s\n\n", compat7 > syscompat7

	printf "/\n System call names.\n *\n" > sysnames
	printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
	printf " * $%s$\n", "FreeBSD" > sysnames

	printf "/\n System call numbers.\n *\n" > syshdr
	printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr
	printf " * $%s$\n", "FreeBSD" > syshdr
	printf "# FreeBSD system call names.\n" > sysmk
	printf "# DO NOT EDIT-- this file is automatically generated.\n" > sysmk
	printf "# $%s$\n", "FreeBSD" > sysmk

	printf "/\n System call argument to DTrace register array converstion.\n *\n" > systrace
	printf " * DO NOT EDIT-- this file is automatically generated.\n" > systrace
	printf " * $%s$\n", "FreeBSD" > systrace
	}
	NR == 1 {
	gsub("[$]FreeBSD: ", "", $0)
	gsub(" [$]", "", $0)

	printf " * created from%s\n */\n\n", $0 > syssw

	printf "\n/* The casts are bogus but will do for now. */\n" > sysent
	printf "struct sysent %s[] = {\n",switchname > sysent

	printf " * created from%s\n */\n\n", $0 > sysarg
	printf "#ifndef %s\n", sysproto_h > sysarg
	printf "#define\t%s\n\n", sysproto_h > sysarg
	printf "#include <sys/signal.h>\n" > sysarg
	printf "#include <sys/acl.h>\n" > sysarg
	printf "#include <sys/cpuset.h>\n" > sysarg
	printf "#include <sys/_semaphore.h>\n" > sysarg
	printf "#include <sys/ucontext.h>\n\n" > sysarg
	printf "#include <bsm/audit_kevents.h>\n\n" > sysarg
	printf "struct proc;\n\n" > sysarg
	printf "struct thread;\n\n" > sysarg
	printf "#define\tPAD_(t)\t(sizeof(register_t) <= sizeof(t) ? \\\n" > sysarg
	printf "\t\t0 : sizeof(register_t) - sizeof(t))\n\n" > sysarg
	printf "#if BYTE_ORDER == LITTLE_ENDIAN\n"> sysarg
	printf "#define\tPADL_(t)\t0\n" > sysarg
	printf "#define\tPADR_(t)\tPAD_(t)\n" > sysarg
	printf "#else\n" > sysarg
	printf "#define\tPADL_(t)\tPAD_(t)\n" > sysarg
	printf "#define\tPADR_(t)\t0\n" > sysarg
	printf "#endif\n\n" > sysarg

	printf " * created from%s\n */\n\n", $0 > sysnames
	printf "const char *%s[] = {\n", namesname > sysnames

	printf " * created from%s\n */\n\n", $0 > syshdr

	printf "# created from%s\nMIASM = ", $0 > sysmk

	printf " * This file is part of the DTrace syscall provider.\n */\n\n" > systrace
	printf "static void\nsystrace_args(int sysnum, void params, uint64_t uarg, int *n_args)\n{\n" > systrace
	printf "\tint64_t iarg = (int64_t ) uarg;\n" > systrace
	printf "\tswitch (sysnum) {\n" > systrace

	printf "static void\nsystrace_setargdesc(int sysnum, int ndx, char desc, size_t descsz)\n{\n\tconst char p = NULL;\n" > systracetmp
	printf "\tswitch (sysnum) {\n" > systracetmp

	next
	}
	NF == 0 \|\| $1 ~ /^;/ {
	next
	}
	$1 ~ /^#[ ]*include/ {
	print > sysinc
	next
	}
	$1 ~ /^#[ ]*if/ {
	print > sysent
	print > sysdcl
	print > sysarg
	print > syscompat
	print > syscompat4
	print > syscompat6
	print > syscompat7
	print > sysnames
	print > systrace
	print > systracetmp
	savesyscall = syscall
	next
	}
	$1 ~ /^#[ ]*else/ {
	print > sysent
	print > sysdcl
	print > sysarg
	print > syscompat
	print > syscompat4
	print > syscompat6
	print > syscompat7
	print > sysnames
	print > systrace
	print > systracetmp
	syscall = savesyscall
	next
	}
	$1 ~ /^#/ {
	print > sysent
	print > sysdcl
	print > sysarg
	print > syscompat
	print > syscompat4
	print > syscompat6
	print > syscompat7
	print > sysnames
	print > systrace
	print > systracetmp
	next
	}
	syscall != $1 {
	printf "%s: line %d: syscall number out of sync at %d\n",
	infile, NR, syscall
	printf "line is:\n"
	print
	exit 1
	}
	# Returns true if the type "name" is the first flag in the type field
	function type(name, flags, n) {
	n = split($3, flags, /\\|/)
	return (n > 0 && flags[1] == name)
	}
	# Returns true if the flag "name" is set in the type field
	function flag(name, flags, i, n) {
	n = split($3, flags, /\\|/)
	for (i = 1; i <= n; i++)
	if (flags[i] == name)
	return 1
	return 0
	}
	function align_sysent_comment(column) {
	printf("\t") > sysent
	column = column + 8 - column % 8
	while (column < 56) {
	printf("\t") > sysent
	column = column + 8
	}
	}
	function parserr(was, wanted) {
	printf "%s: line %d: unexpected %s (expected %s)\n",
	infile, NR, was, wanted
	exit 1
	}
	function parseline() {
	f=4 # toss number, type, audit event
	argc= 0;
	argssize = "0"
	thr_flag = "SY_THR_STATIC"
	if (flag("NOTSTATIC")) {
	thr_flag = "SY_THR_ABSENT"
	}
	if ($NF != "}") {
	funcalias=$(NF-2)
	argalias=$(NF-1)
	rettype=$NF
	end=NF-3
	} else {
	funcalias=""
	argalias=""
	rettype="int"
	end=NF
	}
	if (flag("NODEF")) {
	auditev="AUE_NULL"
	funcname=$4
	argssize = "AS(" $6 ")"
	return
	}
	if ($f != "{")
	parserr($f, "{")
	f++
	if ($end != "}")
	parserr($end, "}")
	end--
	if ($end != ";")
	parserr($end, ";")
	end--
	if ($end != ")")
	parserr($end, ")")
	end--

	f++ #function return type

	funcname=$f

	#
	# We now know the func name, so define a flags field for it.
	# Do this before any other processing as we may return early
	# from it.
	#
	for (cap in capenabled) {
	if (funcname == capenabled[cap]) {
	flags = "SYF_CAPENABLED";
	}
	}

	if (funcalias == "")
	funcalias = funcname
	if (argalias == "") {
	argalias = funcname "_args"
	if (flag("COMPAT"))
	argalias = "o" argalias
	if (flag("COMPAT4"))
	argalias = "freebsd4_" argalias
	if (flag("COMPAT6"))
	argalias = "freebsd6_" argalias
	if (flag("COMPAT7"))
	argalias = "freebsd7_" argalias
	}
	f++

	if ($f != "(")
	parserr($f, ")")
	f++

	if (f == end) {
	if ($f != "void")
	parserr($f, "argument definition")
	return
	}

	while (f <= end) {
	argc++
	argtype[argc]=""
	oldf=""
	while (f < end && $(f+1) != ",") {
	if (argtype[argc] != "" && oldf != "*")
	argtype[argc] = argtype[argc]" ";
	argtype[argc] = argtype[argc]$f;
	oldf = $f;
	f++
	}
	if (argtype[argc] == "")
	parserr($f, "argument definition")
	argname[argc]=$f;
	f += 2; # skip name, and any comma
	}
	if (argc != 0)
	argssize = "AS(" argalias ")"
	}
	{ comment = $4
	if (NF < 7)
	for (i = 5; i <= NF; i++)
	comment = comment " " $i
	}

	#
	# The AUE_ audit event identifier.
	#
	{
	auditev = $2;
	}

	#
	# The flags, if any.
	#
	{
	flags = "0";
	}

	type("STD") \|\| type("NODEF") \|\| type("NOARGS") \|\| type("NOPROTO") \
	\|\| type("NOSTD") {
	parseline()
	printf("\t/* %s */\n\tcase %d: {\n", funcname, syscall) > systrace
	printf("\t/* %s */\n\tcase %d:\n", funcname, syscall) > systracetmp
	if (argc > 0) {
	printf("\t\tswitch(ndx) {\n") > systracetmp
	printf("\t\tstruct %s *p = params;\n", argalias) > systrace
	for (i = 1; i <= argc; i++) {
	printf("\t\tcase %d:\n\t\t\tp = \"%s\";\n\t\t\tbreak;\n", i - 1, argtype[i]) > systracetmp
	if (index(argtype[i], "*") > 0 \|\| argtype[i] == "caddr_t")
	printf("\t\tuarg[%d] = (intptr_t) p->%s; /* %s */\n", \
	i - 1, \
	argname[i], argtype[i]) > systrace
	else if (substr(argtype[i], 1, 1) == "u" \|\| argtype[i] == "size_t")
	printf("\t\tuarg[%d] = p->%s; /* %s */\n", \
	i - 1, \
	argname[i], argtype[i]) > systrace
	else
	printf("\t\tiarg[%d] = p->%s; /* %s */\n", \
	i - 1, \
	argname[i], argtype[i]) > systrace
	}
	printf("\t\tdefault:\n\t\t\tbreak;\n\t\t};\n") > systracetmp
	}
	printf("\t\t*n_args = %d;\n\t\tbreak;\n\t}\n", argc) > systrace
	printf("\t\tbreak;\n") > systracetmp
	if (argc != 0 && !flag("NOARGS") && !flag("NOPROTO") && \
	!flag("NODEF")) {
	printf("struct %s {\n", argalias) > sysarg
	for (i = 1; i <= argc; i++)
	printf("\tchar %s_l_[PADL_(%s)]; " \
	"%s %s; char %s_r_[PADR_(%s)];\n",
	argname[i], argtype[i],
	argtype[i], argname[i],
	argname[i], argtype[i]) > sysarg
	printf("};\n") > sysarg
	}
	else if (!flag("NOARGS") && !flag("NOPROTO") && !flag("NODEF"))
	printf("struct %s {\n\tregister_t dummy;\n};\n",
	argalias) > sysarg
	if (!flag("NOPROTO") && !flag("NODEF")) {
	- printf("%s\t%s(struct thread , struct %s )",
	- rettype, funcname, argalias) > sysdcl
	+ if (funcname == "nosys" \|\| funcname == "lkmnosys" \|\|
	+ funcname == "sysarch" \|\| funcname ~ /^freebsd/ \|\|
	+ funcname ~ /^linux/ \|\| funcname ~ /^svr4/ \|\|
	+ funcname ~ /^ibcs2/ \|\| funcname ~ /^xenix/) {
	+ printf("%s\t%s(struct thread , struct %s )",
	+ rettype, funcname, argalias) > sysdcl
	+ } else {
	+ printf("%s\tsys_%s(struct thread , struct %s )",
	+ rettype, funcname, argalias) > sysdcl
	+ }
	printf(";\n") > sysdcl
	printf("#define\t%sAUE_%s\t%s\n", syscallprefix,
	funcalias, auditev) > sysaue
	}
	printf("\t{ %s, (sy_call_t *)", argssize) > sysent
	column = 8 + 2 + length(argssize) + 15
	if (flag("NOSTD")) {
	printf("%s },", "lkmressys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT") > sysent
	column = column + length("lkmressys") + length("AUE_NULL") + 3
	} else {
	- printf("%s, %s, NULL, 0, 0, %s, %s },", funcname, auditev, flags, thr_flag) > sysent
	- column = column + length(funcname) + length(auditev) + length(flags) + 3
	+ if (funcname == "nosys" \|\| funcname == "sysarch" \|\|
	+ funcname == "lkmnosys" \|\| funcname ~ /^freebsd/ \|\|
	+ funcname ~ /^linux/ \|\| funcname ~ /^svr4/ \|\|
	+ funcname ~ /^ibcs2/ \|\| funcname ~ /^xenix/) {
	+ printf("%s, %s, NULL, 0, 0, %s, %s },", funcname, auditev, flags, thr_flag) > sysent
	+ column = column + length(funcname) + length(auditev) + length(flags) + 3
	+ } else {
	+ printf("sys_%s, %s, NULL, 0, 0, %s, %s },", funcname, auditev, flags, thr_flag) > sysent
	+ column = column + length(funcname) + length(auditev) + length(flags) + 3 + 4
	+ }
	}
	align_sysent_comment(column)
	printf("/* %d = %s */\n", syscall, funcalias) > sysent
	printf("\t\"%s\",\t\t\t/* %d = %s */\n",
	funcalias, syscall, funcalias) > sysnames
	if (!flag("NODEF")) {
	printf("#define\t%s%s\t%d\n", syscallprefix,
	funcalias, syscall) > syshdr
	printf(" \\\n\t%s.o", funcalias) > sysmk
	}
	syscall++
	next
	}
	type("COMPAT") \|\| type("COMPAT4") \|\| type("COMPAT6") \|\| \
	type("COMPAT7") {
	if (flag("COMPAT")) {
	ncompat++
	out = syscompat
	outdcl = syscompatdcl
	wrap = "compat"
	prefix = "o"
	descr = "old"
	} else if (flag("COMPAT4")) {
	ncompat4++
	out = syscompat4
	outdcl = syscompat4dcl
	wrap = "compat4"
	prefix = "freebsd4_"
	descr = "freebsd4"
	} else if (flag("COMPAT6")) {
	ncompat6++
	out = syscompat6
	outdcl = syscompat6dcl
	wrap = "compat6"
	prefix = "freebsd6_"
	descr = "freebsd6"
	} else if (flag("COMPAT7")) {
	ncompat7++
	out = syscompat7
	outdcl = syscompat7dcl
	wrap = "compat7"
	prefix = "freebsd7_"
	descr = "freebsd7"
	}
	parseline()
	if (argc != 0 && !flag("NOARGS") && !flag("NOPROTO") && \
	!flag("NODEF")) {
	printf("struct %s {\n", argalias) > out
	for (i = 1; i <= argc; i++)
	printf("\tchar %s_l_[PADL_(%s)]; %s %s; " \
	"char %s_r_[PADR_(%s)];\n",
	argname[i], argtype[i],
	argtype[i], argname[i],
	argname[i], argtype[i]) > out
	printf("};\n") > out
	}
	else if (!flag("NOARGS") && !flag("NOPROTO") && !flag("NODEF"))
	printf("struct %s {\n\tregister_t dummy;\n};\n",
	argalias) > sysarg
	if (!flag("NOPROTO") && !flag("NODEF")) {
	printf("%s\t%s%s(struct thread , struct %s );\n",
	rettype, prefix, funcname, argalias) > outdcl
	printf("#define\t%sAUE_%s%s\t%s\n", syscallprefix,
	prefix, funcname, auditev) > sysaue
	}
	if (flag("NOSTD")) {
	printf("\t{ %s, (sy_call_t *)%s, %s, NULL, 0, 0, 0, SY_THR_ABSENT },",
	"0", "lkmressys", "AUE_NULL") > sysent
	align_sysent_comment(8 + 2 + length("0") + 15 + \
	length("lkmressys") + length("AUE_NULL") + 3)
	} else {
	printf("\t{ %s(%s,%s), %s, NULL, 0, 0, %s, %s },",
	wrap, argssize, funcname, auditev, flags, thr_flag) > sysent
	align_sysent_comment(8 + 9 + length(argssize) + 1 + \
	length(funcname) + length(auditev) + \
	length(flags) + 4)
	}
	printf("/* %d = %s %s */\n", syscall, descr, funcalias) > sysent
	printf("\t\"%s.%s\",\t\t/* %d = %s %s */\n",
	wrap, funcalias, syscall, descr, funcalias) > sysnames
	if (flag("COMPAT")) {
	printf("\t\t\t\t/* %d is old %s */\n",
	syscall, funcalias) > syshdr
	} else if (!flag("NODEF")) {
	printf("#define\t%s%s%s\t%d\n", syscallprefix,
	prefix, funcalias, syscall) > syshdr
	printf(" \\\n\t%s%s.o", prefix, funcalias) > sysmk
	}
	syscall++
	next
	}
	type("OBSOL") {
	printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },") > sysent
	align_sysent_comment(34)
	printf("/* %d = obsolete %s */\n", syscall, comment) > sysent
	printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n",
	$4, syscall, comment) > sysnames
	printf("\t\t\t\t/* %d is obsolete %s */\n",
	syscall, comment) > syshdr
	syscall++
	next
	}
	type("UNIMPL") {
	printf("\t{ 0, (sy_call_t )nosys, AUE_NULL, NULL, 0, 0, 0, SY_THR_ABSENT },\t\t\t/ %d = %s */\n",
	syscall, comment) > sysent
	printf("\t\"#%d\",\t\t\t/* %d = %s */\n",
	syscall, syscall, comment) > sysnames
	syscall++
	next
	}
	{
	printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $3
	exit 1
	}
	END {
	printf "\n#define AS(name) (sizeof(struct name) / sizeof(register_t))\n" > sysinc

	if (ncompat != 0 \|\| ncompat4 != 0 \|\| ncompat6 != 0 \|\| ncompat7 != 0)
	printf "#include \"opt_compat.h\"\n\n" > syssw

	if (ncompat != 0) {
	printf "\n#ifdef %s\n", compat > sysinc
	printf "#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)\n" > sysinc
	printf "#else\n" > sysinc
	printf "#define compat(n, name) 0, (sy_call_t *)nosys\n" > sysinc
	printf "#endif\n" > sysinc
	}

	if (ncompat4 != 0) {
	printf "\n#ifdef %s\n", compat4 > sysinc
	printf "#define compat4(n, name) n, (sy_call_t *)__CONCAT(freebsd4_,name)\n" > sysinc
	printf "#else\n" > sysinc
	printf "#define compat4(n, name) 0, (sy_call_t *)nosys\n" > sysinc
	printf "#endif\n" > sysinc
	}

	if (ncompat6 != 0) {
	printf "\n#ifdef %s\n", compat6 > sysinc
	printf "#define compat6(n, name) n, (sy_call_t *)__CONCAT(freebsd6_,name)\n" > sysinc
	printf "#else\n" > sysinc
	printf "#define compat6(n, name) 0, (sy_call_t *)nosys\n" > sysinc
	printf "#endif\n" > sysinc
	}

	if (ncompat7 != 0) {
	printf "\n#ifdef %s\n", compat7 > sysinc
	printf "#define compat7(n, name) n, (sy_call_t *)__CONCAT(freebsd7_,name)\n" > sysinc
	printf "#else\n" > sysinc
	printf "#define compat7(n, name) 0, (sy_call_t *)nosys\n" > sysinc
	printf "#endif\n" > sysinc
	}

	printf("\n#endif /* %s */\n\n", compat) > syscompatdcl
	printf("\n#endif /* %s */\n\n", compat4) > syscompat4dcl
	printf("\n#endif /* %s */\n\n", compat6) > syscompat6dcl
	printf("\n#endif /* %s */\n\n", compat7) > syscompat7dcl

	printf("\n#undef PAD_\n") > sysprotoend
	printf("#undef PADL_\n") > sysprotoend
	printf("#undef PADR_\n") > sysprotoend
	printf("\n#endif /* !%s */\n", sysproto_h) > sysprotoend

	printf("\n") > sysmk
	printf("};\n") > sysent
	printf("};\n") > sysnames
	printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \
	> syshdr
	printf "\tdefault:\n\t\t*n_args = 0;\n\t\tbreak;\n\t};\n}\n" > systrace
	printf "\tdefault:\n\t\tbreak;\n\t};\n\tif (p != NULL)\n\t\tstrlcpy(desc, p, descsz);\n}\n" > systracetmp
	} '

	cat $sysinc $sysent >> $syssw
	cat $sysarg $sysdcl \
	$syscompat $syscompatdcl \
	$syscompat4 $syscompat4dcl \
	$syscompat6 $syscompat6dcl \
	$syscompat7 $syscompat7dcl \
	$sysaue $sysprotoend > $sysproto
	cat $systracetmp >> $systrace

	Index: head/sys/kern/p1003_1b.c
	===================================================================
	--- head/sys/kern/p1003_1b.c (revision 225616)
	+++ head/sys/kern/p1003_1b.c (revision 225617)
	@@ -1,315 +1,315 @@
	/*-
	* Copyright (c) 1996, 1997, 1998
	* HD Associates, Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by HD Associates, Inc
	* 4. Neither the name of the author nor the names of any co-contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/* p1003_1b: Real Time common code.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_posix.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/posix4.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/syslog.h>
	#include <sys/sysproto.h>

	MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B");

	/* The system calls return ENOSYS if an entry is called that is not run-time
	* supported. I am also logging since some programs start to use this when
	* they shouldn't. That will be removed if annoying.
	*/
	int
	syscall_not_present(struct thread td, const char s, struct nosys_args *uap)
	{
	log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
	td->td_name, td->td_proc->p_pid, s);

	/* a " return nosys(p, uap); " here causes a core dump.
	*/

	return ENOSYS;
	}

	#if !defined(_KPOSIX_PRIORITY_SCHEDULING)

	/* Not configured but loadable via a module:
	*/

	static int
	sched_attach(void)
	{
	return 0;
	}

	SYSCALL_NOT_PRESENT_GEN(sched_setparam)
	SYSCALL_NOT_PRESENT_GEN(sched_getparam)
	SYSCALL_NOT_PRESENT_GEN(sched_setscheduler)
	SYSCALL_NOT_PRESENT_GEN(sched_getscheduler)
	SYSCALL_NOT_PRESENT_GEN(sched_yield)
	SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max)
	SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min)
	SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval)
	#else

	/* Configured in kernel version:
	*/
	static struct ksched *ksched;

	static int
	sched_attach(void)
	{
	int ret = ksched_attach(&ksched);

	if (ret == 0)
	p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 200112L);

	return ret;
	}

	int
	-sched_setparam(struct thread td, struct sched_setparam_args uap)
	+sys_sched_setparam(struct thread td, struct sched_setparam_args uap)
	{
	struct thread *targettd;
	struct proc *targetp;
	int e;
	struct sched_param sched_param;

	e = copyin(uap->param, &sched_param, sizeof(sched_param));
	if (e)
	return (e);

	if (uap->pid == 0) {
	targetp = td->td_proc;
	targettd = td;
	PROC_LOCK(targetp);
	} else {
	targetp = pfind(uap->pid);
	if (targetp == NULL)
	return (ESRCH);
	targettd = FIRST_THREAD_IN_PROC(targetp);
	}

	e = p_cansched(td, targetp);
	if (e == 0) {
	e = ksched_setparam(ksched, targettd,
	(const struct sched_param *)&sched_param);
	}
	PROC_UNLOCK(targetp);
	return (e);
	}

	int
	-sched_getparam(struct thread td, struct sched_getparam_args uap)
	+sys_sched_getparam(struct thread td, struct sched_getparam_args uap)
	{
	int e;
	struct sched_param sched_param;
	struct thread *targettd;
	struct proc *targetp;

	if (uap->pid == 0) {
	targetp = td->td_proc;
	targettd = td;
	PROC_LOCK(targetp);
	} else {
	targetp = pfind(uap->pid);
	if (targetp == NULL) {
	return (ESRCH);
	}
	targettd = FIRST_THREAD_IN_PROC(targetp);
	}

	e = p_cansee(td, targetp);
	if (e == 0) {
	e = ksched_getparam(ksched, targettd, &sched_param);
	}
	PROC_UNLOCK(targetp);
	if (e == 0)
	e = copyout(&sched_param, uap->param, sizeof(sched_param));
	return (e);
	}

	int
	-sched_setscheduler(struct thread td, struct sched_setscheduler_args uap)
	+sys_sched_setscheduler(struct thread td, struct sched_setscheduler_args uap)
	{
	int e;
	struct sched_param sched_param;
	struct thread *targettd;
	struct proc *targetp;

	/* Don't allow non root user to set a scheduler policy. */
	e = priv_check(td, PRIV_SCHED_SET);
	if (e)
	return (e);

	e = copyin(uap->param, &sched_param, sizeof(sched_param));
	if (e)
	return (e);

	if (uap->pid == 0) {
	targetp = td->td_proc;
	targettd = td;
	PROC_LOCK(targetp);
	} else {
	targetp = pfind(uap->pid);
	if (targetp == NULL)
	return (ESRCH);
	targettd = FIRST_THREAD_IN_PROC(targetp);
	}

	e = p_cansched(td, targetp);
	if (e == 0) {
	e = ksched_setscheduler(ksched, targettd,
	uap->policy, (const struct sched_param *)&sched_param);
	}
	PROC_UNLOCK(targetp);
	return (e);
	}

	int
	-sched_getscheduler(struct thread td, struct sched_getscheduler_args uap)
	+sys_sched_getscheduler(struct thread td, struct sched_getscheduler_args uap)
	{
	int e, policy;
	struct thread *targettd;
	struct proc *targetp;

	if (uap->pid == 0) {
	targetp = td->td_proc;
	targettd = td;
	PROC_LOCK(targetp);
	} else {
	targetp = pfind(uap->pid);
	if (targetp == NULL)
	return (ESRCH);
	targettd = FIRST_THREAD_IN_PROC(targetp);
	}

	e = p_cansee(td, targetp);
	if (e == 0) {
	e = ksched_getscheduler(ksched, targettd, &policy);
	td->td_retval[0] = policy;
	}
	PROC_UNLOCK(targetp);

	return (e);
	}

	int
	-sched_yield(struct thread td, struct sched_yield_args uap)
	+sys_sched_yield(struct thread td, struct sched_yield_args uap)
	{

	sched_relinquish(curthread);
	return 0;
	}

	int
	-sched_get_priority_max(struct thread *td,
	+sys_sched_get_priority_max(struct thread *td,
	struct sched_get_priority_max_args *uap)
	{
	int error, prio;

	error = ksched_get_priority_max(ksched, uap->policy, &prio);
	td->td_retval[0] = prio;
	return (error);
	}

	int
	-sched_get_priority_min(struct thread *td,
	+sys_sched_get_priority_min(struct thread *td,
	struct sched_get_priority_min_args *uap)
	{
	int error, prio;

	error = ksched_get_priority_min(ksched, uap->policy, &prio);
	td->td_retval[0] = prio;
	return (error);
	}

	int
	-sched_rr_get_interval(struct thread *td,
	+sys_sched_rr_get_interval(struct thread *td,
	struct sched_rr_get_interval_args *uap)
	{
	struct timespec timespec;
	int error;

	error = kern_sched_rr_get_interval(td, uap->pid, &timespec);
	if (error == 0)
	error = copyout(&timespec, uap->interval, sizeof(timespec));
	return (error);
	}

	int
	kern_sched_rr_get_interval(struct thread *td, pid_t pid,
	struct timespec *ts)
	{
	int e;
	struct thread *targettd;
	struct proc *targetp;

	if (pid == 0) {
	targettd = td;
	targetp = td->td_proc;
	PROC_LOCK(targetp);
	} else {
	targetp = pfind(pid);
	if (targetp == NULL)
	return (ESRCH);
	targettd = FIRST_THREAD_IN_PROC(targetp);
	}

	e = p_cansee(td, targetp);
	if (e == 0)
	e = ksched_rr_get_interval(ksched, targettd, ts);
	PROC_UNLOCK(targetp);
	return (e);
	}

	#endif

	static void
	p31binit(void *notused)
	{
	(void) sched_attach();
	p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
	}

	SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL);
	Index: head/sys/kern/subr_bus.c
	===================================================================
	--- head/sys/kern/subr_bus.c (revision 225616)
	+++ head/sys/kern/subr_bus.c (revision 225617)
	@@ -1,4742 +1,4742 @@
	/*-
	* Copyright (c) 1997,1998,2003 Doug Rabson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_bus.h"

	#include <sys/param.h>
	#include <sys/conf.h>
	#include <sys/filio.h>
	#include <sys/lock.h>
	#include <sys/kernel.h>
	#include <sys/kobj.h>
	#include <sys/limits.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/poll.h>
	#include <sys/proc.h>
	#include <sys/condvar.h>
	#include <sys/queue.h>
	#include <machine/bus.h>
	#include <sys/rman.h>
	#include <sys/selinfo.h>
	#include <sys/signalvar.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/uio.h>
	#include <sys/bus.h>
	#include <sys/interrupt.h>

	#include <machine/stdarg.h>

	#include <vm/uma.h>

	SYSCTL_NODE(_hw, OID_AUTO, bus, CTLFLAG_RW, NULL, NULL);
	SYSCTL_NODE(, OID_AUTO, dev, CTLFLAG_RW, NULL, NULL);

	/*
	* Used to attach drivers to devclasses.
	*/
	typedef struct driverlink *driverlink_t;
	struct driverlink {
	kobj_class_t driver;
	TAILQ_ENTRY(driverlink) link; /* list of drivers in devclass */
	int pass;
	TAILQ_ENTRY(driverlink) passlink;
	};

	/*
	* Forward declarations
	*/
	typedef TAILQ_HEAD(devclass_list, devclass) devclass_list_t;
	typedef TAILQ_HEAD(driver_list, driverlink) driver_list_t;
	typedef TAILQ_HEAD(device_list, device) device_list_t;

	struct devclass {
	TAILQ_ENTRY(devclass) link;
	devclass_t parent; /* parent in devclass hierarchy */
	driver_list_t drivers; /* bus devclasses store drivers for bus */
	char *name;
	device_t devices; / array of devices indexed by unit */
	int maxunit; /* size of devices array */
	int flags;
	#define DC_HAS_CHILDREN 1

	struct sysctl_ctx_list sysctl_ctx;
	struct sysctl_oid *sysctl_tree;
	};

	/**
	* @brief Implementation of device.
	*/
	struct device {
	/*
	* A device is a kernel object. The first field must be the
	* current ops table for the object.
	*/
	KOBJ_FIELDS;

	/*
	* Device hierarchy.
	*/
	TAILQ_ENTRY(device) link; /*< list of devices in parent /
	TAILQ_ENTRY(device) devlink; /*< global device list membership /
	device_t parent; /*< parent of this device /
	device_list_t children; /*< list of child devices /

	/*
	* Details of this device.
	*/
	driver_t driver; /< current driver /
	devclass_t devclass; /*< current device class /
	int unit; /*< current unit number /
	char* nameunit; /*< name+unit e.g. foodev0 /
	char* desc; /*< driver specific description /
	int busy; /*< count of calls to device_busy() /
	device_state_t state; /*< current device state /
	uint32_t devflags; /*< api level flags for device_get_flags() /
	u_int flags; /*< internal device flags /
	#define DF_ENABLED 0x01 /* device should be probed/attached */
	#define DF_FIXEDCLASS 0x02 /* devclass specified at create time */
	#define DF_WILDCARD 0x04 /* unit was originally wildcard */
	#define DF_DESCMALLOCED 0x08 /* description was malloced */
	#define DF_QUIET 0x10 /* don't print verbose attach message */
	#define DF_DONENOMATCH 0x20 /* don't execute DEVICE_NOMATCH again */
	#define DF_EXTERNALSOFTC 0x40 /* softc not allocated by us */
	#define DF_REBID 0x80 /* Can rebid after attach */
	u_int order; /*< order from device_add_child_ordered() /
	void ivars; /< instance variables /
	void softc; /< current driver's variables /

	struct sysctl_ctx_list sysctl_ctx; /*< state for sysctl variables /
	struct sysctl_oid sysctl_tree; /< state for sysctl variables /
	};

	static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures");
	static MALLOC_DEFINE(M_BUS_SC, "bus-sc", "Bus data structures, softc");

	#ifdef BUS_DEBUG

	static int bus_debug = 1;
	TUNABLE_INT("bus.debug", &bus_debug);
	SYSCTL_INT(_debug, OID_AUTO, bus_debug, CTLFLAG_RW, &bus_debug, 0,
	"Debug bus code");

	#define PDEBUG(a) if (bus_debug) {printf("%s:%d: ", __func__, __LINE__), printf a; printf("\n");}
	#define DEVICENAME(d) ((d)? device_get_name(d): "no device")
	#define DRIVERNAME(d) ((d)? d->name : "no driver")
	#define DEVCLANAME(d) ((d)? d->name : "no devclass")

	/**
	* Produce the indenting, indent*2 spaces plus a '.' ahead of that to
	* prevent syslog from deleting initial spaces
	*/
	#define indentprintf(p) do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf(" "); printf p ; } while (0)

	static void print_device_short(device_t dev, int indent);
	static void print_device(device_t dev, int indent);
	void print_device_tree_short(device_t dev, int indent);
	void print_device_tree(device_t dev, int indent);
	static void print_driver_short(driver_t *driver, int indent);
	static void print_driver(driver_t *driver, int indent);
	static void print_driver_list(driver_list_t drivers, int indent);
	static void print_devclass_short(devclass_t dc, int indent);
	static void print_devclass(devclass_t dc, int indent);
	void print_devclass_list_short(void);
	void print_devclass_list(void);

	#else
	/* Make the compiler ignore the function calls */
	#define PDEBUG(a) /* nop */
	#define DEVICENAME(d) /* nop */
	#define DRIVERNAME(d) /* nop */
	#define DEVCLANAME(d) /* nop */

	#define print_device_short(d,i) /* nop */
	#define print_device(d,i) /* nop */
	#define print_device_tree_short(d,i) /* nop */
	#define print_device_tree(d,i) /* nop */
	#define print_driver_short(d,i) /* nop */
	#define print_driver(d,i) /* nop */
	#define print_driver_list(d,i) /* nop */
	#define print_devclass_short(d,i) /* nop */
	#define print_devclass(d,i) /* nop */
	#define print_devclass_list_short() /* nop */
	#define print_devclass_list() /* nop */
	#endif

	/*
	* dev sysctl tree
	*/

	enum {
	DEVCLASS_SYSCTL_PARENT,
	};

	static int
	devclass_sysctl_handler(SYSCTL_HANDLER_ARGS)
	{
	devclass_t dc = (devclass_t)arg1;
	const char *value;

	switch (arg2) {
	case DEVCLASS_SYSCTL_PARENT:
	value = dc->parent ? dc->parent->name : "";
	break;
	default:
	return (EINVAL);
	}
	return (SYSCTL_OUT(req, value, strlen(value)));
	}

	static void
	devclass_sysctl_init(devclass_t dc)
	{

	if (dc->sysctl_tree != NULL)
	return;
	sysctl_ctx_init(&dc->sysctl_ctx);
	dc->sysctl_tree = SYSCTL_ADD_NODE(&dc->sysctl_ctx,
	SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, dc->name,
	CTLFLAG_RD, NULL, "");
	SYSCTL_ADD_PROC(&dc->sysctl_ctx, SYSCTL_CHILDREN(dc->sysctl_tree),
	OID_AUTO, "%parent", CTLTYPE_STRING \| CTLFLAG_RD,
	dc, DEVCLASS_SYSCTL_PARENT, devclass_sysctl_handler, "A",
	"parent class");
	}

	enum {
	DEVICE_SYSCTL_DESC,
	DEVICE_SYSCTL_DRIVER,
	DEVICE_SYSCTL_LOCATION,
	DEVICE_SYSCTL_PNPINFO,
	DEVICE_SYSCTL_PARENT,
	};

	static int
	device_sysctl_handler(SYSCTL_HANDLER_ARGS)
	{
	device_t dev = (device_t)arg1;
	const char *value;
	char *buf;
	int error;

	buf = NULL;
	switch (arg2) {
	case DEVICE_SYSCTL_DESC:
	value = dev->desc ? dev->desc : "";
	break;
	case DEVICE_SYSCTL_DRIVER:
	value = dev->driver ? dev->driver->name : "";
	break;
	case DEVICE_SYSCTL_LOCATION:
	value = buf = malloc(1024, M_BUS, M_WAITOK \| M_ZERO);
	bus_child_location_str(dev, buf, 1024);
	break;
	case DEVICE_SYSCTL_PNPINFO:
	value = buf = malloc(1024, M_BUS, M_WAITOK \| M_ZERO);
	bus_child_pnpinfo_str(dev, buf, 1024);
	break;
	case DEVICE_SYSCTL_PARENT:
	value = dev->parent ? dev->parent->nameunit : "";
	break;
	default:
	return (EINVAL);
	}
	error = SYSCTL_OUT(req, value, strlen(value));
	if (buf != NULL)
	free(buf, M_BUS);
	return (error);
	}

	static void
	device_sysctl_init(device_t dev)
	{
	devclass_t dc = dev->devclass;

	if (dev->sysctl_tree != NULL)
	return;
	devclass_sysctl_init(dc);
	sysctl_ctx_init(&dev->sysctl_ctx);
	dev->sysctl_tree = SYSCTL_ADD_NODE(&dev->sysctl_ctx,
	SYSCTL_CHILDREN(dc->sysctl_tree), OID_AUTO,
	dev->nameunit + strlen(dc->name),
	CTLFLAG_RD, NULL, "");
	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
	OID_AUTO, "%desc", CTLTYPE_STRING \| CTLFLAG_RD,
	dev, DEVICE_SYSCTL_DESC, device_sysctl_handler, "A",
	"device description");
	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
	OID_AUTO, "%driver", CTLTYPE_STRING \| CTLFLAG_RD,
	dev, DEVICE_SYSCTL_DRIVER, device_sysctl_handler, "A",
	"device driver name");
	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
	OID_AUTO, "%location", CTLTYPE_STRING \| CTLFLAG_RD,
	dev, DEVICE_SYSCTL_LOCATION, device_sysctl_handler, "A",
	"device location relative to parent");
	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
	OID_AUTO, "%pnpinfo", CTLTYPE_STRING \| CTLFLAG_RD,
	dev, DEVICE_SYSCTL_PNPINFO, device_sysctl_handler, "A",
	"device identification");
	SYSCTL_ADD_PROC(&dev->sysctl_ctx, SYSCTL_CHILDREN(dev->sysctl_tree),
	OID_AUTO, "%parent", CTLTYPE_STRING \| CTLFLAG_RD,
	dev, DEVICE_SYSCTL_PARENT, device_sysctl_handler, "A",
	"parent device");
	}

	static void
	device_sysctl_update(device_t dev)
	{
	devclass_t dc = dev->devclass;

	if (dev->sysctl_tree == NULL)
	return;
	sysctl_rename_oid(dev->sysctl_tree, dev->nameunit + strlen(dc->name));
	}

	static void
	device_sysctl_fini(device_t dev)
	{
	if (dev->sysctl_tree == NULL)
	return;
	sysctl_ctx_free(&dev->sysctl_ctx);
	dev->sysctl_tree = NULL;
	}

	/*
	* /dev/devctl implementation
	*/

	/*
	* This design allows only one reader for /dev/devctl. This is not desirable
	* in the long run, but will get a lot of hair out of this implementation.
	* Maybe we should make this device a clonable device.
	*
	* Also note: we specifically do not attach a device to the device_t tree
	* to avoid potential chicken and egg problems. One could argue that all
	* of this belongs to the root node. One could also further argue that the
	* sysctl interface that we have not might more properly be an ioctl
	* interface, but at this stage of the game, I'm not inclined to rock that
	* boat.
	*
	* I'm also not sure that the SIGIO support is done correctly or not, as
	* I copied it from a driver that had SIGIO support that likely hasn't been
	* tested since 3.4 or 2.2.8!
	*/

	/* Deprecated way to adjust queue length */
	static int sysctl_devctl_disable(SYSCTL_HANDLER_ARGS);
	/* XXX Need to support old-style tunable hw.bus.devctl_disable" */
	SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_disable, CTLTYPE_INT \| CTLFLAG_RW, NULL,
	0, sysctl_devctl_disable, "I", "devctl disable -- deprecated");

	#define DEVCTL_DEFAULT_QUEUE_LEN 1000
	static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS);
	static int devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
	TUNABLE_INT("hw.bus.devctl_queue", &devctl_queue_length);
	SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_queue, CTLTYPE_INT \| CTLFLAG_RW, NULL,
	0, sysctl_devctl_queue, "I", "devctl queue length");

	static d_open_t devopen;
	static d_close_t devclose;
	static d_read_t devread;
	static d_ioctl_t devioctl;
	static d_poll_t devpoll;

	static struct cdevsw dev_cdevsw = {
	.d_version = D_VERSION,
	.d_flags = D_NEEDGIANT,
	.d_open = devopen,
	.d_close = devclose,
	.d_read = devread,
	.d_ioctl = devioctl,
	.d_poll = devpoll,
	.d_name = "devctl",
	};

	struct dev_event_info
	{
	char *dei_data;
	TAILQ_ENTRY(dev_event_info) dei_link;
	};

	TAILQ_HEAD(devq, dev_event_info);

	static struct dev_softc
	{
	int inuse;
	int nonblock;
	int queued;
	struct mtx mtx;
	struct cv cv;
	struct selinfo sel;
	struct devq devq;
	struct proc *async_proc;
	} devsoftc;

	static struct cdev *devctl_dev;

	static void
	devinit(void)
	{
	devctl_dev = make_dev_credf(MAKEDEV_ETERNAL, &dev_cdevsw, 0, NULL,
	UID_ROOT, GID_WHEEL, 0600, "devctl");
	mtx_init(&devsoftc.mtx, "dev mtx", "devd", MTX_DEF);
	cv_init(&devsoftc.cv, "dev cv");
	TAILQ_INIT(&devsoftc.devq);
	}

	static int
	devopen(struct cdev dev, int oflags, int devtype, struct thread td)
	{
	if (devsoftc.inuse)
	return (EBUSY);
	/* move to init */
	devsoftc.inuse = 1;
	devsoftc.nonblock = 0;
	devsoftc.async_proc = NULL;
	return (0);
	}

	static int
	devclose(struct cdev dev, int fflag, int devtype, struct thread td)
	{
	devsoftc.inuse = 0;
	mtx_lock(&devsoftc.mtx);
	cv_broadcast(&devsoftc.cv);
	mtx_unlock(&devsoftc.mtx);
	devsoftc.async_proc = NULL;
	return (0);
	}

	/*
	* The read channel for this device is used to report changes to
	* userland in realtime. We are required to free the data as well as
	* the n1 object because we allocate them separately. Also note that
	* we return one record at a time. If you try to read this device a
	* character at a time, you will lose the rest of the data. Listening
	* programs are expected to cope.
	*/
	static int
	devread(struct cdev dev, struct uio uio, int ioflag)
	{
	struct dev_event_info *n1;
	int rv;

	mtx_lock(&devsoftc.mtx);
	while (TAILQ_EMPTY(&devsoftc.devq)) {
	if (devsoftc.nonblock) {
	mtx_unlock(&devsoftc.mtx);
	return (EAGAIN);
	}
	rv = cv_wait_sig(&devsoftc.cv, &devsoftc.mtx);
	if (rv) {
	/*
	* Need to translate ERESTART to EINTR here? -- jake
	*/
	mtx_unlock(&devsoftc.mtx);
	return (rv);
	}
	}
	n1 = TAILQ_FIRST(&devsoftc.devq);
	TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
	devsoftc.queued--;
	mtx_unlock(&devsoftc.mtx);
	rv = uiomove(n1->dei_data, strlen(n1->dei_data), uio);
	free(n1->dei_data, M_BUS);
	free(n1, M_BUS);
	return (rv);
	}

	static int
	devioctl(struct cdev dev, u_long cmd, caddr_t data, int fflag, struct thread td)
	{
	switch (cmd) {

	case FIONBIO:
	if ((int)data)
	devsoftc.nonblock = 1;
	else
	devsoftc.nonblock = 0;
	return (0);
	case FIOASYNC:
	if ((int)data)
	devsoftc.async_proc = td->td_proc;
	else
	devsoftc.async_proc = NULL;
	return (0);

	/* (un)Support for other fcntl() calls. */
	case FIOCLEX:
	case FIONCLEX:
	case FIONREAD:
	case FIOSETOWN:
	case FIOGETOWN:
	default:
	break;
	}
	return (ENOTTY);
	}

	static int
	devpoll(struct cdev dev, int events, struct thread td)
	{
	int revents = 0;

	mtx_lock(&devsoftc.mtx);
	if (events & (POLLIN \| POLLRDNORM)) {
	if (!TAILQ_EMPTY(&devsoftc.devq))
	revents = events & (POLLIN \| POLLRDNORM);
	else
	selrecord(td, &devsoftc.sel);
	}
	mtx_unlock(&devsoftc.mtx);

	return (revents);
	}

	/**
	* @brief Return whether the userland process is running
	*/
	boolean_t
	devctl_process_running(void)
	{
	return (devsoftc.inuse == 1);
	}

	/**
	* @brief Queue data to be read from the devctl device
	*
	* Generic interface to queue data to the devctl device. It is
	* assumed that @p data is properly formatted. It is further assumed
	* that @p data is allocated using the M_BUS malloc type.
	*/
	void
	devctl_queue_data_f(char *data, int flags)
	{
	struct dev_event_info n1 = NULL, n2 = NULL;
	struct proc *p;

	if (strlen(data) == 0)
	goto out;
	if (devctl_queue_length == 0)
	goto out;
	n1 = malloc(sizeof(*n1), M_BUS, flags);
	if (n1 == NULL)
	goto out;
	n1->dei_data = data;
	mtx_lock(&devsoftc.mtx);
	if (devctl_queue_length == 0) {
	mtx_unlock(&devsoftc.mtx);
	free(n1->dei_data, M_BUS);
	free(n1, M_BUS);
	return;
	}
	/* Leave at least one spot in the queue... */
	while (devsoftc.queued > devctl_queue_length - 1) {
	n2 = TAILQ_FIRST(&devsoftc.devq);
	TAILQ_REMOVE(&devsoftc.devq, n2, dei_link);
	free(n2->dei_data, M_BUS);
	free(n2, M_BUS);
	devsoftc.queued--;
	}
	TAILQ_INSERT_TAIL(&devsoftc.devq, n1, dei_link);
	devsoftc.queued++;
	cv_broadcast(&devsoftc.cv);
	mtx_unlock(&devsoftc.mtx);
	selwakeup(&devsoftc.sel);
	p = devsoftc.async_proc;
	if (p != NULL) {
	PROC_LOCK(p);
	- psignal(p, SIGIO);
	+ kern_psignal(p, SIGIO);
	PROC_UNLOCK(p);
	}
	return;
	out:
	/*
	* We have to free data on all error paths since the caller
	* assumes it will be free'd when this item is dequeued.
	*/
	free(data, M_BUS);
	return;
	}

	void
	devctl_queue_data(char *data)
	{

	devctl_queue_data_f(data, M_NOWAIT);
	}

	/**
	* @brief Send a 'notification' to userland, using standard ways
	*/
	void
	devctl_notify_f(const char system, const char subsystem, const char *type,
	const char *data, int flags)
	{
	int len = 0;
	char *msg;

	if (system == NULL)
	return; /* BOGUS! Must specify system. */
	if (subsystem == NULL)
	return; /* BOGUS! Must specify subsystem. */
	if (type == NULL)
	return; /* BOGUS! Must specify type. */
	len += strlen(" system=") + strlen(system);
	len += strlen(" subsystem=") + strlen(subsystem);
	len += strlen(" type=") + strlen(type);
	/* add in the data message plus newline. */
	if (data != NULL)
	len += strlen(data);
	len += 3; /* '!', '\n', and NUL */
	msg = malloc(len, M_BUS, flags);
	if (msg == NULL)
	return; /* Drop it on the floor */
	if (data != NULL)
	snprintf(msg, len, "!system=%s subsystem=%s type=%s %s\n",
	system, subsystem, type, data);
	else
	snprintf(msg, len, "!system=%s subsystem=%s type=%s\n",
	system, subsystem, type);
	devctl_queue_data_f(msg, flags);
	}

	void
	devctl_notify(const char system, const char subsystem, const char *type,
	const char *data)
	{

	devctl_notify_f(system, subsystem, type, data, M_NOWAIT);
	}

	/*
	* Common routine that tries to make sending messages as easy as possible.
	* We allocate memory for the data, copy strings into that, but do not
	* free it unless there's an error. The dequeue part of the driver should
	* free the data. We don't send data when the device is disabled. We do
	* send data, even when we have no listeners, because we wish to avoid
	* races relating to startup and restart of listening applications.
	*
	* devaddq is designed to string together the type of event, with the
	* object of that event, plus the plug and play info and location info
	* for that event. This is likely most useful for devices, but less
	* useful for other consumers of this interface. Those should use
	* the devctl_queue_data() interface instead.
	*/
	static void
	devaddq(const char type, const char what, device_t dev)
	{
	char *data = NULL;
	char *loc = NULL;
	char *pnp = NULL;
	const char *parstr;

	if (!devctl_queue_length)/* Rare race, but lost races safely discard */
	return;
	data = malloc(1024, M_BUS, M_NOWAIT);
	if (data == NULL)
	goto bad;

	/* get the bus specific location of this device */
	loc = malloc(1024, M_BUS, M_NOWAIT);
	if (loc == NULL)
	goto bad;
	*loc = '\0';
	bus_child_location_str(dev, loc, 1024);

	/* Get the bus specific pnp info of this device */
	pnp = malloc(1024, M_BUS, M_NOWAIT);
	if (pnp == NULL)
	goto bad;
	*pnp = '\0';
	bus_child_pnpinfo_str(dev, pnp, 1024);

	/* Get the parent of this device, or / if high enough in the tree. */
	if (device_get_parent(dev) == NULL)
	parstr = "."; /* Or '/' ? */
	else
	parstr = device_get_nameunit(device_get_parent(dev));
	/* String it all together. */
	snprintf(data, 1024, "%s%s at %s %s on %s\n", type, what, loc, pnp,
	parstr);
	free(loc, M_BUS);
	free(pnp, M_BUS);
	devctl_queue_data(data);
	return;
	bad:
	free(pnp, M_BUS);
	free(loc, M_BUS);
	free(data, M_BUS);
	return;
	}

	/*
	* A device was added to the tree. We are called just after it successfully
	* attaches (that is, probe and attach success for this device). No call
	* is made if a device is merely parented into the tree. See devnomatch
	* if probe fails. If attach fails, no notification is sent (but maybe
	* we should have a different message for this).
	*/
	static void
	devadded(device_t dev)
	{
	devaddq("+", device_get_nameunit(dev), dev);
	}

	/*
	* A device was removed from the tree. We are called just before this
	* happens.
	*/
	static void
	devremoved(device_t dev)
	{
	devaddq("-", device_get_nameunit(dev), dev);
	}

	/*
	* Called when there's no match for this device. This is only called
	* the first time that no match happens, so we don't keep getting this
	* message. Should that prove to be undesirable, we can change it.
	* This is called when all drivers that can attach to a given bus
	* decline to accept this device. Other errors may not be detected.
	*/
	static void
	devnomatch(device_t dev)
	{
	devaddq("?", "", dev);
	}

	static int
	sysctl_devctl_disable(SYSCTL_HANDLER_ARGS)
	{
	struct dev_event_info *n1;
	int dis, error;

	dis = devctl_queue_length == 0;
	error = sysctl_handle_int(oidp, &dis, 0, req);
	if (error \|\| !req->newptr)
	return (error);
	mtx_lock(&devsoftc.mtx);
	if (dis) {
	while (!TAILQ_EMPTY(&devsoftc.devq)) {
	n1 = TAILQ_FIRST(&devsoftc.devq);
	TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
	free(n1->dei_data, M_BUS);
	free(n1, M_BUS);
	}
	devsoftc.queued = 0;
	devctl_queue_length = 0;
	} else {
	devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
	}
	mtx_unlock(&devsoftc.mtx);
	return (0);
	}

	static int
	sysctl_devctl_queue(SYSCTL_HANDLER_ARGS)
	{
	struct dev_event_info *n1;
	int q, error;

	q = devctl_queue_length;
	error = sysctl_handle_int(oidp, &q, 0, req);
	if (error \|\| !req->newptr)
	return (error);
	if (q < 0)
	return (EINVAL);
	mtx_lock(&devsoftc.mtx);
	devctl_queue_length = q;
	while (devsoftc.queued > devctl_queue_length) {
	n1 = TAILQ_FIRST(&devsoftc.devq);
	TAILQ_REMOVE(&devsoftc.devq, n1, dei_link);
	free(n1->dei_data, M_BUS);
	free(n1, M_BUS);
	devsoftc.queued--;
	}
	mtx_unlock(&devsoftc.mtx);
	return (0);
	}

	/* End of /dev/devctl code */

	static TAILQ_HEAD(,device) bus_data_devices;
	static int bus_data_generation = 1;

	static kobj_method_t null_methods[] = {
	KOBJMETHOD_END
	};

	DEFINE_CLASS(null, null_methods, 0);

	/*
	* Bus pass implementation
	*/

	static driver_list_t passes = TAILQ_HEAD_INITIALIZER(passes);
	int bus_current_pass = BUS_PASS_ROOT;

	/**
	* @internal
	* @brief Register the pass level of a new driver attachment
	*
	* Register a new driver attachment's pass level. If no driver
	* attachment with the same pass level has been added, then @p new
	* will be added to the global passes list.
	*
	* @param new the new driver attachment
	*/
	static void
	driver_register_pass(struct driverlink *new)
	{
	struct driverlink *dl;

	/* We only consider pass numbers during boot. */
	if (bus_current_pass == BUS_PASS_DEFAULT)
	return;

	/*
	* Walk the passes list. If we already know about this pass
	* then there is nothing to do. If we don't, then insert this
	* driver link into the list.
	*/
	TAILQ_FOREACH(dl, &passes, passlink) {
	if (dl->pass < new->pass)
	continue;
	if (dl->pass == new->pass)
	return;
	TAILQ_INSERT_BEFORE(dl, new, passlink);
	return;
	}
	TAILQ_INSERT_TAIL(&passes, new, passlink);
	}

	/**
	* @brief Raise the current bus pass
	*
	* Raise the current bus pass level to @p pass. Call the BUS_NEW_PASS()
	* method on the root bus to kick off a new device tree scan for each
	* new pass level that has at least one driver.
	*/
	void
	bus_set_pass(int pass)
	{
	struct driverlink *dl;

	if (bus_current_pass > pass)
	panic("Attempt to lower bus pass level");

	TAILQ_FOREACH(dl, &passes, passlink) {
	/* Skip pass values below the current pass level. */
	if (dl->pass <= bus_current_pass)
	continue;

	/*
	* Bail once we hit a driver with a pass level that is
	* too high.
	*/
	if (dl->pass > pass)
	break;

	/*
	* Raise the pass level to the next level and rescan
	* the tree.
	*/
	bus_current_pass = dl->pass;
	BUS_NEW_PASS(root_bus);
	}

	/*
	* If there isn't a driver registered for the requested pass,
	* then bus_current_pass might still be less than 'pass'. Set
	* it to 'pass' in that case.
	*/
	if (bus_current_pass < pass)
	bus_current_pass = pass;
	KASSERT(bus_current_pass == pass, ("Failed to update bus pass level"));
	}

	/*
	* Devclass implementation
	*/

	static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses);

	/**
	* @internal
	* @brief Find or create a device class
	*
	* If a device class with the name @p classname exists, return it,
	* otherwise if @p create is non-zero create and return a new device
	* class.
	*
	* If @p parentname is non-NULL, the parent of the devclass is set to
	* the devclass of that name.
	*
	* @param classname the devclass name to find or create
	* @param parentname the parent devclass name or @c NULL
	* @param create non-zero to create a devclass
	*/
	static devclass_t
	devclass_find_internal(const char classname, const char parentname,
	int create)
	{
	devclass_t dc;

	PDEBUG(("looking for %s", classname));
	if (!classname)
	return (NULL);

	TAILQ_FOREACH(dc, &devclasses, link) {
	if (!strcmp(dc->name, classname))
	break;
	}

	if (create && !dc) {
	PDEBUG(("creating %s", classname));
	dc = malloc(sizeof(struct devclass) + strlen(classname) + 1,
	M_BUS, M_NOWAIT \| M_ZERO);
	if (!dc)
	return (NULL);
	dc->parent = NULL;
	dc->name = (char*) (dc + 1);
	strcpy(dc->name, classname);
	TAILQ_INIT(&dc->drivers);
	TAILQ_INSERT_TAIL(&devclasses, dc, link);

	bus_data_generation_update();
	}

	/*
	* If a parent class is specified, then set that as our parent so
	* that this devclass will support drivers for the parent class as
	* well. If the parent class has the same name don't do this though
	* as it creates a cycle that can trigger an infinite loop in
	* device_probe_child() if a device exists for which there is no
	* suitable driver.
	*/
	if (parentname && dc && !dc->parent &&
	strcmp(classname, parentname) != 0) {
	dc->parent = devclass_find_internal(parentname, NULL, TRUE);
	dc->parent->flags \|= DC_HAS_CHILDREN;
	}

	return (dc);
	}

	/**
	* @brief Create a device class
	*
	* If a device class with the name @p classname exists, return it,
	* otherwise create and return a new device class.
	*
	* @param classname the devclass name to find or create
	*/
	devclass_t
	devclass_create(const char *classname)
	{
	return (devclass_find_internal(classname, NULL, TRUE));
	}

	/**
	* @brief Find a device class
	*
	* If a device class with the name @p classname exists, return it,
	* otherwise return @c NULL.
	*
	* @param classname the devclass name to find
	*/
	devclass_t
	devclass_find(const char *classname)
	{
	return (devclass_find_internal(classname, NULL, FALSE));
	}

	/**
	* @brief Register that a device driver has been added to a devclass
	*
	* Register that a device driver has been added to a devclass. This
	* is called by devclass_add_driver to accomplish the recursive
	* notification of all the children classes of dc, as well as dc.
	* Each layer will have BUS_DRIVER_ADDED() called for all instances of
	* the devclass.
	*
	* We do a full search here of the devclass list at each iteration
	* level to save storing children-lists in the devclass structure. If
	* we ever move beyond a few dozen devices doing this, we may need to
	* reevaluate...
	*
	* @param dc the devclass to edit
	* @param driver the driver that was just added
	*/
	static void
	devclass_driver_added(devclass_t dc, driver_t *driver)
	{
	devclass_t parent;
	int i;

	/*
	* Call BUS_DRIVER_ADDED for any existing busses in this class.
	*/
	for (i = 0; i < dc->maxunit; i++)
	if (dc->devices[i] && device_is_attached(dc->devices[i]))
	BUS_DRIVER_ADDED(dc->devices[i], driver);

	/*
	* Walk through the children classes. Since we only keep a
	* single parent pointer around, we walk the entire list of
	* devclasses looking for children. We set the
	* DC_HAS_CHILDREN flag when a child devclass is created on
	* the parent, so we only walk the list for those devclasses
	* that have children.
	*/
	if (!(dc->flags & DC_HAS_CHILDREN))
	return;
	parent = dc;
	TAILQ_FOREACH(dc, &devclasses, link) {
	if (dc->parent == parent)
	devclass_driver_added(dc, driver);
	}
	}

	/**
	* @brief Add a device driver to a device class
	*
	* Add a device driver to a devclass. This is normally called
	* automatically by DRIVER_MODULE(). The BUS_DRIVER_ADDED() method of
	* all devices in the devclass will be called to allow them to attempt
	* to re-probe any unmatched children.
	*
	* @param dc the devclass to edit
	* @param driver the driver to register
	*/
	int
	devclass_add_driver(devclass_t dc, driver_t driver, int pass, devclass_t dcp)
	{
	driverlink_t dl;
	const char *parentname;

	PDEBUG(("%s", DRIVERNAME(driver)));

	/* Don't allow invalid pass values. */
	if (pass <= BUS_PASS_ROOT)
	return (EINVAL);

	dl = malloc(sizeof *dl, M_BUS, M_NOWAIT\|M_ZERO);
	if (!dl)
	return (ENOMEM);

	/*
	* Compile the driver's methods. Also increase the reference count
	* so that the class doesn't get freed when the last instance
	* goes. This means we can safely use static methods and avoids a
	* double-free in devclass_delete_driver.
	*/
	kobj_class_compile((kobj_class_t) driver);

	/*
	* If the driver has any base classes, make the
	* devclass inherit from the devclass of the driver's
	* first base class. This will allow the system to
	* search for drivers in both devclasses for children
	* of a device using this driver.
	*/
	if (driver->baseclasses)
	parentname = driver->baseclasses[0]->name;
	else
	parentname = NULL;
	*dcp = devclass_find_internal(driver->name, parentname, TRUE);

	dl->driver = driver;
	TAILQ_INSERT_TAIL(&dc->drivers, dl, link);
	driver->refs++; /* XXX: kobj_mtx */
	dl->pass = pass;
	driver_register_pass(dl);

	devclass_driver_added(dc, driver);
	bus_data_generation_update();
	return (0);
	}

	/**
	* @brief Register that a device driver has been deleted from a devclass
	*
	* Register that a device driver has been removed from a devclass.
	* This is called by devclass_delete_driver to accomplish the
	* recursive notification of all the children classes of busclass, as
	* well as busclass. Each layer will attempt to detach the driver
	* from any devices that are children of the bus's devclass. The function
	* will return an error if a device fails to detach.
	*
	* We do a full search here of the devclass list at each iteration
	* level to save storing children-lists in the devclass structure. If
	* we ever move beyond a few dozen devices doing this, we may need to
	* reevaluate...
	*
	* @param busclass the devclass of the parent bus
	* @param dc the devclass of the driver being deleted
	* @param driver the driver being deleted
	*/
	static int
	devclass_driver_deleted(devclass_t busclass, devclass_t dc, driver_t *driver)
	{
	devclass_t parent;
	device_t dev;
	int error, i;

	/*
	* Disassociate from any devices. We iterate through all the
	* devices in the devclass of the driver and detach any which are
	* using the driver and which have a parent in the devclass which
	* we are deleting from.
	*
	* Note that since a driver can be in multiple devclasses, we
	* should not detach devices which are not children of devices in
	* the affected devclass.
	*/
	for (i = 0; i < dc->maxunit; i++) {
	if (dc->devices[i]) {
	dev = dc->devices[i];
	if (dev->driver == driver && dev->parent &&
	dev->parent->devclass == busclass) {
	if ((error = device_detach(dev)) != 0)
	return (error);
	device_set_driver(dev, NULL);
	BUS_PROBE_NOMATCH(dev->parent, dev);
	devnomatch(dev);
	dev->flags \|= DF_DONENOMATCH;
	}
	}
	}

	/*
	* Walk through the children classes. Since we only keep a
	* single parent pointer around, we walk the entire list of
	* devclasses looking for children. We set the
	* DC_HAS_CHILDREN flag when a child devclass is created on
	* the parent, so we only walk the list for those devclasses
	* that have children.
	*/
	if (!(busclass->flags & DC_HAS_CHILDREN))
	return (0);
	parent = busclass;
	TAILQ_FOREACH(busclass, &devclasses, link) {
	if (busclass->parent == parent) {
	error = devclass_driver_deleted(busclass, dc, driver);
	if (error)
	return (error);
	}
	}
	return (0);
	}

	/**
	* @brief Delete a device driver from a device class
	*
	* Delete a device driver from a devclass. This is normally called
	* automatically by DRIVER_MODULE().
	*
	* If the driver is currently attached to any devices,
	* devclass_delete_driver() will first attempt to detach from each
	* device. If one of the detach calls fails, the driver will not be
	* deleted.
	*
	* @param dc the devclass to edit
	* @param driver the driver to unregister
	*/
	int
	devclass_delete_driver(devclass_t busclass, driver_t *driver)
	{
	devclass_t dc = devclass_find(driver->name);
	driverlink_t dl;
	int error;

	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));

	if (!dc)
	return (0);

	/*
	* Find the link structure in the bus' list of drivers.
	*/
	TAILQ_FOREACH(dl, &busclass->drivers, link) {
	if (dl->driver == driver)
	break;
	}

	if (!dl) {
	PDEBUG(("%s not found in %s list", driver->name,
	busclass->name));
	return (ENOENT);
	}

	error = devclass_driver_deleted(busclass, dc, driver);
	if (error != 0)
	return (error);

	TAILQ_REMOVE(&busclass->drivers, dl, link);
	free(dl, M_BUS);

	/* XXX: kobj_mtx */
	driver->refs--;
	if (driver->refs == 0)
	kobj_class_free((kobj_class_t) driver);

	bus_data_generation_update();
	return (0);
	}

	/**
	* @brief Quiesces a set of device drivers from a device class
	*
	* Quiesce a device driver from a devclass. This is normally called
	* automatically by DRIVER_MODULE().
	*
	* If the driver is currently attached to any devices,
	* devclass_quiesece_driver() will first attempt to quiesce each
	* device.
	*
	* @param dc the devclass to edit
	* @param driver the driver to unregister
	*/
	static int
	devclass_quiesce_driver(devclass_t busclass, driver_t *driver)
	{
	devclass_t dc = devclass_find(driver->name);
	driverlink_t dl;
	device_t dev;
	int i;
	int error;

	PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass)));

	if (!dc)
	return (0);

	/*
	* Find the link structure in the bus' list of drivers.
	*/
	TAILQ_FOREACH(dl, &busclass->drivers, link) {
	if (dl->driver == driver)
	break;
	}

	if (!dl) {
	PDEBUG(("%s not found in %s list", driver->name,
	busclass->name));
	return (ENOENT);
	}

	/*
	* Quiesce all devices. We iterate through all the devices in
	* the devclass of the driver and quiesce any which are using
	* the driver and which have a parent in the devclass which we
	* are quiescing.
	*
	* Note that since a driver can be in multiple devclasses, we
	* should not quiesce devices which are not children of
	* devices in the affected devclass.
	*/
	for (i = 0; i < dc->maxunit; i++) {
	if (dc->devices[i]) {
	dev = dc->devices[i];
	if (dev->driver == driver && dev->parent &&
	dev->parent->devclass == busclass) {
	if ((error = device_quiesce(dev)) != 0)
	return (error);
	}
	}
	}

	return (0);
	}

	/**
	* @internal
	*/
	static driverlink_t
	devclass_find_driver_internal(devclass_t dc, const char *classname)
	{
	driverlink_t dl;

	PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc)));

	TAILQ_FOREACH(dl, &dc->drivers, link) {
	if (!strcmp(dl->driver->name, classname))
	return (dl);
	}

	PDEBUG(("not found"));
	return (NULL);
	}

	/**
	* @brief Return the name of the devclass
	*/
	const char *
	devclass_get_name(devclass_t dc)
	{
	return (dc->name);
	}

	/**
	* @brief Find a device given a unit number
	*
	* @param dc the devclass to search
	* @param unit the unit number to search for
	*
	* @returns the device with the given unit number or @c
	* NULL if there is no such device
	*/
	device_t
	devclass_get_device(devclass_t dc, int unit)
	{
	if (dc == NULL \|\| unit < 0 \|\| unit >= dc->maxunit)
	return (NULL);
	return (dc->devices[unit]);
	}

	/**
	* @brief Find the softc field of a device given a unit number
	*
	* @param dc the devclass to search
	* @param unit the unit number to search for
	*
	* @returns the softc field of the device with the given
	* unit number or @c NULL if there is no such
	* device
	*/
	void *
	devclass_get_softc(devclass_t dc, int unit)
	{
	device_t dev;

	dev = devclass_get_device(dc, unit);
	if (!dev)
	return (NULL);

	return (device_get_softc(dev));
	}

	/**
	* @brief Get a list of devices in the devclass
	*
	* An array containing a list of all the devices in the given devclass
	* is allocated and returned in @p *devlistp. The number of devices
	* in the array is returned in @p *devcountp. The caller should free
	* the array using @c free(p, M_TEMP), even if @p *devcountp is 0.
	*
	* @param dc the devclass to examine
	* @param devlistp points at location for array pointer return
	* value
	* @param devcountp points at location for array size return value
	*
	* @retval 0 success
	* @retval ENOMEM the array allocation failed
	*/
	int
	devclass_get_devices(devclass_t dc, device_t *devlistp, int devcountp)
	{
	int count, i;
	device_t *list;

	count = devclass_get_count(dc);
	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT\|M_ZERO);
	if (!list)
	return (ENOMEM);

	count = 0;
	for (i = 0; i < dc->maxunit; i++) {
	if (dc->devices[i]) {
	list[count] = dc->devices[i];
	count++;
	}
	}

	*devlistp = list;
	*devcountp = count;

	return (0);
	}

	/**
	* @brief Get a list of drivers in the devclass
	*
	* An array containing a list of pointers to all the drivers in the
	* given devclass is allocated and returned in @p *listp. The number
	* of drivers in the array is returned in @p *countp. The caller should
	* free the array using @c free(p, M_TEMP).
	*
	* @param dc the devclass to examine
	* @param listp gives location for array pointer return value
	* @param countp gives location for number of array elements
	* return value
	*
	* @retval 0 success
	* @retval ENOMEM the array allocation failed
	*/
	int
	devclass_get_drivers(devclass_t dc, driver_t **listp, int countp)
	{
	driverlink_t dl;
	driver_t **list;
	int count;

	count = 0;
	TAILQ_FOREACH(dl, &dc->drivers, link)
	count++;
	list = malloc(count * sizeof(driver_t *), M_TEMP, M_NOWAIT);
	if (list == NULL)
	return (ENOMEM);

	count = 0;
	TAILQ_FOREACH(dl, &dc->drivers, link) {
	list[count] = dl->driver;
	count++;
	}
	*listp = list;
	*countp = count;

	return (0);
	}

	/**
	* @brief Get the number of devices in a devclass
	*
	* @param dc the devclass to examine
	*/
	int
	devclass_get_count(devclass_t dc)
	{
	int count, i;

	count = 0;
	for (i = 0; i < dc->maxunit; i++)
	if (dc->devices[i])
	count++;
	return (count);
	}

	/**
	* @brief Get the maximum unit number used in a devclass
	*
	* Note that this is one greater than the highest currently-allocated
	* unit. If a null devclass_t is passed in, -1 is returned to indicate
	* that not even the devclass has been allocated yet.
	*
	* @param dc the devclass to examine
	*/
	int
	devclass_get_maxunit(devclass_t dc)
	{
	if (dc == NULL)
	return (-1);
	return (dc->maxunit);
	}

	/**
	* @brief Find a free unit number in a devclass
	*
	* This function searches for the first unused unit number greater
	* that or equal to @p unit.
	*
	* @param dc the devclass to examine
	* @param unit the first unit number to check
	*/
	int
	devclass_find_free_unit(devclass_t dc, int unit)
	{
	if (dc == NULL)
	return (unit);
	while (unit < dc->maxunit && dc->devices[unit] != NULL)
	unit++;
	return (unit);
	}

	/**
	* @brief Set the parent of a devclass
	*
	* The parent class is normally initialised automatically by
	* DRIVER_MODULE().
	*
	* @param dc the devclass to edit
	* @param pdc the new parent devclass
	*/
	void
	devclass_set_parent(devclass_t dc, devclass_t pdc)
	{
	dc->parent = pdc;
	}

	/**
	* @brief Get the parent of a devclass
	*
	* @param dc the devclass to examine
	*/
	devclass_t
	devclass_get_parent(devclass_t dc)
	{
	return (dc->parent);
	}

	struct sysctl_ctx_list *
	devclass_get_sysctl_ctx(devclass_t dc)
	{
	return (&dc->sysctl_ctx);
	}

	struct sysctl_oid *
	devclass_get_sysctl_tree(devclass_t dc)
	{
	return (dc->sysctl_tree);
	}

	/**
	* @internal
	* @brief Allocate a unit number
	*
	* On entry, @p *unitp is the desired unit number (or @c -1 if any
	* will do). The allocated unit number is returned in @p *unitp.

	* @param dc the devclass to allocate from
	* @param unitp points at the location for the allocated unit
	* number
	*
	* @retval 0 success
	* @retval EEXIST the requested unit number is already allocated
	* @retval ENOMEM memory allocation failure
	*/
	static int
	devclass_alloc_unit(devclass_t dc, device_t dev, int *unitp)
	{
	const char *s;
	int unit = *unitp;

	PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc)));

	/* Ask the parent bus if it wants to wire this device. */
	if (unit == -1)
	BUS_HINT_DEVICE_UNIT(device_get_parent(dev), dev, dc->name,
	&unit);

	/* If we were given a wired unit number, check for existing device */
	/* XXX imp XXX */
	if (unit != -1) {
	if (unit >= 0 && unit < dc->maxunit &&
	dc->devices[unit] != NULL) {
	if (bootverbose)
	printf("%s: %s%d already exists; skipping it\n",
	dc->name, dc->name, *unitp);
	return (EEXIST);
	}
	} else {
	/* Unwired device, find the next available slot for it */
	unit = 0;
	for (unit = 0;; unit++) {
	/* If there is an "at" hint for a unit then skip it. */
	if (resource_string_value(dc->name, unit, "at", &s) ==
	0)
	continue;

	/* If this device slot is already in use, skip it. */
	if (unit < dc->maxunit && dc->devices[unit] != NULL)
	continue;

	break;
	}
	}

	/*
	* We've selected a unit beyond the length of the table, so let's
	* extend the table to make room for all units up to and including
	* this one.
	*/
	if (unit >= dc->maxunit) {
	device_t newlist, oldlist;
	int newsize;

	oldlist = dc->devices;
	newsize = roundup((unit + 1), MINALLOCSIZE / sizeof(device_t));
	newlist = malloc(sizeof(device_t) * newsize, M_BUS, M_NOWAIT);
	if (!newlist)
	return (ENOMEM);
	if (oldlist != NULL)
	bcopy(oldlist, newlist, sizeof(device_t) * dc->maxunit);
	bzero(newlist + dc->maxunit,
	sizeof(device_t) * (newsize - dc->maxunit));
	dc->devices = newlist;
	dc->maxunit = newsize;
	if (oldlist != NULL)
	free(oldlist, M_BUS);
	}
	PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc)));

	*unitp = unit;
	return (0);
	}

	/**
	* @internal
	* @brief Add a device to a devclass
	*
	* A unit number is allocated for the device (using the device's
	* preferred unit number if any) and the device is registered in the
	* devclass. This allows the device to be looked up by its unit
	* number, e.g. by decoding a dev_t minor number.
	*
	* @param dc the devclass to add to
	* @param dev the device to add
	*
	* @retval 0 success
	* @retval EEXIST the requested unit number is already allocated
	* @retval ENOMEM memory allocation failure
	*/
	static int
	devclass_add_device(devclass_t dc, device_t dev)
	{
	int buflen, error;

	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));

	buflen = snprintf(NULL, 0, "%s%d$", dc->name, INT_MAX);
	if (buflen < 0)
	return (ENOMEM);
	dev->nameunit = malloc(buflen, M_BUS, M_NOWAIT\|M_ZERO);
	if (!dev->nameunit)
	return (ENOMEM);

	if ((error = devclass_alloc_unit(dc, dev, &dev->unit)) != 0) {
	free(dev->nameunit, M_BUS);
	dev->nameunit = NULL;
	return (error);
	}
	dc->devices[dev->unit] = dev;
	dev->devclass = dc;
	snprintf(dev->nameunit, buflen, "%s%d", dc->name, dev->unit);

	return (0);
	}

	/**
	* @internal
	* @brief Delete a device from a devclass
	*
	* The device is removed from the devclass's device list and its unit
	* number is freed.

	* @param dc the devclass to delete from
	* @param dev the device to delete
	*
	* @retval 0 success
	*/
	static int
	devclass_delete_device(devclass_t dc, device_t dev)
	{
	if (!dc \|\| !dev)
	return (0);

	PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc)));

	if (dev->devclass != dc \|\| dc->devices[dev->unit] != dev)
	panic("devclass_delete_device: inconsistent device class");
	dc->devices[dev->unit] = NULL;
	if (dev->flags & DF_WILDCARD)
	dev->unit = -1;
	dev->devclass = NULL;
	free(dev->nameunit, M_BUS);
	dev->nameunit = NULL;

	return (0);
	}

	/**
	* @internal
	* @brief Make a new device and add it as a child of @p parent
	*
	* @param parent the parent of the new device
	* @param name the devclass name of the new device or @c NULL
	* to leave the devclass unspecified
	* @parem unit the unit number of the new device of @c -1 to
	* leave the unit number unspecified
	*
	* @returns the new device
	*/
	static device_t
	make_device(device_t parent, const char *name, int unit)
	{
	device_t dev;
	devclass_t dc;

	PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit));

	if (name) {
	dc = devclass_find_internal(name, NULL, TRUE);
	if (!dc) {
	printf("make_device: can't find device class %s\n",
	name);
	return (NULL);
	}
	} else {
	dc = NULL;
	}

	dev = malloc(sizeof(struct device), M_BUS, M_NOWAIT\|M_ZERO);
	if (!dev)
	return (NULL);

	dev->parent = parent;
	TAILQ_INIT(&dev->children);
	kobj_init((kobj_t) dev, &null_class);
	dev->driver = NULL;
	dev->devclass = NULL;
	dev->unit = unit;
	dev->nameunit = NULL;
	dev->desc = NULL;
	dev->busy = 0;
	dev->devflags = 0;
	dev->flags = DF_ENABLED;
	dev->order = 0;
	if (unit == -1)
	dev->flags \|= DF_WILDCARD;
	if (name) {
	dev->flags \|= DF_FIXEDCLASS;
	if (devclass_add_device(dc, dev)) {
	kobj_delete((kobj_t) dev, M_BUS);
	return (NULL);
	}
	}
	dev->ivars = NULL;
	dev->softc = NULL;

	dev->state = DS_NOTPRESENT;

	TAILQ_INSERT_TAIL(&bus_data_devices, dev, devlink);
	bus_data_generation_update();

	return (dev);
	}

	/**
	* @internal
	* @brief Print a description of a device.
	*/
	static int
	device_print_child(device_t dev, device_t child)
	{
	int retval = 0;

	if (device_is_alive(child))
	retval += BUS_PRINT_CHILD(dev, child);
	else
	retval += device_printf(child, " not found\n");

	return (retval);
	}

	/**
	* @brief Create a new device
	*
	* This creates a new device and adds it as a child of an existing
	* parent device. The new device will be added after the last existing
	* child with order zero.
	*
	* @param dev the device which will be the parent of the
	* new child device
	* @param name devclass name for new device or @c NULL if not
	* specified
	* @param unit unit number for new device or @c -1 if not
	* specified
	*
	* @returns the new device
	*/
	device_t
	device_add_child(device_t dev, const char *name, int unit)
	{
	return (device_add_child_ordered(dev, 0, name, unit));
	}

	/**
	* @brief Create a new device
	*
	* This creates a new device and adds it as a child of an existing
	* parent device. The new device will be added after the last existing
	* child with the same order.
	*
	* @param dev the device which will be the parent of the
	* new child device
	* @param order a value which is used to partially sort the
	* children of @p dev - devices created using
	* lower values of @p order appear first in @p
	* dev's list of children
	* @param name devclass name for new device or @c NULL if not
	* specified
	* @param unit unit number for new device or @c -1 if not
	* specified
	*
	* @returns the new device
	*/
	device_t
	device_add_child_ordered(device_t dev, u_int order, const char *name, int unit)
	{
	device_t child;
	device_t place;

	PDEBUG(("%s at %s with order %u as unit %d",
	name, DEVICENAME(dev), order, unit));

	child = make_device(dev, name, unit);
	if (child == NULL)
	return (child);
	child->order = order;

	TAILQ_FOREACH(place, &dev->children, link) {
	if (place->order > order)
	break;
	}

	if (place) {
	/*
	* The device 'place' is the first device whose order is
	* greater than the new child.
	*/
	TAILQ_INSERT_BEFORE(place, child, link);
	} else {
	/*
	* The new child's order is greater or equal to the order of
	* any existing device. Add the child to the tail of the list.
	*/
	TAILQ_INSERT_TAIL(&dev->children, child, link);
	}

	bus_data_generation_update();
	return (child);
	}

	/**
	* @brief Delete a device
	*
	* This function deletes a device along with all of its children. If
	* the device currently has a driver attached to it, the device is
	* detached first using device_detach().
	*
	* @param dev the parent device
	* @param child the device to delete
	*
	* @retval 0 success
	* @retval non-zero a unit error code describing the error
	*/
	int
	device_delete_child(device_t dev, device_t child)
	{
	int error;
	device_t grandchild;

	PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));

	/* remove children first */
	while ( (grandchild = TAILQ_FIRST(&child->children)) ) {
	error = device_delete_child(child, grandchild);
	if (error)
	return (error);
	}

	if ((error = device_detach(child)) != 0)
	return (error);
	if (child->devclass)
	devclass_delete_device(child->devclass, child);
	TAILQ_REMOVE(&dev->children, child, link);
	TAILQ_REMOVE(&bus_data_devices, child, devlink);
	kobj_delete((kobj_t) child, M_BUS);

	bus_data_generation_update();
	return (0);
	}

	/**
	* @brief Find a device given a unit number
	*
	* This is similar to devclass_get_devices() but only searches for
	* devices which have @p dev as a parent.
	*
	* @param dev the parent device to search
	* @param unit the unit number to search for. If the unit is -1,
	* return the first child of @p dev which has name
	* @p classname (that is, the one with the lowest unit.)
	*
	* @returns the device with the given unit number or @c
	* NULL if there is no such device
	*/
	device_t
	device_find_child(device_t dev, const char *classname, int unit)
	{
	devclass_t dc;
	device_t child;

	dc = devclass_find(classname);
	if (!dc)
	return (NULL);

	if (unit != -1) {
	child = devclass_get_device(dc, unit);
	if (child && child->parent == dev)
	return (child);
	} else {
	for (unit = 0; unit < devclass_get_maxunit(dc); unit++) {
	child = devclass_get_device(dc, unit);
	if (child && child->parent == dev)
	return (child);
	}
	}
	return (NULL);
	}

	/**
	* @internal
	*/
	static driverlink_t
	first_matching_driver(devclass_t dc, device_t dev)
	{
	if (dev->devclass)
	return (devclass_find_driver_internal(dc, dev->devclass->name));
	return (TAILQ_FIRST(&dc->drivers));
	}

	/**
	* @internal
	*/
	static driverlink_t
	next_matching_driver(devclass_t dc, device_t dev, driverlink_t last)
	{
	if (dev->devclass) {
	driverlink_t dl;
	for (dl = TAILQ_NEXT(last, link); dl; dl = TAILQ_NEXT(dl, link))
	if (!strcmp(dev->devclass->name, dl->driver->name))
	return (dl);
	return (NULL);
	}
	return (TAILQ_NEXT(last, link));
	}

	/**
	* @internal
	*/
	int
	device_probe_child(device_t dev, device_t child)
	{
	devclass_t dc;
	driverlink_t best = NULL;
	driverlink_t dl;
	int result, pri = 0;
	int hasclass = (child->devclass != NULL);

	GIANT_REQUIRED;

	dc = dev->devclass;
	if (!dc)
	panic("device_probe_child: parent device has no devclass");

	/*
	* If the state is already probed, then return. However, don't
	* return if we can rebid this object.
	*/
	if (child->state == DS_ALIVE && (child->flags & DF_REBID) == 0)
	return (0);

	for (; dc; dc = dc->parent) {
	for (dl = first_matching_driver(dc, child);
	dl;
	dl = next_matching_driver(dc, child, dl)) {

	/* If this driver's pass is too high, then ignore it. */
	if (dl->pass > bus_current_pass)
	continue;

	PDEBUG(("Trying %s", DRIVERNAME(dl->driver)));
	device_set_driver(child, dl->driver);
	if (!hasclass) {
	if (device_set_devclass(child, dl->driver->name)) {
	printf("driver bug: Unable to set devclass (devname: %s)\n",
	(child ? device_get_name(child) :
	"no device"));
	device_set_driver(child, NULL);
	continue;
	}
	}

	/* Fetch any flags for the device before probing. */
	resource_int_value(dl->driver->name, child->unit,
	"flags", &child->devflags);

	result = DEVICE_PROBE(child);

	/* Reset flags and devclass before the next probe. */
	child->devflags = 0;
	if (!hasclass)
	device_set_devclass(child, NULL);

	/*
	* If the driver returns SUCCESS, there can be
	* no higher match for this device.
	*/
	if (result == 0) {
	best = dl;
	pri = 0;
	break;
	}

	/*
	* The driver returned an error so it
	* certainly doesn't match.
	*/
	if (result > 0) {
	device_set_driver(child, NULL);
	continue;
	}

	/*
	* A priority lower than SUCCESS, remember the
	* best matching driver. Initialise the value
	* of pri for the first match.
	*/
	if (best == NULL \|\| result > pri) {
	/*
	* Probes that return BUS_PROBE_NOWILDCARD
	* or lower only match when they are set
	* in stone by the parent bus.
	*/
	if (result <= BUS_PROBE_NOWILDCARD &&
	child->flags & DF_WILDCARD)
	continue;
	best = dl;
	pri = result;
	continue;
	}
	}
	/*
	* If we have an unambiguous match in this devclass,
	* don't look in the parent.
	*/
	if (best && pri == 0)
	break;
	}

	/*
	* If we found a driver, change state and initialise the devclass.
	*/
	/* XXX What happens if we rebid and got no best? */
	if (best) {
	/*
	* If this device was atached, and we were asked to
	* rescan, and it is a different driver, then we have
	* to detach the old driver and reattach this new one.
	* Note, we don't have to check for DF_REBID here
	* because if the state is > DS_ALIVE, we know it must
	* be.
	*
	* This assumes that all DF_REBID drivers can have
	* their probe routine called at any time and that
	* they are idempotent as well as completely benign in
	* normal operations.
	*
	* We also have to make sure that the detach
	* succeeded, otherwise we fail the operation (or
	* maybe it should just fail silently? I'm torn).
	*/
	if (child->state > DS_ALIVE && best->driver != child->driver)
	if ((result = device_detach(dev)) != 0)
	return (result);

	/* Set the winning driver, devclass, and flags. */
	if (!child->devclass) {
	result = device_set_devclass(child, best->driver->name);
	if (result != 0)
	return (result);
	}
	device_set_driver(child, best->driver);
	resource_int_value(best->driver->name, child->unit,
	"flags", &child->devflags);

	if (pri < 0) {
	/*
	* A bit bogus. Call the probe method again to make
	* sure that we have the right description.
	*/
	DEVICE_PROBE(child);
	#if 0
	child->flags \|= DF_REBID;
	#endif
	} else
	child->flags &= ~DF_REBID;
	child->state = DS_ALIVE;

	bus_data_generation_update();
	return (0);
	}

	return (ENXIO);
	}

	/**
	* @brief Return the parent of a device
	*/
	device_t
	device_get_parent(device_t dev)
	{
	return (dev->parent);
	}

	/**
	* @brief Get a list of children of a device
	*
	* An array containing a list of all the children of the given device
	* is allocated and returned in @p *devlistp. The number of devices
	* in the array is returned in @p *devcountp. The caller should free
	* the array using @c free(p, M_TEMP).
	*
	* @param dev the device to examine
	* @param devlistp points at location for array pointer return
	* value
	* @param devcountp points at location for array size return value
	*
	* @retval 0 success
	* @retval ENOMEM the array allocation failed
	*/
	int
	device_get_children(device_t dev, device_t *devlistp, int devcountp)
	{
	int count;
	device_t child;
	device_t *list;

	count = 0;
	TAILQ_FOREACH(child, &dev->children, link) {
	count++;
	}

	list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT\|M_ZERO);
	if (!list)
	return (ENOMEM);

	count = 0;
	TAILQ_FOREACH(child, &dev->children, link) {
	list[count] = child;
	count++;
	}

	*devlistp = list;
	*devcountp = count;

	return (0);
	}

	/**
	* @brief Return the current driver for the device or @c NULL if there
	* is no driver currently attached
	*/
	driver_t *
	device_get_driver(device_t dev)
	{
	return (dev->driver);
	}

	/**
	* @brief Return the current devclass for the device or @c NULL if
	* there is none.
	*/
	devclass_t
	device_get_devclass(device_t dev)
	{
	return (dev->devclass);
	}

	/**
	* @brief Return the name of the device's devclass or @c NULL if there
	* is none.
	*/
	const char *
	device_get_name(device_t dev)
	{
	if (dev != NULL && dev->devclass)
	return (devclass_get_name(dev->devclass));
	return (NULL);
	}

	/**
	* @brief Return a string containing the device's devclass name
	* followed by an ascii representation of the device's unit number
	* (e.g. @c "foo2").
	*/
	const char *
	device_get_nameunit(device_t dev)
	{
	return (dev->nameunit);
	}

	/**
	* @brief Return the device's unit number.
	*/
	int
	device_get_unit(device_t dev)
	{
	return (dev->unit);
	}

	/**
	* @brief Return the device's description string
	*/
	const char *
	device_get_desc(device_t dev)
	{
	return (dev->desc);
	}

	/**
	* @brief Return the device's flags
	*/
	uint32_t
	device_get_flags(device_t dev)
	{
	return (dev->devflags);
	}

	struct sysctl_ctx_list *
	device_get_sysctl_ctx(device_t dev)
	{
	return (&dev->sysctl_ctx);
	}

	struct sysctl_oid *
	device_get_sysctl_tree(device_t dev)
	{
	return (dev->sysctl_tree);
	}

	/**
	* @brief Print the name of the device followed by a colon and a space
	*
	* @returns the number of characters printed
	*/
	int
	device_print_prettyname(device_t dev)
	{
	const char *name = device_get_name(dev);

	if (name == NULL)
	return (printf("unknown: "));
	return (printf("%s%d: ", name, device_get_unit(dev)));
	}

	/**
	* @brief Print the name of the device followed by a colon, a space
	* and the result of calling vprintf() with the value of @p fmt and
	* the following arguments.
	*
	* @returns the number of characters printed
	*/
	int
	device_printf(device_t dev, const char * fmt, ...)
	{
	va_list ap;
	int retval;

	retval = device_print_prettyname(dev);
	va_start(ap, fmt);
	retval += vprintf(fmt, ap);
	va_end(ap);
	return (retval);
	}

	/**
	* @internal
	*/
	static void
	device_set_desc_internal(device_t dev, const char* desc, int copy)
	{
	if (dev->desc && (dev->flags & DF_DESCMALLOCED)) {
	free(dev->desc, M_BUS);
	dev->flags &= ~DF_DESCMALLOCED;
	dev->desc = NULL;
	}

	if (copy && desc) {
	dev->desc = malloc(strlen(desc) + 1, M_BUS, M_NOWAIT);
	if (dev->desc) {
	strcpy(dev->desc, desc);
	dev->flags \|= DF_DESCMALLOCED;
	}
	} else {
	/* Avoid a -Wcast-qual warning */
	dev->desc = (char *)(uintptr_t) desc;
	}

	bus_data_generation_update();
	}

	/**
	* @brief Set the device's description
	*
	* The value of @c desc should be a string constant that will not
	* change (at least until the description is changed in a subsequent
	* call to device_set_desc() or device_set_desc_copy()).
	*/
	void
	device_set_desc(device_t dev, const char* desc)
	{
	device_set_desc_internal(dev, desc, FALSE);
	}

	/**
	* @brief Set the device's description
	*
	* The string pointed to by @c desc is copied. Use this function if
	* the device description is generated, (e.g. with sprintf()).
	*/
	void
	device_set_desc_copy(device_t dev, const char* desc)
	{
	device_set_desc_internal(dev, desc, TRUE);
	}

	/**
	* @brief Set the device's flags
	*/
	void
	device_set_flags(device_t dev, uint32_t flags)
	{
	dev->devflags = flags;
	}

	/**
	* @brief Return the device's softc field
	*
	* The softc is allocated and zeroed when a driver is attached, based
	* on the size field of the driver.
	*/
	void *
	device_get_softc(device_t dev)
	{
	return (dev->softc);
	}

	/**
	* @brief Set the device's softc field
	*
	* Most drivers do not need to use this since the softc is allocated
	* automatically when the driver is attached.
	*/
	void
	device_set_softc(device_t dev, void *softc)
	{
	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC))
	free(dev->softc, M_BUS_SC);
	dev->softc = softc;
	if (dev->softc)
	dev->flags \|= DF_EXTERNALSOFTC;
	else
	dev->flags &= ~DF_EXTERNALSOFTC;
	}

	/**
	* @brief Get the device's ivars field
	*
	* The ivars field is used by the parent device to store per-device
	* state (e.g. the physical location of the device or a list of
	* resources).
	*/
	void *
	device_get_ivars(device_t dev)
	{

	KASSERT(dev != NULL, ("device_get_ivars(NULL, ...)"));
	return (dev->ivars);
	}

	/**
	* @brief Set the device's ivars field
	*/
	void
	device_set_ivars(device_t dev, void * ivars)
	{

	KASSERT(dev != NULL, ("device_set_ivars(NULL, ...)"));
	dev->ivars = ivars;
	}

	/**
	* @brief Return the device's state
	*/
	device_state_t
	device_get_state(device_t dev)
	{
	return (dev->state);
	}

	/**
	* @brief Set the DF_ENABLED flag for the device
	*/
	void
	device_enable(device_t dev)
	{
	dev->flags \|= DF_ENABLED;
	}

	/**
	* @brief Clear the DF_ENABLED flag for the device
	*/
	void
	device_disable(device_t dev)
	{
	dev->flags &= ~DF_ENABLED;
	}

	/**
	* @brief Increment the busy counter for the device
	*/
	void
	device_busy(device_t dev)
	{
	if (dev->state < DS_ATTACHED)
	panic("device_busy: called for unattached device");
	if (dev->busy == 0 && dev->parent)
	device_busy(dev->parent);
	dev->busy++;
	dev->state = DS_BUSY;
	}

	/**
	* @brief Decrement the busy counter for the device
	*/
	void
	device_unbusy(device_t dev)
	{
	if (dev->state != DS_BUSY)
	panic("device_unbusy: called for non-busy device %s",
	device_get_nameunit(dev));
	dev->busy--;
	if (dev->busy == 0) {
	if (dev->parent)
	device_unbusy(dev->parent);
	dev->state = DS_ATTACHED;
	}
	}

	/**
	* @brief Set the DF_QUIET flag for the device
	*/
	void
	device_quiet(device_t dev)
	{
	dev->flags \|= DF_QUIET;
	}

	/**
	* @brief Clear the DF_QUIET flag for the device
	*/
	void
	device_verbose(device_t dev)
	{
	dev->flags &= ~DF_QUIET;
	}

	/**
	* @brief Return non-zero if the DF_QUIET flag is set on the device
	*/
	int
	device_is_quiet(device_t dev)
	{
	return ((dev->flags & DF_QUIET) != 0);
	}

	/**
	* @brief Return non-zero if the DF_ENABLED flag is set on the device
	*/
	int
	device_is_enabled(device_t dev)
	{
	return ((dev->flags & DF_ENABLED) != 0);
	}

	/**
	* @brief Return non-zero if the device was successfully probed
	*/
	int
	device_is_alive(device_t dev)
	{
	return (dev->state >= DS_ALIVE);
	}

	/**
	* @brief Return non-zero if the device currently has a driver
	* attached to it
	*/
	int
	device_is_attached(device_t dev)
	{
	return (dev->state >= DS_ATTACHED);
	}

	/**
	* @brief Set the devclass of a device
	* @see devclass_add_device().
	*/
	int
	device_set_devclass(device_t dev, const char *classname)
	{
	devclass_t dc;
	int error;

	if (!classname) {
	if (dev->devclass)
	devclass_delete_device(dev->devclass, dev);
	return (0);
	}

	if (dev->devclass) {
	printf("device_set_devclass: device class already set\n");
	return (EINVAL);
	}

	dc = devclass_find_internal(classname, NULL, TRUE);
	if (!dc)
	return (ENOMEM);

	error = devclass_add_device(dc, dev);

	bus_data_generation_update();
	return (error);
	}

	/**
	* @brief Set the driver of a device
	*
	* @retval 0 success
	* @retval EBUSY the device already has a driver attached
	* @retval ENOMEM a memory allocation failure occurred
	*/
	int
	device_set_driver(device_t dev, driver_t *driver)
	{
	if (dev->state >= DS_ATTACHED)
	return (EBUSY);

	if (dev->driver == driver)
	return (0);

	if (dev->softc && !(dev->flags & DF_EXTERNALSOFTC)) {
	free(dev->softc, M_BUS_SC);
	dev->softc = NULL;
	}
	kobj_delete((kobj_t) dev, NULL);
	dev->driver = driver;
	if (driver) {
	kobj_init((kobj_t) dev, (kobj_class_t) driver);
	if (!(dev->flags & DF_EXTERNALSOFTC) && driver->size > 0) {
	dev->softc = malloc(driver->size, M_BUS_SC,
	M_NOWAIT \| M_ZERO);
	if (!dev->softc) {
	kobj_delete((kobj_t) dev, NULL);
	kobj_init((kobj_t) dev, &null_class);
	dev->driver = NULL;
	return (ENOMEM);
	}
	}
	} else {
	kobj_init((kobj_t) dev, &null_class);
	}

	bus_data_generation_update();
	return (0);
	}

	/**
	* @brief Probe a device, and return this status.
	*
	* This function is the core of the device autoconfiguration
	* system. Its purpose is to select a suitable driver for a device and
	* then call that driver to initialise the hardware appropriately. The
	* driver is selected by calling the DEVICE_PROBE() method of a set of
	* candidate drivers and then choosing the driver which returned the
	* best value. This driver is then attached to the device using
	* device_attach().
	*
	* The set of suitable drivers is taken from the list of drivers in
	* the parent device's devclass. If the device was originally created
	* with a specific class name (see device_add_child()), only drivers
	* with that name are probed, otherwise all drivers in the devclass
	* are probed. If no drivers return successful probe values in the
	* parent devclass, the search continues in the parent of that
	* devclass (see devclass_get_parent()) if any.
	*
	* @param dev the device to initialise
	*
	* @retval 0 success
	* @retval ENXIO no driver was found
	* @retval ENOMEM memory allocation failure
	* @retval non-zero some other unix error code
	* @retval -1 Device already attached
	*/
	int
	device_probe(device_t dev)
	{
	int error;

	GIANT_REQUIRED;

	if (dev->state >= DS_ALIVE && (dev->flags & DF_REBID) == 0)
	return (-1);

	if (!(dev->flags & DF_ENABLED)) {
	if (bootverbose && device_get_name(dev) != NULL) {
	device_print_prettyname(dev);
	printf("not probed (disabled)\n");
	}
	return (-1);
	}
	if ((error = device_probe_child(dev->parent, dev)) != 0) {
	if (bus_current_pass == BUS_PASS_DEFAULT &&
	!(dev->flags & DF_DONENOMATCH)) {
	BUS_PROBE_NOMATCH(dev->parent, dev);
	devnomatch(dev);
	dev->flags \|= DF_DONENOMATCH;
	}
	return (error);
	}
	return (0);
	}

	/**
	* @brief Probe a device and attach a driver if possible
	*
	* calls device_probe() and attaches if that was successful.
	*/
	int
	device_probe_and_attach(device_t dev)
	{
	int error;

	GIANT_REQUIRED;

	error = device_probe(dev);
	if (error == -1)
	return (0);
	else if (error != 0)
	return (error);
	return (device_attach(dev));
	}

	/**
	* @brief Attach a device driver to a device
	*
	* This function is a wrapper around the DEVICE_ATTACH() driver
	* method. In addition to calling DEVICE_ATTACH(), it initialises the
	* device's sysctl tree, optionally prints a description of the device
	* and queues a notification event for user-based device management
	* services.
	*
	* Normally this function is only called internally from
	* device_probe_and_attach().
	*
	* @param dev the device to initialise
	*
	* @retval 0 success
	* @retval ENXIO no driver was found
	* @retval ENOMEM memory allocation failure
	* @retval non-zero some other unix error code
	*/
	int
	device_attach(device_t dev)
	{
	int error;

	device_sysctl_init(dev);
	if (!device_is_quiet(dev))
	device_print_child(dev->parent, dev);
	if ((error = DEVICE_ATTACH(dev)) != 0) {
	printf("device_attach: %s%d attach returned %d\n",
	dev->driver->name, dev->unit, error);
	/* Unset the class; set in device_probe_child */
	if (dev->devclass == NULL)
	device_set_devclass(dev, NULL);
	device_set_driver(dev, NULL);
	device_sysctl_fini(dev);
	dev->state = DS_NOTPRESENT;
	return (error);
	}
	device_sysctl_update(dev);
	dev->state = DS_ATTACHED;
	dev->flags &= ~DF_DONENOMATCH;
	devadded(dev);
	return (0);
	}

	/**
	* @brief Detach a driver from a device
	*
	* This function is a wrapper around the DEVICE_DETACH() driver
	* method. If the call to DEVICE_DETACH() succeeds, it calls
	* BUS_CHILD_DETACHED() for the parent of @p dev, queues a
	* notification event for user-based device management services and
	* cleans up the device's sysctl tree.
	*
	* @param dev the device to un-initialise
	*
	* @retval 0 success
	* @retval ENXIO no driver was found
	* @retval ENOMEM memory allocation failure
	* @retval non-zero some other unix error code
	*/
	int
	device_detach(device_t dev)
	{
	int error;

	GIANT_REQUIRED;

	PDEBUG(("%s", DEVICENAME(dev)));
	if (dev->state == DS_BUSY)
	return (EBUSY);
	if (dev->state != DS_ATTACHED)
	return (0);

	if ((error = DEVICE_DETACH(dev)) != 0)
	return (error);
	devremoved(dev);
	if (!device_is_quiet(dev))
	device_printf(dev, "detached\n");
	if (dev->parent)
	BUS_CHILD_DETACHED(dev->parent, dev);

	if (!(dev->flags & DF_FIXEDCLASS))
	devclass_delete_device(dev->devclass, dev);

	dev->state = DS_NOTPRESENT;
	device_set_driver(dev, NULL);
	device_set_desc(dev, NULL);
	device_sysctl_fini(dev);

	return (0);
	}

	/**
	* @brief Tells a driver to quiesce itself.
	*
	* This function is a wrapper around the DEVICE_QUIESCE() driver
	* method. If the call to DEVICE_QUIESCE() succeeds.
	*
	* @param dev the device to quiesce
	*
	* @retval 0 success
	* @retval ENXIO no driver was found
	* @retval ENOMEM memory allocation failure
	* @retval non-zero some other unix error code
	*/
	int
	device_quiesce(device_t dev)
	{

	PDEBUG(("%s", DEVICENAME(dev)));
	if (dev->state == DS_BUSY)
	return (EBUSY);
	if (dev->state != DS_ATTACHED)
	return (0);

	return (DEVICE_QUIESCE(dev));
	}

	/**
	* @brief Notify a device of system shutdown
	*
	* This function calls the DEVICE_SHUTDOWN() driver method if the
	* device currently has an attached driver.
	*
	* @returns the value returned by DEVICE_SHUTDOWN()
	*/
	int
	device_shutdown(device_t dev)
	{
	if (dev->state < DS_ATTACHED)
	return (0);
	return (DEVICE_SHUTDOWN(dev));
	}

	/**
	* @brief Set the unit number of a device
	*
	* This function can be used to override the unit number used for a
	* device (e.g. to wire a device to a pre-configured unit number).
	*/
	int
	device_set_unit(device_t dev, int unit)
	{
	devclass_t dc;
	int err;

	dc = device_get_devclass(dev);
	if (unit < dc->maxunit && dc->devices[unit])
	return (EBUSY);
	err = devclass_delete_device(dc, dev);
	if (err)
	return (err);
	dev->unit = unit;
	err = devclass_add_device(dc, dev);
	if (err)
	return (err);

	bus_data_generation_update();
	return (0);
	}

	/======================================/
	/*
	* Some useful method implementations to make life easier for bus drivers.
	*/

	/**
	* @brief Initialise a resource list.
	*
	* @param rl the resource list to initialise
	*/
	void
	resource_list_init(struct resource_list *rl)
	{
	STAILQ_INIT(rl);
	}

	/**
	* @brief Reclaim memory used by a resource list.
	*
	* This function frees the memory for all resource entries on the list
	* (if any).
	*
	* @param rl the resource list to free
	*/
	void
	resource_list_free(struct resource_list *rl)
	{
	struct resource_list_entry *rle;

	while ((rle = STAILQ_FIRST(rl)) != NULL) {
	if (rle->res)
	panic("resource_list_free: resource entry is busy");
	STAILQ_REMOVE_HEAD(rl, link);
	free(rle, M_BUS);
	}
	}

	/**
	* @brief Add a resource entry.
	*
	* This function adds a resource entry using the given @p type, @p
	* start, @p end and @p count values. A rid value is chosen by
	* searching sequentially for the first unused rid starting at zero.
	*
	* @param rl the resource list to edit
	* @param type the resource entry type (e.g. SYS_RES_MEMORY)
	* @param start the start address of the resource
	* @param end the end address of the resource
	* @param count XXX end-start+1
	*/
	int
	resource_list_add_next(struct resource_list *rl, int type, u_long start,
	u_long end, u_long count)
	{
	int rid;

	rid = 0;
	while (resource_list_find(rl, type, rid) != NULL)
	rid++;
	resource_list_add(rl, type, rid, start, end, count);
	return (rid);
	}

	/**
	* @brief Add or modify a resource entry.
	*
	* If an existing entry exists with the same type and rid, it will be
	* modified using the given values of @p start, @p end and @p
	* count. If no entry exists, a new one will be created using the
	* given values. The resource list entry that matches is then returned.
	*
	* @param rl the resource list to edit
	* @param type the resource entry type (e.g. SYS_RES_MEMORY)
	* @param rid the resource identifier
	* @param start the start address of the resource
	* @param end the end address of the resource
	* @param count XXX end-start+1
	*/
	struct resource_list_entry *
	resource_list_add(struct resource_list *rl, int type, int rid,
	u_long start, u_long end, u_long count)
	{
	struct resource_list_entry *rle;

	rle = resource_list_find(rl, type, rid);
	if (!rle) {
	rle = malloc(sizeof(struct resource_list_entry), M_BUS,
	M_NOWAIT);
	if (!rle)
	panic("resource_list_add: can't record entry");
	STAILQ_INSERT_TAIL(rl, rle, link);
	rle->type = type;
	rle->rid = rid;
	rle->res = NULL;
	rle->flags = 0;
	}

	if (rle->res)
	panic("resource_list_add: resource entry is busy");

	rle->start = start;
	rle->end = end;
	rle->count = count;
	return (rle);
	}

	/**
	* @brief Determine if a resource entry is busy.
	*
	* Returns true if a resource entry is busy meaning that it has an
	* associated resource that is not an unallocated "reserved" resource.
	*
	* @param rl the resource list to search
	* @param type the resource entry type (e.g. SYS_RES_MEMORY)
	* @param rid the resource identifier
	*
	* @returns Non-zero if the entry is busy, zero otherwise.
	*/
	int
	resource_list_busy(struct resource_list *rl, int type, int rid)
	{
	struct resource_list_entry *rle;

	rle = resource_list_find(rl, type, rid);
	if (rle == NULL \|\| rle->res == NULL)
	return (0);
	if ((rle->flags & (RLE_RESERVED \| RLE_ALLOCATED)) == RLE_RESERVED) {
	KASSERT(!(rman_get_flags(rle->res) & RF_ACTIVE),
	("reserved resource is active"));
	return (0);
	}
	return (1);
	}

	/**
	* @brief Determine if a resource entry is reserved.
	*
	* Returns true if a resource entry is reserved meaning that it has an
	* associated "reserved" resource. The resource can either be
	* allocated or unallocated.
	*
	* @param rl the resource list to search
	* @param type the resource entry type (e.g. SYS_RES_MEMORY)
	* @param rid the resource identifier
	*
	* @returns Non-zero if the entry is reserved, zero otherwise.
	*/
	int
	resource_list_reserved(struct resource_list *rl, int type, int rid)
	{
	struct resource_list_entry *rle;

	rle = resource_list_find(rl, type, rid);
	if (rle != NULL && rle->flags & RLE_RESERVED)
	return (1);
	return (0);
	}

	/**
	* @brief Find a resource entry by type and rid.
	*
	* @param rl the resource list to search
	* @param type the resource entry type (e.g. SYS_RES_MEMORY)
	* @param rid the resource identifier
	*
	* @returns the resource entry pointer or NULL if there is no such
	* entry.
	*/
	struct resource_list_entry *
	resource_list_find(struct resource_list *rl, int type, int rid)
	{
	struct resource_list_entry *rle;

	STAILQ_FOREACH(rle, rl, link) {
	if (rle->type == type && rle->rid == rid)
	return (rle);
	}
	return (NULL);
	}

	/**
	* @brief Delete a resource entry.
	*
	* @param rl the resource list to edit
	* @param type the resource entry type (e.g. SYS_RES_MEMORY)
	* @param rid the resource identifier
	*/
	void
	resource_list_delete(struct resource_list *rl, int type, int rid)
	{
	struct resource_list_entry *rle = resource_list_find(rl, type, rid);

	if (rle) {
	if (rle->res != NULL)
	panic("resource_list_delete: resource has not been released");
	STAILQ_REMOVE(rl, rle, resource_list_entry, link);
	free(rle, M_BUS);
	}
	}

	/**
	* @brief Allocate a reserved resource
	*
	* This can be used by busses to force the allocation of resources
	* that are always active in the system even if they are not allocated
	* by a driver (e.g. PCI BARs). This function is usually called when
	* adding a new child to the bus. The resource is allocated from the
	* parent bus when it is reserved. The resource list entry is marked
	* with RLE_RESERVED to note that it is a reserved resource.
	*
	* Subsequent attempts to allocate the resource with
	* resource_list_alloc() will succeed the first time and will set
	* RLE_ALLOCATED to note that it has been allocated. When a reserved
	* resource that has been allocated is released with
	* resource_list_release() the resource RLE_ALLOCATED is cleared, but
	* the actual resource remains allocated. The resource can be released to
	* the parent bus by calling resource_list_unreserve().
	*
	* @param rl the resource list to allocate from
	* @param bus the parent device of @p child
	* @param child the device for which the resource is being reserved
	* @param type the type of resource to allocate
	* @param rid a pointer to the resource identifier
	* @param start hint at the start of the resource range - pass
	* @c 0UL for any start address
	* @param end hint at the end of the resource range - pass
	* @c ~0UL for any end address
	* @param count hint at the size of range required - pass @c 1
	* for any size
	* @param flags any extra flags to control the resource
	* allocation - see @c RF_XXX flags in
	* <sys/rman.h> for details
	*
	* @returns the resource which was allocated or @c NULL if no
	* resource could be allocated
	*/
	struct resource *
	resource_list_reserve(struct resource_list *rl, device_t bus, device_t child,
	int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
	{
	struct resource_list_entry *rle = NULL;
	int passthrough = (device_get_parent(child) != bus);
	struct resource *r;

	if (passthrough)
	panic(
	"resource_list_reserve() should only be called for direct children");
	if (flags & RF_ACTIVE)
	panic(
	"resource_list_reserve() should only reserve inactive resources");

	r = resource_list_alloc(rl, bus, child, type, rid, start, end, count,
	flags);
	if (r != NULL) {
	rle = resource_list_find(rl, type, *rid);
	rle->flags \|= RLE_RESERVED;
	}
	return (r);
	}

	/**
	* @brief Helper function for implementing BUS_ALLOC_RESOURCE()
	*
	* Implement BUS_ALLOC_RESOURCE() by looking up a resource from the list
	* and passing the allocation up to the parent of @p bus. This assumes
	* that the first entry of @c device_get_ivars(child) is a struct
	* resource_list. This also handles 'passthrough' allocations where a
	* child is a remote descendant of bus by passing the allocation up to
	* the parent of bus.
	*
	* Typically, a bus driver would store a list of child resources
	* somewhere in the child device's ivars (see device_get_ivars()) and
	* its implementation of BUS_ALLOC_RESOURCE() would find that list and
	* then call resource_list_alloc() to perform the allocation.
	*
	* @param rl the resource list to allocate from
	* @param bus the parent device of @p child
	* @param child the device which is requesting an allocation
	* @param type the type of resource to allocate
	* @param rid a pointer to the resource identifier
	* @param start hint at the start of the resource range - pass
	* @c 0UL for any start address
	* @param end hint at the end of the resource range - pass
	* @c ~0UL for any end address
	* @param count hint at the size of range required - pass @c 1
	* for any size
	* @param flags any extra flags to control the resource
	* allocation - see @c RF_XXX flags in
	* <sys/rman.h> for details
	*
	* @returns the resource which was allocated or @c NULL if no
	* resource could be allocated
	*/
	struct resource *
	resource_list_alloc(struct resource_list *rl, device_t bus, device_t child,
	int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
	{
	struct resource_list_entry *rle = NULL;
	int passthrough = (device_get_parent(child) != bus);
	int isdefault = (start == 0UL && end == ~0UL);

	if (passthrough) {
	return (BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
	type, rid, start, end, count, flags));
	}

	rle = resource_list_find(rl, type, *rid);

	if (!rle)
	return (NULL); /* no resource of that type/rid */

	if (rle->res) {
	if (rle->flags & RLE_RESERVED) {
	if (rle->flags & RLE_ALLOCATED)
	return (NULL);
	if ((flags & RF_ACTIVE) &&
	bus_activate_resource(child, type, *rid,
	rle->res) != 0)
	return (NULL);
	rle->flags \|= RLE_ALLOCATED;
	return (rle->res);
	}
	panic("resource_list_alloc: resource entry is busy");
	}

	if (isdefault) {
	start = rle->start;
	count = ulmax(count, rle->count);
	end = ulmax(rle->end, start + count - 1);
	}

	rle->res = BUS_ALLOC_RESOURCE(device_get_parent(bus), child,
	type, rid, start, end, count, flags);

	/*
	* Record the new range.
	*/
	if (rle->res) {
	rle->start = rman_get_start(rle->res);
	rle->end = rman_get_end(rle->res);
	rle->count = count;
	}

	return (rle->res);
	}

	/**
	* @brief Helper function for implementing BUS_RELEASE_RESOURCE()
	*
	* Implement BUS_RELEASE_RESOURCE() using a resource list. Normally
	* used with resource_list_alloc().
	*
	* @param rl the resource list which was allocated from
	* @param bus the parent device of @p child
	* @param child the device which is requesting a release
	* @param type the type of resource to release
	* @param rid the resource identifier
	* @param res the resource to release
	*
	* @retval 0 success
	* @retval non-zero a standard unix error code indicating what
	* error condition prevented the operation
	*/
	int
	resource_list_release(struct resource_list *rl, device_t bus, device_t child,
	int type, int rid, struct resource *res)
	{
	struct resource_list_entry *rle = NULL;
	int passthrough = (device_get_parent(child) != bus);
	int error;

	if (passthrough) {
	return (BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
	type, rid, res));
	}

	rle = resource_list_find(rl, type, rid);

	if (!rle)
	panic("resource_list_release: can't find resource");
	if (!rle->res)
	panic("resource_list_release: resource entry is not busy");
	if (rle->flags & RLE_RESERVED) {
	if (rle->flags & RLE_ALLOCATED) {
	if (rman_get_flags(res) & RF_ACTIVE) {
	error = bus_deactivate_resource(child, type,
	rid, res);
	if (error)
	return (error);
	}
	rle->flags &= ~RLE_ALLOCATED;
	return (0);
	}
	return (EINVAL);
	}

	error = BUS_RELEASE_RESOURCE(device_get_parent(bus), child,
	type, rid, res);
	if (error)
	return (error);

	rle->res = NULL;
	return (0);
	}

	/**
	* @brief Fully release a reserved resource
	*
	* Fully releases a resouce reserved via resource_list_reserve().
	*
	* @param rl the resource list which was allocated from
	* @param bus the parent device of @p child
	* @param child the device whose reserved resource is being released
	* @param type the type of resource to release
	* @param rid the resource identifier
	* @param res the resource to release
	*
	* @retval 0 success
	* @retval non-zero a standard unix error code indicating what
	* error condition prevented the operation
	*/
	int
	resource_list_unreserve(struct resource_list *rl, device_t bus, device_t child,
	int type, int rid)
	{
	struct resource_list_entry *rle = NULL;
	int passthrough = (device_get_parent(child) != bus);

	if (passthrough)
	panic(
	"resource_list_unreserve() should only be called for direct children");

	rle = resource_list_find(rl, type, rid);

	if (!rle)
	panic("resource_list_unreserve: can't find resource");
	if (!(rle->flags & RLE_RESERVED))
	return (EINVAL);
	if (rle->flags & RLE_ALLOCATED)
	return (EBUSY);
	rle->flags &= ~RLE_RESERVED;
	return (resource_list_release(rl, bus, child, type, rid, rle->res));
	}

	/**
	* @brief Print a description of resources in a resource list
	*
	* Print all resources of a specified type, for use in BUS_PRINT_CHILD().
	* The name is printed if at least one resource of the given type is available.
	* The format is used to print resource start and end.
	*
	* @param rl the resource list to print
	* @param name the name of @p type, e.g. @c "memory"
	* @param type type type of resource entry to print
	* @param format printf(9) format string to print resource
	* start and end values
	*
	* @returns the number of characters printed
	*/
	int
	resource_list_print_type(struct resource_list rl, const char name, int type,
	const char *format)
	{
	struct resource_list_entry *rle;
	int printed, retval;

	printed = 0;
	retval = 0;
	/* Yes, this is kinda cheating */
	STAILQ_FOREACH(rle, rl, link) {
	if (rle->type == type) {
	if (printed == 0)
	retval += printf(" %s ", name);
	else
	retval += printf(",");
	printed++;
	retval += printf(format, rle->start);
	if (rle->count > 1) {
	retval += printf("-");
	retval += printf(format, rle->start +
	rle->count - 1);
	}
	}
	}
	return (retval);
	}

	/**
	* @brief Releases all the resources in a list.
	*
	* @param rl The resource list to purge.
	*
	* @returns nothing
	*/
	void
	resource_list_purge(struct resource_list *rl)
	{
	struct resource_list_entry *rle;

	while ((rle = STAILQ_FIRST(rl)) != NULL) {
	if (rle->res)
	bus_release_resource(rman_get_device(rle->res),
	rle->type, rle->rid, rle->res);
	STAILQ_REMOVE_HEAD(rl, link);
	free(rle, M_BUS);
	}
	}

	device_t
	bus_generic_add_child(device_t dev, u_int order, const char *name, int unit)
	{

	return (device_add_child_ordered(dev, order, name, unit));
	}

	/**
	* @brief Helper function for implementing DEVICE_PROBE()
	*
	* This function can be used to help implement the DEVICE_PROBE() for
	* a bus (i.e. a device which has other devices attached to it). It
	* calls the DEVICE_IDENTIFY() method of each driver in the device's
	* devclass.
	*/
	int
	bus_generic_probe(device_t dev)
	{
	devclass_t dc = dev->devclass;
	driverlink_t dl;

	TAILQ_FOREACH(dl, &dc->drivers, link) {
	/*
	* If this driver's pass is too high, then ignore it.
	* For most drivers in the default pass, this will
	* never be true. For early-pass drivers they will
	* only call the identify routines of eligible drivers
	* when this routine is called. Drivers for later
	* passes should have their identify routines called
	* on early-pass busses during BUS_NEW_PASS().
	*/
	if (dl->pass > bus_current_pass)
	continue;
	DEVICE_IDENTIFY(dl->driver, dev);
	}

	return (0);
	}

	/**
	* @brief Helper function for implementing DEVICE_ATTACH()
	*
	* This function can be used to help implement the DEVICE_ATTACH() for
	* a bus. It calls device_probe_and_attach() for each of the device's
	* children.
	*/
	int
	bus_generic_attach(device_t dev)
	{
	device_t child;

	TAILQ_FOREACH(child, &dev->children, link) {
	device_probe_and_attach(child);
	}

	return (0);
	}

	/**
	* @brief Helper function for implementing DEVICE_DETACH()
	*
	* This function can be used to help implement the DEVICE_DETACH() for
	* a bus. It calls device_detach() for each of the device's
	* children.
	*/
	int
	bus_generic_detach(device_t dev)
	{
	device_t child;
	int error;

	if (dev->state != DS_ATTACHED)
	return (EBUSY);

	TAILQ_FOREACH(child, &dev->children, link) {
	if ((error = device_detach(child)) != 0)
	return (error);
	}

	return (0);
	}

	/**
	* @brief Helper function for implementing DEVICE_SHUTDOWN()
	*
	* This function can be used to help implement the DEVICE_SHUTDOWN()
	* for a bus. It calls device_shutdown() for each of the device's
	* children.
	*/
	int
	bus_generic_shutdown(device_t dev)
	{
	device_t child;

	TAILQ_FOREACH(child, &dev->children, link) {
	device_shutdown(child);
	}

	return (0);
	}

	/**
	* @brief Helper function for implementing DEVICE_SUSPEND()
	*
	* This function can be used to help implement the DEVICE_SUSPEND()
	* for a bus. It calls DEVICE_SUSPEND() for each of the device's
	* children. If any call to DEVICE_SUSPEND() fails, the suspend
	* operation is aborted and any devices which were suspended are
	* resumed immediately by calling their DEVICE_RESUME() methods.
	*/
	int
	bus_generic_suspend(device_t dev)
	{
	int error;
	device_t child, child2;

	TAILQ_FOREACH(child, &dev->children, link) {
	error = DEVICE_SUSPEND(child);
	if (error) {
	for (child2 = TAILQ_FIRST(&dev->children);
	child2 && child2 != child;
	child2 = TAILQ_NEXT(child2, link))
	DEVICE_RESUME(child2);
	return (error);
	}
	}
	return (0);
	}

	/**
	* @brief Helper function for implementing DEVICE_RESUME()
	*
	* This function can be used to help implement the DEVICE_RESUME() for
	* a bus. It calls DEVICE_RESUME() on each of the device's children.
	*/
	int
	bus_generic_resume(device_t dev)
	{
	device_t child;

	TAILQ_FOREACH(child, &dev->children, link) {
	DEVICE_RESUME(child);
	/* if resume fails, there's nothing we can usefully do... */
	}
	return (0);
	}

	/**
	* @brief Helper function for implementing BUS_PRINT_CHILD().
	*
	* This function prints the first part of the ascii representation of
	* @p child, including its name, unit and description (if any - see
	* device_set_desc()).
	*
	* @returns the number of characters printed
	*/
	int
	bus_print_child_header(device_t dev, device_t child)
	{
	int retval = 0;

	if (device_get_desc(child)) {
	retval += device_printf(child, "<%s>", device_get_desc(child));
	} else {
	retval += printf("%s", device_get_nameunit(child));
	}

	return (retval);
	}

	/**
	* @brief Helper function for implementing BUS_PRINT_CHILD().
	*
	* This function prints the last part of the ascii representation of
	* @p child, which consists of the string @c " on " followed by the
	* name and unit of the @p dev.
	*
	* @returns the number of characters printed
	*/
	int
	bus_print_child_footer(device_t dev, device_t child)
	{
	return (printf(" on %s\n", device_get_nameunit(dev)));
	}

	/**
	* @brief Helper function for implementing BUS_PRINT_CHILD().
	*
	* This function simply calls bus_print_child_header() followed by
	* bus_print_child_footer().
	*
	* @returns the number of characters printed
	*/
	int
	bus_generic_print_child(device_t dev, device_t child)
	{
	int retval = 0;

	retval += bus_print_child_header(dev, child);
	retval += bus_print_child_footer(dev, child);

	return (retval);
	}

	/**
	* @brief Stub function for implementing BUS_READ_IVAR().
	*
	* @returns ENOENT
	*/
	int
	bus_generic_read_ivar(device_t dev, device_t child, int index,
	uintptr_t * result)
	{
	return (ENOENT);
	}

	/**
	* @brief Stub function for implementing BUS_WRITE_IVAR().
	*
	* @returns ENOENT
	*/
	int
	bus_generic_write_ivar(device_t dev, device_t child, int index,
	uintptr_t value)
	{
	return (ENOENT);
	}

	/**
	* @brief Stub function for implementing BUS_GET_RESOURCE_LIST().
	*
	* @returns NULL
	*/
	struct resource_list *
	bus_generic_get_resource_list(device_t dev, device_t child)
	{
	return (NULL);
	}

	/**
	* @brief Helper function for implementing BUS_DRIVER_ADDED().
	*
	* This implementation of BUS_DRIVER_ADDED() simply calls the driver's
	* DEVICE_IDENTIFY() method to allow it to add new children to the bus
	* and then calls device_probe_and_attach() for each unattached child.
	*/
	void
	bus_generic_driver_added(device_t dev, driver_t *driver)
	{
	device_t child;

	DEVICE_IDENTIFY(driver, dev);
	TAILQ_FOREACH(child, &dev->children, link) {
	if (child->state == DS_NOTPRESENT \|\|
	(child->flags & DF_REBID))
	device_probe_and_attach(child);
	}
	}

	/**
	* @brief Helper function for implementing BUS_NEW_PASS().
	*
	* This implementing of BUS_NEW_PASS() first calls the identify
	* routines for any drivers that probe at the current pass. Then it
	* walks the list of devices for this bus. If a device is already
	* attached, then it calls BUS_NEW_PASS() on that device. If the
	* device is not already attached, it attempts to attach a driver to
	* it.
	*/
	void
	bus_generic_new_pass(device_t dev)
	{
	driverlink_t dl;
	devclass_t dc;
	device_t child;

	dc = dev->devclass;
	TAILQ_FOREACH(dl, &dc->drivers, link) {
	if (dl->pass == bus_current_pass)
	DEVICE_IDENTIFY(dl->driver, dev);
	}
	TAILQ_FOREACH(child, &dev->children, link) {
	if (child->state >= DS_ATTACHED)
	BUS_NEW_PASS(child);
	else if (child->state == DS_NOTPRESENT)
	device_probe_and_attach(child);
	}
	}

	/**
	* @brief Helper function for implementing BUS_SETUP_INTR().
	*
	* This simple implementation of BUS_SETUP_INTR() simply calls the
	* BUS_SETUP_INTR() method of the parent of @p dev.
	*/
	int
	bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
	int flags, driver_filter_t filter, driver_intr_t intr, void *arg,
	void **cookiep)
	{
	/* Propagate up the bus hierarchy until someone handles it. */
	if (dev->parent)
	return (BUS_SETUP_INTR(dev->parent, child, irq, flags,
	filter, intr, arg, cookiep));
	return (EINVAL);
	}

	/**
	* @brief Helper function for implementing BUS_TEARDOWN_INTR().
	*
	* This simple implementation of BUS_TEARDOWN_INTR() simply calls the
	* BUS_TEARDOWN_INTR() method of the parent of @p dev.
	*/
	int
	bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq,
	void *cookie)
	{
	/* Propagate up the bus hierarchy until someone handles it. */
	if (dev->parent)
	return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie));
	return (EINVAL);
	}

	/**
	* @brief Helper function for implementing BUS_ADJUST_RESOURCE().
	*
	* This simple implementation of BUS_ADJUST_RESOURCE() simply calls the
	* BUS_ADJUST_RESOURCE() method of the parent of @p dev.
	*/
	int
	bus_generic_adjust_resource(device_t dev, device_t child, int type,
	struct resource *r, u_long start, u_long end)
	{
	/* Propagate up the bus hierarchy until someone handles it. */
	if (dev->parent)
	return (BUS_ADJUST_RESOURCE(dev->parent, child, type, r, start,
	end));
	return (EINVAL);
	}

	/**
	* @brief Helper function for implementing BUS_ALLOC_RESOURCE().
	*
	* This simple implementation of BUS_ALLOC_RESOURCE() simply calls the
	* BUS_ALLOC_RESOURCE() method of the parent of @p dev.
	*/
	struct resource *
	bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid,
	u_long start, u_long end, u_long count, u_int flags)
	{
	/* Propagate up the bus hierarchy until someone handles it. */
	if (dev->parent)
	return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid,
	start, end, count, flags));
	return (NULL);
	}

	/**
	* @brief Helper function for implementing BUS_RELEASE_RESOURCE().
	*
	* This simple implementation of BUS_RELEASE_RESOURCE() simply calls the
	* BUS_RELEASE_RESOURCE() method of the parent of @p dev.
	*/
	int
	bus_generic_release_resource(device_t dev, device_t child, int type, int rid,
	struct resource *r)
	{
	/* Propagate up the bus hierarchy until someone handles it. */
	if (dev->parent)
	return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid,
	r));
	return (EINVAL);
	}

	/**
	* @brief Helper function for implementing BUS_ACTIVATE_RESOURCE().
	*
	* This simple implementation of BUS_ACTIVATE_RESOURCE() simply calls the
	* BUS_ACTIVATE_RESOURCE() method of the parent of @p dev.
	*/
	int
	bus_generic_activate_resource(device_t dev, device_t child, int type, int rid,
	struct resource *r)
	{
	/* Propagate up the bus hierarchy until someone handles it. */
	if (dev->parent)
	return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid,
	r));
	return (EINVAL);
	}

	/**
	* @brief Helper function for implementing BUS_DEACTIVATE_RESOURCE().
	*
	* This simple implementation of BUS_DEACTIVATE_RESOURCE() simply calls the
	* BUS_DEACTIVATE_RESOURCE() method of the parent of @p dev.
	*/
	int
	bus_generic_deactivate_resource(device_t dev, device_t child, int type,
	int rid, struct resource *r)
	{
	/* Propagate up the bus hierarchy until someone handles it. */
	if (dev->parent)
	return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid,
	r));
	return (EINVAL);
	}

	/**
	* @brief Helper function for implementing BUS_BIND_INTR().
	*
	* This simple implementation of BUS_BIND_INTR() simply calls the
	* BUS_BIND_INTR() method of the parent of @p dev.
	*/
	int
	bus_generic_bind_intr(device_t dev, device_t child, struct resource *irq,
	int cpu)
	{

	/* Propagate up the bus hierarchy until someone handles it. */
	if (dev->parent)
	return (BUS_BIND_INTR(dev->parent, child, irq, cpu));
	return (EINVAL);
	}

	/**
	* @brief Helper function for implementing BUS_CONFIG_INTR().
	*
	* This simple implementation of BUS_CONFIG_INTR() simply calls the
	* BUS_CONFIG_INTR() method of the parent of @p dev.
	*/
	int
	bus_generic_config_intr(device_t dev, int irq, enum intr_trigger trig,
	enum intr_polarity pol)
	{

	/* Propagate up the bus hierarchy until someone handles it. */
	if (dev->parent)
	return (BUS_CONFIG_INTR(dev->parent, irq, trig, pol));
	return (EINVAL);
	}

	/**
	* @brief Helper function for implementing BUS_DESCRIBE_INTR().
	*
	* This simple implementation of BUS_DESCRIBE_INTR() simply calls the
	* BUS_DESCRIBE_INTR() method of the parent of @p dev.
	*/
	int
	bus_generic_describe_intr(device_t dev, device_t child, struct resource *irq,
	void cookie, const char descr)
	{

	/* Propagate up the bus hierarchy until someone handles it. */
	if (dev->parent)
	return (BUS_DESCRIBE_INTR(dev->parent, child, irq, cookie,
	descr));
	return (EINVAL);
	}

	/**
	* @brief Helper function for implementing BUS_GET_DMA_TAG().
	*
	* This simple implementation of BUS_GET_DMA_TAG() simply calls the
	* BUS_GET_DMA_TAG() method of the parent of @p dev.
	*/
	bus_dma_tag_t
	bus_generic_get_dma_tag(device_t dev, device_t child)
	{

	/* Propagate up the bus hierarchy until someone handles it. */
	if (dev->parent != NULL)
	return (BUS_GET_DMA_TAG(dev->parent, child));
	return (NULL);
	}

	/**
	* @brief Helper function for implementing BUS_GET_RESOURCE().
	*
	* This implementation of BUS_GET_RESOURCE() uses the
	* resource_list_find() function to do most of the work. It calls
	* BUS_GET_RESOURCE_LIST() to find a suitable resource list to
	* search.
	*/
	int
	bus_generic_rl_get_resource(device_t dev, device_t child, int type, int rid,
	u_long startp, u_long countp)
	{
	struct resource_list * rl = NULL;
	struct resource_list_entry * rle = NULL;

	rl = BUS_GET_RESOURCE_LIST(dev, child);
	if (!rl)
	return (EINVAL);

	rle = resource_list_find(rl, type, rid);
	if (!rle)
	return (ENOENT);

	if (startp)
	*startp = rle->start;
	if (countp)
	*countp = rle->count;

	return (0);
	}

	/**
	* @brief Helper function for implementing BUS_SET_RESOURCE().
	*
	* This implementation of BUS_SET_RESOURCE() uses the
	* resource_list_add() function to do most of the work. It calls
	* BUS_GET_RESOURCE_LIST() to find a suitable resource list to
	* edit.
	*/
	int
	bus_generic_rl_set_resource(device_t dev, device_t child, int type, int rid,
	u_long start, u_long count)
	{
	struct resource_list * rl = NULL;

	rl = BUS_GET_RESOURCE_LIST(dev, child);
	if (!rl)
	return (EINVAL);

	resource_list_add(rl, type, rid, start, (start + count - 1), count);

	return (0);
	}

	/**
	* @brief Helper function for implementing BUS_DELETE_RESOURCE().
	*
	* This implementation of BUS_DELETE_RESOURCE() uses the
	* resource_list_delete() function to do most of the work. It calls
	* BUS_GET_RESOURCE_LIST() to find a suitable resource list to
	* edit.
	*/
	void
	bus_generic_rl_delete_resource(device_t dev, device_t child, int type, int rid)
	{
	struct resource_list * rl = NULL;

	rl = BUS_GET_RESOURCE_LIST(dev, child);
	if (!rl)
	return;

	resource_list_delete(rl, type, rid);

	return;
	}

	/**
	* @brief Helper function for implementing BUS_RELEASE_RESOURCE().
	*
	* This implementation of BUS_RELEASE_RESOURCE() uses the
	* resource_list_release() function to do most of the work. It calls
	* BUS_GET_RESOURCE_LIST() to find a suitable resource list.
	*/
	int
	bus_generic_rl_release_resource(device_t dev, device_t child, int type,
	int rid, struct resource *r)
	{
	struct resource_list * rl = NULL;

	if (device_get_parent(child) != dev)
	return (BUS_RELEASE_RESOURCE(device_get_parent(dev), child,
	type, rid, r));

	rl = BUS_GET_RESOURCE_LIST(dev, child);
	if (!rl)
	return (EINVAL);

	return (resource_list_release(rl, dev, child, type, rid, r));
	}

	/**
	* @brief Helper function for implementing BUS_ALLOC_RESOURCE().
	*
	* This implementation of BUS_ALLOC_RESOURCE() uses the
	* resource_list_alloc() function to do most of the work. It calls
	* BUS_GET_RESOURCE_LIST() to find a suitable resource list.
	*/
	struct resource *
	bus_generic_rl_alloc_resource(device_t dev, device_t child, int type,
	int *rid, u_long start, u_long end, u_long count, u_int flags)
	{
	struct resource_list * rl = NULL;

	if (device_get_parent(child) != dev)
	return (BUS_ALLOC_RESOURCE(device_get_parent(dev), child,
	type, rid, start, end, count, flags));

	rl = BUS_GET_RESOURCE_LIST(dev, child);
	if (!rl)
	return (NULL);

	return (resource_list_alloc(rl, dev, child, type, rid,
	start, end, count, flags));
	}

	/**
	* @brief Helper function for implementing BUS_CHILD_PRESENT().
	*
	* This simple implementation of BUS_CHILD_PRESENT() simply calls the
	* BUS_CHILD_PRESENT() method of the parent of @p dev.
	*/
	int
	bus_generic_child_present(device_t dev, device_t child)
	{
	return (BUS_CHILD_PRESENT(device_get_parent(dev), dev));
	}

	/*
	* Some convenience functions to make it easier for drivers to use the
	* resource-management functions. All these really do is hide the
	* indirection through the parent's method table, making for slightly
	* less-wordy code. In the future, it might make sense for this code
	* to maintain some sort of a list of resources allocated by each device.
	*/

	int
	bus_alloc_resources(device_t dev, struct resource_spec *rs,
	struct resource **res)
	{
	int i;

	for (i = 0; rs[i].type != -1; i++)
	res[i] = NULL;
	for (i = 0; rs[i].type != -1; i++) {
	res[i] = bus_alloc_resource_any(dev,
	rs[i].type, &rs[i].rid, rs[i].flags);
	if (res[i] == NULL && !(rs[i].flags & RF_OPTIONAL)) {
	bus_release_resources(dev, rs, res);
	return (ENXIO);
	}
	}
	return (0);
	}

	void
	bus_release_resources(device_t dev, const struct resource_spec *rs,
	struct resource **res)
	{
	int i;

	for (i = 0; rs[i].type != -1; i++)
	if (res[i] != NULL) {
	bus_release_resource(
	dev, rs[i].type, rs[i].rid, res[i]);
	res[i] = NULL;
	}
	}

	/**
	* @brief Wrapper function for BUS_ALLOC_RESOURCE().
	*
	* This function simply calls the BUS_ALLOC_RESOURCE() method of the
	* parent of @p dev.
	*/
	struct resource *
	bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end,
	u_long count, u_int flags)
	{
	if (dev->parent == NULL)
	return (NULL);
	return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
	count, flags));
	}

	/**
	* @brief Wrapper function for BUS_ADJUST_RESOURCE().
	*
	* This function simply calls the BUS_ADJUST_RESOURCE() method of the
	* parent of @p dev.
	*/
	int
	bus_adjust_resource(device_t dev, int type, struct resource *r, u_long start,
	u_long end)
	{
	if (dev->parent == NULL)
	return (EINVAL);
	return (BUS_ADJUST_RESOURCE(dev->parent, dev, type, r, start, end));
	}

	/**
	* @brief Wrapper function for BUS_ACTIVATE_RESOURCE().
	*
	* This function simply calls the BUS_ACTIVATE_RESOURCE() method of the
	* parent of @p dev.
	*/
	int
	bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
	{
	if (dev->parent == NULL)
	return (EINVAL);
	return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
	}

	/**
	* @brief Wrapper function for BUS_DEACTIVATE_RESOURCE().
	*
	* This function simply calls the BUS_DEACTIVATE_RESOURCE() method of the
	* parent of @p dev.
	*/
	int
	bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
	{
	if (dev->parent == NULL)
	return (EINVAL);
	return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
	}

	/**
	* @brief Wrapper function for BUS_RELEASE_RESOURCE().
	*
	* This function simply calls the BUS_RELEASE_RESOURCE() method of the
	* parent of @p dev.
	*/
	int
	bus_release_resource(device_t dev, int type, int rid, struct resource *r)
	{
	if (dev->parent == NULL)
	return (EINVAL);
	return (BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r));
	}

	/**
	* @brief Wrapper function for BUS_SETUP_INTR().
	*
	* This function simply calls the BUS_SETUP_INTR() method of the
	* parent of @p dev.
	*/
	int
	bus_setup_intr(device_t dev, struct resource *r, int flags,
	driver_filter_t filter, driver_intr_t handler, void arg, void *cookiep)
	{
	int error;

	if (dev->parent == NULL)
	return (EINVAL);
	error = BUS_SETUP_INTR(dev->parent, dev, r, flags, filter, handler,
	arg, cookiep);
	if (error != 0)
	return (error);
	if (handler != NULL && !(flags & INTR_MPSAFE))
	device_printf(dev, "[GIANT-LOCKED]\n");
	return (0);
	}

	/**
	* @brief Wrapper function for BUS_TEARDOWN_INTR().
	*
	* This function simply calls the BUS_TEARDOWN_INTR() method of the
	* parent of @p dev.
	*/
	int
	bus_teardown_intr(device_t dev, struct resource r, void cookie)
	{
	if (dev->parent == NULL)
	return (EINVAL);
	return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie));
	}

	/**
	* @brief Wrapper function for BUS_BIND_INTR().
	*
	* This function simply calls the BUS_BIND_INTR() method of the
	* parent of @p dev.
	*/
	int
	bus_bind_intr(device_t dev, struct resource *r, int cpu)
	{
	if (dev->parent == NULL)
	return (EINVAL);
	return (BUS_BIND_INTR(dev->parent, dev, r, cpu));
	}

	/**
	* @brief Wrapper function for BUS_DESCRIBE_INTR().
	*
	* This function first formats the requested description into a
	* temporary buffer and then calls the BUS_DESCRIBE_INTR() method of
	* the parent of @p dev.
	*/
	int
	bus_describe_intr(device_t dev, struct resource irq, void cookie,
	const char *fmt, ...)
	{
	va_list ap;
	char descr[MAXCOMLEN + 1];

	if (dev->parent == NULL)
	return (EINVAL);
	va_start(ap, fmt);
	vsnprintf(descr, sizeof(descr), fmt, ap);
	va_end(ap);
	return (BUS_DESCRIBE_INTR(dev->parent, dev, irq, cookie, descr));
	}

	/**
	* @brief Wrapper function for BUS_SET_RESOURCE().
	*
	* This function simply calls the BUS_SET_RESOURCE() method of the
	* parent of @p dev.
	*/
	int
	bus_set_resource(device_t dev, int type, int rid,
	u_long start, u_long count)
	{
	return (BUS_SET_RESOURCE(device_get_parent(dev), dev, type, rid,
	start, count));
	}

	/**
	* @brief Wrapper function for BUS_GET_RESOURCE().
	*
	* This function simply calls the BUS_GET_RESOURCE() method of the
	* parent of @p dev.
	*/
	int
	bus_get_resource(device_t dev, int type, int rid,
	u_long startp, u_long countp)
	{
	return (BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
	startp, countp));
	}

	/**
	* @brief Wrapper function for BUS_GET_RESOURCE().
	*
	* This function simply calls the BUS_GET_RESOURCE() method of the
	* parent of @p dev and returns the start value.
	*/
	u_long
	bus_get_resource_start(device_t dev, int type, int rid)
	{
	u_long start, count;
	int error;

	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
	&start, &count);
	if (error)
	return (0);
	return (start);
	}

	/**
	* @brief Wrapper function for BUS_GET_RESOURCE().
	*
	* This function simply calls the BUS_GET_RESOURCE() method of the
	* parent of @p dev and returns the count value.
	*/
	u_long
	bus_get_resource_count(device_t dev, int type, int rid)
	{
	u_long start, count;
	int error;

	error = BUS_GET_RESOURCE(device_get_parent(dev), dev, type, rid,
	&start, &count);
	if (error)
	return (0);
	return (count);
	}

	/**
	* @brief Wrapper function for BUS_DELETE_RESOURCE().
	*
	* This function simply calls the BUS_DELETE_RESOURCE() method of the
	* parent of @p dev.
	*/
	void
	bus_delete_resource(device_t dev, int type, int rid)
	{
	BUS_DELETE_RESOURCE(device_get_parent(dev), dev, type, rid);
	}

	/**
	* @brief Wrapper function for BUS_CHILD_PRESENT().
	*
	* This function simply calls the BUS_CHILD_PRESENT() method of the
	* parent of @p dev.
	*/
	int
	bus_child_present(device_t child)
	{
	return (BUS_CHILD_PRESENT(device_get_parent(child), child));
	}

	/**
	* @brief Wrapper function for BUS_CHILD_PNPINFO_STR().
	*
	* This function simply calls the BUS_CHILD_PNPINFO_STR() method of the
	* parent of @p dev.
	*/
	int
	bus_child_pnpinfo_str(device_t child, char *buf, size_t buflen)
	{
	device_t parent;

	parent = device_get_parent(child);
	if (parent == NULL) {
	*buf = '\0';
	return (0);
	}
	return (BUS_CHILD_PNPINFO_STR(parent, child, buf, buflen));
	}

	/**
	* @brief Wrapper function for BUS_CHILD_LOCATION_STR().
	*
	* This function simply calls the BUS_CHILD_LOCATION_STR() method of the
	* parent of @p dev.
	*/
	int
	bus_child_location_str(device_t child, char *buf, size_t buflen)
	{
	device_t parent;

	parent = device_get_parent(child);
	if (parent == NULL) {
	*buf = '\0';
	return (0);
	}
	return (BUS_CHILD_LOCATION_STR(parent, child, buf, buflen));
	}

	/**
	* @brief Wrapper function for BUS_GET_DMA_TAG().
	*
	* This function simply calls the BUS_GET_DMA_TAG() method of the
	* parent of @p dev.
	*/
	bus_dma_tag_t
	bus_get_dma_tag(device_t dev)
	{
	device_t parent;

	parent = device_get_parent(dev);
	if (parent == NULL)
	return (NULL);
	return (BUS_GET_DMA_TAG(parent, dev));
	}

	/* Resume all devices and then notify userland that we're up again. */
	static int
	root_resume(device_t dev)
	{
	int error;

	error = bus_generic_resume(dev);
	if (error == 0)
	devctl_notify("kern", "power", "resume", NULL);
	return (error);
	}

	static int
	root_print_child(device_t dev, device_t child)
	{
	int retval = 0;

	retval += bus_print_child_header(dev, child);
	retval += printf("\n");

	return (retval);
	}

	static int
	root_setup_intr(device_t dev, device_t child, struct resource *irq, int flags,
	driver_filter_t filter, driver_intr_t intr, void arg, void *cookiep)
	{
	/*
	* If an interrupt mapping gets to here something bad has happened.
	*/
	panic("root_setup_intr");
	}

	/*
	* If we get here, assume that the device is permanant and really is
	* present in the system. Removable bus drivers are expected to intercept
	* this call long before it gets here. We return -1 so that drivers that
	* really care can check vs -1 or some ERRNO returned higher in the food
	* chain.
	*/
	static int
	root_child_present(device_t dev, device_t child)
	{
	return (-1);
	}

	static kobj_method_t root_methods[] = {
	/* Device interface */
	KOBJMETHOD(device_shutdown, bus_generic_shutdown),
	KOBJMETHOD(device_suspend, bus_generic_suspend),
	KOBJMETHOD(device_resume, root_resume),

	/* Bus interface */
	KOBJMETHOD(bus_print_child, root_print_child),
	KOBJMETHOD(bus_read_ivar, bus_generic_read_ivar),
	KOBJMETHOD(bus_write_ivar, bus_generic_write_ivar),
	KOBJMETHOD(bus_setup_intr, root_setup_intr),
	KOBJMETHOD(bus_child_present, root_child_present),

	KOBJMETHOD_END
	};

	static driver_t root_driver = {
	"root",
	root_methods,
	1, /* no softc */
	};

	device_t root_bus;
	devclass_t root_devclass;

	static int
	root_bus_module_handler(module_t mod, int what, void* arg)
	{
	switch (what) {
	case MOD_LOAD:
	TAILQ_INIT(&bus_data_devices);
	kobj_class_compile((kobj_class_t) &root_driver);
	root_bus = make_device(NULL, "root", 0);
	root_bus->desc = "System root bus";
	kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver);
	root_bus->driver = &root_driver;
	root_bus->state = DS_ATTACHED;
	root_devclass = devclass_find_internal("root", NULL, FALSE);
	devinit();
	return (0);

	case MOD_SHUTDOWN:
	device_shutdown(root_bus);
	return (0);
	default:
	return (EOPNOTSUPP);
	}

	return (0);
	}

	static moduledata_t root_bus_mod = {
	"rootbus",
	root_bus_module_handler,
	NULL
	};
	DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);

	/**
	* @brief Automatically configure devices
	*
	* This function begins the autoconfiguration process by calling
	* device_probe_and_attach() for each child of the @c root0 device.
	*/
	void
	root_bus_configure(void)
	{

	PDEBUG(("."));

	/* Eventually this will be split up, but this is sufficient for now. */
	bus_set_pass(BUS_PASS_DEFAULT);
	}

	/**
	* @brief Module handler for registering device drivers
	*
	* This module handler is used to automatically register device
	* drivers when modules are loaded. If @p what is MOD_LOAD, it calls
	* devclass_add_driver() for the driver described by the
	* driver_module_data structure pointed to by @p arg
	*/
	int
	driver_module_handler(module_t mod, int what, void *arg)
	{
	struct driver_module_data *dmd;
	devclass_t bus_devclass;
	kobj_class_t driver;
	int error, pass;

	dmd = (struct driver_module_data *)arg;
	bus_devclass = devclass_find_internal(dmd->dmd_busname, NULL, TRUE);
	error = 0;

	switch (what) {
	case MOD_LOAD:
	if (dmd->dmd_chainevh)
	error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);

	pass = dmd->dmd_pass;
	driver = dmd->dmd_driver;
	PDEBUG(("Loading module: driver %s on bus %s (pass %d)",
	DRIVERNAME(driver), dmd->dmd_busname, pass));
	error = devclass_add_driver(bus_devclass, driver, pass,
	dmd->dmd_devclass);
	break;

	case MOD_UNLOAD:
	PDEBUG(("Unloading module: driver %s from bus %s",
	DRIVERNAME(dmd->dmd_driver),
	dmd->dmd_busname));
	error = devclass_delete_driver(bus_devclass,
	dmd->dmd_driver);

	if (!error && dmd->dmd_chainevh)
	error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
	break;
	case MOD_QUIESCE:
	PDEBUG(("Quiesce module: driver %s from bus %s",
	DRIVERNAME(dmd->dmd_driver),
	dmd->dmd_busname));
	error = devclass_quiesce_driver(bus_devclass,
	dmd->dmd_driver);

	if (!error && dmd->dmd_chainevh)
	error = dmd->dmd_chainevh(mod,what,dmd->dmd_chainarg);
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}

	return (error);
	}

	/**
	* @brief Enumerate all hinted devices for this bus.
	*
	* Walks through the hints for this bus and calls the bus_hinted_child
	* routine for each one it fines. It searches first for the specific
	* bus that's being probed for hinted children (eg isa0), and then for
	* generic children (eg isa).
	*
	* @param dev bus device to enumerate
	*/
	void
	bus_enumerate_hinted_children(device_t bus)
	{
	int i;
	const char dname, busname;
	int dunit;

	/*
	* enumerate all devices on the specific bus
	*/
	busname = device_get_nameunit(bus);
	i = 0;
	while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
	BUS_HINTED_CHILD(bus, dname, dunit);

	/*
	* and all the generic ones.
	*/
	busname = device_get_name(bus);
	i = 0;
	while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
	BUS_HINTED_CHILD(bus, dname, dunit);
	}

	#ifdef BUS_DEBUG

	/* the _short versions avoid iteration by not calling anything that prints
	* more than oneliners. I love oneliners.
	*/

	static void
	print_device_short(device_t dev, int indent)
	{
	if (!dev)
	return;

	indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%s%s,%sivars,%ssoftc,busy=%d\n",
	dev->unit, dev->desc,
	(dev->parent? "":"no "),
	(TAILQ_EMPTY(&dev->children)? "no ":""),
	(dev->flags&DF_ENABLED? "enabled,":"disabled,"),
	(dev->flags&DF_FIXEDCLASS? "fixed,":""),
	(dev->flags&DF_WILDCARD? "wildcard,":""),
	(dev->flags&DF_DESCMALLOCED? "descmalloced,":""),
	(dev->flags&DF_REBID? "rebiddable,":""),
	(dev->ivars? "":"no "),
	(dev->softc? "":"no "),
	dev->busy));
	}

	static void
	print_device(device_t dev, int indent)
	{
	if (!dev)
	return;

	print_device_short(dev, indent);

	indentprintf(("Parent:\n"));
	print_device_short(dev->parent, indent+1);
	indentprintf(("Driver:\n"));
	print_driver_short(dev->driver, indent+1);
	indentprintf(("Devclass:\n"));
	print_devclass_short(dev->devclass, indent+1);
	}

	void
	print_device_tree_short(device_t dev, int indent)
	/* print the device and all its children (indented) */
	{
	device_t child;

	if (!dev)
	return;

	print_device_short(dev, indent);

	TAILQ_FOREACH(child, &dev->children, link) {
	print_device_tree_short(child, indent+1);
	}
	}

	void
	print_device_tree(device_t dev, int indent)
	/* print the device and all its children (indented) */
	{
	device_t child;

	if (!dev)
	return;

	print_device(dev, indent);

	TAILQ_FOREACH(child, &dev->children, link) {
	print_device_tree(child, indent+1);
	}
	}

	static void
	print_driver_short(driver_t *driver, int indent)
	{
	if (!driver)
	return;

	indentprintf(("driver %s: softc size = %zd\n",
	driver->name, driver->size));
	}

	static void
	print_driver(driver_t *driver, int indent)
	{
	if (!driver)
	return;

	print_driver_short(driver, indent);
	}


	static void
	print_driver_list(driver_list_t drivers, int indent)
	{
	driverlink_t driver;

	TAILQ_FOREACH(driver, &drivers, link) {
	print_driver(driver->driver, indent);
	}
	}

	static void
	print_devclass_short(devclass_t dc, int indent)
	{
	if ( !dc )
	return;

	indentprintf(("devclass %s: max units = %d\n", dc->name, dc->maxunit));
	}

	static void
	print_devclass(devclass_t dc, int indent)
	{
	int i;

	if ( !dc )
	return;

	print_devclass_short(dc, indent);
	indentprintf(("Drivers:\n"));
	print_driver_list(dc->drivers, indent+1);

	indentprintf(("Devices:\n"));
	for (i = 0; i < dc->maxunit; i++)
	if (dc->devices[i])
	print_device(dc->devices[i], indent+1);
	}

	void
	print_devclass_list_short(void)
	{
	devclass_t dc;

	printf("Short listing of devclasses, drivers & devices:\n");
	TAILQ_FOREACH(dc, &devclasses, link) {
	print_devclass_short(dc, 0);
	}
	}

	void
	print_devclass_list(void)
	{
	devclass_t dc;

	printf("Full listing of devclasses, drivers & devices:\n");
	TAILQ_FOREACH(dc, &devclasses, link) {
	print_devclass(dc, 0);
	}
	}

	#endif

	/*
	* User-space access to the device tree.
	*
	* We implement a small set of nodes:
	*
	* hw.bus Single integer read method to obtain the
	* current generation count.
	* hw.bus.devices Reads the entire device tree in flat space.
	* hw.bus.rman Resource manager interface
	*
	* We might like to add the ability to scan devclasses and/or drivers to
	* determine what else is currently loaded/available.
	*/

	static int
	sysctl_bus(SYSCTL_HANDLER_ARGS)
	{
	struct u_businfo ubus;

	ubus.ub_version = BUS_USER_VERSION;
	ubus.ub_generation = bus_data_generation;

	return (SYSCTL_OUT(req, &ubus, sizeof(ubus)));
	}
	SYSCTL_NODE(_hw_bus, OID_AUTO, info, CTLFLAG_RW, sysctl_bus,
	"bus-related data");

	static int
	sysctl_devices(SYSCTL_HANDLER_ARGS)
	{
	int name = (int )arg1;
	u_int namelen = arg2;
	int index;
	struct device *dev;
	struct u_device udev; /* XXX this is a bit big */
	int error;

	if (namelen != 2)
	return (EINVAL);

	if (bus_data_generation_check(name[0]))
	return (EINVAL);

	index = name[1];

	/*
	* Scan the list of devices, looking for the requested index.
	*/
	TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
	if (index-- == 0)
	break;
	}
	if (dev == NULL)
	return (ENOENT);

	/*
	* Populate the return array.
	*/
	bzero(&udev, sizeof(udev));
	udev.dv_handle = (uintptr_t)dev;
	udev.dv_parent = (uintptr_t)dev->parent;
	if (dev->nameunit != NULL)
	strlcpy(udev.dv_name, dev->nameunit, sizeof(udev.dv_name));
	if (dev->desc != NULL)
	strlcpy(udev.dv_desc, dev->desc, sizeof(udev.dv_desc));
	if (dev->driver != NULL && dev->driver->name != NULL)
	strlcpy(udev.dv_drivername, dev->driver->name,
	sizeof(udev.dv_drivername));
	bus_child_pnpinfo_str(dev, udev.dv_pnpinfo, sizeof(udev.dv_pnpinfo));
	bus_child_location_str(dev, udev.dv_location, sizeof(udev.dv_location));
	udev.dv_devflags = dev->devflags;
	udev.dv_flags = dev->flags;
	udev.dv_state = dev->state;
	error = SYSCTL_OUT(req, &udev, sizeof(udev));
	return (error);
	}

	SYSCTL_NODE(_hw_bus, OID_AUTO, devices, CTLFLAG_RD, sysctl_devices,
	"system device tree");

	int
	bus_data_generation_check(int generation)
	{
	if (generation != bus_data_generation)
	return (1);

	/* XXX generate optimised lists here? */
	return (0);
	}

	void
	bus_data_generation_update(void)
	{
	bus_data_generation++;
	}

	int
	bus_free_resource(device_t dev, int type, struct resource *r)
	{
	if (r == NULL)
	return (0);
	return (bus_release_resource(dev, type, rman_get_rid(r), r));
	}
	Index: head/sys/kern/subr_prof.c
	===================================================================
	--- head/sys/kern/subr_prof.c (revision 225616)
	+++ head/sys/kern/subr_prof.c (revision 225617)
	@@ -1,589 +1,589 @@
	/*-
	* Copyright (c) 1982, 1986, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)subr_prof.c 8.3 (Berkeley) 9/23/93
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/sysctl.h>

	#include <machine/cpu.h>

	#ifdef GPROF
	#include <sys/malloc.h>
	#include <sys/gmon.h>
	#undef MCOUNT

	static MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");

	static void kmstartup(void *);
	SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL);

	struct gmonparam _gmonparam = { GMON_PROF_OFF };

	#ifdef GUPROF
	void
	nullfunc_loop_profiled()
	{
	int i;

	for (i = 0; i < CALIB_SCALE; i++)
	nullfunc_profiled();
	}

	#define nullfunc_loop_profiled_end nullfunc_profiled /* XXX */

	void
	nullfunc_profiled()
	{
	}
	#endif /* GUPROF */

	/*
	* Update the histograms to support extending the text region arbitrarily.
	* This is done slightly naively (no sparse regions), so will waste slight
	* amounts of memory, but will overall work nicely enough to allow profiling
	* of KLDs.
	*/
	void
	kmupetext(uintfptr_t nhighpc)
	{
	struct gmonparam np; /* slightly large */
	struct gmonparam *p = &_gmonparam;
	char *cp;

	GIANT_REQUIRED;
	bcopy(p, &np, sizeof(*p));
	np.highpc = ROUNDUP(nhighpc, HISTFRACTION * sizeof(HISTCOUNTER));
	if (np.highpc <= p->highpc)
	return;
	np.textsize = np.highpc - p->lowpc;
	np.kcountsize = np.textsize / HISTFRACTION;
	np.hashfraction = HASHFRACTION;
	np.fromssize = np.textsize / HASHFRACTION;
	np.tolimit = np.textsize * ARCDENSITY / 100;
	if (np.tolimit < MINARCS)
	np.tolimit = MINARCS;
	else if (np.tolimit > MAXARCS)
	np.tolimit = MAXARCS;
	np.tossize = np.tolimit * sizeof(struct tostruct);
	cp = malloc(np.kcountsize + np.fromssize + np.tossize,
	M_GPROF, M_WAITOK);
	/*
	* Check for something else extending highpc while we slept.
	*/
	if (np.highpc <= p->highpc) {
	free(cp, M_GPROF);
	return;
	}
	np.tos = (struct tostruct *)cp;
	cp += np.tossize;
	np.kcount = (HISTCOUNTER *)cp;
	cp += np.kcountsize;
	np.froms = (u_short *)cp;
	#ifdef GUPROF
	/* Reinitialize pointers to overhead counters. */
	np.cputime_count = &KCOUNT(&np, PC_TO_I(&np, cputime));
	np.mcount_count = &KCOUNT(&np, PC_TO_I(&np, mcount));
	np.mexitcount_count = &KCOUNT(&np, PC_TO_I(&np, mexitcount));
	#endif
	critical_enter();
	bcopy(p->tos, np.tos, p->tossize);
	bzero((char *)np.tos + p->tossize, np.tossize - p->tossize);
	bcopy(p->kcount, np.kcount, p->kcountsize);
	bzero((char *)np.kcount + p->kcountsize, np.kcountsize -
	p->kcountsize);
	bcopy(p->froms, np.froms, p->fromssize);
	bzero((char *)np.froms + p->fromssize, np.fromssize - p->fromssize);
	cp = (char *)p->tos;
	bcopy(&np, p, sizeof(*p));
	critical_exit();
	free(cp, M_GPROF);
	}

	static void
	kmstartup(dummy)
	void *dummy;
	{
	char *cp;
	struct gmonparam *p = &_gmonparam;
	#ifdef GUPROF
	int cputime_overhead;
	int empty_loop_time;
	int i;
	int mcount_overhead;
	int mexitcount_overhead;
	int nullfunc_loop_overhead;
	int nullfunc_loop_profiled_time;
	uintfptr_t tmp_addr;
	#endif

	/*
	* Round lowpc and highpc to multiples of the density we're using
	* so the rest of the scaling (here and in gprof) stays in ints.
	*/
	p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER));
	p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER));
	p->textsize = p->highpc - p->lowpc;
	printf("Profiling kernel, textsize=%lu [%jx..%jx]\n",
	p->textsize, (uintmax_t)p->lowpc, (uintmax_t)p->highpc);
	p->kcountsize = p->textsize / HISTFRACTION;
	p->hashfraction = HASHFRACTION;
	p->fromssize = p->textsize / HASHFRACTION;
	p->tolimit = p->textsize * ARCDENSITY / 100;
	if (p->tolimit < MINARCS)
	p->tolimit = MINARCS;
	else if (p->tolimit > MAXARCS)
	p->tolimit = MAXARCS;
	p->tossize = p->tolimit * sizeof(struct tostruct);
	cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize,
	M_GPROF, M_WAITOK \| M_ZERO);
	p->tos = (struct tostruct *)cp;
	cp += p->tossize;
	p->kcount = (HISTCOUNTER *)cp;
	cp += p->kcountsize;
	p->froms = (u_short *)cp;
	p->histcounter_type = FUNCTION_ALIGNMENT / HISTFRACTION * NBBY;

	#ifdef GUPROF
	/* Signed counters. */
	p->histcounter_type = -p->histcounter_type;

	/* Initialize pointers to overhead counters. */
	p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime));
	p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount));
	p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount));

	/*
	* Disable interrupts to avoid interference while we calibrate
	* things.
	*/
	critical_enter();

	/*
	* Determine overheads.
	* XXX this needs to be repeated for each useful timer/counter.
	*/
	cputime_overhead = 0;
	startguprof(p);
	for (i = 0; i < CALIB_SCALE; i++)
	cputime_overhead += cputime();

	empty_loop();
	startguprof(p);
	empty_loop();
	empty_loop_time = cputime();

	nullfunc_loop_profiled();

	/*
	* Start profiling. There won't be any normal function calls since
	* interrupts are disabled, but we will call the profiling routines
	* directly to determine their overheads.
	*/
	p->state = GMON_PROF_HIRES;

	startguprof(p);
	nullfunc_loop_profiled();

	startguprof(p);
	for (i = 0; i < CALIB_SCALE; i++)
	- MCOUNT_OVERHEAD(profil);
	- mcount_overhead = KCOUNT(p, PC_TO_I(p, profil));
	+ MCOUNT_OVERHEAD(sys_profil);
	+ mcount_overhead = KCOUNT(p, PC_TO_I(p, sys_profil));

	startguprof(p);
	for (i = 0; i < CALIB_SCALE; i++)
	MEXITCOUNT_OVERHEAD();
	MEXITCOUNT_OVERHEAD_GETLABEL(tmp_addr);
	mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr));

	p->state = GMON_PROF_OFF;
	stopguprof(p);

	critical_exit();

	nullfunc_loop_profiled_time = 0;
	for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled;
	tmp_addr < (uintfptr_t)nullfunc_loop_profiled_end;
	tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER))
	nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr));
	#define CALIB_DOSCALE(count) (((count) + CALIB_SCALE / 3) / CALIB_SCALE)
	#define c2n(count, freq) ((int)((count) * 1000000000LL / freq))
	printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n",
	CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)),
	CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)),
	CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)),
	CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)),
	CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate)));
	cputime_overhead -= empty_loop_time;
	mcount_overhead -= empty_loop_time;
	mexitcount_overhead -= empty_loop_time;

	/*-
	* Profiling overheads are determined by the times between the
	* following events:
	* MC1: mcount() is called
	* MC2: cputime() (called from mcount()) latches the timer
	* MC3: mcount() completes
	* ME1: mexitcount() is called
	* ME2: cputime() (called from mexitcount()) latches the timer
	* ME3: mexitcount() completes.
	* The times between the events vary slightly depending on instruction
	* combination and cache misses, etc. Attempt to determine the
	* minimum times. These can be subtracted from the profiling times
	* without much risk of reducing the profiling times below what they
	* would be when profiling is not configured. Abbreviate:
	* ab = minimum time between MC1 and MC3
	* a = minumum time between MC1 and MC2
	* b = minimum time between MC2 and MC3
	* cd = minimum time between ME1 and ME3
	* c = minimum time between ME1 and ME2
	* d = minimum time between ME2 and ME3.
	* These satisfy the relations:
	* ab <= mcount_overhead (just measured)
	* a + b <= ab
	* cd <= mexitcount_overhead (just measured)
	* c + d <= cd
	* a + d <= nullfunc_loop_profiled_time (just measured)
	* a >= 0, b >= 0, c >= 0, d >= 0.
	* Assume that ab and cd are equal to the minimums.
	*/
	p->cputime_overhead = CALIB_DOSCALE(cputime_overhead);
	p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead);
	p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead
	- cputime_overhead);
	nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time;
	p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead
	- nullfunc_loop_overhead)
	/ 4);
	p->mexitcount_pre_overhead = p->mexitcount_overhead
	+ p->cputime_overhead
	- p->mexitcount_post_overhead;
	p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead)
	- p->mexitcount_post_overhead;
	p->mcount_post_overhead = p->mcount_overhead
	+ p->cputime_overhead
	- p->mcount_pre_overhead;
	printf(
	"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n",
	c2n(p->cputime_overhead, p->profrate),
	c2n(p->mcount_overhead, p->profrate),
	c2n(p->mcount_pre_overhead, p->profrate),
	c2n(p->mcount_post_overhead, p->profrate),
	c2n(p->cputime_overhead, p->profrate),
	c2n(p->mexitcount_overhead, p->profrate),
	c2n(p->mexitcount_pre_overhead, p->profrate),
	c2n(p->mexitcount_post_overhead, p->profrate));
	printf(
	"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n",
	p->cputime_overhead, p->mcount_overhead,
	p->mcount_pre_overhead, p->mcount_post_overhead,
	p->cputime_overhead, p->mexitcount_overhead,
	p->mexitcount_pre_overhead, p->mexitcount_post_overhead);
	#endif /* GUPROF */
	}

	/*
	* Return kernel profiling information.
	*/
	static int
	sysctl_kern_prof(SYSCTL_HANDLER_ARGS)
	{
	int name = (int ) arg1;
	u_int namelen = arg2;
	struct gmonparam *gp = &_gmonparam;
	int error;
	int state;

	/* all sysctl names at this level are terminal */
	if (namelen != 1)
	return (ENOTDIR); /* overloaded */

	switch (name[0]) {
	case GPROF_STATE:
	state = gp->state;
	error = sysctl_handle_int(oidp, &state, 0, req);
	if (error)
	return (error);
	if (!req->newptr)
	return (0);
	if (state == GMON_PROF_OFF) {
	gp->state = state;
	PROC_LOCK(&proc0);
	stopprofclock(&proc0);
	PROC_UNLOCK(&proc0);
	stopguprof(gp);
	} else if (state == GMON_PROF_ON) {
	gp->state = GMON_PROF_OFF;
	stopguprof(gp);
	gp->profrate = profhz;
	PROC_LOCK(&proc0);
	startprofclock(&proc0);
	PROC_UNLOCK(&proc0);
	gp->state = state;
	#ifdef GUPROF
	} else if (state == GMON_PROF_HIRES) {
	gp->state = GMON_PROF_OFF;
	PROC_LOCK(&proc0);
	stopprofclock(&proc0);
	PROC_UNLOCK(&proc0);
	startguprof(gp);
	gp->state = state;
	#endif
	} else if (state != gp->state)
	return (EINVAL);
	return (0);
	case GPROF_COUNT:
	return (sysctl_handle_opaque(oidp,
	gp->kcount, gp->kcountsize, req));
	case GPROF_FROMS:
	return (sysctl_handle_opaque(oidp,
	gp->froms, gp->fromssize, req));
	case GPROF_TOS:
	return (sysctl_handle_opaque(oidp,
	gp->tos, gp->tossize, req));
	case GPROF_GMONPARAM:
	return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req));
	default:
	return (EOPNOTSUPP);
	}
	/* NOTREACHED */
	}

	SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, "");
	#endif /* GPROF */

	/*
	* Profiling system call.
	*
	* The scale factor is a fixed point number with 16 bits of fraction, so that
	* 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct profil_args {
	caddr_t samples;
	size_t size;
	size_t offset;
	u_int scale;
	};
	#endif
	/* ARGSUSED */
	int
	-profil(struct thread td, struct profil_args uap)
	+sys_profil(struct thread td, struct profil_args uap)
	{
	struct uprof *upp;
	struct proc *p;

	if (uap->scale > (1 << 16))
	return (EINVAL);

	p = td->td_proc;
	if (uap->scale == 0) {
	PROC_LOCK(p);
	stopprofclock(p);
	PROC_UNLOCK(p);
	return (0);
	}
	PROC_LOCK(p);
	upp = &td->td_proc->p_stats->p_prof;
	PROC_SLOCK(p);
	upp->pr_off = uap->offset;
	upp->pr_scale = uap->scale;
	upp->pr_base = uap->samples;
	upp->pr_size = uap->size;
	PROC_SUNLOCK(p);
	startprofclock(p);
	PROC_UNLOCK(p);

	return (0);
	}

	/*
	* Scale is a fixed-point number with the binary point 16 bits
	* into the value, and is <= 1.0. pc is at most 32 bits, so the
	* intermediate result is at most 48 bits.
	*/
	#define PC_TO_INDEX(pc, prof) \
	((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
	(u_quad_t)((prof)->pr_scale)) >> 16) & ~1)

	/*
	* Collect user-level profiling statistics; called on a profiling tick,
	* when a process is running in user-mode. This routine may be called
	* from an interrupt context. We try to update the user profiling buffers
	* cheaply with fuswintr() and suswintr(). If that fails, we revert to
	* an AST that will vector us to trap() with a context in which copyin
	* and copyout will work. Trap will then call addupc_task().
	*
	* Note that we may (rarely) not get around to the AST soon enough, and
	* lose profile ticks when the next tick overwrites this one, but in this
	* case the system is overloaded and the profile is probably already
	* inaccurate.
	*/
	void
	addupc_intr(struct thread *td, uintfptr_t pc, u_int ticks)
	{
	struct uprof *prof;
	caddr_t addr;
	u_int i;
	int v;

	if (ticks == 0)
	return;
	prof = &td->td_proc->p_stats->p_prof;
	PROC_SLOCK(td->td_proc);
	if (pc < prof->pr_off \|\|
	(i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
	PROC_SUNLOCK(td->td_proc);
	return; /* out of range; ignore */
	}

	addr = prof->pr_base + i;
	PROC_SUNLOCK(td->td_proc);
	if ((v = fuswintr(addr)) == -1 \|\| suswintr(addr, v + ticks) == -1) {
	td->td_profil_addr = pc;
	td->td_profil_ticks = ticks;
	td->td_pflags \|= TDP_OWEUPC;
	thread_lock(td);
	td->td_flags \|= TDF_ASTPENDING;
	thread_unlock(td);
	}
	}

	/*
	* Much like before, but we can afford to take faults here. If the
	* update fails, we simply turn off profiling.
	*/
	void
	addupc_task(struct thread *td, uintfptr_t pc, u_int ticks)
	{
	struct proc *p = td->td_proc;
	struct uprof *prof;
	caddr_t addr;
	u_int i;
	u_short v;
	int stop = 0;

	if (ticks == 0)
	return;

	PROC_LOCK(p);
	if (!(p->p_flag & P_PROFIL)) {
	PROC_UNLOCK(p);
	return;
	}
	p->p_profthreads++;
	prof = &p->p_stats->p_prof;
	PROC_SLOCK(p);
	if (pc < prof->pr_off \|\|
	(i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
	PROC_SUNLOCK(p);
	goto out;
	}

	addr = prof->pr_base + i;
	PROC_SUNLOCK(p);
	PROC_UNLOCK(p);
	if (copyin(addr, &v, sizeof(v)) == 0) {
	v += ticks;
	if (copyout(&v, addr, sizeof(v)) == 0) {
	PROC_LOCK(p);
	goto out;
	}
	}
	stop = 1;
	PROC_LOCK(p);

	out:
	if (--p->p_profthreads == 0) {
	if (p->p_flag & P_STOPPROF) {
	wakeup(&p->p_profthreads);
	stop = 0;
	}
	}
	if (stop)
	stopprofclock(p);
	PROC_UNLOCK(p);
	}

	#if (defined(__amd64__) \|\| defined(__i386__)) && \
	defined(__GNUCLIKE_CTOR_SECTION_HANDLING)
	/*
	* Support for "--test-coverage --profile-arcs" in GCC.
	*
	* We need to call all the functions in the .ctor section, in order
	* to get all the counter-arrays strung into a list.
	*
	* XXX: the .ctors call __bb_init_func which is located in over in
	* XXX: i386/i386/support.s for historical reasons. There is probably
	* XXX: no reason for that to be assembler anymore, but doing it right
	* XXX: in MI C code requires one to reverse-engineer the type-selection
	* XXX: inside GCC. Have fun.
	*
	* XXX: Worrisome perspective: Calling the .ctors may make C++ in the
	* XXX: kernel feasible. Don't.
	*/
	typedef void (*ctor_t)(void);
	extern ctor_t _start_ctors, _stop_ctors;

	static void
	tcov_init(void *foo __unused)
	{
	ctor_t *p, q;

	for (p = &_start_ctors; p < &_stop_ctors; p++) {
	q = *p;
	q();
	}
	}

	SYSINIT(tcov_init, SI_SUB_KPROF, SI_ORDER_SECOND, tcov_init, NULL);

	/*
	* GCC contains magic to recognize calls to for instance execve() and
	* puts in calls to this function to preserve the profile counters.
	* XXX: Put zinging punchline here.
	*/
	void __bb_fork_func(void);
	void
	__bb_fork_func(void)
	{
	}

	#endif

	Index: head/sys/kern/subr_trap.c
	===================================================================
	--- head/sys/kern/subr_trap.c (revision 225616)
	+++ head/sys/kern/subr_trap.c (revision 225617)
	@@ -1,265 +1,265 @@
	/*-
	* Copyright (C) 1994, David Greenman
	* Copyright (c) 1990, 1993
	* The Regents of the University of California. All rights reserved.
	* Copyright (c) 2007 The FreeBSD Foundation
	*
	* This code is derived from software contributed to Berkeley by
	* the University of Utah, and William Jolitz.
	*
	* Portions of this software were developed by A. Joseph Koshy under
	* sponsorship from the FreeBSD Foundation and Google, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)trap.c 7.4 (Berkeley) 5/13/91
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_capsicum.h"
	#include "opt_ktrace.h"
	#include "opt_kdtrace.h"
	#include "opt_sched.h"

	#include <sys/param.h>
	#include <sys/bus.h>
	#include <sys/capability.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/pmckern.h>
	#include <sys/proc.h>
	#include <sys/ktr.h>
	#include <sys/pioctl.h>
	#include <sys/ptrace.h>
	#include <sys/resourcevar.h>
	#include <sys/sched.h>
	#include <sys/signalvar.h>
	#include <sys/syscall.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/systm.h>
	#include <sys/vmmeter.h>
	#ifdef KTRACE
	#include <sys/uio.h>
	#include <sys/ktrace.h>
	#endif
	#include <security/audit/audit.h>

	#include <machine/cpu.h>

	#ifdef VIMAGE
	#include <net/vnet.h>
	#endif

	#ifdef XEN
	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#endif

	#include <security/mac/mac_framework.h>

	/*
	* Define the code needed before returning to user mode, for trap and
	* syscall.
	*/
	void
	userret(struct thread td, struct trapframe frame)
	{
	struct proc *p = td->td_proc;

	CTR3(KTR_SYSC, "userret: thread %p (pid %d, %s)", td, p->p_pid,
	td->td_name);
	#if 0
	#ifdef DIAGNOSTIC
	/* Check that we called signotify() enough. */
	PROC_LOCK(p);
	thread_lock(td);
	if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 \|\|
	(td->td_flags & TDF_ASTPENDING) == 0))
	printf("failed to set signal flags properly for ast()\n");
	thread_unlock(td);
	PROC_UNLOCK(p);
	#endif
	#endif
	#ifdef KTRACE
	KTRUSERRET(td);
	#endif
	/*
	* If this thread tickled GEOM, we need to wait for the giggling to
	* stop before we return to userland
	*/
	if (td->td_pflags & TDP_GEOM)
	g_waitidle();

	/*
	* Charge system time if profiling.
	*/
	if (p->p_flag & P_PROFIL)
	addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio);
	/*
	* Let the scheduler adjust our priority etc.
	*/
	sched_userret(td);
	KASSERT(td->td_locks == 0,
	("userret: Returning with %d locks held.", td->td_locks));
	#ifdef VIMAGE
	/* Unfortunately td_vnet_lpush needs VNET_DEBUG. */
	VNET_ASSERT(curvnet == NULL,
	("%s: Returning on td %p (pid %d, %s) with vnet %p set in %s",
	__func__, td, p->p_pid, td->td_name, curvnet,
	(td->td_vnet_lpush != NULL) ? td->td_vnet_lpush : "N/A"));
	#endif
	#ifdef XEN
	PT_UPDATES_FLUSH();
	#endif
	}

	/*
	* Process an asynchronous software trap.
	* This is relatively easy.
	* This function will return with preemption disabled.
	*/
	void
	ast(struct trapframe *framep)
	{
	struct thread *td;
	struct proc *p;
	int flags;
	int sig;

	td = curthread;
	p = td->td_proc;

	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,
	p->p_comm);
	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
	WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode");
	mtx_assert(&Giant, MA_NOTOWNED);
	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
	td->td_frame = framep;
	td->td_pticks = 0;

	/*
	* This updates the td_flag's for the checks below in one
	* "atomic" operation with turning off the astpending flag.
	* If another AST is triggered while we are handling the
	* AST's saved in flags, the astpending flag will be set and
	* ast() will be called again.
	*/
	thread_lock(td);
	flags = td->td_flags;
	td->td_flags &= ~(TDF_ASTPENDING \| TDF_NEEDSIGCHK \| TDF_NEEDSUSPCHK \|
	TDF_NEEDRESCHED \| TDF_ALRMPEND \| TDF_PROFPEND \| TDF_MACPEND);
	thread_unlock(td);
	PCPU_INC(cnt.v_trap);

	if (td->td_ucred != p->p_ucred)
	cred_update_thread(td);
	if (td->td_pflags & TDP_OWEUPC && p->p_flag & P_PROFIL) {
	addupc_task(td, td->td_profil_addr, td->td_profil_ticks);
	td->td_profil_ticks = 0;
	td->td_pflags &= ~TDP_OWEUPC;
	}
	if (flags & TDF_ALRMPEND) {
	PROC_LOCK(p);
	- psignal(p, SIGVTALRM);
	+ kern_psignal(p, SIGVTALRM);
	PROC_UNLOCK(p);
	}
	if (flags & TDF_PROFPEND) {
	PROC_LOCK(p);
	- psignal(p, SIGPROF);
	+ kern_psignal(p, SIGPROF);
	PROC_UNLOCK(p);
	}
	#ifdef MAC
	if (flags & TDF_MACPEND)
	mac_thread_userret(td);
	#endif
	if (flags & TDF_NEEDRESCHED) {
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_CSW))
	ktrcsw(1, 1);
	#endif
	thread_lock(td);
	sched_prio(td, td->td_user_pri);
	mi_switch(SW_INVOL \| SWT_NEEDRESCHED, NULL);
	thread_unlock(td);
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_CSW))
	ktrcsw(0, 1);
	#endif
	}

	/*
	* Check for signals. Unlocked reads of p_pendingcnt or
	* p_siglist might cause process-directed signal to be handled
	* later.
	*/
	if (flags & TDF_NEEDSIGCHK \|\| p->p_pendingcnt > 0 \|\|
	!SIGISEMPTY(p->p_siglist)) {
	PROC_LOCK(p);
	mtx_lock(&p->p_sigacts->ps_mtx);
	while ((sig = cursig(td, SIG_STOP_ALLOWED)) != 0)
	postsig(sig);
	mtx_unlock(&p->p_sigacts->ps_mtx);
	PROC_UNLOCK(p);
	}
	/*
	* We need to check to see if we have to exit or wait due to a
	* single threading requirement or some other STOP condition.
	*/
	if (flags & TDF_NEEDSUSPCHK) {
	PROC_LOCK(p);
	thread_suspend_check(0);
	PROC_UNLOCK(p);
	}

	if (td->td_pflags & TDP_OLDMASK) {
	td->td_pflags &= ~TDP_OLDMASK;
	kern_sigprocmask(td, SIG_SETMASK, &td->td_oldsigmask, NULL, 0);
	}

	userret(td, framep);
	mtx_assert(&Giant, MA_NOTOWNED);
	}

	const char *
	syscallname(struct proc *p, u_int code)
	{
	static const char unknown[] = "unknown";
	struct sysentvec *sv;

	sv = p->p_sysent;
	if (sv->sv_syscallnames == NULL \|\| code >= sv->sv_size)
	return (unknown);
	return (sv->sv_syscallnames[code]);
	}
	Index: head/sys/kern/sys_capability.c
	===================================================================
	--- head/sys/kern/sys_capability.c (revision 225616)
	+++ head/sys/kern/sys_capability.c (revision 225617)
	@@ -1,553 +1,553 @@
	/*-
	* Copyright (c) 2008-2011 Robert N. M. Watson
	* Copyright (c) 2010-2011 Jonathan Anderson
	* All rights reserved.
	*
	* This software was developed at the University of Cambridge Computer
	* Laboratory with support from a grant from Google, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* FreeBSD kernel capability facility.
	*
	* Two kernel features are implemented here: capability mode, a sandboxed mode
	* of execution for processes, and capabilities, a refinement on file
	* descriptors that allows fine-grained control over operations on the file
	* descriptor. Collectively, these allow processes to run in the style of a
	* historic "capability system" in which they can use only resources
	* explicitly delegated to them. This model is enforced by restricting access
	* to global namespaces in capability mode.
	*
	* Capabilities wrap other file descriptor types, binding them to a constant
	* rights mask set when the capability is created. New capabilities may be
	* derived from existing capabilities, but only if they have the same or a
	* strict subset of the rights on the original capability.
	*
	* System calls permitted in capability mode are defined in capabilities.conf;
	* calls must be carefully audited for safety to ensure that they don't allow
	* escape from a sandbox. Some calls permit only a subset of operations in
	* capability mode -- for example, shm_open(2) is limited to creating
	* anonymous, rather than named, POSIX shared memory objects.
	*/

	#include "opt_capsicum.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/capability.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/sysproto.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/ucred.h>

	#include <security/audit/audit.h>

	#include <vm/uma.h>
	#include <vm/vm.h>

	#ifdef CAPABILITY_MODE

	FEATURE(security_capability_mode, "Capsicum Capability Mode");

	/*
	* System call to enter capability mode for the process.
	*/
	int
	-cap_enter(struct thread td, struct cap_enter_args uap)
	+sys_cap_enter(struct thread td, struct cap_enter_args uap)
	{
	struct ucred newcred, oldcred;
	struct proc *p;

	if (IN_CAPABILITY_MODE(td))
	return (0);

	newcred = crget();
	p = td->td_proc;
	PROC_LOCK(p);
	oldcred = p->p_ucred;
	crcopy(newcred, oldcred);
	newcred->cr_flags \|= CRED_FLAG_CAPMODE;
	p->p_ucred = newcred;
	PROC_UNLOCK(p);
	crfree(oldcred);
	return (0);
	}

	/*
	* System call to query whether the process is in capability mode.
	*/
	int
	-cap_getmode(struct thread td, struct cap_getmode_args uap)
	+sys_cap_getmode(struct thread td, struct cap_getmode_args uap)
	{
	u_int i;

	i = (IN_CAPABILITY_MODE(td)) ? 1 : 0;
	return (copyout(&i, uap->modep, sizeof(i)));
	}

	#else /* !CAPABILITY_MODE */

	int
	-cap_enter(struct thread td, struct cap_enter_args uap)
	+sys_cap_enter(struct thread td, struct cap_enter_args uap)
	{

	return (ENOSYS);
	}

	int
	-cap_getmode(struct thread td, struct cap_getmode_args uap)
	+sys_cap_getmode(struct thread td, struct cap_getmode_args uap)
	{

	return (ENOSYS);
	}

	#endif /* CAPABILITY_MODE */

	#ifdef CAPABILITIES

	FEATURE(security_capabilities, "Capsicum Capabilities");

	/*
	* struct capability describes a capability, and is hung off of its struct
	* file f_data field. cap_file and cap_rightss are static once hooked up, as
	* neither the object it references nor the rights it encapsulates are
	* permitted to change.
	*/
	struct capability {
	struct file cap_object; / Underlying object's file. */
	struct file cap_file; / Back-pointer to cap's file. */
	cap_rights_t cap_rights; /* Mask of rights on object. */
	};

	/*
	* Capabilities have a fileops vector, but in practice none should ever be
	* called except for fo_close, as the capability will normally not be
	* returned during a file descriptor lookup in the system call code.
	*/
	static fo_rdwr_t capability_read;
	static fo_rdwr_t capability_write;
	static fo_truncate_t capability_truncate;
	static fo_ioctl_t capability_ioctl;
	static fo_poll_t capability_poll;
	static fo_kqfilter_t capability_kqfilter;
	static fo_stat_t capability_stat;
	static fo_close_t capability_close;
	static fo_chmod_t capability_chmod;
	static fo_chown_t capability_chown;

	static struct fileops capability_ops = {
	.fo_read = capability_read,
	.fo_write = capability_write,
	.fo_truncate = capability_truncate,
	.fo_ioctl = capability_ioctl,
	.fo_poll = capability_poll,
	.fo_kqfilter = capability_kqfilter,
	.fo_stat = capability_stat,
	.fo_close = capability_close,
	.fo_chmod = capability_chmod,
	.fo_chown = capability_chown,
	.fo_flags = DFLAG_PASSABLE,
	};

	static struct fileops capability_ops_unpassable = {
	.fo_read = capability_read,
	.fo_write = capability_write,
	.fo_truncate = capability_truncate,
	.fo_ioctl = capability_ioctl,
	.fo_poll = capability_poll,
	.fo_kqfilter = capability_kqfilter,
	.fo_stat = capability_stat,
	.fo_close = capability_close,
	.fo_chmod = capability_chmod,
	.fo_chown = capability_chown,
	.fo_flags = 0,
	};

	static uma_zone_t capability_zone;

	static void
	capability_init(void *dummy __unused)
	{

	capability_zone = uma_zcreate("capability", sizeof(struct capability),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	if (capability_zone == NULL)
	panic("capability_init: capability_zone not initialized");
	}
	SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, capability_init, NULL);

	/*
	* Test whether a capability grants the requested rights.
	*/
	static int
	cap_check(struct capability *c, cap_rights_t rights)
	{

	if ((c->cap_rights \| rights) != c->cap_rights)
	return (ENOTCAPABLE);
	return (0);
	}

	/*
	* Extract rights from a capability for monitoring purposes -- not for use in
	* any other way, as we want to keep all capability permission evaluation in
	* this one file.
	*/
	cap_rights_t
	cap_rights(struct file *fp_cap)
	{
	struct capability *c;

	KASSERT(fp_cap->f_type == DTYPE_CAPABILITY,
	("cap_rights: !capability"));

	c = fp_cap->f_data;
	return (c->cap_rights);
	}

	/*
	* System call to create a new capability reference to either an existing
	* file object or an an existing capability.
	*/
	int
	-cap_new(struct thread td, struct cap_new_args uap)
	+sys_cap_new(struct thread td, struct cap_new_args uap)
	{
	int error, capfd;
	int fd = uap->fd;
	struct file *fp;
	cap_rights_t rights = uap->rights;

	AUDIT_ARG_FD(fd);
	AUDIT_ARG_RIGHTS(rights);
	error = fget(td, fd, rights, &fp);
	if (error)
	return (error);
	AUDIT_ARG_FILE(td->td_proc, fp);
	error = kern_capwrap(td, fp, rights, &capfd);
	if (error)
	return (error);

	/*
	* Release our reference to the file (kern_capwrap has held a reference
	* for the filedesc array).
	*/
	fdrop(fp, td);
	td->td_retval[0] = capfd;
	return (0);
	}

	/*
	* System call to query the rights mask associated with a capability.
	*/
	int
	-cap_getrights(struct thread td, struct cap_getrights_args uap)
	+sys_cap_getrights(struct thread td, struct cap_getrights_args uap)
	{
	struct capability *cp;
	struct file *fp;
	int error;

	AUDIT_ARG_FD(uap->fd);
	error = fgetcap(td, uap->fd, &fp);
	if (error)
	return (error);
	cp = fp->f_data;
	error = copyout(&cp->cap_rights, uap->rightsp, sizeof(*uap->rightsp));
	fdrop(fp, td);
	return (error);
	}

	/*
	* Create a capability to wrap around an existing file.
	*/
	int
	kern_capwrap(struct thread td, struct file fp, cap_rights_t rights,
	int *capfdp)
	{
	struct capability cp, cp_old;
	struct file fp_object, fcapp;
	int error;

	if ((rights \| CAP_MASK_VALID) != CAP_MASK_VALID)
	return (EINVAL);

	/*
	* If a new capability is being derived from an existing capability,
	* then the new capability rights must be a subset of the existing
	* rights.
	*/
	if (fp->f_type == DTYPE_CAPABILITY) {
	cp_old = fp->f_data;
	if ((cp_old->cap_rights \| rights) != cp_old->cap_rights)
	return (ENOTCAPABLE);
	}

	/*
	* Allocate a new file descriptor to hang the capability off of.
	*/
	error = falloc(td, &fcapp, capfdp, fp->f_flag);
	if (error)
	return (error);

	/*
	* Rather than nesting capabilities, directly reference the object an
	* existing capability references. There's nothing else interesting
	* to preserve for future use, as we've incorporated the previous
	* rights mask into the new one. This prevents us from having to
	* deal with capability chains.
	*/
	if (fp->f_type == DTYPE_CAPABILITY)
	fp_object = ((struct capability *)fp->f_data)->cap_object;
	else
	fp_object = fp;
	fhold(fp_object);
	cp = uma_zalloc(capability_zone, M_WAITOK \| M_ZERO);
	cp->cap_rights = rights;
	cp->cap_object = fp_object;
	cp->cap_file = fcapp;
	if (fp->f_flag & DFLAG_PASSABLE)
	finit(fcapp, fp->f_flag, DTYPE_CAPABILITY, cp,
	&capability_ops);
	else
	finit(fcapp, fp->f_flag, DTYPE_CAPABILITY, cp,
	&capability_ops_unpassable);

	/*
	* Release our private reference (the proc filedesc still has one).
	*/
	fdrop(fcapp, td);
	return (0);
	}

	/*
	* Given a file descriptor, test it against a capability rights mask and then
	* return the file descriptor on which to actually perform the requested
	* operation. As long as the reference to fp_cap remains valid, the returned
	* pointer in *fp will remain valid, so no extra reference management is
	* required, and the caller should fdrop() fp_cap as normal when done with
	* both.
	*/
	int
	cap_funwrap(struct file fp_cap, cap_rights_t rights, struct file *fpp)
	{
	struct capability *c;
	int error;

	if (fp_cap->f_type != DTYPE_CAPABILITY) {
	*fpp = fp_cap;
	return (0);
	}
	c = fp_cap->f_data;
	error = cap_check(c, rights);
	if (error)
	return (error);
	*fpp = c->cap_object;
	return (0);
	}

	/*
	* Slightly different routine for memory mapping file descriptors: unwrap the
	* capability and check CAP_MMAP, but also return a bitmask representing the
	* maximum mapping rights the capability allows on the object.
	*/
	int
	cap_funwrap_mmap(struct file fp_cap, cap_rights_t rights, u_char maxprotp,
	struct file **fpp)
	{
	struct capability *c;
	u_char maxprot;
	int error;

	if (fp_cap->f_type != DTYPE_CAPABILITY) {
	*fpp = fp_cap;
	*maxprotp = VM_PROT_ALL;
	return (0);
	}
	c = fp_cap->f_data;
	error = cap_check(c, rights \| CAP_MMAP);
	if (error)
	return (error);
	*fpp = c->cap_object;
	maxprot = 0;
	if (c->cap_rights & CAP_READ)
	maxprot \|= VM_PROT_READ;
	if (c->cap_rights & CAP_WRITE)
	maxprot \|= VM_PROT_WRITE;
	if (c->cap_rights & CAP_MAPEXEC)
	maxprot \|= VM_PROT_EXECUTE;
	*maxprotp = maxprot;
	return (0);
	}

	/*
	* When a capability is closed, simply drop the reference on the underlying
	* object and free the capability. fdrop() will handle the case where the
	* underlying object also needs to close, and the caller will have already
	* performed any object-specific lock or mqueue handling.
	*/
	static int
	capability_close(struct file fp, struct thread td)
	{
	struct capability *c;
	struct file *fp_object;

	KASSERT(fp->f_type == DTYPE_CAPABILITY,
	("capability_close: !capability"));

	c = fp->f_data;
	fp->f_ops = &badfileops;
	fp->f_data = NULL;
	fp_object = c->cap_object;
	uma_zfree(capability_zone, c);
	return (fdrop(fp_object, td));
	}

	/*
	* In general, file descriptor operations should never make it to the
	* capability, only the underlying file descriptor operation vector, so panic
	* if we do turn up here.
	*/
	static int
	capability_read(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{

	panic("capability_read");
	}

	static int
	capability_write(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{

	panic("capability_write");
	}

	static int
	capability_truncate(struct file fp, off_t length, struct ucred active_cred,
	struct thread *td)
	{

	panic("capability_truncate");
	}

	static int
	capability_ioctl(struct file fp, u_long com, void data,
	struct ucred active_cred, struct thread td)
	{

	panic("capability_ioctl");
	}

	static int
	capability_poll(struct file fp, int events, struct ucred active_cred,
	struct thread *td)
	{

	panic("capability_poll");
	}

	static int
	capability_kqfilter(struct file fp, struct knote kn)
	{

	panic("capability_kqfilter");
	}

	static int
	capability_stat(struct file fp, struct stat sb, struct ucred *active_cred,
	struct thread *td)
	{

	panic("capability_stat");
	}

	int
	capability_chmod(struct file fp, mode_t mode, struct ucred active_cred,
	struct thread *td)
	{

	panic("capability_chmod");
	}

	int
	capability_chown(struct file *fp, uid_t uid, gid_t gid,
	struct ucred active_cred, struct thread td)
	{

	panic("capability_chown");
	}

	#else /* !CAPABILITIES */

	/*
	* Stub Capability functions for when options CAPABILITIES isn't compiled
	* into the kernel.
	*/
	int
	-cap_new(struct thread td, struct cap_new_args uap)
	+sys_cap_new(struct thread td, struct cap_new_args uap)
	{

	return (ENOSYS);
	}

	int
	-cap_getrights(struct thread td, struct cap_getrights_args uap)
	+sys_cap_getrights(struct thread td, struct cap_getrights_args uap)
	{

	return (ENOSYS);
	}

	int
	cap_funwrap(struct file fp_cap, cap_rights_t rights, struct file *fpp)
	{

	KASSERT(fp_cap->f_type != DTYPE_CAPABILITY,
	("cap_funwrap: saw capability"));

	*fpp = fp_cap;
	return (0);
	}

	int
	cap_funwrap_mmap(struct file fp_cap, cap_rights_t rights, u_char maxprotp,
	struct file **fpp)
	{

	KASSERT(fp_cap->f_type != DTYPE_CAPABILITY,
	("cap_funwrap_mmap: saw capability"));

	*fpp = fp_cap;
	*maxprotp = VM_PROT_ALL;
	return (0);
	}

	#endif /* CAPABILITIES */
	Index: head/sys/kern/sys_generic.c
	===================================================================
	--- head/sys/kern/sys_generic.c (revision 225616)
	+++ head/sys/kern/sys_generic.c (revision 225617)
	@@ -1,1700 +1,1700 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_capsicum.h"
	#include "opt_compat.h"
	#include "opt_ktrace.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/capability.h>
	#include <sys/filedesc.h>
	#include <sys/filio.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/proc.h>
	#include <sys/signalvar.h>
	#include <sys/socketvar.h>
	#include <sys/uio.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/limits.h>
	#include <sys/malloc.h>
	#include <sys/poll.h>
	#include <sys/resourcevar.h>
	#include <sys/selinfo.h>
	#include <sys/sleepqueue.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/vnode.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/condvar.h>
	#ifdef KTRACE
	#include <sys/ktrace.h>
	#endif

	#include <security/audit/audit.h>

	static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
	static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
	MALLOC_DEFINE(M_IOV, "iov", "large iov's");

	static int pollout(struct thread , struct pollfd , struct pollfd *,
	u_int);
	static int pollscan(struct thread , struct pollfd , u_int);
	static int pollrescan(struct thread *);
	static int selscan(struct thread , fd_mask , fd_mask *, int);
	static int selrescan(struct thread , fd_mask , fd_mask *);
	static void selfdalloc(struct thread , void );
	static void selfdfree(struct seltd , struct selfd );
	static int dofileread(struct thread , int, struct file , struct uio *,
	off_t, int);
	static int dofilewrite(struct thread , int, struct file , struct uio *,
	off_t, int);
	static void doselwakeup(struct selinfo *, int);
	static void seltdinit(struct thread *);
	static int seltdwait(struct thread *, int);
	static void seltdclear(struct thread *);

	/*
	* One seltd per-thread allocated on demand as needed.
	*
	* t - protected by st_mtx
	* k - Only accessed by curthread or read-only
	*/
	struct seltd {
	STAILQ_HEAD(, selfd) st_selq; /* (k) List of selfds. */
	struct selfd st_free1; / (k) free fd for read set. */
	struct selfd st_free2; / (k) free fd for write set. */
	struct mtx st_mtx; /* Protects struct seltd */
	struct cv st_wait; /* (t) Wait channel. */
	int st_flags; /* (t) SELTD_ flags. */
	};

	#define SELTD_PENDING 0x0001 /* We have pending events. */
	#define SELTD_RESCAN 0x0002 /* Doing a rescan. */

	/*
	* One selfd allocated per-thread per-file-descriptor.
	* f - protected by sf_mtx
	*/
	struct selfd {
	STAILQ_ENTRY(selfd) sf_link; /* (k) fds owned by this td. */
	TAILQ_ENTRY(selfd) sf_threads; /* (f) fds on this selinfo. */
	struct selinfo sf_si; / (f) selinfo when linked. */
	struct mtx sf_mtx; / Pointer to selinfo mtx. */
	struct seltd sf_td; / (k) owning seltd. */
	void sf_cookie; / (k) fd or pollfd. */
	};

	static uma_zone_t selfd_zone;
	static struct mtx_pool *mtxpool_select;

	#ifndef _SYS_SYSPROTO_H_
	struct read_args {
	int fd;
	void *buf;
	size_t nbyte;
	};
	#endif
	int
	-read(td, uap)
	+sys_read(td, uap)
	struct thread *td;
	struct read_args *uap;
	{
	struct uio auio;
	struct iovec aiov;
	int error;

	if (uap->nbyte > INT_MAX)
	return (EINVAL);
	aiov.iov_base = uap->buf;
	aiov.iov_len = uap->nbyte;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_resid = uap->nbyte;
	auio.uio_segflg = UIO_USERSPACE;
	error = kern_readv(td, uap->fd, &auio);
	return(error);
	}

	/*
	* Positioned read system call
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct pread_args {
	int fd;
	void *buf;
	size_t nbyte;
	int pad;
	off_t offset;
	};
	#endif
	int
	-pread(td, uap)
	+sys_pread(td, uap)
	struct thread *td;
	struct pread_args *uap;
	{
	struct uio auio;
	struct iovec aiov;
	int error;

	if (uap->nbyte > INT_MAX)
	return (EINVAL);
	aiov.iov_base = uap->buf;
	aiov.iov_len = uap->nbyte;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_resid = uap->nbyte;
	auio.uio_segflg = UIO_USERSPACE;
	error = kern_preadv(td, uap->fd, &auio, uap->offset);
	return(error);
	}

	int
	freebsd6_pread(td, uap)
	struct thread *td;
	struct freebsd6_pread_args *uap;
	{
	struct pread_args oargs;

	oargs.fd = uap->fd;
	oargs.buf = uap->buf;
	oargs.nbyte = uap->nbyte;
	oargs.offset = uap->offset;
	- return (pread(td, &oargs));
	+ return (sys_pread(td, &oargs));
	}

	/*
	* Scatter read system call.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct readv_args {
	int fd;
	struct iovec *iovp;
	u_int iovcnt;
	};
	#endif
	int
	-readv(struct thread td, struct readv_args uap)
	+sys_readv(struct thread td, struct readv_args uap)
	{
	struct uio *auio;
	int error;

	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_readv(td, uap->fd, auio);
	free(auio, M_IOV);
	return (error);
	}

	int
	kern_readv(struct thread td, int fd, struct uio auio)
	{
	struct file *fp;
	int error;

	error = fget_read(td, fd, CAP_READ \| CAP_SEEK, &fp);
	if (error)
	return (error);
	error = dofileread(td, fd, fp, auio, (off_t)-1, 0);
	fdrop(fp, td);
	return (error);
	}

	/*
	* Scatter positioned read system call.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct preadv_args {
	int fd;
	struct iovec *iovp;
	u_int iovcnt;
	off_t offset;
	};
	#endif
	int
	-preadv(struct thread td, struct preadv_args uap)
	+sys_preadv(struct thread td, struct preadv_args uap)
	{
	struct uio *auio;
	int error;

	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_preadv(td, uap->fd, auio, uap->offset);
	free(auio, M_IOV);
	return (error);
	}

	int
	kern_preadv(td, fd, auio, offset)
	struct thread *td;
	int fd;
	struct uio *auio;
	off_t offset;
	{
	struct file *fp;
	int error;

	error = fget_read(td, fd, CAP_READ, &fp);
	if (error)
	return (error);
	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
	error = ESPIPE;
	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
	error = EINVAL;
	else
	error = dofileread(td, fd, fp, auio, offset, FOF_OFFSET);
	fdrop(fp, td);
	return (error);
	}

	/*
	* Common code for readv and preadv that reads data in
	* from a file using the passed in uio, offset, and flags.
	*/
	static int
	dofileread(td, fd, fp, auio, offset, flags)
	struct thread *td;
	int fd;
	struct file *fp;
	struct uio *auio;
	off_t offset;
	int flags;
	{
	ssize_t cnt;
	int error;
	#ifdef KTRACE
	struct uio *ktruio = NULL;
	#endif

	/* Finish zero length reads right here */
	if (auio->uio_resid == 0) {
	td->td_retval[0] = 0;
	return(0);
	}
	auio->uio_rw = UIO_READ;
	auio->uio_offset = offset;
	auio->uio_td = td;
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_GENIO))
	ktruio = cloneuio(auio);
	#endif
	cnt = auio->uio_resid;
	if ((error = fo_read(fp, auio, td->td_ucred, flags, td))) {
	if (auio->uio_resid != cnt && (error == ERESTART \|\|
	error == EINTR \|\| error == EWOULDBLOCK))
	error = 0;
	}
	cnt -= auio->uio_resid;
	#ifdef KTRACE
	if (ktruio != NULL) {
	ktruio->uio_resid = cnt;
	ktrgenio(fd, UIO_READ, ktruio, error);
	}
	#endif
	td->td_retval[0] = cnt;
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct write_args {
	int fd;
	const void *buf;
	size_t nbyte;
	};
	#endif
	int
	-write(td, uap)
	+sys_write(td, uap)
	struct thread *td;
	struct write_args *uap;
	{
	struct uio auio;
	struct iovec aiov;
	int error;

	if (uap->nbyte > INT_MAX)
	return (EINVAL);
	aiov.iov_base = (void *)(uintptr_t)uap->buf;
	aiov.iov_len = uap->nbyte;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_resid = uap->nbyte;
	auio.uio_segflg = UIO_USERSPACE;
	error = kern_writev(td, uap->fd, &auio);
	return(error);
	}

	/*
	* Positioned write system call.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct pwrite_args {
	int fd;
	const void *buf;
	size_t nbyte;
	int pad;
	off_t offset;
	};
	#endif
	int
	-pwrite(td, uap)
	+sys_pwrite(td, uap)
	struct thread *td;
	struct pwrite_args *uap;
	{
	struct uio auio;
	struct iovec aiov;
	int error;

	if (uap->nbyte > INT_MAX)
	return (EINVAL);
	aiov.iov_base = (void *)(uintptr_t)uap->buf;
	aiov.iov_len = uap->nbyte;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_resid = uap->nbyte;
	auio.uio_segflg = UIO_USERSPACE;
	error = kern_pwritev(td, uap->fd, &auio, uap->offset);
	return(error);
	}

	int
	freebsd6_pwrite(td, uap)
	struct thread *td;
	struct freebsd6_pwrite_args *uap;
	{
	struct pwrite_args oargs;

	oargs.fd = uap->fd;
	oargs.buf = uap->buf;
	oargs.nbyte = uap->nbyte;
	oargs.offset = uap->offset;
	- return (pwrite(td, &oargs));
	+ return (sys_pwrite(td, &oargs));
	}

	/*
	* Gather write system call.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct writev_args {
	int fd;
	struct iovec *iovp;
	u_int iovcnt;
	};
	#endif
	int
	-writev(struct thread td, struct writev_args uap)
	+sys_writev(struct thread td, struct writev_args uap)
	{
	struct uio *auio;
	int error;

	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_writev(td, uap->fd, auio);
	free(auio, M_IOV);
	return (error);
	}

	int
	kern_writev(struct thread td, int fd, struct uio auio)
	{
	struct file *fp;
	int error;

	error = fget_write(td, fd, CAP_WRITE \| CAP_SEEK, &fp);
	if (error)
	return (error);
	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
	fdrop(fp, td);
	return (error);
	}

	/*
	* Gather positioned write system call.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct pwritev_args {
	int fd;
	struct iovec *iovp;
	u_int iovcnt;
	off_t offset;
	};
	#endif
	int
	-pwritev(struct thread td, struct pwritev_args uap)
	+sys_pwritev(struct thread td, struct pwritev_args uap)
	{
	struct uio *auio;
	int error;

	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
	if (error)
	return (error);
	error = kern_pwritev(td, uap->fd, auio, uap->offset);
	free(auio, M_IOV);
	return (error);
	}

	int
	kern_pwritev(td, fd, auio, offset)
	struct thread *td;
	struct uio *auio;
	int fd;
	off_t offset;
	{
	struct file *fp;
	int error;

	error = fget_write(td, fd, CAP_WRITE, &fp);
	if (error)
	return (error);
	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
	error = ESPIPE;
	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
	error = EINVAL;
	else
	error = dofilewrite(td, fd, fp, auio, offset, FOF_OFFSET);
	fdrop(fp, td);
	return (error);
	}

	/*
	* Common code for writev and pwritev that writes data to
	* a file using the passed in uio, offset, and flags.
	*/
	static int
	dofilewrite(td, fd, fp, auio, offset, flags)
	struct thread *td;
	int fd;
	struct file *fp;
	struct uio *auio;
	off_t offset;
	int flags;
	{
	ssize_t cnt;
	int error;
	#ifdef KTRACE
	struct uio *ktruio = NULL;
	#endif

	auio->uio_rw = UIO_WRITE;
	auio->uio_td = td;
	auio->uio_offset = offset;
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_GENIO))
	ktruio = cloneuio(auio);
	#endif
	cnt = auio->uio_resid;
	if (fp->f_type == DTYPE_VNODE)
	bwillwrite();
	if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
	if (auio->uio_resid != cnt && (error == ERESTART \|\|
	error == EINTR \|\| error == EWOULDBLOCK))
	error = 0;
	/* Socket layer is responsible for issuing SIGPIPE. */
	if (fp->f_type != DTYPE_SOCKET && error == EPIPE) {
	PROC_LOCK(td->td_proc);
	tdsignal(td, SIGPIPE);
	PROC_UNLOCK(td->td_proc);
	}
	}
	cnt -= auio->uio_resid;
	#ifdef KTRACE
	if (ktruio != NULL) {
	ktruio->uio_resid = cnt;
	ktrgenio(fd, UIO_WRITE, ktruio, error);
	}
	#endif
	td->td_retval[0] = cnt;
	return (error);
	}

	/*
	* Truncate a file given a file descriptor.
	*
	* Can't use fget_write() here, since must return EINVAL and not EBADF if the
	* descriptor isn't writable.
	*/
	int
	kern_ftruncate(td, fd, length)
	struct thread *td;
	int fd;
	off_t length;
	{
	struct file *fp;
	int error;

	AUDIT_ARG_FD(fd);
	if (length < 0)
	return (EINVAL);
	error = fget(td, fd, CAP_FTRUNCATE, &fp);
	if (error)
	return (error);
	AUDIT_ARG_FILE(td->td_proc, fp);
	if (!(fp->f_flag & FWRITE)) {
	fdrop(fp, td);
	return (EINVAL);
	}
	error = fo_truncate(fp, length, td->td_ucred, td);
	fdrop(fp, td);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ftruncate_args {
	int fd;
	int pad;
	off_t length;
	};
	#endif
	int
	-ftruncate(td, uap)
	+sys_ftruncate(td, uap)
	struct thread *td;
	struct ftruncate_args *uap;
	{

	return (kern_ftruncate(td, uap->fd, uap->length));
	}

	#if defined(COMPAT_43)
	#ifndef _SYS_SYSPROTO_H_
	struct oftruncate_args {
	int fd;
	long length;
	};
	#endif
	int
	oftruncate(td, uap)
	struct thread *td;
	struct oftruncate_args *uap;
	{

	return (kern_ftruncate(td, uap->fd, uap->length));
	}
	#endif /* COMPAT_43 */

	#ifndef _SYS_SYSPROTO_H_
	struct ioctl_args {
	int fd;
	u_long com;
	caddr_t data;
	};
	#endif
	/* ARGSUSED */
	int
	-ioctl(struct thread td, struct ioctl_args uap)
	+sys_ioctl(struct thread td, struct ioctl_args uap)
	{
	u_long com;
	int arg, error;
	u_int size;
	caddr_t data;

	if (uap->com > 0xffffffff) {
	printf(
	"WARNING pid %d (%s): ioctl sign-extension ioctl %lx\n",
	td->td_proc->p_pid, td->td_name, uap->com);
	uap->com &= 0xffffffff;
	}
	com = uap->com;

	/*
	* Interpret high order word to find amount of data to be
	* copied to/from the user's address space.
	*/
	size = IOCPARM_LEN(com);
	if ((size > IOCPARM_MAX) \|\|
	((com & (IOC_VOID \| IOC_IN \| IOC_OUT)) == 0) \|\|
	#if defined(COMPAT_FREEBSD5) \|\| defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43)
	((com & IOC_OUT) && size == 0) \|\|
	#else
	((com & (IOC_IN \| IOC_OUT)) && size == 0) \|\|
	#endif
	((com & IOC_VOID) && size > 0 && size != sizeof(int)))
	return (ENOTTY);

	if (size > 0) {
	if (com & IOC_VOID) {
	/* Integer argument. */
	arg = (intptr_t)uap->data;
	data = (void *)&arg;
	size = 0;
	} else
	data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
	} else
	data = (void *)&uap->data;
	if (com & IOC_IN) {
	error = copyin(uap->data, data, (u_int)size);
	if (error) {
	if (size > 0)
	free(data, M_IOCTLOPS);
	return (error);
	}
	} else if (com & IOC_OUT) {
	/*
	* Zero the buffer so the user always
	* gets back something deterministic.
	*/
	bzero(data, size);
	}

	error = kern_ioctl(td, uap->fd, com, data);

	if (error == 0 && (com & IOC_OUT))
	error = copyout(data, uap->data, (u_int)size);

	if (size > 0)
	free(data, M_IOCTLOPS);
	return (error);
	}

	int
	kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
	{
	struct file *fp;
	struct filedesc *fdp;
	int error;
	int tmp;

	AUDIT_ARG_FD(fd);
	AUDIT_ARG_CMD(com);
	if ((error = fget(td, fd, CAP_IOCTL, &fp)) != 0)
	return (error);
	if ((fp->f_flag & (FREAD \| FWRITE)) == 0) {
	fdrop(fp, td);
	return (EBADF);
	}
	fdp = td->td_proc->p_fd;
	switch (com) {
	case FIONCLEX:
	FILEDESC_XLOCK(fdp);
	fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
	FILEDESC_XUNLOCK(fdp);
	goto out;
	case FIOCLEX:
	FILEDESC_XLOCK(fdp);
	fdp->fd_ofileflags[fd] \|= UF_EXCLOSE;
	FILEDESC_XUNLOCK(fdp);
	goto out;
	case FIONBIO:
	if ((tmp = (int )data))
	atomic_set_int(&fp->f_flag, FNONBLOCK);
	else
	atomic_clear_int(&fp->f_flag, FNONBLOCK);
	data = (void *)&tmp;
	break;
	case FIOASYNC:
	if ((tmp = (int )data))
	atomic_set_int(&fp->f_flag, FASYNC);
	else
	atomic_clear_int(&fp->f_flag, FASYNC);
	data = (void *)&tmp;
	break;
	}

	error = fo_ioctl(fp, com, data, td->td_ucred, td);
	out:
	fdrop(fp, td);
	return (error);
	}

	int
	poll_no_poll(int events)
	{
	/*
	* Return true for read/write. If the user asked for something
	* special, return POLLNVAL, so that clients have a way of
	* determining reliably whether or not the extended
	* functionality is present without hard-coding knowledge
	* of specific filesystem implementations.
	*/
	if (events & ~POLLSTANDARD)
	return (POLLNVAL);

	return (events & (POLLIN \| POLLOUT \| POLLRDNORM \| POLLWRNORM));
	}

	int
	-pselect(struct thread td, struct pselect_args uap)
	+sys_pselect(struct thread td, struct pselect_args uap)
	{
	struct timespec ts;
	struct timeval tv, *tvp;
	sigset_t set, *uset;
	int error;

	if (uap->ts != NULL) {
	error = copyin(uap->ts, &ts, sizeof(ts));
	if (error != 0)
	return (error);
	TIMESPEC_TO_TIMEVAL(&tv, &ts);
	tvp = &tv;
	} else
	tvp = NULL;
	if (uap->sm != NULL) {
	error = copyin(uap->sm, &set, sizeof(set));
	if (error != 0)
	return (error);
	uset = &set;
	} else
	uset = NULL;
	return (kern_pselect(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
	uset, NFDBITS));
	}

	int
	kern_pselect(struct thread td, int nd, fd_set in, fd_set ou, fd_set ex,
	struct timeval tvp, sigset_t uset, int abi_nfdbits)
	{
	int error;

	if (uset != NULL) {
	error = kern_sigprocmask(td, SIG_SETMASK, uset,
	&td->td_oldsigmask, 0);
	if (error != 0)
	return (error);
	td->td_pflags \|= TDP_OLDMASK;
	/*
	* Make sure that ast() is called on return to
	* usermode and TDP_OLDMASK is cleared, restoring old
	* sigmask.
	*/
	thread_lock(td);
	td->td_flags \|= TDF_ASTPENDING;
	thread_unlock(td);
	}
	error = kern_select(td, nd, in, ou, ex, tvp, abi_nfdbits);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct select_args {
	int nd;
	fd_set in, ou, *ex;
	struct timeval *tv;
	};
	#endif
	int
	-select(struct thread td, struct select_args uap)
	+sys_select(struct thread td, struct select_args uap)
	{
	struct timeval tv, *tvp;
	int error;

	if (uap->tv != NULL) {
	error = copyin(uap->tv, &tv, sizeof(tv));
	if (error)
	return (error);
	tvp = &tv;
	} else
	tvp = NULL;

	return (kern_select(td, uap->nd, uap->in, uap->ou, uap->ex, tvp,
	NFDBITS));
	}

	int
	kern_select(struct thread td, int nd, fd_set fd_in, fd_set *fd_ou,
	fd_set fd_ex, struct timeval tvp, int abi_nfdbits)
	{
	struct filedesc *fdp;
	/*
	* The magic 2048 here is chosen to be just enough for FD_SETSIZE
	* infds with the new FD_SETSIZE of 1024, and more than enough for
	* FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
	* of 256.
	*/
	fd_mask s_selbits[howmany(2048, NFDBITS)];
	fd_mask ibits[3], obits[3], selbits, sbp;
	struct timeval atv, rtv, ttv;
	int error, timo;
	u_int nbufbytes, ncpbytes, ncpubytes, nfdbits;

	if (nd < 0)
	return (EINVAL);
	fdp = td->td_proc->p_fd;
	if (nd > fdp->fd_lastfile + 1)
	nd = fdp->fd_lastfile + 1;

	/*
	* Allocate just enough bits for the non-null fd_sets. Use the
	* preallocated auto buffer if possible.
	*/
	nfdbits = roundup(nd, NFDBITS);
	ncpbytes = nfdbits / NBBY;
	ncpubytes = roundup(nd, abi_nfdbits) / NBBY;
	nbufbytes = 0;
	if (fd_in != NULL)
	nbufbytes += 2 * ncpbytes;
	if (fd_ou != NULL)
	nbufbytes += 2 * ncpbytes;
	if (fd_ex != NULL)
	nbufbytes += 2 * ncpbytes;
	if (nbufbytes <= sizeof s_selbits)
	selbits = &s_selbits[0];
	else
	selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);

	/*
	* Assign pointers into the bit buffers and fetch the input bits.
	* Put the output buffers together so that they can be bzeroed
	* together.
	*/
	sbp = selbits;
	#define getbits(name, x) \
	do { \
	if (name == NULL) { \
	ibits[x] = NULL; \
	obits[x] = NULL; \
	} else { \
	ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
	obits[x] = sbp; \
	sbp += ncpbytes / sizeof *sbp; \
	error = copyin(name, ibits[x], ncpubytes); \
	if (error != 0) \
	goto done; \
	bzero((char *)ibits[x] + ncpubytes, \
	ncpbytes - ncpubytes); \
	} \
	} while (0)
	getbits(fd_in, 0);
	getbits(fd_ou, 1);
	getbits(fd_ex, 2);
	#undef getbits

	#if BYTE_ORDER == BIG_ENDIAN && defined(__LP64__)
	/*
	* XXX: swizzle_fdset assumes that if abi_nfdbits != NFDBITS,
	* we are running under 32-bit emulation. This should be more
	* generic.
	*/
	#define swizzle_fdset(bits) \
	if (abi_nfdbits != NFDBITS && bits != NULL) { \
	int i; \
	for (i = 0; i < ncpbytes / sizeof *sbp; i++) \
	bits[i] = (bits[i] >> 32) \| (bits[i] << 32); \
	}
	#else
	#define swizzle_fdset(bits)
	#endif

	/* Make sure the bit order makes it through an ABI transition */
	swizzle_fdset(ibits[0]);
	swizzle_fdset(ibits[1]);
	swizzle_fdset(ibits[2]);

	if (nbufbytes != 0)
	bzero(selbits, nbufbytes / 2);

	if (tvp != NULL) {
	atv = *tvp;
	if (itimerfix(&atv)) {
	error = EINVAL;
	goto done;
	}
	getmicrouptime(&rtv);
	timevaladd(&atv, &rtv);
	} else {
	atv.tv_sec = 0;
	atv.tv_usec = 0;
	}
	timo = 0;
	seltdinit(td);
	/* Iterate until the timeout expires or descriptors become ready. */
	for (;;) {
	error = selscan(td, ibits, obits, nd);
	if (error \|\| td->td_retval[0] != 0)
	break;
	if (atv.tv_sec \|\| atv.tv_usec) {
	getmicrouptime(&rtv);
	if (timevalcmp(&rtv, &atv, >=))
	break;
	ttv = atv;
	timevalsub(&ttv, &rtv);
	timo = ttv.tv_sec > 24 * 60 * 60 ?
	24 * 60 * 60 * hz : tvtohz(&ttv);
	}
	error = seltdwait(td, timo);
	if (error)
	break;
	error = selrescan(td, ibits, obits);
	if (error \|\| td->td_retval[0] != 0)
	break;
	}
	seltdclear(td);

	done:
	/* select is not restarted after signals... */
	if (error == ERESTART)
	error = EINTR;
	if (error == EWOULDBLOCK)
	error = 0;

	/* swizzle bit order back, if necessary */
	swizzle_fdset(obits[0]);
	swizzle_fdset(obits[1]);
	swizzle_fdset(obits[2]);
	#undef swizzle_fdset

	#define putbits(name, x) \
	if (name && (error2 = copyout(obits[x], name, ncpubytes))) \
	error = error2;
	if (error == 0) {
	int error2;

	putbits(fd_in, 0);
	putbits(fd_ou, 1);
	putbits(fd_ex, 2);
	#undef putbits
	}
	if (selbits != &s_selbits[0])
	free(selbits, M_SELECT);

	return (error);
	}
	/*
	* Convert a select bit set to poll flags.
	*
	* The backend always returns POLLHUP/POLLERR if appropriate and we
	* return this as a set bit in any set.
	*/
	static int select_flags[3] = {
	POLLRDNORM \| POLLHUP \| POLLERR,
	POLLWRNORM \| POLLHUP \| POLLERR,
	POLLRDBAND \| POLLERR
	};

	/*
	* Compute the fo_poll flags required for a fd given by the index and
	* bit position in the fd_mask array.
	*/
	static __inline int
	selflags(fd_mask **ibits, int idx, fd_mask bit)
	{
	int flags;
	int msk;

	flags = 0;
	for (msk = 0; msk < 3; msk++) {
	if (ibits[msk] == NULL)
	continue;
	if ((ibits[msk][idx] & bit) == 0)
	continue;
	flags \|= select_flags[msk];
	}
	return (flags);
	}

	/*
	* Set the appropriate output bits given a mask of fired events and the
	* input bits originally requested.
	*/
	static __inline int
	selsetbits(fd_mask ibits, fd_mask obits, int idx, fd_mask bit, int events)
	{
	int msk;
	int n;

	n = 0;
	for (msk = 0; msk < 3; msk++) {
	if ((events & select_flags[msk]) == 0)
	continue;
	if (ibits[msk] == NULL)
	continue;
	if ((ibits[msk][idx] & bit) == 0)
	continue;
	/*
	* XXX Check for a duplicate set. This can occur because a
	* socket calls selrecord() twice for each poll() call
	* resulting in two selfds per real fd. selrescan() will
	* call selsetbits twice as a result.
	*/
	if ((obits[msk][idx] & bit) != 0)
	continue;
	obits[msk][idx] \|= bit;
	n++;
	}

	return (n);
	}

	static __inline int
	getselfd_cap(struct filedesc fdp, int fd, struct file *fpp)
	{
	struct file *fp;
	#ifdef CAPABILITIES
	struct file *fp_fromcap;
	int error;
	#endif

	if ((fp = fget_unlocked(fdp, fd)) == NULL)
	return (EBADF);
	#ifdef CAPABILITIES
	/*
	* If the file descriptor is for a capability, test rights and use
	* the file descriptor references by the capability.
	*/
	error = cap_funwrap(fp, CAP_POLL_EVENT, &fp_fromcap);
	if (error) {
	fdrop(fp, curthread);
	return (error);
	}
	if (fp != fp_fromcap) {
	fhold(fp_fromcap);
	fdrop(fp, curthread);
	fp = fp_fromcap;
	}
	#endif /* CAPABILITIES */
	*fpp = fp;
	return (0);
	}

	/*
	* Traverse the list of fds attached to this thread's seltd and check for
	* completion.
	*/
	static int
	selrescan(struct thread td, fd_mask ibits, fd_mask *obits)
	{
	struct filedesc *fdp;
	struct selinfo *si;
	struct seltd *stp;
	struct selfd *sfp;
	struct selfd *sfn;
	struct file *fp;
	fd_mask bit;
	int fd, ev, n, idx;
	int error;

	fdp = td->td_proc->p_fd;
	stp = td->td_sel;
	n = 0;
	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
	fd = (int)(uintptr_t)sfp->sf_cookie;
	si = sfp->sf_si;
	selfdfree(stp, sfp);
	/* If the selinfo wasn't cleared the event didn't fire. */
	if (si != NULL)
	continue;
	error = getselfd_cap(fdp, fd, &fp);
	if (error)
	return (error);
	idx = fd / NFDBITS;
	bit = (fd_mask)1 << (fd % NFDBITS);
	ev = fo_poll(fp, selflags(ibits, idx, bit), td->td_ucred, td);
	fdrop(fp, td);
	if (ev != 0)
	n += selsetbits(ibits, obits, idx, bit, ev);
	}
	stp->st_flags = 0;
	td->td_retval[0] = n;
	return (0);
	}

	/*
	* Perform the initial filedescriptor scan and register ourselves with
	* each selinfo.
	*/
	static int
	selscan(td, ibits, obits, nfd)
	struct thread *td;
	fd_mask ibits, obits;
	int nfd;
	{
	struct filedesc *fdp;
	struct file *fp;
	fd_mask bit;
	int ev, flags, end, fd;
	int n, idx;
	int error;

	fdp = td->td_proc->p_fd;
	n = 0;
	for (idx = 0, fd = 0; fd < nfd; idx++) {
	end = imin(fd + NFDBITS, nfd);
	for (bit = 1; fd < end; bit <<= 1, fd++) {
	/* Compute the list of events we're interested in. */
	flags = selflags(ibits, idx, bit);
	if (flags == 0)
	continue;
	error = getselfd_cap(fdp, fd, &fp);
	if (error)
	return (error);
	selfdalloc(td, (void *)(uintptr_t)fd);
	ev = fo_poll(fp, flags, td->td_ucred, td);
	fdrop(fp, td);
	if (ev != 0)
	n += selsetbits(ibits, obits, idx, bit, ev);
	}
	}

	td->td_retval[0] = n;
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct poll_args {
	struct pollfd *fds;
	u_int nfds;
	int timeout;
	};
	#endif
	int
	-poll(td, uap)
	+sys_poll(td, uap)
	struct thread *td;
	struct poll_args *uap;
	{
	struct pollfd *bits;
	struct pollfd smallbits[32];
	struct timeval atv, rtv, ttv;
	int error = 0, timo;
	u_int nfds;
	size_t ni;

	nfds = uap->nfds;
	if (nfds > maxfilesperproc && nfds > FD_SETSIZE)
	return (EINVAL);
	ni = nfds * sizeof(struct pollfd);
	if (ni > sizeof(smallbits))
	bits = malloc(ni, M_TEMP, M_WAITOK);
	else
	bits = smallbits;
	error = copyin(uap->fds, bits, ni);
	if (error)
	goto done;
	if (uap->timeout != INFTIM) {
	atv.tv_sec = uap->timeout / 1000;
	atv.tv_usec = (uap->timeout % 1000) * 1000;
	if (itimerfix(&atv)) {
	error = EINVAL;
	goto done;
	}
	getmicrouptime(&rtv);
	timevaladd(&atv, &rtv);
	} else {
	atv.tv_sec = 0;
	atv.tv_usec = 0;
	}
	timo = 0;
	seltdinit(td);
	/* Iterate until the timeout expires or descriptors become ready. */
	for (;;) {
	error = pollscan(td, bits, nfds);
	if (error \|\| td->td_retval[0] != 0)
	break;
	if (atv.tv_sec \|\| atv.tv_usec) {
	getmicrouptime(&rtv);
	if (timevalcmp(&rtv, &atv, >=))
	break;
	ttv = atv;
	timevalsub(&ttv, &rtv);
	timo = ttv.tv_sec > 24 * 60 * 60 ?
	24 * 60 * 60 * hz : tvtohz(&ttv);
	}
	error = seltdwait(td, timo);
	if (error)
	break;
	error = pollrescan(td);
	if (error \|\| td->td_retval[0] != 0)
	break;
	}
	seltdclear(td);

	done:
	/* poll is not restarted after signals... */
	if (error == ERESTART)
	error = EINTR;
	if (error == EWOULDBLOCK)
	error = 0;
	if (error == 0) {
	error = pollout(td, bits, uap->fds, nfds);
	if (error)
	goto out;
	}
	out:
	if (ni > sizeof(smallbits))
	free(bits, M_TEMP);
	return (error);
	}

	static int
	pollrescan(struct thread *td)
	{
	struct seltd *stp;
	struct selfd *sfp;
	struct selfd *sfn;
	struct selinfo *si;
	struct filedesc *fdp;
	struct file *fp;
	struct pollfd *fd;
	int n;

	n = 0;
	fdp = td->td_proc->p_fd;
	stp = td->td_sel;
	FILEDESC_SLOCK(fdp);
	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn) {
	fd = (struct pollfd *)sfp->sf_cookie;
	si = sfp->sf_si;
	selfdfree(stp, sfp);
	/* If the selinfo wasn't cleared the event didn't fire. */
	if (si != NULL)
	continue;
	fp = fdp->fd_ofiles[fd->fd];
	#ifdef CAPABILITIES
	if ((fp == NULL)
	\|\| (cap_funwrap(fp, CAP_POLL_EVENT, &fp) != 0)) {
	#else
	if (fp == NULL) {
	#endif
	fd->revents = POLLNVAL;
	n++;
	continue;
	}

	/*
	* Note: backend also returns POLLHUP and
	* POLLERR if appropriate.
	*/
	fd->revents = fo_poll(fp, fd->events, td->td_ucred, td);
	if (fd->revents != 0)
	n++;
	}
	FILEDESC_SUNLOCK(fdp);
	stp->st_flags = 0;
	td->td_retval[0] = n;
	return (0);
	}


	static int
	pollout(td, fds, ufds, nfd)
	struct thread *td;
	struct pollfd *fds;
	struct pollfd *ufds;
	u_int nfd;
	{
	int error = 0;
	u_int i = 0;
	u_int n = 0;

	for (i = 0; i < nfd; i++) {
	error = copyout(&fds->revents, &ufds->revents,
	sizeof(ufds->revents));
	if (error)
	return (error);
	if (fds->revents != 0)
	n++;
	fds++;
	ufds++;
	}
	td->td_retval[0] = n;
	return (0);
	}

	static int
	pollscan(td, fds, nfd)
	struct thread *td;
	struct pollfd *fds;
	u_int nfd;
	{
	struct filedesc *fdp = td->td_proc->p_fd;
	int i;
	struct file *fp;
	int n = 0;

	FILEDESC_SLOCK(fdp);
	for (i = 0; i < nfd; i++, fds++) {
	if (fds->fd >= fdp->fd_nfiles) {
	fds->revents = POLLNVAL;
	n++;
	} else if (fds->fd < 0) {
	fds->revents = 0;
	} else {
	fp = fdp->fd_ofiles[fds->fd];
	#ifdef CAPABILITIES
	if ((fp == NULL)
	\|\| (cap_funwrap(fp, CAP_POLL_EVENT, &fp) != 0)) {
	#else
	if (fp == NULL) {
	#endif
	fds->revents = POLLNVAL;
	n++;
	} else {
	/*
	* Note: backend also returns POLLHUP and
	* POLLERR if appropriate.
	*/
	selfdalloc(td, fds);
	fds->revents = fo_poll(fp, fds->events,
	td->td_ucred, td);
	/*
	* POSIX requires POLLOUT to be never
	* set simultaneously with POLLHUP.
	*/
	if ((fds->revents & POLLHUP) != 0)
	fds->revents &= ~POLLOUT;

	if (fds->revents != 0)
	n++;
	}
	}
	}
	FILEDESC_SUNLOCK(fdp);
	td->td_retval[0] = n;
	return (0);
	}

	/*
	* OpenBSD poll system call.
	*
	* XXX this isn't quite a true representation.. OpenBSD uses select ops.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct openbsd_poll_args {
	struct pollfd *fds;
	u_int nfds;
	int timeout;
	};
	#endif
	int
	-openbsd_poll(td, uap)
	+sys_openbsd_poll(td, uap)
	register struct thread *td;
	register struct openbsd_poll_args *uap;
	{
	- return (poll(td, (struct poll_args *)uap));
	+ return (sys_poll(td, (struct poll_args *)uap));
	}

	/*
	* XXX This was created specifically to support netncp and netsmb. This
	* allows the caller to specify a socket to wait for events on. It returns
	* 0 if any events matched and an error otherwise. There is no way to
	* determine which events fired.
	*/
	int
	selsocket(struct socket so, int events, struct timeval tvp, struct thread *td)
	{
	struct timeval atv, rtv, ttv;
	int error, timo;

	if (tvp != NULL) {
	atv = *tvp;
	if (itimerfix(&atv))
	return (EINVAL);
	getmicrouptime(&rtv);
	timevaladd(&atv, &rtv);
	} else {
	atv.tv_sec = 0;
	atv.tv_usec = 0;
	}

	timo = 0;
	seltdinit(td);
	/*
	* Iterate until the timeout expires or the socket becomes ready.
	*/
	for (;;) {
	selfdalloc(td, NULL);
	error = sopoll(so, events, NULL, td);
	/* error here is actually the ready events. */
	if (error)
	return (0);
	if (atv.tv_sec \|\| atv.tv_usec) {
	getmicrouptime(&rtv);
	if (timevalcmp(&rtv, &atv, >=)) {
	seltdclear(td);
	return (EWOULDBLOCK);
	}
	ttv = atv;
	timevalsub(&ttv, &rtv);
	timo = ttv.tv_sec > 24 * 60 * 60 ?
	24 * 60 * 60 * hz : tvtohz(&ttv);
	}
	error = seltdwait(td, timo);
	seltdclear(td);
	if (error)
	break;
	}
	/* XXX Duplicates ncp/smb behavior. */
	if (error == ERESTART)
	error = 0;
	return (error);
	}

	/*
	* Preallocate two selfds associated with 'cookie'. Some fo_poll routines
	* have two select sets, one for read and another for write.
	*/
	static void
	selfdalloc(struct thread td, void cookie)
	{
	struct seltd *stp;

	stp = td->td_sel;
	if (stp->st_free1 == NULL)
	stp->st_free1 = uma_zalloc(selfd_zone, M_WAITOK\|M_ZERO);
	stp->st_free1->sf_td = stp;
	stp->st_free1->sf_cookie = cookie;
	if (stp->st_free2 == NULL)
	stp->st_free2 = uma_zalloc(selfd_zone, M_WAITOK\|M_ZERO);
	stp->st_free2->sf_td = stp;
	stp->st_free2->sf_cookie = cookie;
	}

	static void
	selfdfree(struct seltd stp, struct selfd sfp)
	{
	STAILQ_REMOVE(&stp->st_selq, sfp, selfd, sf_link);
	mtx_lock(sfp->sf_mtx);
	if (sfp->sf_si)
	TAILQ_REMOVE(&sfp->sf_si->si_tdlist, sfp, sf_threads);
	mtx_unlock(sfp->sf_mtx);
	uma_zfree(selfd_zone, sfp);
	}

	/* Drain the waiters tied to all the selfd belonging the specified selinfo. */
	void
	seldrain(sip)
	struct selinfo *sip;
	{

	/*
	* This feature is already provided by doselwakeup(), thus it is
	* enough to go for it.
	* Eventually, the context, should take care to avoid races
	* between thread calling select()/poll() and file descriptor
	* detaching, but, again, the races are just the same as
	* selwakeup().
	*/
	doselwakeup(sip, -1);
	}

	/*
	* Record a select request.
	*/
	void
	selrecord(selector, sip)
	struct thread *selector;
	struct selinfo *sip;
	{
	struct selfd *sfp;
	struct seltd *stp;
	struct mtx *mtxp;

	stp = selector->td_sel;
	/*
	* Don't record when doing a rescan.
	*/
	if (stp->st_flags & SELTD_RESCAN)
	return;
	/*
	* Grab one of the preallocated descriptors.
	*/
	sfp = NULL;
	if ((sfp = stp->st_free1) != NULL)
	stp->st_free1 = NULL;
	else if ((sfp = stp->st_free2) != NULL)
	stp->st_free2 = NULL;
	else
	panic("selrecord: No free selfd on selq");
	mtxp = sip->si_mtx;
	if (mtxp == NULL)
	mtxp = mtx_pool_find(mtxpool_select, sip);
	/*
	* Initialize the sfp and queue it in the thread.
	*/
	sfp->sf_si = sip;
	sfp->sf_mtx = mtxp;
	STAILQ_INSERT_TAIL(&stp->st_selq, sfp, sf_link);
	/*
	* Now that we've locked the sip, check for initialization.
	*/
	mtx_lock(mtxp);
	if (sip->si_mtx == NULL) {
	sip->si_mtx = mtxp;
	TAILQ_INIT(&sip->si_tdlist);
	}
	/*
	* Add this thread to the list of selfds listening on this selinfo.
	*/
	TAILQ_INSERT_TAIL(&sip->si_tdlist, sfp, sf_threads);
	mtx_unlock(sip->si_mtx);
	}

	/* Wake up a selecting thread. */
	void
	selwakeup(sip)
	struct selinfo *sip;
	{
	doselwakeup(sip, -1);
	}

	/* Wake up a selecting thread, and set its priority. */
	void
	selwakeuppri(sip, pri)
	struct selinfo *sip;
	int pri;
	{
	doselwakeup(sip, pri);
	}

	/*
	* Do a wakeup when a selectable event occurs.
	*/
	static void
	doselwakeup(sip, pri)
	struct selinfo *sip;
	int pri;
	{
	struct selfd *sfp;
	struct selfd *sfn;
	struct seltd *stp;

	/* If it's not initialized there can't be any waiters. */
	if (sip->si_mtx == NULL)
	return;
	/*
	* Locking the selinfo locks all selfds associated with it.
	*/
	mtx_lock(sip->si_mtx);
	TAILQ_FOREACH_SAFE(sfp, &sip->si_tdlist, sf_threads, sfn) {
	/*
	* Once we remove this sfp from the list and clear the
	* sf_si seltdclear will know to ignore this si.
	*/
	TAILQ_REMOVE(&sip->si_tdlist, sfp, sf_threads);
	sfp->sf_si = NULL;
	stp = sfp->sf_td;
	mtx_lock(&stp->st_mtx);
	stp->st_flags \|= SELTD_PENDING;
	cv_broadcastpri(&stp->st_wait, pri);
	mtx_unlock(&stp->st_mtx);
	}
	mtx_unlock(sip->si_mtx);
	}

	static void
	seltdinit(struct thread *td)
	{
	struct seltd *stp;

	if ((stp = td->td_sel) != NULL)
	goto out;
	td->td_sel = stp = malloc(sizeof(*stp), M_SELECT, M_WAITOK\|M_ZERO);
	mtx_init(&stp->st_mtx, "sellck", NULL, MTX_DEF);
	cv_init(&stp->st_wait, "select");
	out:
	stp->st_flags = 0;
	STAILQ_INIT(&stp->st_selq);
	}

	static int
	seltdwait(struct thread *td, int timo)
	{
	struct seltd *stp;
	int error;

	stp = td->td_sel;
	/*
	* An event of interest may occur while we do not hold the seltd
	* locked so check the pending flag before we sleep.
	*/
	mtx_lock(&stp->st_mtx);
	/*
	* Any further calls to selrecord will be a rescan.
	*/
	stp->st_flags \|= SELTD_RESCAN;
	if (stp->st_flags & SELTD_PENDING) {
	mtx_unlock(&stp->st_mtx);
	return (0);
	}
	if (timo > 0)
	error = cv_timedwait_sig(&stp->st_wait, &stp->st_mtx, timo);
	else
	error = cv_wait_sig(&stp->st_wait, &stp->st_mtx);
	mtx_unlock(&stp->st_mtx);

	return (error);
	}

	void
	seltdfini(struct thread *td)
	{
	struct seltd *stp;

	stp = td->td_sel;
	if (stp == NULL)
	return;
	if (stp->st_free1)
	uma_zfree(selfd_zone, stp->st_free1);
	if (stp->st_free2)
	uma_zfree(selfd_zone, stp->st_free2);
	td->td_sel = NULL;
	free(stp, M_SELECT);
	}

	/*
	* Remove the references to the thread from all of the objects we were
	* polling.
	*/
	static void
	seltdclear(struct thread *td)
	{
	struct seltd *stp;
	struct selfd *sfp;
	struct selfd *sfn;

	stp = td->td_sel;
	STAILQ_FOREACH_SAFE(sfp, &stp->st_selq, sf_link, sfn)
	selfdfree(stp, sfp);
	stp->st_flags = 0;
	}

	static void selectinit(void *);
	SYSINIT(select, SI_SUB_SYSCALLS, SI_ORDER_ANY, selectinit, NULL);
	static void
	selectinit(void *dummy __unused)
	{

	selfd_zone = uma_zcreate("selfd", sizeof(struct selfd), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, 0);
	mtxpool_select = mtx_pool_create("select mtxpool", 128, MTX_DEF);
	}
	Index: head/sys/kern/sys_pipe.c
	===================================================================
	--- head/sys/kern/sys_pipe.c (revision 225616)
	+++ head/sys/kern/sys_pipe.c (revision 225617)
	@@ -1,1626 +1,1626 @@
	/*-
	* Copyright (c) 1996 John S. Dyson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice immediately at the beginning of the file, without modification,
	* this list of conditions, and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Absolutely no warranty of function or purpose is made by the author
	* John S. Dyson.
	* 4. Modifications may be freely made to this file if the above conditions
	* are met.
	*/

	/*
	* This file contains a high-performance replacement for the socket-based
	* pipes scheme originally used in FreeBSD/4.4Lite. It does not support
	* all features of sockets, but does do everything that pipes normally
	* do.
	*/

	/*
	* This code has two modes of operation, a small write mode and a large
	* write mode. The small write mode acts like conventional pipes with
	* a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
	* "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
	* and PIPE_SIZE in size, the sending process pins the underlying pages in
	* memory, and the receiving process copies directly from these pinned pages
	* in the sending process.
	*
	* If the sending process receives a signal, it is possible that it will
	* go away, and certainly its address space can change, because control
	* is returned back to the user-mode side. In that case, the pipe code
	* arranges to copy the buffer supplied by the user process, to a pageable
	* kernel buffer, and the receiving process will grab the data from the
	* pageable kernel buffer. Since signals don't happen all that often,
	* the copy operation is normally eliminated.
	*
	* The constant PIPE_MINDIRECT is chosen to make sure that buffering will
	* happen for small transfers so that the system will not spend all of
	* its time context switching.
	*
	* In order to limit the resource use of pipes, two sysctls exist:
	*
	* kern.ipc.maxpipekva - This is a hard limit on the amount of pageable
	* address space available to us in pipe_map. This value is normally
	* autotuned, but may also be loader tuned.
	*
	* kern.ipc.pipekva - This read-only sysctl tracks the current amount of
	* memory in use by pipes.
	*
	* Based on how large pipekva is relative to maxpipekva, the following
	* will happen:
	*
	* 0% - 50%:
	* New pipes are given 16K of memory backing, pipes may dynamically
	* grow to as large as 64K where needed.
	* 50% - 75%:
	* New pipes are given 4K (or PAGE_SIZE) of memory backing,
	* existing pipes may NOT grow.
	* 75% - 100%:
	* New pipes are given 4K (or PAGE_SIZE) of memory backing,
	* existing pipes will be shrunk down to 4K whenever possible.
	*
	* Resizing may be disabled by setting kern.ipc.piperesizeallowed=0. If
	* that is set, the only resize that will occur is the 0 -> SMALL_PIPE_SIZE
	* resize which MUST occur for reverse-direction pipes when they are
	* first used.
	*
	* Additional information about the current state of pipes may be obtained
	* from kern.ipc.pipes, kern.ipc.pipefragretry, kern.ipc.pipeallocfail,
	* and kern.ipc.piperesizefail.
	*
	* Locking rules: There are two locks present here: A mutex, used via
	* PIPE_LOCK, and a flag, used via pipelock(). All locking is done via
	* the flag, as mutexes can not persist over uiomove. The mutex
	* exists only to guard access to the flag, and is not in itself a
	* locking mechanism. Also note that there is only a single mutex for
	* both directions of a pipe.
	*
	* As pipelock() may have to sleep before it can acquire the flag, it
	* is important to reread all data after a call to pipelock(); everything
	* in the structure may have changed.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/filio.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/ttycom.h>
	#include <sys/stat.h>
	#include <sys/malloc.h>
	#include <sys/poll.h>
	#include <sys/selinfo.h>
	#include <sys/signalvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/pipe.h>
	#include <sys/proc.h>
	#include <sys/vnode.h>
	#include <sys/uio.h>
	#include <sys/event.h>

	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/vm_object.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_extern.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_page.h>
	#include <vm/uma.h>

	/*
	* Use this define if you want to disable fancy VM things. Expect an
	* approx 30% decrease in transfer rate. This could be useful for
	* NetBSD or OpenBSD.
	*/
	/* #define PIPE_NODIRECT */

	/*
	* interfaces to the outside world
	*/
	static fo_rdwr_t pipe_read;
	static fo_rdwr_t pipe_write;
	static fo_truncate_t pipe_truncate;
	static fo_ioctl_t pipe_ioctl;
	static fo_poll_t pipe_poll;
	static fo_kqfilter_t pipe_kqfilter;
	static fo_stat_t pipe_stat;
	static fo_close_t pipe_close;

	static struct fileops pipeops = {
	.fo_read = pipe_read,
	.fo_write = pipe_write,
	.fo_truncate = pipe_truncate,
	.fo_ioctl = pipe_ioctl,
	.fo_poll = pipe_poll,
	.fo_kqfilter = pipe_kqfilter,
	.fo_stat = pipe_stat,
	.fo_close = pipe_close,
	.fo_chmod = invfo_chmod,
	.fo_chown = invfo_chown,
	.fo_flags = DFLAG_PASSABLE
	};

	static void filt_pipedetach(struct knote *kn);
	static int filt_piperead(struct knote *kn, long hint);
	static int filt_pipewrite(struct knote *kn, long hint);

	static struct filterops pipe_rfiltops = {
	.f_isfd = 1,
	.f_detach = filt_pipedetach,
	.f_event = filt_piperead
	};
	static struct filterops pipe_wfiltops = {
	.f_isfd = 1,
	.f_detach = filt_pipedetach,
	.f_event = filt_pipewrite
	};

	/*
	* Default pipe buffer size(s), this can be kind-of large now because pipe
	* space is pageable. The pipe code will try to maintain locality of
	* reference for performance reasons, so small amounts of outstanding I/O
	* will not wipe the cache.
	*/
	#define MINPIPESIZE (PIPE_SIZE/3)
	#define MAXPIPESIZE (2*PIPE_SIZE/3)

	static long amountpipekva;
	static int pipefragretry;
	static int pipeallocfail;
	static int piperesizefail;
	static int piperesizeallowed = 1;

	SYSCTL_LONG(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
	&maxpipekva, 0, "Pipe KVA limit");
	SYSCTL_LONG(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
	&amountpipekva, 0, "Pipe KVA usage");
	SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
	&pipefragretry, 0, "Pipe allocation retries due to fragmentation");
	SYSCTL_INT(_kern_ipc, OID_AUTO, pipeallocfail, CTLFLAG_RD,
	&pipeallocfail, 0, "Pipe allocation failures");
	SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizefail, CTLFLAG_RD,
	&piperesizefail, 0, "Pipe resize failures");
	SYSCTL_INT(_kern_ipc, OID_AUTO, piperesizeallowed, CTLFLAG_RW,
	&piperesizeallowed, 0, "Pipe resizing allowed");

	static void pipeinit(void *dummy __unused);
	static void pipeclose(struct pipe *cpipe);
	static void pipe_free_kmem(struct pipe *cpipe);
	static int pipe_create(struct pipe *pipe, int backing);
	static __inline int pipelock(struct pipe *cpipe, int catch);
	static __inline void pipeunlock(struct pipe *cpipe);
	static __inline void pipeselwakeup(struct pipe *cpipe);
	#ifndef PIPE_NODIRECT
	static int pipe_build_write_buffer(struct pipe wpipe, struct uio uio);
	static void pipe_destroy_write_buffer(struct pipe *wpipe);
	static int pipe_direct_write(struct pipe wpipe, struct uio uio);
	static void pipe_clone_write_buffer(struct pipe *wpipe);
	#endif
	static int pipespace(struct pipe *cpipe, int size);
	static int pipespace_new(struct pipe *cpipe, int size);

	static int pipe_zone_ctor(void mem, int size, void arg, int flags);
	static int pipe_zone_init(void *mem, int size, int flags);
	static void pipe_zone_fini(void *mem, int size);

	static uma_zone_t pipe_zone;

	SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, pipeinit, NULL);

	static void
	pipeinit(void *dummy __unused)
	{

	pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
	pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
	UMA_ALIGN_PTR, 0);
	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
	}

	static int
	pipe_zone_ctor(void mem, int size, void arg, int flags)
	{
	struct pipepair *pp;
	struct pipe rpipe, wpipe;

	KASSERT(size == sizeof(*pp), ("pipe_zone_ctor: wrong size"));

	pp = (struct pipepair *)mem;

	/*
	* We zero both pipe endpoints to make sure all the kmem pointers
	* are NULL, flag fields are zero'd, etc. We timestamp both
	* endpoints with the same time.
	*/
	rpipe = &pp->pp_rpipe;
	bzero(rpipe, sizeof(*rpipe));
	vfs_timestamp(&rpipe->pipe_ctime);
	rpipe->pipe_atime = rpipe->pipe_mtime = rpipe->pipe_ctime;

	wpipe = &pp->pp_wpipe;
	bzero(wpipe, sizeof(*wpipe));
	wpipe->pipe_ctime = rpipe->pipe_ctime;
	wpipe->pipe_atime = wpipe->pipe_mtime = rpipe->pipe_ctime;

	rpipe->pipe_peer = wpipe;
	rpipe->pipe_pair = pp;
	wpipe->pipe_peer = rpipe;
	wpipe->pipe_pair = pp;

	/*
	* Mark both endpoints as present; they will later get free'd
	* one at a time. When both are free'd, then the whole pair
	* is released.
	*/
	rpipe->pipe_present = PIPE_ACTIVE;
	wpipe->pipe_present = PIPE_ACTIVE;

	/*
	* Eventually, the MAC Framework may initialize the label
	* in ctor or init, but for now we do it elswhere to avoid
	* blocking in ctor or init.
	*/
	pp->pp_label = NULL;

	return (0);
	}

	static int
	pipe_zone_init(void *mem, int size, int flags)
	{
	struct pipepair *pp;

	KASSERT(size == sizeof(*pp), ("pipe_zone_init: wrong size"));

	pp = (struct pipepair *)mem;

	mtx_init(&pp->pp_mtx, "pipe mutex", NULL, MTX_DEF \| MTX_RECURSE);
	return (0);
	}

	static void
	pipe_zone_fini(void *mem, int size)
	{
	struct pipepair *pp;

	KASSERT(size == sizeof(*pp), ("pipe_zone_fini: wrong size"));

	pp = (struct pipepair *)mem;

	mtx_destroy(&pp->pp_mtx);
	}

	/*
	* The pipe system call for the DTYPE_PIPE type of pipes. If we fail, let
	* the zone pick up the pieces via pipeclose().
	*/
	int
	kern_pipe(struct thread *td, int fildes[2])
	{
	struct filedesc *fdp = td->td_proc->p_fd;
	struct file rf, wf;
	struct pipepair *pp;
	struct pipe rpipe, wpipe;
	int fd, error;

	pp = uma_zalloc(pipe_zone, M_WAITOK);
	#ifdef MAC
	/*
	* The MAC label is shared between the connected endpoints. As a
	* result mac_pipe_init() and mac_pipe_create() are called once
	* for the pair, and not on the endpoints.
	*/
	mac_pipe_init(pp);
	mac_pipe_create(td->td_ucred, pp);
	#endif
	rpipe = &pp->pp_rpipe;
	wpipe = &pp->pp_wpipe;

	knlist_init_mtx(&rpipe->pipe_sel.si_note, PIPE_MTX(rpipe));
	knlist_init_mtx(&wpipe->pipe_sel.si_note, PIPE_MTX(wpipe));

	/* Only the forward direction pipe is backed by default */
	if ((error = pipe_create(rpipe, 1)) != 0 \|\|
	(error = pipe_create(wpipe, 0)) != 0) {
	pipeclose(rpipe);
	pipeclose(wpipe);
	return (error);
	}

	rpipe->pipe_state \|= PIPE_DIRECTOK;
	wpipe->pipe_state \|= PIPE_DIRECTOK;

	error = falloc(td, &rf, &fd, 0);
	if (error) {
	pipeclose(rpipe);
	pipeclose(wpipe);
	return (error);
	}
	/* An extra reference on `rf' has been held for us by falloc(). */
	fildes[0] = fd;

	/*
	* Warning: once we've gotten past allocation of the fd for the
	* read-side, we can only drop the read side via fdrop() in order
	* to avoid races against processes which manage to dup() the read
	* side while we are blocked trying to allocate the write side.
	*/
	finit(rf, FREAD \| FWRITE, DTYPE_PIPE, rpipe, &pipeops);
	error = falloc(td, &wf, &fd, 0);
	if (error) {
	fdclose(fdp, rf, fildes[0], td);
	fdrop(rf, td);
	/* rpipe has been closed by fdrop(). */
	pipeclose(wpipe);
	return (error);
	}
	/* An extra reference on `wf' has been held for us by falloc(). */
	finit(wf, FREAD \| FWRITE, DTYPE_PIPE, wpipe, &pipeops);
	fdrop(wf, td);
	fildes[1] = fd;
	fdrop(rf, td);

	return (0);
	}

	/* ARGSUSED */
	int
	-pipe(struct thread td, struct pipe_args uap)
	+sys_pipe(struct thread td, struct pipe_args uap)
	{
	int error;
	int fildes[2];

	error = kern_pipe(td, fildes);
	if (error)
	return (error);

	td->td_retval[0] = fildes[0];
	td->td_retval[1] = fildes[1];

	return (0);
	}

	/*
	* Allocate kva for pipe circular buffer, the space is pageable
	* This routine will 'realloc' the size of a pipe safely, if it fails
	* it will retain the old buffer.
	* If it fails it will return ENOMEM.
	*/
	static int
	pipespace_new(cpipe, size)
	struct pipe *cpipe;
	int size;
	{
	caddr_t buffer;
	int error, cnt, firstseg;
	static int curfail = 0;
	static struct timeval lastfail;

	KASSERT(!mtx_owned(PIPE_MTX(cpipe)), ("pipespace: pipe mutex locked"));
	KASSERT(!(cpipe->pipe_state & PIPE_DIRECTW),
	("pipespace: resize of direct writes not allowed"));
	retry:
	cnt = cpipe->pipe_buffer.cnt;
	if (cnt > size)
	size = cnt;

	size = round_page(size);
	buffer = (caddr_t) vm_map_min(pipe_map);

	error = vm_map_find(pipe_map, NULL, 0,
	(vm_offset_t *) &buffer, size, 1,
	VM_PROT_ALL, VM_PROT_ALL, 0);
	if (error != KERN_SUCCESS) {
	if ((cpipe->pipe_buffer.buffer == NULL) &&
	(size > SMALL_PIPE_SIZE)) {
	size = SMALL_PIPE_SIZE;
	pipefragretry++;
	goto retry;
	}
	if (cpipe->pipe_buffer.buffer == NULL) {
	pipeallocfail++;
	if (ppsratecheck(&lastfail, &curfail, 1))
	printf("kern.ipc.maxpipekva exceeded; see tuning(7)\n");
	} else {
	piperesizefail++;
	}
	return (ENOMEM);
	}

	/* copy data, then free old resources if we're resizing */
	if (cnt > 0) {
	if (cpipe->pipe_buffer.in <= cpipe->pipe_buffer.out) {
	firstseg = cpipe->pipe_buffer.size - cpipe->pipe_buffer.out;
	bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
	buffer, firstseg);
	if ((cnt - firstseg) > 0)
	bcopy(cpipe->pipe_buffer.buffer, &buffer[firstseg],
	cpipe->pipe_buffer.in);
	} else {
	bcopy(&cpipe->pipe_buffer.buffer[cpipe->pipe_buffer.out],
	buffer, cnt);
	}
	}
	pipe_free_kmem(cpipe);
	cpipe->pipe_buffer.buffer = buffer;
	cpipe->pipe_buffer.size = size;
	cpipe->pipe_buffer.in = cnt;
	cpipe->pipe_buffer.out = 0;
	cpipe->pipe_buffer.cnt = cnt;
	atomic_add_long(&amountpipekva, cpipe->pipe_buffer.size);
	return (0);
	}

	/*
	* Wrapper for pipespace_new() that performs locking assertions.
	*/
	static int
	pipespace(cpipe, size)
	struct pipe *cpipe;
	int size;
	{

	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
	("Unlocked pipe passed to pipespace"));
	return (pipespace_new(cpipe, size));
	}

	/*
	* lock a pipe for I/O, blocking other access
	*/
	static __inline int
	pipelock(cpipe, catch)
	struct pipe *cpipe;
	int catch;
	{
	int error;

	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
	while (cpipe->pipe_state & PIPE_LOCKFL) {
	cpipe->pipe_state \|= PIPE_LWANT;
	error = msleep(cpipe, PIPE_MTX(cpipe),
	catch ? (PRIBIO \| PCATCH) : PRIBIO,
	"pipelk", 0);
	if (error != 0)
	return (error);
	}
	cpipe->pipe_state \|= PIPE_LOCKFL;
	return (0);
	}

	/*
	* unlock a pipe I/O lock
	*/
	static __inline void
	pipeunlock(cpipe)
	struct pipe *cpipe;
	{

	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
	KASSERT(cpipe->pipe_state & PIPE_LOCKFL,
	("Unlocked pipe passed to pipeunlock"));
	cpipe->pipe_state &= ~PIPE_LOCKFL;
	if (cpipe->pipe_state & PIPE_LWANT) {
	cpipe->pipe_state &= ~PIPE_LWANT;
	wakeup(cpipe);
	}
	}

	static __inline void
	pipeselwakeup(cpipe)
	struct pipe *cpipe;
	{

	PIPE_LOCK_ASSERT(cpipe, MA_OWNED);
	if (cpipe->pipe_state & PIPE_SEL) {
	selwakeuppri(&cpipe->pipe_sel, PSOCK);
	if (!SEL_WAITING(&cpipe->pipe_sel))
	cpipe->pipe_state &= ~PIPE_SEL;
	}
	if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio)
	pgsigio(&cpipe->pipe_sigio, SIGIO, 0);
	KNOTE_LOCKED(&cpipe->pipe_sel.si_note, 0);
	}

	/*
	* Initialize and allocate VM and memory for pipe. The structure
	* will start out zero'd from the ctor, so we just manage the kmem.
	*/
	static int
	pipe_create(pipe, backing)
	struct pipe *pipe;
	int backing;
	{
	int error;

	if (backing) {
	if (amountpipekva > maxpipekva / 2)
	error = pipespace_new(pipe, SMALL_PIPE_SIZE);
	else
	error = pipespace_new(pipe, PIPE_SIZE);
	} else {
	/* If we're not backing this pipe, no need to do anything. */
	error = 0;
	}
	return (error);
	}

	/* ARGSUSED */
	static int
	pipe_read(fp, uio, active_cred, flags, td)
	struct file *fp;
	struct uio *uio;
	struct ucred *active_cred;
	struct thread *td;
	int flags;
	{
	struct pipe *rpipe = fp->f_data;
	int error;
	int nread = 0;
	u_int size;

	PIPE_LOCK(rpipe);
	++rpipe->pipe_busy;
	error = pipelock(rpipe, 1);
	if (error)
	goto unlocked_error;

	#ifdef MAC
	error = mac_pipe_check_read(active_cred, rpipe->pipe_pair);
	if (error)
	goto locked_error;
	#endif
	if (amountpipekva > (3 * maxpipekva) / 4) {
	if (!(rpipe->pipe_state & PIPE_DIRECTW) &&
	(rpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
	(rpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
	(piperesizeallowed == 1)) {
	PIPE_UNLOCK(rpipe);
	pipespace(rpipe, SMALL_PIPE_SIZE);
	PIPE_LOCK(rpipe);
	}
	}

	while (uio->uio_resid) {
	/*
	* normal pipe buffer receive
	*/
	if (rpipe->pipe_buffer.cnt > 0) {
	size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
	if (size > rpipe->pipe_buffer.cnt)
	size = rpipe->pipe_buffer.cnt;
	if (size > (u_int) uio->uio_resid)
	size = (u_int) uio->uio_resid;

	PIPE_UNLOCK(rpipe);
	error = uiomove(
	&rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
	size, uio);
	PIPE_LOCK(rpipe);
	if (error)
	break;

	rpipe->pipe_buffer.out += size;
	if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
	rpipe->pipe_buffer.out = 0;

	rpipe->pipe_buffer.cnt -= size;

	/*
	* If there is no more to read in the pipe, reset
	* its pointers to the beginning. This improves
	* cache hit stats.
	*/
	if (rpipe->pipe_buffer.cnt == 0) {
	rpipe->pipe_buffer.in = 0;
	rpipe->pipe_buffer.out = 0;
	}
	nread += size;
	#ifndef PIPE_NODIRECT
	/*
	* Direct copy, bypassing a kernel buffer.
	*/
	} else if ((size = rpipe->pipe_map.cnt) &&
	(rpipe->pipe_state & PIPE_DIRECTW)) {
	if (size > (u_int) uio->uio_resid)
	size = (u_int) uio->uio_resid;

	PIPE_UNLOCK(rpipe);
	error = uiomove_fromphys(rpipe->pipe_map.ms,
	rpipe->pipe_map.pos, size, uio);
	PIPE_LOCK(rpipe);
	if (error)
	break;
	nread += size;
	rpipe->pipe_map.pos += size;
	rpipe->pipe_map.cnt -= size;
	if (rpipe->pipe_map.cnt == 0) {
	rpipe->pipe_state &= ~PIPE_DIRECTW;
	wakeup(rpipe);
	}
	#endif
	} else {
	/*
	* detect EOF condition
	* read returns 0 on EOF, no need to set error
	*/
	if (rpipe->pipe_state & PIPE_EOF)
	break;

	/*
	* If the "write-side" has been blocked, wake it up now.
	*/
	if (rpipe->pipe_state & PIPE_WANTW) {
	rpipe->pipe_state &= ~PIPE_WANTW;
	wakeup(rpipe);
	}

	/*
	* Break if some data was read.
	*/
	if (nread > 0)
	break;

	/*
	* Unlock the pipe buffer for our remaining processing.
	* We will either break out with an error or we will
	* sleep and relock to loop.
	*/
	pipeunlock(rpipe);

	/*
	* Handle non-blocking mode operation or
	* wait for more data.
	*/
	if (fp->f_flag & FNONBLOCK) {
	error = EAGAIN;
	} else {
	rpipe->pipe_state \|= PIPE_WANTR;
	if ((error = msleep(rpipe, PIPE_MTX(rpipe),
	PRIBIO \| PCATCH,
	"piperd", 0)) == 0)
	error = pipelock(rpipe, 1);
	}
	if (error)
	goto unlocked_error;
	}
	}
	#ifdef MAC
	locked_error:
	#endif
	pipeunlock(rpipe);

	/* XXX: should probably do this before getting any locks. */
	if (error == 0)
	vfs_timestamp(&rpipe->pipe_atime);
	unlocked_error:
	--rpipe->pipe_busy;

	/*
	* PIPE_WANT processing only makes sense if pipe_busy is 0.
	*/
	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
	rpipe->pipe_state &= ~(PIPE_WANT\|PIPE_WANTW);
	wakeup(rpipe);
	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
	/*
	* Handle write blocking hysteresis.
	*/
	if (rpipe->pipe_state & PIPE_WANTW) {
	rpipe->pipe_state &= ~PIPE_WANTW;
	wakeup(rpipe);
	}
	}

	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
	pipeselwakeup(rpipe);

	PIPE_UNLOCK(rpipe);
	return (error);
	}

	#ifndef PIPE_NODIRECT
	/*
	* Map the sending processes' buffer into kernel space and wire it.
	* This is similar to a physical write operation.
	*/
	static int
	pipe_build_write_buffer(wpipe, uio)
	struct pipe *wpipe;
	struct uio *uio;
	{
	u_int size;
	int i;

	PIPE_LOCK_ASSERT(wpipe, MA_NOTOWNED);
	KASSERT(wpipe->pipe_state & PIPE_DIRECTW,
	("Clone attempt on non-direct write pipe!"));

	size = (u_int) uio->uio_iov->iov_len;
	if (size > wpipe->pipe_buffer.size)
	size = wpipe->pipe_buffer.size;

	if ((i = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
	(vm_offset_t)uio->uio_iov->iov_base, size, VM_PROT_READ,
	wpipe->pipe_map.ms, PIPENPAGES)) < 0)
	return (EFAULT);

	/*
	* set up the control block
	*/
	wpipe->pipe_map.npages = i;
	wpipe->pipe_map.pos =
	((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
	wpipe->pipe_map.cnt = size;

	/*
	* and update the uio data
	*/

	uio->uio_iov->iov_len -= size;
	uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + size;
	if (uio->uio_iov->iov_len == 0)
	uio->uio_iov++;
	uio->uio_resid -= size;
	uio->uio_offset += size;
	return (0);
	}

	/*
	* unmap and unwire the process buffer
	*/
	static void
	pipe_destroy_write_buffer(wpipe)
	struct pipe *wpipe;
	{

	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
	vm_page_unhold_pages(wpipe->pipe_map.ms, wpipe->pipe_map.npages);
	wpipe->pipe_map.npages = 0;
	}

	/*
	* In the case of a signal, the writing process might go away. This
	* code copies the data into the circular buffer so that the source
	* pages can be freed without loss of data.
	*/
	static void
	pipe_clone_write_buffer(wpipe)
	struct pipe *wpipe;
	{
	struct uio uio;
	struct iovec iov;
	int size;
	int pos;

	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
	size = wpipe->pipe_map.cnt;
	pos = wpipe->pipe_map.pos;

	wpipe->pipe_buffer.in = size;
	wpipe->pipe_buffer.out = 0;
	wpipe->pipe_buffer.cnt = size;
	wpipe->pipe_state &= ~PIPE_DIRECTW;

	PIPE_UNLOCK(wpipe);
	iov.iov_base = wpipe->pipe_buffer.buffer;
	iov.iov_len = size;
	uio.uio_iov = &iov;
	uio.uio_iovcnt = 1;
	uio.uio_offset = 0;
	uio.uio_resid = size;
	uio.uio_segflg = UIO_SYSSPACE;
	uio.uio_rw = UIO_READ;
	uio.uio_td = curthread;
	uiomove_fromphys(wpipe->pipe_map.ms, pos, size, &uio);
	PIPE_LOCK(wpipe);
	pipe_destroy_write_buffer(wpipe);
	}

	/*
	* This implements the pipe buffer write mechanism. Note that only
	* a direct write OR a normal pipe write can be pending at any given time.
	* If there are any characters in the pipe buffer, the direct write will
	* be deferred until the receiving process grabs all of the bytes from
	* the pipe buffer. Then the direct mapping write is set-up.
	*/
	static int
	pipe_direct_write(wpipe, uio)
	struct pipe *wpipe;
	struct uio *uio;
	{
	int error;

	retry:
	PIPE_LOCK_ASSERT(wpipe, MA_OWNED);
	error = pipelock(wpipe, 1);
	if (wpipe->pipe_state & PIPE_EOF)
	error = EPIPE;
	if (error) {
	pipeunlock(wpipe);
	goto error1;
	}
	while (wpipe->pipe_state & PIPE_DIRECTW) {
	if (wpipe->pipe_state & PIPE_WANTR) {
	wpipe->pipe_state &= ~PIPE_WANTR;
	wakeup(wpipe);
	}
	pipeselwakeup(wpipe);
	wpipe->pipe_state \|= PIPE_WANTW;
	pipeunlock(wpipe);
	error = msleep(wpipe, PIPE_MTX(wpipe),
	PRIBIO \| PCATCH, "pipdww", 0);
	if (error)
	goto error1;
	else
	goto retry;
	}
	wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
	if (wpipe->pipe_buffer.cnt > 0) {
	if (wpipe->pipe_state & PIPE_WANTR) {
	wpipe->pipe_state &= ~PIPE_WANTR;
	wakeup(wpipe);
	}
	pipeselwakeup(wpipe);
	wpipe->pipe_state \|= PIPE_WANTW;
	pipeunlock(wpipe);
	error = msleep(wpipe, PIPE_MTX(wpipe),
	PRIBIO \| PCATCH, "pipdwc", 0);
	if (error)
	goto error1;
	else
	goto retry;
	}

	wpipe->pipe_state \|= PIPE_DIRECTW;

	PIPE_UNLOCK(wpipe);
	error = pipe_build_write_buffer(wpipe, uio);
	PIPE_LOCK(wpipe);
	if (error) {
	wpipe->pipe_state &= ~PIPE_DIRECTW;
	pipeunlock(wpipe);
	goto error1;
	}

	error = 0;
	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
	if (wpipe->pipe_state & PIPE_EOF) {
	pipe_destroy_write_buffer(wpipe);
	pipeselwakeup(wpipe);
	pipeunlock(wpipe);
	error = EPIPE;
	goto error1;
	}
	if (wpipe->pipe_state & PIPE_WANTR) {
	wpipe->pipe_state &= ~PIPE_WANTR;
	wakeup(wpipe);
	}
	pipeselwakeup(wpipe);
	pipeunlock(wpipe);
	error = msleep(wpipe, PIPE_MTX(wpipe), PRIBIO \| PCATCH,
	"pipdwt", 0);
	pipelock(wpipe, 0);
	}

	if (wpipe->pipe_state & PIPE_EOF)
	error = EPIPE;
	if (wpipe->pipe_state & PIPE_DIRECTW) {
	/*
	* this bit of trickery substitutes a kernel buffer for
	* the process that might be going away.
	*/
	pipe_clone_write_buffer(wpipe);
	} else {
	pipe_destroy_write_buffer(wpipe);
	}
	pipeunlock(wpipe);
	return (error);

	error1:
	wakeup(wpipe);
	return (error);
	}
	#endif

	static int
	pipe_write(fp, uio, active_cred, flags, td)
	struct file *fp;
	struct uio *uio;
	struct ucred *active_cred;
	struct thread *td;
	int flags;
	{
	int error = 0;
	int desiredsize, orig_resid;
	struct pipe wpipe, rpipe;

	rpipe = fp->f_data;
	wpipe = rpipe->pipe_peer;

	PIPE_LOCK(rpipe);
	error = pipelock(wpipe, 1);
	if (error) {
	PIPE_UNLOCK(rpipe);
	return (error);
	}
	/*
	* detect loss of pipe read side, issue SIGPIPE if lost.
	*/
	if (wpipe->pipe_present != PIPE_ACTIVE \|\|
	(wpipe->pipe_state & PIPE_EOF)) {
	pipeunlock(wpipe);
	PIPE_UNLOCK(rpipe);
	return (EPIPE);
	}
	#ifdef MAC
	error = mac_pipe_check_write(active_cred, wpipe->pipe_pair);
	if (error) {
	pipeunlock(wpipe);
	PIPE_UNLOCK(rpipe);
	return (error);
	}
	#endif
	++wpipe->pipe_busy;

	/* Choose a larger size if it's advantageous */
	desiredsize = max(SMALL_PIPE_SIZE, wpipe->pipe_buffer.size);
	while (desiredsize < wpipe->pipe_buffer.cnt + uio->uio_resid) {
	if (piperesizeallowed != 1)
	break;
	if (amountpipekva > maxpipekva / 2)
	break;
	if (desiredsize == BIG_PIPE_SIZE)
	break;
	desiredsize = desiredsize * 2;
	}

	/* Choose a smaller size if we're in a OOM situation */
	if ((amountpipekva > (3 * maxpipekva) / 4) &&
	(wpipe->pipe_buffer.size > SMALL_PIPE_SIZE) &&
	(wpipe->pipe_buffer.cnt <= SMALL_PIPE_SIZE) &&
	(piperesizeallowed == 1))
	desiredsize = SMALL_PIPE_SIZE;

	/* Resize if the above determined that a new size was necessary */
	if ((desiredsize != wpipe->pipe_buffer.size) &&
	((wpipe->pipe_state & PIPE_DIRECTW) == 0)) {
	PIPE_UNLOCK(wpipe);
	pipespace(wpipe, desiredsize);
	PIPE_LOCK(wpipe);
	}
	if (wpipe->pipe_buffer.size == 0) {
	/*
	* This can only happen for reverse direction use of pipes
	* in a complete OOM situation.
	*/
	error = ENOMEM;
	--wpipe->pipe_busy;
	pipeunlock(wpipe);
	PIPE_UNLOCK(wpipe);
	return (error);
	}

	pipeunlock(wpipe);

	orig_resid = uio->uio_resid;

	while (uio->uio_resid) {
	int space;

	pipelock(wpipe, 0);
	if (wpipe->pipe_state & PIPE_EOF) {
	pipeunlock(wpipe);
	error = EPIPE;
	break;
	}
	#ifndef PIPE_NODIRECT
	/*
	* If the transfer is large, we can gain performance if
	* we do process-to-process copies directly.
	* If the write is non-blocking, we don't use the
	* direct write mechanism.
	*
	* The direct write mechanism will detect the reader going
	* away on us.
	*/
	if (uio->uio_segflg == UIO_USERSPACE &&
	uio->uio_iov->iov_len >= PIPE_MINDIRECT &&
	wpipe->pipe_buffer.size >= PIPE_MINDIRECT &&
	(fp->f_flag & FNONBLOCK) == 0) {
	pipeunlock(wpipe);
	error = pipe_direct_write(wpipe, uio);
	if (error)
	break;
	continue;
	}
	#endif

	/*
	* Pipe buffered writes cannot be coincidental with
	* direct writes. We wait until the currently executing
	* direct write is completed before we start filling the
	* pipe buffer. We break out if a signal occurs or the
	* reader goes away.
	*/
	if (wpipe->pipe_state & PIPE_DIRECTW) {
	if (wpipe->pipe_state & PIPE_WANTR) {
	wpipe->pipe_state &= ~PIPE_WANTR;
	wakeup(wpipe);
	}
	pipeselwakeup(wpipe);
	wpipe->pipe_state \|= PIPE_WANTW;
	pipeunlock(wpipe);
	error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO \| PCATCH,
	"pipbww", 0);
	if (error)
	break;
	else
	continue;
	}

	space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;

	/* Writes of size <= PIPE_BUF must be atomic. */
	if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
	space = 0;

	if (space > 0) {
	int size; /* Transfer size */
	int segsize; /* first segment to transfer */

	/*
	* Transfer size is minimum of uio transfer
	* and free space in pipe buffer.
	*/
	if (space > uio->uio_resid)
	size = uio->uio_resid;
	else
	size = space;
	/*
	* First segment to transfer is minimum of
	* transfer size and contiguous space in
	* pipe buffer. If first segment to transfer
	* is less than the transfer size, we've got
	* a wraparound in the buffer.
	*/
	segsize = wpipe->pipe_buffer.size -
	wpipe->pipe_buffer.in;
	if (segsize > size)
	segsize = size;

	/* Transfer first segment */

	PIPE_UNLOCK(rpipe);
	error = uiomove(&wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
	segsize, uio);
	PIPE_LOCK(rpipe);

	if (error == 0 && segsize < size) {
	KASSERT(wpipe->pipe_buffer.in + segsize ==
	wpipe->pipe_buffer.size,
	("Pipe buffer wraparound disappeared"));
	/*
	* Transfer remaining part now, to
	* support atomic writes. Wraparound
	* happened.
	*/

	PIPE_UNLOCK(rpipe);
	error = uiomove(
	&wpipe->pipe_buffer.buffer[0],
	size - segsize, uio);
	PIPE_LOCK(rpipe);
	}
	if (error == 0) {
	wpipe->pipe_buffer.in += size;
	if (wpipe->pipe_buffer.in >=
	wpipe->pipe_buffer.size) {
	KASSERT(wpipe->pipe_buffer.in ==
	size - segsize +
	wpipe->pipe_buffer.size,
	("Expected wraparound bad"));
	wpipe->pipe_buffer.in = size - segsize;
	}

	wpipe->pipe_buffer.cnt += size;
	KASSERT(wpipe->pipe_buffer.cnt <=
	wpipe->pipe_buffer.size,
	("Pipe buffer overflow"));
	}
	pipeunlock(wpipe);
	if (error != 0)
	break;
	} else {
	/*
	* If the "read-side" has been blocked, wake it up now.
	*/
	if (wpipe->pipe_state & PIPE_WANTR) {
	wpipe->pipe_state &= ~PIPE_WANTR;
	wakeup(wpipe);
	}

	/*
	* don't block on non-blocking I/O
	*/
	if (fp->f_flag & FNONBLOCK) {
	error = EAGAIN;
	pipeunlock(wpipe);
	break;
	}

	/*
	* We have no more space and have something to offer,
	* wake up select/poll.
	*/
	pipeselwakeup(wpipe);

	wpipe->pipe_state \|= PIPE_WANTW;
	pipeunlock(wpipe);
	error = msleep(wpipe, PIPE_MTX(rpipe),
	PRIBIO \| PCATCH, "pipewr", 0);
	if (error != 0)
	break;
	}
	}

	pipelock(wpipe, 0);
	--wpipe->pipe_busy;

	if ((wpipe->pipe_busy == 0) && (wpipe->pipe_state & PIPE_WANT)) {
	wpipe->pipe_state &= ~(PIPE_WANT \| PIPE_WANTR);
	wakeup(wpipe);
	} else if (wpipe->pipe_buffer.cnt > 0) {
	/*
	* If we have put any characters in the buffer, we wake up
	* the reader.
	*/
	if (wpipe->pipe_state & PIPE_WANTR) {
	wpipe->pipe_state &= ~PIPE_WANTR;
	wakeup(wpipe);
	}
	}

	/*
	* Don't return EPIPE if I/O was successful
	*/
	if ((wpipe->pipe_buffer.cnt == 0) &&
	(uio->uio_resid == 0) &&
	(error == EPIPE)) {
	error = 0;
	}

	if (error == 0)
	vfs_timestamp(&wpipe->pipe_mtime);

	/*
	* We have something to offer,
	* wake up select/poll.
	*/
	if (wpipe->pipe_buffer.cnt)
	pipeselwakeup(wpipe);

	pipeunlock(wpipe);
	PIPE_UNLOCK(rpipe);
	return (error);
	}

	/* ARGSUSED */
	static int
	pipe_truncate(fp, length, active_cred, td)
	struct file *fp;
	off_t length;
	struct ucred *active_cred;
	struct thread *td;
	{

	return (EINVAL);
	}

	/*
	* we implement a very minimal set of ioctls for compatibility with sockets.
	*/
	static int
	pipe_ioctl(fp, cmd, data, active_cred, td)
	struct file *fp;
	u_long cmd;
	void *data;
	struct ucred *active_cred;
	struct thread *td;
	{
	struct pipe *mpipe = fp->f_data;
	int error;

	PIPE_LOCK(mpipe);

	#ifdef MAC
	error = mac_pipe_check_ioctl(active_cred, mpipe->pipe_pair, cmd, data);
	if (error) {
	PIPE_UNLOCK(mpipe);
	return (error);
	}
	#endif

	error = 0;
	switch (cmd) {

	case FIONBIO:
	break;

	case FIOASYNC:
	if ((int )data) {
	mpipe->pipe_state \|= PIPE_ASYNC;
	} else {
	mpipe->pipe_state &= ~PIPE_ASYNC;
	}
	break;

	case FIONREAD:
	if (mpipe->pipe_state & PIPE_DIRECTW)
	(int )data = mpipe->pipe_map.cnt;
	else
	(int )data = mpipe->pipe_buffer.cnt;
	break;

	case FIOSETOWN:
	PIPE_UNLOCK(mpipe);
	error = fsetown((int )data, &mpipe->pipe_sigio);
	goto out_unlocked;

	case FIOGETOWN:
	(int )data = fgetown(&mpipe->pipe_sigio);
	break;

	/* This is deprecated, FIOSETOWN should be used instead. */
	case TIOCSPGRP:
	PIPE_UNLOCK(mpipe);
	error = fsetown(-((int )data), &mpipe->pipe_sigio);
	goto out_unlocked;

	/* This is deprecated, FIOGETOWN should be used instead. */
	case TIOCGPGRP:
	(int )data = -fgetown(&mpipe->pipe_sigio);
	break;

	default:
	error = ENOTTY;
	break;
	}
	PIPE_UNLOCK(mpipe);
	out_unlocked:
	return (error);
	}

	static int
	pipe_poll(fp, events, active_cred, td)
	struct file *fp;
	int events;
	struct ucred *active_cred;
	struct thread *td;
	{
	struct pipe *rpipe = fp->f_data;
	struct pipe *wpipe;
	int revents = 0;
	#ifdef MAC
	int error;
	#endif

	wpipe = rpipe->pipe_peer;
	PIPE_LOCK(rpipe);
	#ifdef MAC
	error = mac_pipe_check_poll(active_cred, rpipe->pipe_pair);
	if (error)
	goto locked_error;
	#endif
	if (events & (POLLIN \| POLLRDNORM))
	if ((rpipe->pipe_state & PIPE_DIRECTW) \|\|
	(rpipe->pipe_buffer.cnt > 0))
	revents \|= events & (POLLIN \| POLLRDNORM);

	if (events & (POLLOUT \| POLLWRNORM))
	if (wpipe->pipe_present != PIPE_ACTIVE \|\|
	(wpipe->pipe_state & PIPE_EOF) \|\|
	(((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
	(wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
	revents \|= events & (POLLOUT \| POLLWRNORM);

	if ((events & POLLINIGNEOF) == 0) {
	if (rpipe->pipe_state & PIPE_EOF) {
	revents \|= (events & (POLLIN \| POLLRDNORM));
	if (wpipe->pipe_present != PIPE_ACTIVE \|\|
	(wpipe->pipe_state & PIPE_EOF))
	revents \|= POLLHUP;
	}
	}

	if (revents == 0) {
	if (events & (POLLIN \| POLLRDNORM)) {
	selrecord(td, &rpipe->pipe_sel);
	if (SEL_WAITING(&rpipe->pipe_sel))
	rpipe->pipe_state \|= PIPE_SEL;
	}

	if (events & (POLLOUT \| POLLWRNORM)) {
	selrecord(td, &wpipe->pipe_sel);
	if (SEL_WAITING(&wpipe->pipe_sel))
	wpipe->pipe_state \|= PIPE_SEL;
	}
	}
	#ifdef MAC
	locked_error:
	#endif
	PIPE_UNLOCK(rpipe);

	return (revents);
	}

	/*
	* We shouldn't need locks here as we're doing a read and this should
	* be a natural race.
	*/
	static int
	pipe_stat(fp, ub, active_cred, td)
	struct file *fp;
	struct stat *ub;
	struct ucred *active_cred;
	struct thread *td;
	{
	struct pipe *pipe = fp->f_data;
	#ifdef MAC
	int error;

	PIPE_LOCK(pipe);
	error = mac_pipe_check_stat(active_cred, pipe->pipe_pair);
	PIPE_UNLOCK(pipe);
	if (error)
	return (error);
	#endif
	bzero(ub, sizeof(*ub));
	ub->st_mode = S_IFIFO;
	ub->st_blksize = PAGE_SIZE;
	if (pipe->pipe_state & PIPE_DIRECTW)
	ub->st_size = pipe->pipe_map.cnt;
	else
	ub->st_size = pipe->pipe_buffer.cnt;
	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
	ub->st_atim = pipe->pipe_atime;
	ub->st_mtim = pipe->pipe_mtime;
	ub->st_ctim = pipe->pipe_ctime;
	ub->st_uid = fp->f_cred->cr_uid;
	ub->st_gid = fp->f_cred->cr_gid;
	/*
	* Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
	* XXX (st_dev, st_ino) should be unique.
	*/
	return (0);
	}

	/* ARGSUSED */
	static int
	pipe_close(fp, td)
	struct file *fp;
	struct thread *td;
	{
	struct pipe *cpipe = fp->f_data;

	fp->f_ops = &badfileops;
	fp->f_data = NULL;
	funsetown(&cpipe->pipe_sigio);
	pipeclose(cpipe);
	return (0);
	}

	static void
	pipe_free_kmem(cpipe)
	struct pipe *cpipe;
	{

	KASSERT(!mtx_owned(PIPE_MTX(cpipe)),
	("pipe_free_kmem: pipe mutex locked"));

	if (cpipe->pipe_buffer.buffer != NULL) {
	atomic_subtract_long(&amountpipekva, cpipe->pipe_buffer.size);
	vm_map_remove(pipe_map,
	(vm_offset_t)cpipe->pipe_buffer.buffer,
	(vm_offset_t)cpipe->pipe_buffer.buffer + cpipe->pipe_buffer.size);
	cpipe->pipe_buffer.buffer = NULL;
	}
	#ifndef PIPE_NODIRECT
	{
	cpipe->pipe_map.cnt = 0;
	cpipe->pipe_map.pos = 0;
	cpipe->pipe_map.npages = 0;
	}
	#endif
	}

	/*
	* shutdown the pipe
	*/
	static void
	pipeclose(cpipe)
	struct pipe *cpipe;
	{
	struct pipepair *pp;
	struct pipe *ppipe;

	KASSERT(cpipe != NULL, ("pipeclose: cpipe == NULL"));

	PIPE_LOCK(cpipe);
	pipelock(cpipe, 0);
	pp = cpipe->pipe_pair;

	pipeselwakeup(cpipe);

	/*
	* If the other side is blocked, wake it up saying that
	* we want to close it down.
	*/
	cpipe->pipe_state \|= PIPE_EOF;
	while (cpipe->pipe_busy) {
	wakeup(cpipe);
	cpipe->pipe_state \|= PIPE_WANT;
	pipeunlock(cpipe);
	msleep(cpipe, PIPE_MTX(cpipe), PRIBIO, "pipecl", 0);
	pipelock(cpipe, 0);
	}


	/*
	* Disconnect from peer, if any.
	*/
	ppipe = cpipe->pipe_peer;
	if (ppipe->pipe_present == PIPE_ACTIVE) {
	pipeselwakeup(ppipe);

	ppipe->pipe_state \|= PIPE_EOF;
	wakeup(ppipe);
	KNOTE_LOCKED(&ppipe->pipe_sel.si_note, 0);
	}

	/*
	* Mark this endpoint as free. Release kmem resources. We
	* don't mark this endpoint as unused until we've finished
	* doing that, or the pipe might disappear out from under
	* us.
	*/
	PIPE_UNLOCK(cpipe);
	pipe_free_kmem(cpipe);
	PIPE_LOCK(cpipe);
	cpipe->pipe_present = PIPE_CLOSING;
	pipeunlock(cpipe);

	/*
	* knlist_clear() may sleep dropping the PIPE_MTX. Set the
	* PIPE_FINALIZED, that allows other end to free the
	* pipe_pair, only after the knotes are completely dismantled.
	*/
	knlist_clear(&cpipe->pipe_sel.si_note, 1);
	cpipe->pipe_present = PIPE_FINALIZED;
	seldrain(&cpipe->pipe_sel);
	knlist_destroy(&cpipe->pipe_sel.si_note);

	/*
	* If both endpoints are now closed, release the memory for the
	* pipe pair. If not, unlock.
	*/
	if (ppipe->pipe_present == PIPE_FINALIZED) {
	PIPE_UNLOCK(cpipe);
	#ifdef MAC
	mac_pipe_destroy(pp);
	#endif
	uma_zfree(pipe_zone, cpipe->pipe_pair);
	} else
	PIPE_UNLOCK(cpipe);
	}

	/ARGSUSED/
	static int
	pipe_kqfilter(struct file fp, struct knote kn)
	{
	struct pipe *cpipe;

	cpipe = kn->kn_fp->f_data;
	PIPE_LOCK(cpipe);
	switch (kn->kn_filter) {
	case EVFILT_READ:
	kn->kn_fop = &pipe_rfiltops;
	break;
	case EVFILT_WRITE:
	kn->kn_fop = &pipe_wfiltops;
	if (cpipe->pipe_peer->pipe_present != PIPE_ACTIVE) {
	/* other end of pipe has been closed */
	PIPE_UNLOCK(cpipe);
	return (EPIPE);
	}
	cpipe = cpipe->pipe_peer;
	break;
	default:
	PIPE_UNLOCK(cpipe);
	return (EINVAL);
	}

	knlist_add(&cpipe->pipe_sel.si_note, kn, 1);
	PIPE_UNLOCK(cpipe);
	return (0);
	}

	static void
	filt_pipedetach(struct knote *kn)
	{
	struct pipe cpipe = (struct pipe )kn->kn_fp->f_data;

	PIPE_LOCK(cpipe);
	if (kn->kn_filter == EVFILT_WRITE)
	cpipe = cpipe->pipe_peer;
	knlist_remove(&cpipe->pipe_sel.si_note, kn, 1);
	PIPE_UNLOCK(cpipe);
	}

	/ARGSUSED/
	static int
	filt_piperead(struct knote *kn, long hint)
	{
	struct pipe *rpipe = kn->kn_fp->f_data;
	struct pipe *wpipe = rpipe->pipe_peer;
	int ret;

	PIPE_LOCK(rpipe);
	kn->kn_data = rpipe->pipe_buffer.cnt;
	if ((kn->kn_data == 0) && (rpipe->pipe_state & PIPE_DIRECTW))
	kn->kn_data = rpipe->pipe_map.cnt;

	if ((rpipe->pipe_state & PIPE_EOF) \|\|
	wpipe->pipe_present != PIPE_ACTIVE \|\|
	(wpipe->pipe_state & PIPE_EOF)) {
	kn->kn_flags \|= EV_EOF;
	PIPE_UNLOCK(rpipe);
	return (1);
	}
	ret = kn->kn_data > 0;
	PIPE_UNLOCK(rpipe);
	return ret;
	}

	/ARGSUSED/
	static int
	filt_pipewrite(struct knote *kn, long hint)
	{
	struct pipe *rpipe = kn->kn_fp->f_data;
	struct pipe *wpipe = rpipe->pipe_peer;

	PIPE_LOCK(rpipe);
	if (wpipe->pipe_present != PIPE_ACTIVE \|\|
	(wpipe->pipe_state & PIPE_EOF)) {
	kn->kn_data = 0;
	kn->kn_flags \|= EV_EOF;
	PIPE_UNLOCK(rpipe);
	return (1);
	}
	kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
	if (wpipe->pipe_state & PIPE_DIRECTW)
	kn->kn_data = 0;

	PIPE_UNLOCK(rpipe);
	return (kn->kn_data >= PIPE_BUF);
	}
	Index: head/sys/kern/sys_procdesc.c
	===================================================================
	--- head/sys/kern/sys_procdesc.c (revision 225616)
	+++ head/sys/kern/sys_procdesc.c (revision 225617)
	@@ -1,524 +1,524 @@
	/*-
	* Copyright (c) 2009 Robert N. M. Watson
	* All rights reserved.
	*
	* This software was developed at the University of Cambridge Computer
	* Laboratory with support from a grant from Google, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*-
	* FreeBSD process descriptor facility.
	*
	* Some processes are represented by a file descriptor, which will be used in
	* preference to signaling and pids for the purposes of process management,
	* and is, in effect, a form of capability. When a process descriptor is
	* used with a process, it ceases to be visible to certain traditional UNIX
	* process facilities, such as waitpid(2).
	*
	* Some semantics:
	*
	* - At most one process descriptor will exist for any process, although
	* references to that descriptor may be held from many processes (or even
	* be in flight between processes over a local domain socket).
	* - Last close on the process descriptor will terminate the process using
	* SIGKILL and reparent it to init so that there's a process to reap it
	* when it's done exiting.
	* - If the process exits before the descriptor is closed, it will not
	* generate SIGCHLD on termination, or be picked up by waitpid().
	* - The pdkill(2) system call may be used to deliver a signal to the process
	* using its process descriptor.
	* - The pdwait4(2) system call may be used to block (or not) on a process
	* descriptor to collect termination information.
	*
	* Open questions:
	*
	* - How to handle ptrace(2)?
	* - Will we want to add a pidtoprocdesc(2) system call to allow process
	* descriptors to be created for processes without pfork(2)?
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_procdesc.h"

	#include <sys/param.h>
	#include <sys/capability.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/poll.h>
	#include <sys/proc.h>
	#include <sys/procdesc.h>
	#include <sys/resourcevar.h>
	#include <sys/stat.h>
	#include <sys/sysproto.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/ucred.h>

	#include <security/audit/audit.h>

	#include <vm/uma.h>

	#ifdef PROCDESC

	FEATURE(process_descriptors, "Process Descriptors");

	static uma_zone_t procdesc_zone;

	static fo_rdwr_t procdesc_read;
	static fo_rdwr_t procdesc_write;
	static fo_truncate_t procdesc_truncate;
	static fo_ioctl_t procdesc_ioctl;
	static fo_poll_t procdesc_poll;
	static fo_kqfilter_t procdesc_kqfilter;
	static fo_stat_t procdesc_stat;
	static fo_close_t procdesc_close;
	static fo_chmod_t procdesc_chmod;
	static fo_chown_t procdesc_chown;

	static struct fileops procdesc_ops = {
	.fo_read = procdesc_read,
	.fo_write = procdesc_write,
	.fo_truncate = procdesc_truncate,
	.fo_ioctl = procdesc_ioctl,
	.fo_poll = procdesc_poll,
	.fo_kqfilter = procdesc_kqfilter,
	.fo_stat = procdesc_stat,
	.fo_close = procdesc_close,
	.fo_chmod = procdesc_chmod,
	.fo_chown = procdesc_chown,
	.fo_flags = DFLAG_PASSABLE,
	};

	/*
	* Initialize with VFS so that process descriptors are available along with
	* other file descriptor types. As long as it runs before init(8) starts,
	* there shouldn't be a problem.
	*/
	static void
	procdesc_init(void *dummy __unused)
	{

	procdesc_zone = uma_zcreate("procdesc", sizeof(struct procdesc),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	if (procdesc_zone == NULL)
	panic("procdesc_init: procdesc_zone not initialized");
	}
	SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, procdesc_init, NULL);

	/*
	* Return a locked process given a process descriptor, or ESRCH if it has
	* died.
	*/
	int
	procdesc_find(struct thread *td, int fd, cap_rights_t rights,
	struct proc **p)
	{
	struct procdesc *pd;
	struct file *fp;
	int error;

	error = fget(td, fd, rights, &fp);
	if (error)
	return (error);
	if (fp->f_type != DTYPE_PROCDESC) {
	error = EBADF;
	goto out;
	}
	pd = fp->f_data;
	sx_slock(&proctree_lock);
	if (pd->pd_proc != NULL) {
	*p = pd->pd_proc;
	PROC_LOCK(*p);
	} else
	error = ESRCH;
	sx_sunlock(&proctree_lock);
	out:
	fdrop(fp, td);
	return (error);
	}

	/*
	* Function to be used by procstat(1) sysctls when returning procdesc
	* information.
	*/
	pid_t
	procdesc_pid(struct file *fp_procdesc)
	{
	struct procdesc *pd;

	KASSERT(fp_procdesc->f_type == DTYPE_PROCDESC,
	("procdesc_pid: !procdesc"));

	pd = fp_procdesc->f_data;
	return (pd->pd_pid);
	}

	/*
	* Retrieve the PID associated with a process descriptor.
	*/
	int
	kern_pdgetpid(struct thread td, int fd, cap_rights_t rights, pid_t pidp)
	{
	struct file *fp;
	int error;

	error = fget(td, fd, rights, &fp);
	if (error)
	return (error);
	if (fp->f_type != DTYPE_PROCDESC) {
	error = EBADF;
	goto out;
	}
	*pidp = procdesc_pid(fp);
	out:
	fdrop(fp, td);
	return (error);
	}

	/*
	* System call to return the pid of a process given its process descriptor.
	*/
	int
	-pdgetpid(struct thread td, struct pdgetpid_args uap)
	+sys_pdgetpid(struct thread td, struct pdgetpid_args uap)
	{
	pid_t pid;
	int error;

	AUDIT_ARG_FD(uap->fd);
	error = kern_pdgetpid(td, uap->fd, CAP_PDGETPID, &pid);
	if (error == 0)
	error = copyout(&pid, uap->pidp, sizeof(pid));
	return (error);
	}

	/*
	* When a new process is forked by pdfork(), a file descriptor is allocated
	* by the fork code first, then the process is forked, and then we get a
	* chance to set up the process descriptor. Failure is not permitted at this
	* point, so procdesc_new() must succeed.
	*/
	void
	procdesc_new(struct proc *p, int flags)
	{
	struct procdesc *pd;

	pd = uma_zalloc(procdesc_zone, M_WAITOK \| M_ZERO);
	pd->pd_proc = p;
	pd->pd_pid = p->p_pid;
	p->p_procdesc = pd;
	pd->pd_flags = 0;
	if (flags & PD_DAEMON)
	pd->pd_flags \|= PDF_DAEMON;
	PROCDESC_LOCK_INIT(pd);

	/*
	* Process descriptors start out with two references: one from their
	* struct file, and the other from their struct proc.
	*/
	refcount_init(&pd->pd_refcount, 2);
	}

	/*
	* Initialize a file with a process descriptor.
	*/
	void
	procdesc_finit(struct procdesc pdp, struct file fp)
	{

	finit(fp, FREAD \| FWRITE, DTYPE_PROCDESC, pdp, &procdesc_ops);
	}

	static void
	procdesc_free(struct procdesc *pd)
	{

	/*
	* When the last reference is released, we assert that the descriptor
	* has been closed, but not that the process has exited, as we will
	* detach the descriptor before the process dies if the descript is
	* closed, as we can't wait synchronously.
	*/
	if (refcount_release(&pd->pd_refcount)) {
	KASSERT(pd->pd_proc == NULL,
	("procdesc_free: pd_proc != NULL"));
	KASSERT((pd->pd_flags & PDF_CLOSED),
	("procdesc_free: !PDF_CLOSED"));

	PROCDESC_LOCK_DESTROY(pd);
	uma_zfree(procdesc_zone, pd);
	}
	}

	/*
	* procdesc_exit() - notify a process descriptor that its process is exiting.
	* We use the proctree_lock to ensure that process exit either happens
	* strictly before or strictly after a concurrent call to procdesc_close().
	*/
	int
	procdesc_exit(struct proc *p)
	{
	struct procdesc *pd;

	sx_assert(&proctree_lock, SA_XLOCKED);
	PROC_LOCK_ASSERT(p, MA_OWNED);
	KASSERT(p->p_procdesc != NULL, ("procdesc_exit: p_procdesc NULL"));

	pd = p->p_procdesc;

	PROCDESC_LOCK(pd);
	KASSERT((pd->pd_flags & PDF_CLOSED) == 0 \|\| p->p_pptr == initproc,
	("procdesc_exit: closed && parent not init"));

	pd->pd_flags \|= PDF_EXITED;

	/*
	* If the process descriptor has been closed, then we have nothing
	* to do; return 1 so that init will get SIGCHLD and do the reaping.
	* Clean up the procdesc now rather than letting it happen during
	* that reap.
	*/
	if (pd->pd_flags & PDF_CLOSED) {
	PROCDESC_UNLOCK(pd);
	pd->pd_proc = NULL;
	p->p_procdesc = NULL;
	procdesc_free(pd);
	return (1);
	}
	if (pd->pd_flags & PDF_SELECTED) {
	pd->pd_flags &= ~PDF_SELECTED;
	selwakeup(&pd->pd_selinfo);
	}
	PROCDESC_UNLOCK(pd);
	return (0);
	}

	/*
	* When a process descriptor is reaped, perhaps as a result of close() or
	* pdwait4(), release the process's reference on the process descriptor.
	*/
	void
	procdesc_reap(struct proc *p)
	{
	struct procdesc *pd;

	sx_assert(&proctree_lock, SA_XLOCKED);
	KASSERT(p->p_procdesc != NULL, ("procdesc_reap: p_procdesc == NULL"));

	pd = p->p_procdesc;
	pd->pd_proc = NULL;
	procdesc_free(pd);
	}

	/*
	* procdesc_close() - last close on a process descriptor. If the process is
	* still running, terminate with SIGKILL (unless PD_DAEMON is set) and let
	* init(8) clean up the mess; if not, we have to clean up the zombie ourselves.
	*/
	static int
	procdesc_close(struct file fp, struct thread td)
	{
	struct procdesc *pd;
	struct proc *p;

	KASSERT(fp->f_type == DTYPE_PROCDESC, ("procdesc_close: !procdesc"));

	pd = fp->f_data;
	fp->f_ops = &badfileops;
	fp->f_data = NULL;

	sx_xlock(&proctree_lock);
	PROCDESC_LOCK(pd);
	pd->pd_flags \|= PDF_CLOSED;
	PROCDESC_UNLOCK(pd);
	p = pd->pd_proc;
	PROC_LOCK(p);
	if (p->p_state == PRS_ZOMBIE) {
	/*
	* If the process is already dead and just awaiting reaping,
	* do that now. This will release the process's reference to
	* the process descriptor when it calls back into
	* procdesc_reap().
	*/
	PROC_SLOCK(p);
	proc_reap(curthread, p, NULL, 0, NULL);
	} else {
	/*
	* If the process is not yet dead, we need to kill it, but we
	* can't wait around synchronously for it to go away, as that
	* path leads to madness (and deadlocks). First, detach the
	* process from its descriptor so that its exit status will
	* be reported normally.
	*/
	pd->pd_proc = NULL;
	p->p_procdesc = NULL;
	procdesc_free(pd);

	/*
	* Next, reparent it to init(8) so that there's someone to
	* pick up the pieces; finally, terminate with prejudice.
	*/
	p->p_sigparent = SIGCHLD;
	proc_reparent(p, initproc);
	if ((pd->pd_flags & PD_DAEMON) == 0)
	- psignal(p, SIGKILL);
	+ kern_psignal(p, SIGKILL);
	PROC_UNLOCK(p);
	sx_xunlock(&proctree_lock);
	}

	/*
	* Release the file descriptor's reference on the process descriptor.
	*/
	procdesc_free(pd);
	return (0);
	}

	static int
	procdesc_read(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{

	return (EOPNOTSUPP);
	}

	static int
	procdesc_write(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{

	return (EOPNOTSUPP);
	}

	static int
	procdesc_truncate(struct file fp, off_t length, struct ucred active_cred,
	struct thread *td)
	{

	return (EOPNOTSUPP);
	}

	static int
	procdesc_ioctl(struct file fp, u_long com, void data,
	struct ucred active_cred, struct thread td)
	{

	return (EOPNOTSUPP);
	}

	static int
	procdesc_poll(struct file fp, int events, struct ucred active_cred,
	struct thread *td)
	{
	struct procdesc *pd;
	int revents;

	revents = 0;
	pd = fp->f_data;
	PROCDESC_LOCK(pd);
	if (pd->pd_flags & PDF_EXITED)
	revents \|= POLLHUP;
	if (revents == 0) {
	selrecord(td, &pd->pd_selinfo);
	pd->pd_flags \|= PDF_SELECTED;
	}
	PROCDESC_UNLOCK(pd);
	return (revents);
	}

	static int
	procdesc_kqfilter(struct file fp, struct knote kn)
	{

	return (EOPNOTSUPP);
	}

	static int
	procdesc_stat(struct file fp, struct stat sb, struct ucred *active_cred,
	struct thread *td)
	{
	struct procdesc *pd;
	struct timeval pstart;

	/*
	* XXXRW: Perhaps we should cache some more information from the
	* process so that we can return it reliably here even after it has
	* died. For example, caching its credential data.
	*/
	bzero(sb, sizeof(*sb));
	pd = fp->f_data;
	sx_slock(&proctree_lock);
	if (pd->pd_proc != NULL) {
	PROC_LOCK(pd->pd_proc);

	/* Set birth and [acm] times to process start time. */
	pstart = pd->pd_proc->p_stats->p_start;
	timevaladd(&pstart, &boottime);
	TIMEVAL_TO_TIMESPEC(&pstart, &sb->st_birthtim);
	sb->st_atim = sb->st_birthtim;
	sb->st_ctim = sb->st_birthtim;
	sb->st_mtim = sb->st_birthtim;
	if (pd->pd_proc->p_state != PRS_ZOMBIE)
	sb->st_mode = S_IFREG \| S_IRWXU;
	else
	sb->st_mode = S_IFREG;
	sb->st_uid = pd->pd_proc->p_ucred->cr_ruid;
	sb->st_gid = pd->pd_proc->p_ucred->cr_rgid;
	PROC_UNLOCK(pd->pd_proc);
	} else
	sb->st_mode = S_IFREG;
	sx_sunlock(&proctree_lock);
	return (0);
	}

	static int
	procdesc_chmod(struct file fp, mode_t mode, struct ucred active_cred,
	struct thread *td)
	{

	return (EOPNOTSUPP);
	}

	static int
	procdesc_chown(struct file fp, uid_t uid, gid_t gid, struct ucred active_cred,
	struct thread *td)
	{

	return (EOPNOTSUPP);
	}

	#else /* !PROCDESC */

	int
	-pdgetpid(struct thread td, struct pdgetpid_args uap)
	+sys_pdgetpid(struct thread td, struct pdgetpid_args uap)
	{

	return (ENOSYS);
	}

	#endif /* PROCDESC */
	Index: head/sys/kern/sys_process.c
	===================================================================
	--- head/sys/kern/sys_process.c (revision 225616)
	+++ head/sys/kern/sys_process.c (revision 225617)
	@@ -1,1242 +1,1242 @@
	/*-
	* Copyright (c) 1994, Sean Eric Fagan
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by Sean Eric Fagan.
	* 4. The name of the author may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/proc.h>
	#include <sys/vnode.h>
	#include <sys/ptrace.h>
	#include <sys/sx.h>
	#include <sys/malloc.h>
	#include <sys/signalvar.h>

	#include <machine/reg.h>

	#include <security/audit/audit.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_map.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_param.h>

	#ifdef COMPAT_FREEBSD32
	#include <sys/procfs.h>
	#include <compat/freebsd32/freebsd32_signal.h>

	struct ptrace_io_desc32 {
	int piod_op;
	uint32_t piod_offs;
	uint32_t piod_addr;
	uint32_t piod_len;
	};

	struct ptrace_vm_entry32 {
	int pve_entry;
	int pve_timestamp;
	uint32_t pve_start;
	uint32_t pve_end;
	uint32_t pve_offset;
	u_int pve_prot;
	u_int pve_pathlen;
	int32_t pve_fileid;
	u_int pve_fsid;
	uint32_t pve_path;
	};

	struct ptrace_lwpinfo32 {
	lwpid_t pl_lwpid; /* LWP described. */
	int pl_event; /* Event that stopped the LWP. */
	int pl_flags; /* LWP flags. */
	sigset_t pl_sigmask; /* LWP signal mask */
	sigset_t pl_siglist; /* LWP pending signal */
	struct siginfo32 pl_siginfo; /* siginfo for signal */
	char pl_tdname[MAXCOMLEN + 1]; /* LWP name. */
	int pl_child_pid; /* New child pid */
	};

	#endif

	/*
	* Functions implemented using PROC_ACTION():
	*
	* proc_read_regs(proc, regs)
	* Get the current user-visible register set from the process
	* and copy it into the regs structure (<machine/reg.h>).
	* The process is stopped at the time read_regs is called.
	*
	* proc_write_regs(proc, regs)
	* Update the current register set from the passed in regs
	* structure. Take care to avoid clobbering special CPU
	* registers or privileged bits in the PSL.
	* Depending on the architecture this may have fix-up work to do,
	* especially if the IAR or PCW are modified.
	* The process is stopped at the time write_regs is called.
	*
	* proc_read_fpregs, proc_write_fpregs
	* deal with the floating point register set, otherwise as above.
	*
	* proc_read_dbregs, proc_write_dbregs
	* deal with the processor debug register set, otherwise as above.
	*
	* proc_sstep(proc)
	* Arrange for the process to trap after executing a single instruction.
	*/

	#define PROC_ACTION(action) do { \
	int error; \
	\
	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED); \
	if ((td->td_proc->p_flag & P_INMEM) == 0) \
	error = EIO; \
	else \
	error = (action); \
	return (error); \
	} while(0)

	int
	proc_read_regs(struct thread td, struct reg regs)
	{

	PROC_ACTION(fill_regs(td, regs));
	}

	int
	proc_write_regs(struct thread td, struct reg regs)
	{

	PROC_ACTION(set_regs(td, regs));
	}

	int
	proc_read_dbregs(struct thread td, struct dbreg dbregs)
	{

	PROC_ACTION(fill_dbregs(td, dbregs));
	}

	int
	proc_write_dbregs(struct thread td, struct dbreg dbregs)
	{

	PROC_ACTION(set_dbregs(td, dbregs));
	}

	/*
	* Ptrace doesn't support fpregs at all, and there are no security holes
	* or translations for fpregs, so we can just copy them.
	*/
	int
	proc_read_fpregs(struct thread td, struct fpreg fpregs)
	{

	PROC_ACTION(fill_fpregs(td, fpregs));
	}

	int
	proc_write_fpregs(struct thread td, struct fpreg fpregs)
	{

	PROC_ACTION(set_fpregs(td, fpregs));
	}

	#ifdef COMPAT_FREEBSD32
	/* For 32 bit binaries, we need to expose the 32 bit regs layouts. */
	int
	proc_read_regs32(struct thread td, struct reg32 regs32)
	{

	PROC_ACTION(fill_regs32(td, regs32));
	}

	int
	proc_write_regs32(struct thread td, struct reg32 regs32)
	{

	PROC_ACTION(set_regs32(td, regs32));
	}

	int
	proc_read_dbregs32(struct thread td, struct dbreg32 dbregs32)
	{

	PROC_ACTION(fill_dbregs32(td, dbregs32));
	}

	int
	proc_write_dbregs32(struct thread td, struct dbreg32 dbregs32)
	{

	PROC_ACTION(set_dbregs32(td, dbregs32));
	}

	int
	proc_read_fpregs32(struct thread td, struct fpreg32 fpregs32)
	{

	PROC_ACTION(fill_fpregs32(td, fpregs32));
	}

	int
	proc_write_fpregs32(struct thread td, struct fpreg32 fpregs32)
	{

	PROC_ACTION(set_fpregs32(td, fpregs32));
	}
	#endif

	int
	proc_sstep(struct thread *td)
	{

	PROC_ACTION(ptrace_single_step(td));
	}

	int
	proc_rwmem(struct proc p, struct uio uio)
	{
	vm_map_t map;
	vm_offset_t pageno; /* page number */
	vm_prot_t reqprot;
	int error, fault_flags, page_offset, writing;

	/*
	* Assert that someone has locked this vmspace. (Should be
	* curthread but we can't assert that.) This keeps the process
	* from exiting out from under us until this operation completes.
	*/
	KASSERT(p->p_lock >= 1, ("%s: process %p (pid %d) not held", __func__,
	p, p->p_pid));

	/*
	* The map we want...
	*/
	map = &p->p_vmspace->vm_map;

	/*
	* If we are writing, then we request vm_fault() to create a private
	* copy of each page. Since these copies will not be writeable by the
	* process, we must explicity request that they be dirtied.
	*/
	writing = uio->uio_rw == UIO_WRITE;
	reqprot = writing ? VM_PROT_COPY \| VM_PROT_READ : VM_PROT_READ;
	fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL;

	/*
	* Only map in one page at a time. We don't have to, but it
	* makes things easier. This way is trivial - right?
	*/
	do {
	vm_offset_t uva;
	u_int len;
	vm_page_t m;

	uva = (vm_offset_t)uio->uio_offset;

	/*
	* Get the page number of this segment.
	*/
	pageno = trunc_page(uva);
	page_offset = uva - pageno;

	/*
	* How many bytes to copy
	*/
	len = min(PAGE_SIZE - page_offset, uio->uio_resid);

	/*
	* Fault and hold the page on behalf of the process.
	*/
	error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m);
	if (error != KERN_SUCCESS) {
	if (error == KERN_RESOURCE_SHORTAGE)
	error = ENOMEM;
	else
	error = EFAULT;
	break;
	}

	/*
	* Now do the i/o move.
	*/
	error = uiomove_fromphys(&m, page_offset, len, uio);

	/* Make the I-cache coherent for breakpoints. */
	if (writing && error == 0) {
	vm_map_lock_read(map);
	if (vm_map_check_protection(map, pageno, pageno +
	PAGE_SIZE, VM_PROT_EXECUTE))
	vm_sync_icache(map, uva, len);
	vm_map_unlock_read(map);
	}

	/*
	* Release the page.
	*/
	vm_page_lock(m);
	vm_page_unhold(m);
	vm_page_unlock(m);

	} while (error == 0 && uio->uio_resid > 0);

	return (error);
	}

	static int
	ptrace_vm_entry(struct thread td, struct proc p, struct ptrace_vm_entry *pve)
	{
	struct vattr vattr;
	vm_map_t map;
	vm_map_entry_t entry;
	vm_object_t obj, tobj, lobj;
	struct vmspace *vm;
	struct vnode *vp;
	char freepath, fullpath;
	u_int pathlen;
	int error, index, vfslocked;

	error = 0;
	obj = NULL;

	vm = vmspace_acquire_ref(p);
	map = &vm->vm_map;
	vm_map_lock_read(map);

	do {
	entry = map->header.next;
	index = 0;
	while (index < pve->pve_entry && entry != &map->header) {
	entry = entry->next;
	index++;
	}
	if (index != pve->pve_entry) {
	error = EINVAL;
	break;
	}
	while (entry != &map->header &&
	(entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
	entry = entry->next;
	index++;
	}
	if (entry == &map->header) {
	error = ENOENT;
	break;
	}

	/* We got an entry. */
	pve->pve_entry = index + 1;
	pve->pve_timestamp = map->timestamp;
	pve->pve_start = entry->start;
	pve->pve_end = entry->end - 1;
	pve->pve_offset = entry->offset;
	pve->pve_prot = entry->protection;

	/* Backing object's path needed? */
	if (pve->pve_pathlen == 0)
	break;

	pathlen = pve->pve_pathlen;
	pve->pve_pathlen = 0;

	obj = entry->object.vm_object;
	if (obj != NULL)
	VM_OBJECT_LOCK(obj);
	} while (0);

	vm_map_unlock_read(map);
	vmspace_free(vm);

	pve->pve_fsid = VNOVAL;
	pve->pve_fileid = VNOVAL;

	if (error == 0 && obj != NULL) {
	lobj = obj;
	for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) {
	if (tobj != obj)
	VM_OBJECT_LOCK(tobj);
	if (lobj != obj)
	VM_OBJECT_UNLOCK(lobj);
	lobj = tobj;
	pve->pve_offset += tobj->backing_object_offset;
	}
	vp = (lobj->type == OBJT_VNODE) ? lobj->handle : NULL;
	if (vp != NULL)
	vref(vp);
	if (lobj != obj)
	VM_OBJECT_UNLOCK(lobj);
	VM_OBJECT_UNLOCK(obj);

	if (vp != NULL) {
	freepath = NULL;
	fullpath = NULL;
	vn_fullpath(td, vp, &fullpath, &freepath);
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	if (VOP_GETATTR(vp, &vattr, td->td_ucred) == 0) {
	pve->pve_fileid = vattr.va_fileid;
	pve->pve_fsid = vattr.va_fsid;
	}
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);

	if (fullpath != NULL) {
	pve->pve_pathlen = strlen(fullpath) + 1;
	if (pve->pve_pathlen <= pathlen) {
	error = copyout(fullpath, pve->pve_path,
	pve->pve_pathlen);
	} else
	error = ENAMETOOLONG;
	}
	if (freepath != NULL)
	free(freepath, M_TEMP);
	}
	}

	return (error);
	}

	#ifdef COMPAT_FREEBSD32
	static int
	ptrace_vm_entry32(struct thread td, struct proc p,
	struct ptrace_vm_entry32 *pve32)
	{
	struct ptrace_vm_entry pve;
	int error;

	pve.pve_entry = pve32->pve_entry;
	pve.pve_pathlen = pve32->pve_pathlen;
	pve.pve_path = (void *)(uintptr_t)pve32->pve_path;

	error = ptrace_vm_entry(td, p, &pve);
	if (error == 0) {
	pve32->pve_entry = pve.pve_entry;
	pve32->pve_timestamp = pve.pve_timestamp;
	pve32->pve_start = pve.pve_start;
	pve32->pve_end = pve.pve_end;
	pve32->pve_offset = pve.pve_offset;
	pve32->pve_prot = pve.pve_prot;
	pve32->pve_fileid = pve.pve_fileid;
	pve32->pve_fsid = pve.pve_fsid;
	}

	pve32->pve_pathlen = pve.pve_pathlen;
	return (error);
	}

	static void
	ptrace_lwpinfo_to32(const struct ptrace_lwpinfo *pl,
	struct ptrace_lwpinfo32 *pl32)
	{

	pl32->pl_lwpid = pl->pl_lwpid;
	pl32->pl_event = pl->pl_event;
	pl32->pl_flags = pl->pl_flags;
	pl32->pl_sigmask = pl->pl_sigmask;
	pl32->pl_siglist = pl->pl_siglist;
	siginfo_to_siginfo32(&pl->pl_siginfo, &pl32->pl_siginfo);
	strcpy(pl32->pl_tdname, pl->pl_tdname);
	pl32->pl_child_pid = pl->pl_child_pid;
	}
	#endif /* COMPAT_FREEBSD32 */

	/*
	* Process debugging system call.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct ptrace_args {
	int req;
	pid_t pid;
	caddr_t addr;
	int data;
	};
	#endif

	#ifdef COMPAT_FREEBSD32
	/*
	* This CPP subterfuge is to try and reduce the number of ifdefs in
	* the body of the code.
	* COPYIN(uap->addr, &r.reg, sizeof r.reg);
	* becomes either:
	* copyin(uap->addr, &r.reg, sizeof r.reg);
	* or
	* copyin(uap->addr, &r.reg32, sizeof r.reg32);
	* .. except this is done at runtime.
	*/
	#define COPYIN(u, k, s) wrap32 ? \
	copyin(u, k ## 32, s ## 32) : \
	copyin(u, k, s)
	#define COPYOUT(k, u, s) wrap32 ? \
	copyout(k ## 32, u, s ## 32) : \
	copyout(k, u, s)
	#else
	#define COPYIN(u, k, s) copyin(u, k, s)
	#define COPYOUT(k, u, s) copyout(k, u, s)
	#endif
	int
	-ptrace(struct thread td, struct ptrace_args uap)
	+sys_ptrace(struct thread td, struct ptrace_args uap)
	{
	/*
	* XXX this obfuscation is to reduce stack usage, but the register
	* structs may be too large to put on the stack anyway.
	*/
	union {
	struct ptrace_io_desc piod;
	struct ptrace_lwpinfo pl;
	struct ptrace_vm_entry pve;
	struct dbreg dbreg;
	struct fpreg fpreg;
	struct reg reg;
	#ifdef COMPAT_FREEBSD32
	struct dbreg32 dbreg32;
	struct fpreg32 fpreg32;
	struct reg32 reg32;
	struct ptrace_io_desc32 piod32;
	struct ptrace_lwpinfo32 pl32;
	struct ptrace_vm_entry32 pve32;
	#endif
	} r;
	void *addr;
	int error = 0;
	#ifdef COMPAT_FREEBSD32
	int wrap32 = 0;

	if (SV_CURPROC_FLAG(SV_ILP32))
	wrap32 = 1;
	#endif
	AUDIT_ARG_PID(uap->pid);
	AUDIT_ARG_CMD(uap->req);
	AUDIT_ARG_VALUE(uap->data);
	addr = &r;
	switch (uap->req) {
	case PT_GETREGS:
	case PT_GETFPREGS:
	case PT_GETDBREGS:
	case PT_LWPINFO:
	break;
	case PT_SETREGS:
	error = COPYIN(uap->addr, &r.reg, sizeof r.reg);
	break;
	case PT_SETFPREGS:
	error = COPYIN(uap->addr, &r.fpreg, sizeof r.fpreg);
	break;
	case PT_SETDBREGS:
	error = COPYIN(uap->addr, &r.dbreg, sizeof r.dbreg);
	break;
	case PT_IO:
	error = COPYIN(uap->addr, &r.piod, sizeof r.piod);
	break;
	case PT_VM_ENTRY:
	error = COPYIN(uap->addr, &r.pve, sizeof r.pve);
	break;
	default:
	addr = uap->addr;
	break;
	}
	if (error)
	return (error);

	error = kern_ptrace(td, uap->req, uap->pid, addr, uap->data);
	if (error)
	return (error);

	switch (uap->req) {
	case PT_VM_ENTRY:
	error = COPYOUT(&r.pve, uap->addr, sizeof r.pve);
	break;
	case PT_IO:
	error = COPYOUT(&r.piod, uap->addr, sizeof r.piod);
	break;
	case PT_GETREGS:
	error = COPYOUT(&r.reg, uap->addr, sizeof r.reg);
	break;
	case PT_GETFPREGS:
	error = COPYOUT(&r.fpreg, uap->addr, sizeof r.fpreg);
	break;
	case PT_GETDBREGS:
	error = COPYOUT(&r.dbreg, uap->addr, sizeof r.dbreg);
	break;
	case PT_LWPINFO:
	error = copyout(&r.pl, uap->addr, uap->data);
	break;
	}

	return (error);
	}
	#undef COPYIN
	#undef COPYOUT

	#ifdef COMPAT_FREEBSD32
	/*
	* PROC_READ(regs, td2, addr);
	* becomes either:
	* proc_read_regs(td2, addr);
	* or
	* proc_read_regs32(td2, addr);
	* .. except this is done at runtime. There is an additional
	* complication in that PROC_WRITE disallows 32 bit consumers
	* from writing to 64 bit address space targets.
	*/
	#define PROC_READ(w, t, a) wrap32 ? \
	proc_read_ ## w ## 32(t, a) : \
	proc_read_ ## w (t, a)
	#define PROC_WRITE(w, t, a) wrap32 ? \
	(safe ? proc_write_ ## w ## 32(t, a) : EINVAL ) : \
	proc_write_ ## w (t, a)
	#else
	#define PROC_READ(w, t, a) proc_read_ ## w (t, a)
	#define PROC_WRITE(w, t, a) proc_write_ ## w (t, a)
	#endif

	int
	kern_ptrace(struct thread td, int req, pid_t pid, void addr, int data)
	{
	struct iovec iov;
	struct uio uio;
	struct proc curp, p, *pp;
	struct thread *td2 = NULL;
	struct ptrace_io_desc *piod = NULL;
	struct ptrace_lwpinfo *pl;
	int error, write, tmp, num;
	int proctree_locked = 0;
	lwpid_t tid = 0, *buf;
	#ifdef COMPAT_FREEBSD32
	int wrap32 = 0, safe = 0;
	struct ptrace_io_desc32 *piod32 = NULL;
	struct ptrace_lwpinfo32 *pl32 = NULL;
	struct ptrace_lwpinfo plr;
	#endif

	curp = td->td_proc;

	/* Lock proctree before locking the process. */
	switch (req) {
	case PT_TRACE_ME:
	case PT_ATTACH:
	case PT_STEP:
	case PT_CONTINUE:
	case PT_TO_SCE:
	case PT_TO_SCX:
	case PT_SYSCALL:
	case PT_FOLLOW_FORK:
	case PT_DETACH:
	sx_xlock(&proctree_lock);
	proctree_locked = 1;
	break;
	default:
	break;
	}

	write = 0;
	if (req == PT_TRACE_ME) {
	p = td->td_proc;
	PROC_LOCK(p);
	} else {
	if (pid <= PID_MAX) {
	if ((p = pfind(pid)) == NULL) {
	if (proctree_locked)
	sx_xunlock(&proctree_lock);
	return (ESRCH);
	}
	} else {
	td2 = tdfind(pid, -1);
	if (td2 == NULL) {
	if (proctree_locked)
	sx_xunlock(&proctree_lock);
	return (ESRCH);
	}
	p = td2->td_proc;
	tid = pid;
	pid = p->p_pid;
	}
	}
	AUDIT_ARG_PROCESS(p);

	if ((p->p_flag & P_WEXIT) != 0) {
	error = ESRCH;
	goto fail;
	}
	if ((error = p_cansee(td, p)) != 0)
	goto fail;

	if ((error = p_candebug(td, p)) != 0)
	goto fail;

	/*
	* System processes can't be debugged.
	*/
	if ((p->p_flag & P_SYSTEM) != 0) {
	error = EINVAL;
	goto fail;
	}

	if (tid == 0) {
	if ((p->p_flag & P_STOPPED_TRACE) != 0) {
	KASSERT(p->p_xthread != NULL, ("NULL p_xthread"));
	td2 = p->p_xthread;
	} else {
	td2 = FIRST_THREAD_IN_PROC(p);
	}
	tid = td2->td_tid;
	}

	#ifdef COMPAT_FREEBSD32
	/*
	* Test if we're a 32 bit client and what the target is.
	* Set the wrap controls accordingly.
	*/
	if (SV_CURPROC_FLAG(SV_ILP32)) {
	if (SV_PROC_FLAG(td2->td_proc, SV_ILP32))
	safe = 1;
	wrap32 = 1;
	}
	#endif
	/*
	* Permissions check
	*/
	switch (req) {
	case PT_TRACE_ME:
	/* Always legal. */
	break;

	case PT_ATTACH:
	/* Self */
	if (p->p_pid == td->td_proc->p_pid) {
	error = EINVAL;
	goto fail;
	}

	/* Already traced */
	if (p->p_flag & P_TRACED) {
	error = EBUSY;
	goto fail;
	}

	/* Can't trace an ancestor if you're being traced. */
	if (curp->p_flag & P_TRACED) {
	for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) {
	if (pp == p) {
	error = EINVAL;
	goto fail;
	}
	}
	}


	/* OK */
	break;

	case PT_CLEARSTEP:
	/* Allow thread to clear single step for itself */
	if (td->td_tid == tid)
	break;

	/* FALLTHROUGH */
	default:
	/* not being traced... */
	if ((p->p_flag & P_TRACED) == 0) {
	error = EPERM;
	goto fail;
	}

	/* not being traced by YOU */
	if (p->p_pptr != td->td_proc) {
	error = EBUSY;
	goto fail;
	}

	/* not currently stopped */
	if ((p->p_flag & (P_STOPPED_SIG \| P_STOPPED_TRACE)) == 0 \|\|
	p->p_suspcount != p->p_numthreads \|\|
	(p->p_flag & P_WAITED) == 0) {
	error = EBUSY;
	goto fail;
	}

	if ((p->p_flag & P_STOPPED_TRACE) == 0) {
	static int count = 0;
	if (count++ == 0)
	printf("P_STOPPED_TRACE not set.\n");
	}

	/* OK */
	break;
	}

	/* Keep this process around until we finish this request. */
	_PHOLD(p);

	#ifdef FIX_SSTEP
	/*
	* Single step fixup ala procfs
	*/
	FIX_SSTEP(td2);
	#endif

	/*
	* Actually do the requests
	*/

	td->td_retval[0] = 0;

	switch (req) {
	case PT_TRACE_ME:
	/* set my trace flag and "owner" so it can read/write me */
	p->p_flag \|= P_TRACED;
	p->p_oppid = p->p_pptr->p_pid;
	break;

	case PT_ATTACH:
	/* security check done above */
	/*
	* It would be nice if the tracing relationship was separate
	* from the parent relationship but that would require
	* another set of links in the proc struct or for "wait"
	* to scan the entire proc table. To make life easier,
	* we just re-parent the process we're trying to trace.
	* The old parent is remembered so we can put things back
	* on a "detach".
	*/
	p->p_flag \|= P_TRACED;
	p->p_oppid = p->p_pptr->p_pid;
	if (p->p_pptr != td->td_proc) {
	/* Remember that a child is being debugged(traced). */
	p->p_pptr->p_dbg_child++;
	proc_reparent(p, td->td_proc);
	}
	data = SIGSTOP;
	goto sendsig; /* in PT_CONTINUE below */

	case PT_CLEARSTEP:
	error = ptrace_clear_single_step(td2);
	break;

	case PT_SETSTEP:
	error = ptrace_single_step(td2);
	break;

	case PT_SUSPEND:
	td2->td_dbgflags \|= TDB_SUSPEND;
	thread_lock(td2);
	td2->td_flags \|= TDF_NEEDSUSPCHK;
	thread_unlock(td2);
	break;

	case PT_RESUME:
	td2->td_dbgflags &= ~TDB_SUSPEND;
	break;

	case PT_FOLLOW_FORK:
	if (data)
	p->p_flag \|= P_FOLLOWFORK;
	else
	p->p_flag &= ~P_FOLLOWFORK;
	break;

	case PT_STEP:
	case PT_CONTINUE:
	case PT_TO_SCE:
	case PT_TO_SCX:
	case PT_SYSCALL:
	case PT_DETACH:
	/* Zero means do not send any signal */
	if (data < 0 \|\| data > _SIG_MAXSIG) {
	error = EINVAL;
	break;
	}

	switch (req) {
	case PT_STEP:
	error = ptrace_single_step(td2);
	if (error)
	goto out;
	break;
	case PT_CONTINUE:
	case PT_TO_SCE:
	case PT_TO_SCX:
	case PT_SYSCALL:
	if (addr != (void *)1) {
	error = ptrace_set_pc(td2,
	(u_long)(uintfptr_t)addr);
	if (error)
	goto out;
	}
	switch (req) {
	case PT_TO_SCE:
	p->p_stops \|= S_PT_SCE;
	break;
	case PT_TO_SCX:
	p->p_stops \|= S_PT_SCX;
	break;
	case PT_SYSCALL:
	p->p_stops \|= S_PT_SCE \| S_PT_SCX;
	break;
	}
	break;
	case PT_DETACH:
	/* reset process parent */
	if (p->p_oppid != p->p_pptr->p_pid) {
	struct proc *pp;

	PROC_LOCK(p->p_pptr);
	sigqueue_take(p->p_ksi);
	PROC_UNLOCK(p->p_pptr);

	PROC_UNLOCK(p);
	pp = pfind(p->p_oppid);
	if (pp == NULL)
	pp = initproc;
	else
	PROC_UNLOCK(pp);
	PROC_LOCK(p);
	proc_reparent(p, pp);
	p->p_pptr->p_dbg_child--;
	if (pp == initproc)
	p->p_sigparent = SIGCHLD;
	}
	p->p_oppid = 0;
	p->p_flag &= ~(P_TRACED \| P_WAITED \| P_FOLLOWFORK);

	/* should we send SIGCHLD? */
	/* childproc_continued(p); */
	break;
	}

	sendsig:
	if (proctree_locked) {
	sx_xunlock(&proctree_lock);
	proctree_locked = 0;
	}
	p->p_xstat = data;
	p->p_xthread = NULL;
	if ((p->p_flag & (P_STOPPED_SIG \| P_STOPPED_TRACE)) != 0) {
	/* deliver or queue signal */
	td2->td_dbgflags &= ~TDB_XSIG;
	td2->td_xsig = data;

	if (req == PT_DETACH) {
	struct thread *td3;
	FOREACH_THREAD_IN_PROC(p, td3) {
	td3->td_dbgflags &= ~TDB_SUSPEND;
	}
	}
	/*
	* unsuspend all threads, to not let a thread run,
	* you should use PT_SUSPEND to suspend it before
	* continuing process.
	*/
	PROC_SLOCK(p);
	p->p_flag &= ~(P_STOPPED_TRACE\|P_STOPPED_SIG\|P_WAITED);
	thread_unsuspend(p);
	PROC_SUNLOCK(p);
	} else {
	if (data)
	- psignal(p, data);
	+ kern_psignal(p, data);
	}
	break;

	case PT_WRITE_I:
	case PT_WRITE_D:
	td2->td_dbgflags \|= TDB_USERWR;
	write = 1;
	/* FALLTHROUGH */
	case PT_READ_I:
	case PT_READ_D:
	PROC_UNLOCK(p);
	tmp = 0;
	/* write = 0 set above */
	iov.iov_base = write ? (caddr_t)&data : (caddr_t)&tmp;
	iov.iov_len = sizeof(int);
	uio.uio_iov = &iov;
	uio.uio_iovcnt = 1;
	uio.uio_offset = (off_t)(uintptr_t)addr;
	uio.uio_resid = sizeof(int);
	uio.uio_segflg = UIO_SYSSPACE; /* i.e.: the uap */
	uio.uio_rw = write ? UIO_WRITE : UIO_READ;
	uio.uio_td = td;
	error = proc_rwmem(p, &uio);
	if (uio.uio_resid != 0) {
	/*
	* XXX proc_rwmem() doesn't currently return ENOSPC,
	* so I think write() can bogusly return 0.
	* XXX what happens for short writes? We don't want
	* to write partial data.
	* XXX proc_rwmem() returns EPERM for other invalid
	* addresses. Convert this to EINVAL. Does this
	* clobber returns of EPERM for other reasons?
	*/
	if (error == 0 \|\| error == ENOSPC \|\| error == EPERM)
	error = EINVAL; /* EOF */
	}
	if (!write)
	td->td_retval[0] = tmp;
	PROC_LOCK(p);
	break;

	case PT_IO:
	#ifdef COMPAT_FREEBSD32
	if (wrap32) {
	piod32 = addr;
	iov.iov_base = (void *)(uintptr_t)piod32->piod_addr;
	iov.iov_len = piod32->piod_len;
	uio.uio_offset = (off_t)(uintptr_t)piod32->piod_offs;
	uio.uio_resid = piod32->piod_len;
	} else
	#endif
	{
	piod = addr;
	iov.iov_base = piod->piod_addr;
	iov.iov_len = piod->piod_len;
	uio.uio_offset = (off_t)(uintptr_t)piod->piod_offs;
	uio.uio_resid = piod->piod_len;
	}
	uio.uio_iov = &iov;
	uio.uio_iovcnt = 1;
	uio.uio_segflg = UIO_USERSPACE;
	uio.uio_td = td;
	#ifdef COMPAT_FREEBSD32
	tmp = wrap32 ? piod32->piod_op : piod->piod_op;
	#else
	tmp = piod->piod_op;
	#endif
	switch (tmp) {
	case PIOD_READ_D:
	case PIOD_READ_I:
	uio.uio_rw = UIO_READ;
	break;
	case PIOD_WRITE_D:
	case PIOD_WRITE_I:
	td2->td_dbgflags \|= TDB_USERWR;
	uio.uio_rw = UIO_WRITE;
	break;
	default:
	error = EINVAL;
	goto out;
	}
	PROC_UNLOCK(p);
	error = proc_rwmem(p, &uio);
	#ifdef COMPAT_FREEBSD32
	if (wrap32)
	piod32->piod_len -= uio.uio_resid;
	else
	#endif
	piod->piod_len -= uio.uio_resid;
	PROC_LOCK(p);
	break;

	case PT_KILL:
	data = SIGKILL;
	goto sendsig; /* in PT_CONTINUE above */

	case PT_SETREGS:
	td2->td_dbgflags \|= TDB_USERWR;
	error = PROC_WRITE(regs, td2, addr);
	break;

	case PT_GETREGS:
	error = PROC_READ(regs, td2, addr);
	break;

	case PT_SETFPREGS:
	td2->td_dbgflags \|= TDB_USERWR;
	error = PROC_WRITE(fpregs, td2, addr);
	break;

	case PT_GETFPREGS:
	error = PROC_READ(fpregs, td2, addr);
	break;

	case PT_SETDBREGS:
	td2->td_dbgflags \|= TDB_USERWR;
	error = PROC_WRITE(dbregs, td2, addr);
	break;

	case PT_GETDBREGS:
	error = PROC_READ(dbregs, td2, addr);
	break;

	case PT_LWPINFO:
	if (data <= 0 \|\|
	#ifdef COMPAT_FREEBSD32
	(!wrap32 && data > sizeof(*pl)) \|\|
	(wrap32 && data > sizeof(*pl32))) {
	#else
	data > sizeof(*pl)) {
	#endif
	error = EINVAL;
	break;
	}
	#ifdef COMPAT_FREEBSD32
	if (wrap32) {
	pl = &plr;
	pl32 = addr;
	} else
	#endif
	pl = addr;
	pl->pl_lwpid = td2->td_tid;
	pl->pl_flags = 0;
	if (td2->td_dbgflags & TDB_XSIG) {
	pl->pl_event = PL_EVENT_SIGNAL;
	if (td2->td_dbgksi.ksi_signo != 0 &&
	#ifdef COMPAT_FREEBSD32
	((!wrap32 && data >= offsetof(struct ptrace_lwpinfo,
	pl_siginfo) + sizeof(pl->pl_siginfo)) \|\|
	(wrap32 && data >= offsetof(struct ptrace_lwpinfo32,
	pl_siginfo) + sizeof(struct siginfo32)))
	#else
	data >= offsetof(struct ptrace_lwpinfo, pl_siginfo)
	+ sizeof(pl->pl_siginfo)
	#endif
	){
	pl->pl_flags \|= PL_FLAG_SI;
	pl->pl_siginfo = td2->td_dbgksi.ksi_info;
	}
	}
	if ((pl->pl_flags & PL_FLAG_SI) == 0)
	bzero(&pl->pl_siginfo, sizeof(pl->pl_siginfo));
	if (td2->td_dbgflags & TDB_SCE)
	pl->pl_flags \|= PL_FLAG_SCE;
	else if (td2->td_dbgflags & TDB_SCX)
	pl->pl_flags \|= PL_FLAG_SCX;
	if (td2->td_dbgflags & TDB_EXEC)
	pl->pl_flags \|= PL_FLAG_EXEC;
	if (td2->td_dbgflags & TDB_FORK) {
	pl->pl_flags \|= PL_FLAG_FORKED;
	pl->pl_child_pid = td2->td_dbg_forked;
	}
	pl->pl_sigmask = td2->td_sigmask;
	pl->pl_siglist = td2->td_siglist;
	strcpy(pl->pl_tdname, td2->td_name);
	#ifdef COMPAT_FREEBSD32
	if (wrap32)
	ptrace_lwpinfo_to32(pl, pl32);
	#endif
	break;

	case PT_GETNUMLWPS:
	td->td_retval[0] = p->p_numthreads;
	break;

	case PT_GETLWPLIST:
	if (data <= 0) {
	error = EINVAL;
	break;
	}
	num = imin(p->p_numthreads, data);
	PROC_UNLOCK(p);
	buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK);
	tmp = 0;
	PROC_LOCK(p);
	FOREACH_THREAD_IN_PROC(p, td2) {
	if (tmp >= num)
	break;
	buf[tmp++] = td2->td_tid;
	}
	PROC_UNLOCK(p);
	error = copyout(buf, addr, tmp * sizeof(lwpid_t));
	free(buf, M_TEMP);
	if (!error)
	td->td_retval[0] = tmp;
	PROC_LOCK(p);
	break;

	case PT_VM_TIMESTAMP:
	td->td_retval[0] = p->p_vmspace->vm_map.timestamp;
	break;

	case PT_VM_ENTRY:
	PROC_UNLOCK(p);
	#ifdef COMPAT_FREEBSD32
	if (wrap32)
	error = ptrace_vm_entry32(td, p, addr);
	else
	#endif
	error = ptrace_vm_entry(td, p, addr);
	PROC_LOCK(p);
	break;

	default:
	#ifdef __HAVE_PTRACE_MACHDEP
	if (req >= PT_FIRSTMACH) {
	PROC_UNLOCK(p);
	error = cpu_ptrace(td2, req, addr, data);
	PROC_LOCK(p);
	} else
	#endif
	/* Unknown request. */
	error = EINVAL;
	break;
	}

	out:
	/* Drop our hold on this process now that the request has completed. */
	_PRELE(p);
	fail:
	PROC_UNLOCK(p);
	if (proctree_locked)
	sx_xunlock(&proctree_lock);
	return (error);
	}
	#undef PROC_READ
	#undef PROC_WRITE

	/*
	* Stop a process because of a debugging event;
	* stay stopped until p->p_step is cleared
	* (cleared by PIOCCONT in procfs).
	*/
	void
	stopevent(struct proc *p, unsigned int event, unsigned int val)
	{

	PROC_LOCK_ASSERT(p, MA_OWNED);
	p->p_step = 1;
	do {
	p->p_xstat = val;
	p->p_xthread = NULL;
	p->p_stype = event; /* Which event caused the stop? */
	wakeup(&p->p_stype); /* Wake up any PIOCWAIT'ing procs */
	msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0);
	} while (p->p_step);
	}
	Index: head/sys/kern/sysv_msg.c
	===================================================================
	--- head/sys/kern/sysv_msg.c (revision 225616)
	+++ head/sys/kern/sysv_msg.c (revision 225617)
	@@ -1,1590 +1,1592 @@
	/*-
	* Implementation of SVID messages
	*
	* Author: Daniel Boulet
	*
	* Copyright 1993 Daniel Boulet and RTMX Inc.
	*
	* This system call was implemented by Daniel Boulet under contract from RTMX.
	*
	* Redistribution and use in source forms, with and without modification,
	* are permitted provided that this entire comment appears intact.
	*
	* Redistribution in binary form may occur without any restrictions.
	* Obviously, it would be nice if you gave credit where credit is due
	* but requiring it would be too onerous.
	*
	* This software is provided ``AS IS'' without any warranties of any kind.
	*/
	/*-
	* Copyright (c) 2003-2005 McAfee, Inc.
	* All rights reserved.
	*
	* This software was developed for the FreeBSD Project in part by McAfee
	* Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
	* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
	* program.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_sysvipc.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/kernel.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/module.h>
	#include <sys/msg.h>
	#include <sys/racct.h>
	#include <sys/syscall.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/sysctl.h>
	#include <sys/malloc.h>
	#include <sys/jail.h>

	#include <security/mac/mac_framework.h>

	FEATURE(sysv_msg, "System V message queues support");

	static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");

	static int msginit(void);
	static int msgunload(void);
	static int sysvmsg_modload(struct module , int, void );

	+
	#ifdef MSG_DEBUG
	#define DPRINTF(a) printf a
	#else
	#define DPRINTF(a) (void)0
	#endif

	static void msg_freehdr(struct msg *msghdr);

	#ifndef MSGSSZ
	#define MSGSSZ 8 /* Each segment must be 2^N long */
	#endif
	#ifndef MSGSEG
	#define MSGSEG 2048 /* must be less than 32767 */
	#endif
	#define MSGMAX (MSGSSZ*MSGSEG)
	#ifndef MSGMNB
	#define MSGMNB 2048 /* max # of bytes in a queue */
	#endif
	#ifndef MSGMNI
	#define MSGMNI 40
	#endif
	#ifndef MSGTQL
	#define MSGTQL 40
	#endif

	/*
	* Based on the configuration parameters described in an SVR2 (yes, two)
	* config(1m) man page.
	*
	* Each message is broken up and stored in segments that are msgssz bytes
	* long. For efficiency reasons, this should be a power of two. Also,
	* it doesn't make sense if it is less than 8 or greater than about 256.
	* Consequently, msginit in kern/sysv_msg.c checks that msgssz is a power of
	* two between 8 and 1024 inclusive (and panic's if it isn't).
	*/
	struct msginfo msginfo = {
	MSGMAX, /* max chars in a message */
	MSGMNI, /* # of message queue identifiers */
	MSGMNB, /* max chars in a queue */
	MSGTQL, /* max messages in system */
	MSGSSZ, /* size of a message segment */
	/* (must be small power of 2 greater than 4) */
	MSGSEG /* number of message segments */
	};

	/*
	* macros to convert between msqid_ds's and msqid's.
	* (specific to this implementation)
	*/
	#define MSQID(ix,ds) ((ix) & 0xffff \| (((ds).msg_perm.seq << 16) & 0xffff0000))
	#define MSQID_IX(id) ((id) & 0xffff)
	#define MSQID_SEQ(id) (((id) >> 16) & 0xffff)

	/*
	* The rest of this file is specific to this particular implementation.
	*/

	struct msgmap {
	short next; /* next segment in buffer */
	/* -1 -> available */
	/* 0..(MSGSEG-1) -> index of next segment */
	};

	#define MSG_LOCKED 01000 /* Is this msqid_ds locked? */

	static int nfree_msgmaps; /* # of free map entries */
	static short free_msgmaps; /* head of linked list of free map entries */
	static struct msg free_msghdrs;/ list of free msg headers */
	static char msgpool; / MSGMAX byte long msg buffer pool */
	static struct msgmap msgmaps; / MSGSEG msgmap structures */
	static struct msg msghdrs; / MSGTQL msg headers */
	static struct msqid_kernel msqids; / MSGMNI msqid_kernel struct's */
	static struct mtx msq_mtx; /* global mutex for message queues. */

	static struct syscall_helper_data msg_syscalls[] = {
	SYSCALL_INIT_HELPER(msgctl),
	SYSCALL_INIT_HELPER(msgget),
	SYSCALL_INIT_HELPER(msgsnd),
	SYSCALL_INIT_HELPER(msgrcv),
	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)
	SYSCALL_INIT_HELPER(msgsys),
	- SYSCALL_INIT_HELPER(freebsd7_msgctl),
	+ SYSCALL_INIT_HELPER_COMPAT(freebsd7_msgctl),
	#endif
	SYSCALL_INIT_LAST
	};

	#ifdef COMPAT_FREEBSD32
	#include <compat/freebsd32/freebsd32.h>
	#include <compat/freebsd32/freebsd32_ipc.h>
	#include <compat/freebsd32/freebsd32_proto.h>
	#include <compat/freebsd32/freebsd32_signal.h>
	#include <compat/freebsd32/freebsd32_syscall.h>
	#include <compat/freebsd32/freebsd32_util.h>

	static struct syscall_helper_data msg32_syscalls[] = {
	SYSCALL32_INIT_HELPER(freebsd32_msgctl),
	SYSCALL32_INIT_HELPER(freebsd32_msgsnd),
	SYSCALL32_INIT_HELPER(freebsd32_msgrcv),
	- SYSCALL32_INIT_HELPER(msgget),
	+ SYSCALL32_INIT_HELPER_COMPAT(msgget),
	SYSCALL32_INIT_HELPER(freebsd32_msgsys),
	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)
	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_msgctl),
	#endif
	SYSCALL_INIT_LAST
	};
	#endif

	static int
	msginit()
	{
	int i, error;

	TUNABLE_INT_FETCH("kern.ipc.msgseg", &msginfo.msgseg);
	TUNABLE_INT_FETCH("kern.ipc.msgssz", &msginfo.msgssz);
	msginfo.msgmax = msginfo.msgseg * msginfo.msgssz;
	TUNABLE_INT_FETCH("kern.ipc.msgmni", &msginfo.msgmni);
	TUNABLE_INT_FETCH("kern.ipc.msgmnb", &msginfo.msgmnb);
	TUNABLE_INT_FETCH("kern.ipc.msgtql", &msginfo.msgtql);

	msgpool = malloc(msginfo.msgmax, M_MSG, M_WAITOK);
	msgmaps = malloc(sizeof(struct msgmap) * msginfo.msgseg, M_MSG, M_WAITOK);
	msghdrs = malloc(sizeof(struct msg) * msginfo.msgtql, M_MSG, M_WAITOK);
	msqids = malloc(sizeof(struct msqid_kernel) * msginfo.msgmni, M_MSG,
	M_WAITOK);

	/*
	* msginfo.msgssz should be a power of two for efficiency reasons.
	* It is also pretty silly if msginfo.msgssz is less than 8
	* or greater than about 256 so ...
	*/

	i = 8;
	while (i < 1024 && i != msginfo.msgssz)
	i <<= 1;
	if (i != msginfo.msgssz) {
	DPRINTF(("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
	msginfo.msgssz));
	panic("msginfo.msgssz not a small power of 2");
	}

	if (msginfo.msgseg > 32767) {
	DPRINTF(("msginfo.msgseg=%d\n", msginfo.msgseg));
	panic("msginfo.msgseg > 32767");
	}

	for (i = 0; i < msginfo.msgseg; i++) {
	if (i > 0)
	msgmaps[i-1].next = i;
	msgmaps[i].next = -1; /* implies entry is available */
	}
	free_msgmaps = 0;
	nfree_msgmaps = msginfo.msgseg;

	for (i = 0; i < msginfo.msgtql; i++) {
	msghdrs[i].msg_type = 0;
	if (i > 0)
	msghdrs[i-1].msg_next = &msghdrs[i];
	msghdrs[i].msg_next = NULL;
	#ifdef MAC
	mac_sysvmsg_init(&msghdrs[i]);
	#endif
	}
	free_msghdrs = &msghdrs[0];

	for (i = 0; i < msginfo.msgmni; i++) {
	msqids[i].u.msg_qbytes = 0; /* implies entry is available */
	msqids[i].u.msg_perm.seq = 0; /* reset to a known value */
	msqids[i].u.msg_perm.mode = 0;
	#ifdef MAC
	mac_sysvmsq_init(&msqids[i]);
	#endif
	}
	mtx_init(&msq_mtx, "msq", NULL, MTX_DEF);

	error = syscall_helper_register(msg_syscalls);
	if (error != 0)
	return (error);
	#ifdef COMPAT_FREEBSD32
	error = syscall32_helper_register(msg32_syscalls);
	if (error != 0)
	return (error);
	#endif
	return (0);
	}

	static int
	msgunload()
	{
	struct msqid_kernel *msqkptr;
	int msqid;
	#ifdef MAC
	int i;
	#endif

	syscall_helper_unregister(msg_syscalls);
	#ifdef COMPAT_FREEBSD32
	syscall32_helper_unregister(msg32_syscalls);
	#endif

	for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
	/*
	* Look for an unallocated and unlocked msqid_ds.
	* msqid_ds's can be locked by msgsnd or msgrcv while
	* they are copying the message in/out. We can't
	* re-use the entry until they release it.
	*/
	msqkptr = &msqids[msqid];
	if (msqkptr->u.msg_qbytes != 0 \|\|
	(msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
	break;
	}
	if (msqid != msginfo.msgmni)
	return (EBUSY);

	#ifdef MAC
	for (i = 0; i < msginfo.msgtql; i++)
	mac_sysvmsg_destroy(&msghdrs[i]);
	for (msqid = 0; msqid < msginfo.msgmni; msqid++)
	mac_sysvmsq_destroy(&msqids[msqid]);
	#endif
	free(msgpool, M_MSG);
	free(msgmaps, M_MSG);
	free(msghdrs, M_MSG);
	free(msqids, M_MSG);
	mtx_destroy(&msq_mtx);
	return (0);
	}


	static int
	sysvmsg_modload(struct module module, int cmd, void arg)
	{
	int error = 0;

	switch (cmd) {
	case MOD_LOAD:
	error = msginit();
	if (error != 0)
	msgunload();
	break;
	case MOD_UNLOAD:
	error = msgunload();
	break;
	case MOD_SHUTDOWN:
	break;
	default:
	error = EINVAL;
	break;
	}
	return (error);
	}

	static moduledata_t sysvmsg_mod = {
	"sysvmsg",
	&sysvmsg_modload,
	NULL
	};

	DECLARE_MODULE(sysvmsg, sysvmsg_mod, SI_SUB_SYSV_MSG, SI_ORDER_FIRST);
	MODULE_VERSION(sysvmsg, 1);

	static void
	msg_freehdr(msghdr)
	struct msg *msghdr;
	{
	while (msghdr->msg_ts > 0) {
	short next;
	if (msghdr->msg_spot < 0 \|\| msghdr->msg_spot >= msginfo.msgseg)
	panic("msghdr->msg_spot out of range");
	next = msgmaps[msghdr->msg_spot].next;
	msgmaps[msghdr->msg_spot].next = free_msgmaps;
	free_msgmaps = msghdr->msg_spot;
	nfree_msgmaps++;
	msghdr->msg_spot = next;
	if (msghdr->msg_ts >= msginfo.msgssz)
	msghdr->msg_ts -= msginfo.msgssz;
	else
	msghdr->msg_ts = 0;
	}
	if (msghdr->msg_spot != -1)
	panic("msghdr->msg_spot != -1");
	msghdr->msg_next = free_msghdrs;
	free_msghdrs = msghdr;
	#ifdef MAC
	mac_sysvmsg_cleanup(msghdr);
	#endif
	}

	#ifndef _SYS_SYSPROTO_H_
	struct msgctl_args {
	int msqid;
	int cmd;
	struct msqid_ds *buf;
	};
	#endif
	int
	-msgctl(td, uap)
	+sys_msgctl(td, uap)
	struct thread *td;
	register struct msgctl_args *uap;
	{
	int msqid = uap->msqid;
	int cmd = uap->cmd;
	struct msqid_ds msqbuf;
	int error;

	DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf));
	if (cmd == IPC_SET &&
	(error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0)
	return (error);
	error = kern_msgctl(td, msqid, cmd, &msqbuf);
	if (cmd == IPC_STAT && error == 0)
	error = copyout(&msqbuf, uap->buf, sizeof(struct msqid_ds));
	return (error);
	}

	int
	kern_msgctl(td, msqid, cmd, msqbuf)
	struct thread *td;
	int msqid;
	int cmd;
	struct msqid_ds *msqbuf;
	{
	int rval, error, msqix;
	register struct msqid_kernel *msqkptr;

	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);

	msqix = IPCID_TO_IX(msqid);

	if (msqix < 0 \|\| msqix >= msginfo.msgmni) {
	DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
	msginfo.msgmni));
	return (EINVAL);
	}

	msqkptr = &msqids[msqix];

	mtx_lock(&msq_mtx);
	if (msqkptr->u.msg_qbytes == 0) {
	DPRINTF(("no such msqid\n"));
	error = EINVAL;
	goto done2;
	}
	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
	DPRINTF(("wrong sequence number\n"));
	error = EINVAL;
	goto done2;
	}
	#ifdef MAC
	error = mac_sysvmsq_check_msqctl(td->td_ucred, msqkptr, cmd);
	if (error != 0)
	goto done2;
	#endif

	error = 0;
	rval = 0;

	switch (cmd) {

	case IPC_RMID:
	{
	struct msg *msghdr;
	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
	goto done2;

	#ifdef MAC
	/*
	* Check that the thread has MAC access permissions to
	* individual msghdrs. Note: We need to do this in a
	* separate loop because the actual loop alters the
	* msq/msghdr info as it progresses, and there is no going
	* back if half the way through we discover that the
	* thread cannot free a certain msghdr. The msq will get
	* into an inconsistent state.
	*/
	for (msghdr = msqkptr->u.msg_first; msghdr != NULL;
	msghdr = msghdr->msg_next) {
	error = mac_sysvmsq_check_msgrmid(td->td_ucred, msghdr);
	if (error != 0)
	goto done2;
	}
	#endif

	racct_sub_cred(msqkptr->cred, RACCT_NMSGQ, 1);
	racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, msqkptr->u.msg_qnum);
	racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msqkptr->u.msg_cbytes);
	crfree(msqkptr->cred);
	msqkptr->cred = NULL;

	/* Free the message headers */
	msghdr = msqkptr->u.msg_first;
	while (msghdr != NULL) {
	struct msg *msghdr_tmp;

	/* Free the segments of each message */
	msqkptr->u.msg_cbytes -= msghdr->msg_ts;
	msqkptr->u.msg_qnum--;
	msghdr_tmp = msghdr;
	msghdr = msghdr->msg_next;
	msg_freehdr(msghdr_tmp);
	}

	if (msqkptr->u.msg_cbytes != 0)
	panic("msg_cbytes is screwed up");
	if (msqkptr->u.msg_qnum != 0)
	panic("msg_qnum is screwed up");

	msqkptr->u.msg_qbytes = 0; /* Mark it as free */

	#ifdef MAC
	mac_sysvmsq_cleanup(msqkptr);
	#endif

	wakeup(msqkptr);
	}

	break;

	case IPC_SET:
	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
	goto done2;
	if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) {
	error = priv_check(td, PRIV_IPC_MSGSIZE);
	if (error)
	goto done2;
	}
	if (msqbuf->msg_qbytes > msginfo.msgmnb) {
	DPRINTF(("can't increase msg_qbytes beyond %d"
	"(truncating)\n", msginfo.msgmnb));
	msqbuf->msg_qbytes = msginfo.msgmnb; /* silently restrict qbytes to system limit */
	}
	if (msqbuf->msg_qbytes == 0) {
	DPRINTF(("can't reduce msg_qbytes to 0\n"));
	error = EINVAL; /* non-standard errno! */
	goto done2;
	}
	msqkptr->u.msg_perm.uid = msqbuf->msg_perm.uid; /* change the owner */
	msqkptr->u.msg_perm.gid = msqbuf->msg_perm.gid; /* change the owner */
	msqkptr->u.msg_perm.mode = (msqkptr->u.msg_perm.mode & ~0777) \|
	(msqbuf->msg_perm.mode & 0777);
	msqkptr->u.msg_qbytes = msqbuf->msg_qbytes;
	msqkptr->u.msg_ctime = time_second;
	break;

	case IPC_STAT:
	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
	DPRINTF(("requester doesn't have read access\n"));
	goto done2;
	}
	*msqbuf = msqkptr->u;
	break;

	default:
	DPRINTF(("invalid command %d\n", cmd));
	error = EINVAL;
	goto done2;
	}

	if (error == 0)
	td->td_retval[0] = rval;
	done2:
	mtx_unlock(&msq_mtx);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct msgget_args {
	key_t key;
	int msgflg;
	};
	#endif
	+
	int
	-msgget(td, uap)
	+sys_msgget(td, uap)
	struct thread *td;
	register struct msgget_args *uap;
	{
	int msqid, error = 0;
	int key = uap->key;
	int msgflg = uap->msgflg;
	struct ucred *cred = td->td_ucred;
	register struct msqid_kernel *msqkptr = NULL;

	DPRINTF(("msgget(0x%x, 0%o)\n", key, msgflg));

	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);

	mtx_lock(&msq_mtx);
	if (key != IPC_PRIVATE) {
	for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
	msqkptr = &msqids[msqid];
	if (msqkptr->u.msg_qbytes != 0 &&
	msqkptr->u.msg_perm.key == key)
	break;
	}
	if (msqid < msginfo.msgmni) {
	DPRINTF(("found public key\n"));
	if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
	DPRINTF(("not exclusive\n"));
	error = EEXIST;
	goto done2;
	}
	if ((error = ipcperm(td, &msqkptr->u.msg_perm,
	msgflg & 0700))) {
	DPRINTF(("requester doesn't have 0%o access\n",
	msgflg & 0700));
	goto done2;
	}
	#ifdef MAC
	error = mac_sysvmsq_check_msqget(cred, msqkptr);
	if (error != 0)
	goto done2;
	#endif
	goto found;
	}
	}

	DPRINTF(("need to allocate the msqid_ds\n"));
	if (key == IPC_PRIVATE \|\| (msgflg & IPC_CREAT)) {
	for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
	/*
	* Look for an unallocated and unlocked msqid_ds.
	* msqid_ds's can be locked by msgsnd or msgrcv while
	* they are copying the message in/out. We can't
	* re-use the entry until they release it.
	*/
	msqkptr = &msqids[msqid];
	if (msqkptr->u.msg_qbytes == 0 &&
	(msqkptr->u.msg_perm.mode & MSG_LOCKED) == 0)
	break;
	}
	if (msqid == msginfo.msgmni) {
	DPRINTF(("no more msqid_ds's available\n"));
	error = ENOSPC;
	goto done2;
	}
	#ifdef RACCT
	PROC_LOCK(td->td_proc);
	error = racct_add(td->td_proc, RACCT_NMSGQ, 1);
	PROC_UNLOCK(td->td_proc);
	if (error != 0) {
	error = ENOSPC;
	goto done2;
	}
	#endif
	DPRINTF(("msqid %d is available\n", msqid));
	msqkptr->u.msg_perm.key = key;
	msqkptr->u.msg_perm.cuid = cred->cr_uid;
	msqkptr->u.msg_perm.uid = cred->cr_uid;
	msqkptr->u.msg_perm.cgid = cred->cr_gid;
	msqkptr->u.msg_perm.gid = cred->cr_gid;
	msqkptr->u.msg_perm.mode = (msgflg & 0777);
	msqkptr->cred = crhold(cred);
	/* Make sure that the returned msqid is unique */
	msqkptr->u.msg_perm.seq = (msqkptr->u.msg_perm.seq + 1) & 0x7fff;
	msqkptr->u.msg_first = NULL;
	msqkptr->u.msg_last = NULL;
	msqkptr->u.msg_cbytes = 0;
	msqkptr->u.msg_qnum = 0;
	msqkptr->u.msg_qbytes = msginfo.msgmnb;
	msqkptr->u.msg_lspid = 0;
	msqkptr->u.msg_lrpid = 0;
	msqkptr->u.msg_stime = 0;
	msqkptr->u.msg_rtime = 0;
	msqkptr->u.msg_ctime = time_second;
	#ifdef MAC
	mac_sysvmsq_create(cred, msqkptr);
	#endif
	} else {
	DPRINTF(("didn't find it and wasn't asked to create it\n"));
	error = ENOENT;
	goto done2;
	}

	found:
	/* Construct the unique msqid */
	td->td_retval[0] = IXSEQ_TO_IPCID(msqid, msqkptr->u.msg_perm);
	done2:
	mtx_unlock(&msq_mtx);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct msgsnd_args {
	int msqid;
	const void *msgp;
	size_t msgsz;
	int msgflg;
	};
	#endif
	int
	kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
	struct thread *td;
	int msqid;
	const void msgp; / XXX msgp is actually mtext. */
	size_t msgsz;
	int msgflg;
	long mtype;
	{
	int msqix, segs_needed, error = 0;
	register struct msqid_kernel *msqkptr;
	register struct msg *msghdr;
	short next;
	#ifdef RACCT
	size_t saved_msgsz;
	#endif

	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);

	mtx_lock(&msq_mtx);
	msqix = IPCID_TO_IX(msqid);

	if (msqix < 0 \|\| msqix >= msginfo.msgmni) {
	DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
	msginfo.msgmni));
	error = EINVAL;
	goto done2;
	}

	msqkptr = &msqids[msqix];
	if (msqkptr->u.msg_qbytes == 0) {
	DPRINTF(("no such message queue id\n"));
	error = EINVAL;
	goto done2;
	}
	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
	DPRINTF(("wrong sequence number\n"));
	error = EINVAL;
	goto done2;
	}

	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_W))) {
	DPRINTF(("requester doesn't have write access\n"));
	goto done2;
	}

	#ifdef MAC
	error = mac_sysvmsq_check_msqsnd(td->td_ucred, msqkptr);
	if (error != 0)
	goto done2;
	#endif

	#ifdef RACCT
	PROC_LOCK(td->td_proc);
	if (racct_add(td->td_proc, RACCT_MSGQQUEUED, 1)) {
	PROC_UNLOCK(td->td_proc);
	error = EAGAIN;
	goto done2;
	}
	saved_msgsz = msgsz;
	if (racct_add(td->td_proc, RACCT_MSGQSIZE, msgsz)) {
	racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
	PROC_UNLOCK(td->td_proc);
	error = EAGAIN;
	goto done2;
	}
	PROC_UNLOCK(td->td_proc);
	#endif

	segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
	DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
	msginfo.msgssz, segs_needed));
	for (;;) {
	int need_more_resources = 0;

	/*
	* check msgsz
	* (inside this loop in case msg_qbytes changes while we sleep)
	*/

	if (msgsz > msqkptr->u.msg_qbytes) {
	DPRINTF(("msgsz > msqkptr->u.msg_qbytes\n"));
	error = EINVAL;
	goto done3;
	}

	if (msqkptr->u.msg_perm.mode & MSG_LOCKED) {
	DPRINTF(("msqid is locked\n"));
	need_more_resources = 1;
	}
	if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes) {
	DPRINTF(("msgsz + msg_cbytes > msg_qbytes\n"));
	need_more_resources = 1;
	}
	if (segs_needed > nfree_msgmaps) {
	DPRINTF(("segs_needed > nfree_msgmaps\n"));
	need_more_resources = 1;
	}
	if (free_msghdrs == NULL) {
	DPRINTF(("no more msghdrs\n"));
	need_more_resources = 1;
	}

	if (need_more_resources) {
	int we_own_it;

	if ((msgflg & IPC_NOWAIT) != 0) {
	DPRINTF(("need more resources but caller "
	"doesn't want to wait\n"));
	error = EAGAIN;
	goto done3;
	}

	if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0) {
	DPRINTF(("we don't own the msqid_ds\n"));
	we_own_it = 0;
	} else {
	/* Force later arrivals to wait for our
	request */
	DPRINTF(("we own the msqid_ds\n"));
	msqkptr->u.msg_perm.mode \|= MSG_LOCKED;
	we_own_it = 1;
	}
	DPRINTF(("msgsnd: goodnight\n"));
	error = msleep(msqkptr, &msq_mtx, (PZERO - 4) \| PCATCH,
	"msgsnd", hz);
	DPRINTF(("msgsnd: good morning, error=%d\n", error));
	if (we_own_it)
	msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
	if (error == EWOULDBLOCK) {
	DPRINTF(("msgsnd: timed out\n"));
	continue;
	}
	if (error != 0) {
	DPRINTF(("msgsnd: interrupted system call\n"));
	error = EINTR;
	goto done3;
	}

	/*
	* Make sure that the msq queue still exists
	*/

	if (msqkptr->u.msg_qbytes == 0) {
	DPRINTF(("msqid deleted\n"));
	error = EIDRM;
	goto done3;
	}

	} else {
	DPRINTF(("got all the resources that we need\n"));
	break;
	}
	}

	/*
	* We have the resources that we need.
	* Make sure!
	*/

	if (msqkptr->u.msg_perm.mode & MSG_LOCKED)
	panic("msg_perm.mode & MSG_LOCKED");
	if (segs_needed > nfree_msgmaps)
	panic("segs_needed > nfree_msgmaps");
	if (msgsz + msqkptr->u.msg_cbytes > msqkptr->u.msg_qbytes)
	panic("msgsz + msg_cbytes > msg_qbytes");
	if (free_msghdrs == NULL)
	panic("no more msghdrs");

	/*
	* Re-lock the msqid_ds in case we page-fault when copying in the
	* message
	*/

	if ((msqkptr->u.msg_perm.mode & MSG_LOCKED) != 0)
	panic("msqid_ds is already locked");
	msqkptr->u.msg_perm.mode \|= MSG_LOCKED;

	/*
	* Allocate a message header
	*/

	msghdr = free_msghdrs;
	free_msghdrs = msghdr->msg_next;
	msghdr->msg_spot = -1;
	msghdr->msg_ts = msgsz;
	msghdr->msg_type = mtype;
	#ifdef MAC
	/*
	* XXXMAC: Should the mac_sysvmsq_check_msgmsq check follow here
	* immediately? Or, should it be checked just before the msg is
	* enqueued in the msgq (as it is done now)?
	*/
	mac_sysvmsg_create(td->td_ucred, msqkptr, msghdr);
	#endif

	/*
	* Allocate space for the message
	*/

	while (segs_needed > 0) {
	if (nfree_msgmaps <= 0)
	panic("not enough msgmaps");
	if (free_msgmaps == -1)
	panic("nil free_msgmaps");
	next = free_msgmaps;
	if (next <= -1)
	panic("next too low #1");
	if (next >= msginfo.msgseg)
	panic("next out of range #1");
	DPRINTF(("allocating segment %d to message\n", next));
	free_msgmaps = msgmaps[next].next;
	nfree_msgmaps--;
	msgmaps[next].next = msghdr->msg_spot;
	msghdr->msg_spot = next;
	segs_needed--;
	}

	/*
	* Validate the message type
	*/

	if (msghdr->msg_type < 1) {
	msg_freehdr(msghdr);
	msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
	wakeup(msqkptr);
	DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
	error = EINVAL;
	goto done3;
	}

	/*
	* Copy in the message body
	*/

	next = msghdr->msg_spot;
	while (msgsz > 0) {
	size_t tlen;
	if (msgsz > msginfo.msgssz)
	tlen = msginfo.msgssz;
	else
	tlen = msgsz;
	if (next <= -1)
	panic("next too low #2");
	if (next >= msginfo.msgseg)
	panic("next out of range #2");
	mtx_unlock(&msq_mtx);
	if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz],
	tlen)) != 0) {
	mtx_lock(&msq_mtx);
	DPRINTF(("error %d copying in message segment\n",
	error));
	msg_freehdr(msghdr);
	msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
	wakeup(msqkptr);
	goto done3;
	}
	mtx_lock(&msq_mtx);
	msgsz -= tlen;
	msgp = (const char *)msgp + tlen;
	next = msgmaps[next].next;
	}
	if (next != -1)
	panic("didn't use all the msg segments");

	/*
	* We've got the message. Unlock the msqid_ds.
	*/

	msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;

	/*
	* Make sure that the msqid_ds is still allocated.
	*/

	if (msqkptr->u.msg_qbytes == 0) {
	msg_freehdr(msghdr);
	wakeup(msqkptr);
	error = EIDRM;
	goto done3;
	}

	#ifdef MAC
	/*
	* Note: Since the task/thread allocates the msghdr and usually
	* primes it with its own MAC label, for a majority of policies, it
	* won't be necessary to check whether the msghdr has access
	* permissions to the msgq. The mac_sysvmsq_check_msqsnd check would
	* suffice in that case. However, this hook may be required where
	* individual policies derive a non-identical label for the msghdr
	* from the current thread label and may want to check the msghdr
	* enqueue permissions, along with read/write permissions to the
	* msgq.
	*/
	error = mac_sysvmsq_check_msgmsq(td->td_ucred, msghdr, msqkptr);
	if (error != 0) {
	msg_freehdr(msghdr);
	wakeup(msqkptr);
	goto done3;
	}
	#endif

	/*
	* Put the message into the queue
	*/
	if (msqkptr->u.msg_first == NULL) {
	msqkptr->u.msg_first = msghdr;
	msqkptr->u.msg_last = msghdr;
	} else {
	msqkptr->u.msg_last->msg_next = msghdr;
	msqkptr->u.msg_last = msghdr;
	}
	msqkptr->u.msg_last->msg_next = NULL;

	msqkptr->u.msg_cbytes += msghdr->msg_ts;
	msqkptr->u.msg_qnum++;
	msqkptr->u.msg_lspid = td->td_proc->p_pid;
	msqkptr->u.msg_stime = time_second;

	wakeup(msqkptr);
	td->td_retval[0] = 0;
	done3:
	#ifdef RACCT
	if (error != 0) {
	PROC_LOCK(td->td_proc);
	racct_sub(td->td_proc, RACCT_MSGQQUEUED, 1);
	racct_sub(td->td_proc, RACCT_MSGQSIZE, saved_msgsz);
	PROC_UNLOCK(td->td_proc);
	}
	#endif
	done2:
	mtx_unlock(&msq_mtx);
	return (error);
	}

	int
	-msgsnd(td, uap)
	+sys_msgsnd(td, uap)
	struct thread *td;
	register struct msgsnd_args *uap;
	{
	int error;
	long mtype;

	DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp,
	uap->msgsz, uap->msgflg));

	if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) {
	DPRINTF(("error %d copying the message type\n", error));
	return (error);
	}
	return (kern_msgsnd(td, uap->msqid,
	(const char *)uap->msgp + sizeof(mtype),
	uap->msgsz, uap->msgflg, mtype));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct msgrcv_args {
	int msqid;
	void *msgp;
	size_t msgsz;
	long msgtyp;
	int msgflg;
	};
	#endif
	int
	kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype)
	struct thread *td;
	int msqid;
	void msgp; / XXX msgp is actually mtext. */
	size_t msgsz;
	long msgtyp;
	int msgflg;
	long *mtype;
	{
	size_t len;
	register struct msqid_kernel *msqkptr;
	register struct msg *msghdr;
	int msqix, error = 0;
	short next;

	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);

	msqix = IPCID_TO_IX(msqid);

	if (msqix < 0 \|\| msqix >= msginfo.msgmni) {
	DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
	msginfo.msgmni));
	return (EINVAL);
	}

	msqkptr = &msqids[msqix];
	mtx_lock(&msq_mtx);
	if (msqkptr->u.msg_qbytes == 0) {
	DPRINTF(("no such message queue id\n"));
	error = EINVAL;
	goto done2;
	}
	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
	DPRINTF(("wrong sequence number\n"));
	error = EINVAL;
	goto done2;
	}

	if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_R))) {
	DPRINTF(("requester doesn't have read access\n"));
	goto done2;
	}

	#ifdef MAC
	error = mac_sysvmsq_check_msqrcv(td->td_ucred, msqkptr);
	if (error != 0)
	goto done2;
	#endif

	msghdr = NULL;
	while (msghdr == NULL) {
	if (msgtyp == 0) {
	msghdr = msqkptr->u.msg_first;
	if (msghdr != NULL) {
	if (msgsz < msghdr->msg_ts &&
	(msgflg & MSG_NOERROR) == 0) {
	DPRINTF(("first message on the queue "
	"is too big (want %zu, got %d)\n",
	msgsz, msghdr->msg_ts));
	error = E2BIG;
	goto done2;
	}
	#ifdef MAC
	error = mac_sysvmsq_check_msgrcv(td->td_ucred,
	msghdr);
	if (error != 0)
	goto done2;
	#endif
	if (msqkptr->u.msg_first == msqkptr->u.msg_last) {
	msqkptr->u.msg_first = NULL;
	msqkptr->u.msg_last = NULL;
	} else {
	msqkptr->u.msg_first = msghdr->msg_next;
	if (msqkptr->u.msg_first == NULL)
	panic("msg_first/last screwed up #1");
	}
	}
	} else {
	struct msg *previous;
	struct msg **prev;

	previous = NULL;
	prev = &(msqkptr->u.msg_first);
	while ((msghdr = *prev) != NULL) {
	/*
	* Is this message's type an exact match or is
	* this message's type less than or equal to
	* the absolute value of a negative msgtyp?
	* Note that the second half of this test can
	* NEVER be true if msgtyp is positive since
	* msg_type is always positive!
	*/

	if (msgtyp == msghdr->msg_type \|\|
	msghdr->msg_type <= -msgtyp) {
	DPRINTF(("found message type %ld, "
	"requested %ld\n",
	msghdr->msg_type, msgtyp));
	if (msgsz < msghdr->msg_ts &&
	(msgflg & MSG_NOERROR) == 0) {
	DPRINTF(("requested message "
	"on the queue is too big "
	"(want %zu, got %hu)\n",
	msgsz, msghdr->msg_ts));
	error = E2BIG;
	goto done2;
	}
	#ifdef MAC
	error = mac_sysvmsq_check_msgrcv(
	td->td_ucred, msghdr);
	if (error != 0)
	goto done2;
	#endif
	*prev = msghdr->msg_next;
	if (msghdr == msqkptr->u.msg_last) {
	if (previous == NULL) {
	if (prev !=
	&msqkptr->u.msg_first)
	panic("msg_first/last screwed up #2");
	msqkptr->u.msg_first =
	NULL;
	msqkptr->u.msg_last =
	NULL;
	} else {
	if (prev ==
	&msqkptr->u.msg_first)
	panic("msg_first/last screwed up #3");
	msqkptr->u.msg_last =
	previous;
	}
	}
	break;
	}
	previous = msghdr;
	prev = &(msghdr->msg_next);
	}
	}

	/*
	* We've either extracted the msghdr for the appropriate
	* message or there isn't one.
	* If there is one then bail out of this loop.
	*/

	if (msghdr != NULL)
	break;

	/*
	* Hmph! No message found. Does the user want to wait?
	*/

	if ((msgflg & IPC_NOWAIT) != 0) {
	DPRINTF(("no appropriate message found (msgtyp=%ld)\n",
	msgtyp));
	/* The SVID says to return ENOMSG. */
	error = ENOMSG;
	goto done2;
	}

	/*
	* Wait for something to happen
	*/

	DPRINTF(("msgrcv: goodnight\n"));
	error = msleep(msqkptr, &msq_mtx, (PZERO - 4) \| PCATCH,
	"msgrcv", 0);
	DPRINTF(("msgrcv: good morning (error=%d)\n", error));

	if (error != 0) {
	DPRINTF(("msgrcv: interrupted system call\n"));
	error = EINTR;
	goto done2;
	}

	/*
	* Make sure that the msq queue still exists
	*/

	if (msqkptr->u.msg_qbytes == 0 \|\|
	msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
	DPRINTF(("msqid deleted\n"));
	error = EIDRM;
	goto done2;
	}
	}

	/*
	* Return the message to the user.
	*
	* First, do the bookkeeping (before we risk being interrupted).
	*/

	msqkptr->u.msg_cbytes -= msghdr->msg_ts;
	msqkptr->u.msg_qnum--;
	msqkptr->u.msg_lrpid = td->td_proc->p_pid;
	msqkptr->u.msg_rtime = time_second;

	racct_sub_cred(msqkptr->cred, RACCT_MSGQQUEUED, 1);
	racct_sub_cred(msqkptr->cred, RACCT_MSGQSIZE, msghdr->msg_ts);

	/*
	* Make msgsz the actual amount that we'll be returning.
	* Note that this effectively truncates the message if it is too long
	* (since msgsz is never increased).
	*/

	DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz,
	msghdr->msg_ts));
	if (msgsz > msghdr->msg_ts)
	msgsz = msghdr->msg_ts;
	*mtype = msghdr->msg_type;

	/*
	* Return the segments to the user
	*/

	next = msghdr->msg_spot;
	for (len = 0; len < msgsz; len += msginfo.msgssz) {
	size_t tlen;

	if (msgsz - len > msginfo.msgssz)
	tlen = msginfo.msgssz;
	else
	tlen = msgsz - len;
	if (next <= -1)
	panic("next too low #3");
	if (next >= msginfo.msgseg)
	panic("next out of range #3");
	mtx_unlock(&msq_mtx);
	error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen);
	mtx_lock(&msq_mtx);
	if (error != 0) {
	DPRINTF(("error (%d) copying out message segment\n",
	error));
	msg_freehdr(msghdr);
	wakeup(msqkptr);
	goto done2;
	}
	msgp = (char *)msgp + tlen;
	next = msgmaps[next].next;
	}

	/*
	* Done, return the actual number of bytes copied out.
	*/

	msg_freehdr(msghdr);
	wakeup(msqkptr);
	td->td_retval[0] = msgsz;
	done2:
	mtx_unlock(&msq_mtx);
	return (error);
	}

	int
	-msgrcv(td, uap)
	+sys_msgrcv(td, uap)
	struct thread *td;
	register struct msgrcv_args *uap;
	{
	int error;
	long mtype;

	DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid,
	uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg));

	if ((error = kern_msgrcv(td, uap->msqid,
	(char *)uap->msgp + sizeof(mtype), uap->msgsz,
	uap->msgtyp, uap->msgflg, &mtype)) != 0)
	return (error);
	if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0)
	DPRINTF(("error %d copying the message type\n", error));
	return (error);
	}

	static int
	sysctl_msqids(SYSCTL_HANDLER_ARGS)
	{

	return (SYSCTL_OUT(req, msqids,
	sizeof(struct msqid_kernel) * msginfo.msgmni));
	}

	SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0,
	"Maximum message size");
	SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0,
	"Number of message queue identifiers");
	SYSCTL_INT(_kern_ipc, OID_AUTO, msgmnb, CTLFLAG_RDTUN, &msginfo.msgmnb, 0,
	"Maximum number of bytes in a queue");
	SYSCTL_INT(_kern_ipc, OID_AUTO, msgtql, CTLFLAG_RDTUN, &msginfo.msgtql, 0,
	"Maximum number of messages in the system");
	SYSCTL_INT(_kern_ipc, OID_AUTO, msgssz, CTLFLAG_RDTUN, &msginfo.msgssz, 0,
	"Size of a message segment");
	SYSCTL_INT(_kern_ipc, OID_AUTO, msgseg, CTLFLAG_RDTUN, &msginfo.msgseg, 0,
	"Number of message segments");
	SYSCTL_PROC(_kern_ipc, OID_AUTO, msqids, CTLTYPE_OPAQUE \| CTLFLAG_RD,
	NULL, 0, sysctl_msqids, "", "Message queue IDs");

	#ifdef COMPAT_FREEBSD32
	int
	freebsd32_msgsys(struct thread td, struct freebsd32_msgsys_args uap)
	{

	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)
	switch (uap->which) {
	case 0:
	return (freebsd7_freebsd32_msgctl(td,
	(struct freebsd7_freebsd32_msgctl_args *)&uap->a2));
	case 2:
	return (freebsd32_msgsnd(td,
	(struct freebsd32_msgsnd_args *)&uap->a2));
	case 3:
	return (freebsd32_msgrcv(td,
	(struct freebsd32_msgrcv_args *)&uap->a2));
	default:
	- return (msgsys(td, (struct msgsys_args *)uap));
	+ return (sys_msgsys(td, (struct msgsys_args *)uap));
	}
	#else
	return (nosys(td, NULL));
	#endif
	}

	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)
	int
	freebsd7_freebsd32_msgctl(struct thread *td,
	struct freebsd7_freebsd32_msgctl_args *uap)
	{
	struct msqid_ds msqbuf;
	struct msqid_ds32_old msqbuf32;
	int error;

	if (uap->cmd == IPC_SET) {
	error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
	if (error)
	return (error);
	freebsd32_ipcperm_old_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
	PTRIN_CP(msqbuf32, msqbuf, msg_first);
	PTRIN_CP(msqbuf32, msqbuf, msg_last);
	CP(msqbuf32, msqbuf, msg_cbytes);
	CP(msqbuf32, msqbuf, msg_qnum);
	CP(msqbuf32, msqbuf, msg_qbytes);
	CP(msqbuf32, msqbuf, msg_lspid);
	CP(msqbuf32, msqbuf, msg_lrpid);
	CP(msqbuf32, msqbuf, msg_stime);
	CP(msqbuf32, msqbuf, msg_rtime);
	CP(msqbuf32, msqbuf, msg_ctime);
	}
	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
	if (error)
	return (error);
	if (uap->cmd == IPC_STAT) {
	bzero(&msqbuf32, sizeof(msqbuf32));
	freebsd32_ipcperm_old_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
	PTROUT_CP(msqbuf, msqbuf32, msg_first);
	PTROUT_CP(msqbuf, msqbuf32, msg_last);
	CP(msqbuf, msqbuf32, msg_cbytes);
	CP(msqbuf, msqbuf32, msg_qnum);
	CP(msqbuf, msqbuf32, msg_qbytes);
	CP(msqbuf, msqbuf32, msg_lspid);
	CP(msqbuf, msqbuf32, msg_lrpid);
	CP(msqbuf, msqbuf32, msg_stime);
	CP(msqbuf, msqbuf32, msg_rtime);
	CP(msqbuf, msqbuf32, msg_ctime);
	error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
	}
	return (error);
	}
	#endif

	int
	freebsd32_msgctl(struct thread td, struct freebsd32_msgctl_args uap)
	{
	struct msqid_ds msqbuf;
	struct msqid_ds32 msqbuf32;
	int error;

	if (uap->cmd == IPC_SET) {
	error = copyin(uap->buf, &msqbuf32, sizeof(msqbuf32));
	if (error)
	return (error);
	freebsd32_ipcperm_in(&msqbuf32.msg_perm, &msqbuf.msg_perm);
	PTRIN_CP(msqbuf32, msqbuf, msg_first);
	PTRIN_CP(msqbuf32, msqbuf, msg_last);
	CP(msqbuf32, msqbuf, msg_cbytes);
	CP(msqbuf32, msqbuf, msg_qnum);
	CP(msqbuf32, msqbuf, msg_qbytes);
	CP(msqbuf32, msqbuf, msg_lspid);
	CP(msqbuf32, msqbuf, msg_lrpid);
	CP(msqbuf32, msqbuf, msg_stime);
	CP(msqbuf32, msqbuf, msg_rtime);
	CP(msqbuf32, msqbuf, msg_ctime);
	}
	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
	if (error)
	return (error);
	if (uap->cmd == IPC_STAT) {
	freebsd32_ipcperm_out(&msqbuf.msg_perm, &msqbuf32.msg_perm);
	PTROUT_CP(msqbuf, msqbuf32, msg_first);
	PTROUT_CP(msqbuf, msqbuf32, msg_last);
	CP(msqbuf, msqbuf32, msg_cbytes);
	CP(msqbuf, msqbuf32, msg_qnum);
	CP(msqbuf, msqbuf32, msg_qbytes);
	CP(msqbuf, msqbuf32, msg_lspid);
	CP(msqbuf, msqbuf32, msg_lrpid);
	CP(msqbuf, msqbuf32, msg_stime);
	CP(msqbuf, msqbuf32, msg_rtime);
	CP(msqbuf, msqbuf32, msg_ctime);
	error = copyout(&msqbuf32, uap->buf, sizeof(struct msqid_ds32));
	}
	return (error);
	}

	int
	freebsd32_msgsnd(struct thread td, struct freebsd32_msgsnd_args uap)
	{
	const void *msgp;
	long mtype;
	int32_t mtype32;
	int error;

	msgp = PTRIN(uap->msgp);
	if ((error = copyin(msgp, &mtype32, sizeof(mtype32))) != 0)
	return (error);
	mtype = mtype32;
	return (kern_msgsnd(td, uap->msqid,
	(const char *)msgp + sizeof(mtype32),
	uap->msgsz, uap->msgflg, mtype));
	}

	int
	freebsd32_msgrcv(struct thread td, struct freebsd32_msgrcv_args uap)
	{
	void *msgp;
	long mtype;
	int32_t mtype32;
	int error;

	msgp = PTRIN(uap->msgp);
	if ((error = kern_msgrcv(td, uap->msqid,
	(char *)msgp + sizeof(mtype32), uap->msgsz,
	uap->msgtyp, uap->msgflg, &mtype)) != 0)
	return (error);
	mtype32 = (int32_t)mtype;
	return (copyout(&mtype32, msgp, sizeof(mtype32)));
	}
	#endif

	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)

	/* XXX casting to (sy_call_t ) is bogus, as usual. /
	static sy_call_t *msgcalls[] = {
	- (sy_call_t )freebsd7_msgctl, (sy_call_t )msgget,
	- (sy_call_t )msgsnd, (sy_call_t )msgrcv
	+ (sy_call_t )freebsd7_msgctl, (sy_call_t )sys_msgget,
	+ (sy_call_t )sys_msgsnd, (sy_call_t )sys_msgrcv
	};

	/*
	* Entry point for all MSG calls.
	*/
	int
	-msgsys(td, uap)
	+sys_msgsys(td, uap)
	struct thread *td;
	/* XXX actually varargs. */
	struct msgsys_args /* {
	int which;
	int a2;
	int a3;
	int a4;
	int a5;
	int a6;
	} / uap;
	{
	int error;

	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);
	if (uap->which < 0 \|\|
	uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0]))
	return (EINVAL);
	error = (*msgcalls[uap->which])(td, &uap->a2);
	return (error);
	}

	#ifndef CP
	#define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0)
	#endif

	#ifndef _SYS_SYSPROTO_H_
	struct freebsd7_msgctl_args {
	int msqid;
	int cmd;
	struct msqid_ds_old *buf;
	};
	#endif
	int
	freebsd7_msgctl(td, uap)
	struct thread *td;
	struct freebsd7_msgctl_args *uap;
	{
	struct msqid_ds_old msqold;
	struct msqid_ds msqbuf;
	int error;

	DPRINTF(("call to freebsd7_msgctl(%d, %d, %p)\n", uap->msqid, uap->cmd,
	uap->buf));
	if (uap->cmd == IPC_SET) {
	error = copyin(uap->buf, &msqold, sizeof(msqold));
	if (error)
	return (error);
	ipcperm_old2new(&msqold.msg_perm, &msqbuf.msg_perm);
	CP(msqold, msqbuf, msg_first);
	CP(msqold, msqbuf, msg_last);
	CP(msqold, msqbuf, msg_cbytes);
	CP(msqold, msqbuf, msg_qnum);
	CP(msqold, msqbuf, msg_qbytes);
	CP(msqold, msqbuf, msg_lspid);
	CP(msqold, msqbuf, msg_lrpid);
	CP(msqold, msqbuf, msg_stime);
	CP(msqold, msqbuf, msg_rtime);
	CP(msqold, msqbuf, msg_ctime);
	}
	error = kern_msgctl(td, uap->msqid, uap->cmd, &msqbuf);
	if (error)
	return (error);
	if (uap->cmd == IPC_STAT) {
	bzero(&msqold, sizeof(msqold));
	ipcperm_new2old(&msqbuf.msg_perm, &msqold.msg_perm);
	CP(msqbuf, msqold, msg_first);
	CP(msqbuf, msqold, msg_last);
	CP(msqbuf, msqold, msg_cbytes);
	CP(msqbuf, msqold, msg_qnum);
	CP(msqbuf, msqold, msg_qbytes);
	CP(msqbuf, msqold, msg_lspid);
	CP(msqbuf, msqold, msg_lrpid);
	CP(msqbuf, msqold, msg_stime);
	CP(msqbuf, msqold, msg_rtime);
	CP(msqbuf, msqold, msg_ctime);
	error = copyout(&msqold, uap->buf, sizeof(struct msqid_ds_old));
	}
	return (error);
	}

	#undef CP

	#endif /* COMPAT_FREEBSD4 \|\| COMPAT_FREEBSD5 \|\| COMPAT_FREEBSD6 \|\|
	COMPAT_FREEBSD7 */
	Index: head/sys/kern/sysv_sem.c
	===================================================================
	--- head/sys/kern/sysv_sem.c (revision 225616)
	+++ head/sys/kern/sysv_sem.c (revision 225617)
	@@ -1,1666 +1,1666 @@
	/*-
	* Implementation of SVID semaphores
	*
	* Author: Daniel Boulet
	*
	* This software is provided ``AS IS'' without any warranties of any kind.
	*/
	/*-
	* Copyright (c) 2003-2005 McAfee, Inc.
	* All rights reserved.
	*
	* This software was developed for the FreeBSD Project in part by McAfee
	* Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
	* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
	* program.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_sysvipc.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/eventhandler.h>
	#include <sys/kernel.h>
	#include <sys/proc.h>
	#include <sys/lock.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/racct.h>
	#include <sys/sem.h>
	#include <sys/syscall.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/sysctl.h>
	#include <sys/uio.h>
	#include <sys/malloc.h>
	#include <sys/jail.h>

	#include <security/mac/mac_framework.h>

	FEATURE(sysv_sem, "System V semaphores support");

	static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");

	#ifdef SEM_DEBUG
	#define DPRINTF(a) printf a
	#else
	#define DPRINTF(a)
	#endif

	static int seminit(void);
	static int sysvsem_modload(struct module , int, void );
	static int semunload(void);
	static void semexit_myhook(void arg, struct proc p);
	static int sysctl_sema(SYSCTL_HANDLER_ARGS);
	static int semvalid(int semid, struct semid_kernel *semakptr);

	#ifndef _SYS_SYSPROTO_H_
	struct __semctl_args;
	int __semctl(struct thread td, struct __semctl_args uap);
	struct semget_args;
	int semget(struct thread td, struct semget_args uap);
	struct semop_args;
	int semop(struct thread td, struct semop_args uap);
	#endif

	static struct sem_undo semu_alloc(struct thread td);
	static int semundo_adjust(struct thread td, struct sem_undo *supptr,
	int semid, int semseq, int semnum, int adjval);
	static void semundo_clear(int semid, int semnum);

	static struct mtx sem_mtx; /* semaphore global lock */
	static struct mtx sem_undo_mtx;
	static int semtot = 0;
	static struct semid_kernel sema; / semaphore id pool */
	static struct mtx sema_mtx; / semaphore id pool mutexes*/
	static struct sem sem; / semaphore pool */
	LIST_HEAD(, sem_undo) semu_list; /* list of active undo structures */
	LIST_HEAD(, sem_undo) semu_free_list; /* list of free undo structures */
	static int semu; / undo structure pool */
	static eventhandler_tag semexit_tag;

	#define SEMUNDO_MTX sem_undo_mtx
	#define SEMUNDO_LOCK() mtx_lock(&SEMUNDO_MTX);
	#define SEMUNDO_UNLOCK() mtx_unlock(&SEMUNDO_MTX);
	#define SEMUNDO_LOCKASSERT(how) mtx_assert(&SEMUNDO_MTX, (how));

	struct sem {
	u_short semval; /* semaphore value */
	pid_t sempid; /* pid of last operation */
	u_short semncnt; /* # awaiting semval > cval */
	u_short semzcnt; /* # awaiting semval = 0 */
	};

	/*
	* Undo structure (one per process)
	*/
	struct sem_undo {
	LIST_ENTRY(sem_undo) un_next; /* ptr to next active undo structure */
	struct proc un_proc; / owner of this structure */
	short un_cnt; /* # of active entries */
	struct undo {
	short un_adjval; /* adjust on exit values */
	short un_num; /* semaphore # */
	int un_id; /* semid */
	unsigned short un_seq;
	} un_ent[1]; /* undo entries */
	};

	/*
	* Configuration parameters
	*/
	#ifndef SEMMNI
	#define SEMMNI 50 /* # of semaphore identifiers */
	#endif
	#ifndef SEMMNS
	#define SEMMNS 340 /* # of semaphores in system */
	#endif
	#ifndef SEMUME
	#define SEMUME 50 /* max # of undo entries per process */
	#endif
	#ifndef SEMMNU
	#define SEMMNU 150 /* # of undo structures in system */
	#endif

	/* shouldn't need tuning */
	#ifndef SEMMSL
	#define SEMMSL SEMMNS /* max # of semaphores per id */
	#endif
	#ifndef SEMOPM
	#define SEMOPM 100 /* max # of operations per semop call */
	#endif

	#define SEMVMX 32767 /* semaphore maximum value */
	#define SEMAEM 16384 /* adjust on exit max value */

	/*
	* Due to the way semaphore memory is allocated, we have to ensure that
	* SEMUSZ is properly aligned.
	*/

	#define SEM_ALIGN(bytes) (((bytes) + (sizeof(long) - 1)) & ~(sizeof(long) - 1))

	/* actual size of an undo structure */
	#define SEMUSZ SEM_ALIGN(offsetof(struct sem_undo, un_ent[SEMUME]))

	/*
	* Macro to find a particular sem_undo vector
	*/
	#define SEMU(ix) \
	((struct sem_undo )(((intptr_t)semu)+ix seminfo.semusz))

	/*
	* semaphore info struct
	*/
	struct seminfo seminfo = {
	SEMMNI, /* # of semaphore identifiers */
	SEMMNS, /* # of semaphores in system */
	SEMMNU, /* # of undo structures in system */
	SEMMSL, /* max # of semaphores per id */
	SEMOPM, /* max # of operations per semop call */
	SEMUME, /* max # of undo entries per process */
	SEMUSZ, /* size in bytes of undo structure */
	SEMVMX, /* semaphore maximum value */
	SEMAEM /* adjust on exit max value */
	};

	SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
	"Number of semaphore identifiers");
	SYSCTL_INT(_kern_ipc, OID_AUTO, semmns, CTLFLAG_RDTUN, &seminfo.semmns, 0,
	"Maximum number of semaphores in the system");
	SYSCTL_INT(_kern_ipc, OID_AUTO, semmnu, CTLFLAG_RDTUN, &seminfo.semmnu, 0,
	"Maximum number of undo structures in the system");
	SYSCTL_INT(_kern_ipc, OID_AUTO, semmsl, CTLFLAG_RW, &seminfo.semmsl, 0,
	"Max semaphores per id");
	SYSCTL_INT(_kern_ipc, OID_AUTO, semopm, CTLFLAG_RDTUN, &seminfo.semopm, 0,
	"Max operations per semop call");
	SYSCTL_INT(_kern_ipc, OID_AUTO, semume, CTLFLAG_RDTUN, &seminfo.semume, 0,
	"Max undo entries per process");
	SYSCTL_INT(_kern_ipc, OID_AUTO, semusz, CTLFLAG_RDTUN, &seminfo.semusz, 0,
	"Size in bytes of undo structure");
	SYSCTL_INT(_kern_ipc, OID_AUTO, semvmx, CTLFLAG_RW, &seminfo.semvmx, 0,
	"Semaphore maximum value");
	SYSCTL_INT(_kern_ipc, OID_AUTO, semaem, CTLFLAG_RW, &seminfo.semaem, 0,
	"Adjust on exit max value");
	SYSCTL_PROC(_kern_ipc, OID_AUTO, sema, CTLTYPE_OPAQUE \| CTLFLAG_RD,
	NULL, 0, sysctl_sema, "", "Semaphore id pool");

	static struct syscall_helper_data sem_syscalls[] = {
	SYSCALL_INIT_HELPER(__semctl),
	SYSCALL_INIT_HELPER(semget),
	SYSCALL_INIT_HELPER(semop),
	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)
	SYSCALL_INIT_HELPER(semsys),
	- SYSCALL_INIT_HELPER(freebsd7___semctl),
	+ SYSCALL_INIT_HELPER_COMPAT(freebsd7___semctl),
	#endif
	SYSCALL_INIT_LAST
	};

	#ifdef COMPAT_FREEBSD32
	#include <compat/freebsd32/freebsd32.h>
	#include <compat/freebsd32/freebsd32_ipc.h>
	#include <compat/freebsd32/freebsd32_proto.h>
	#include <compat/freebsd32/freebsd32_signal.h>
	#include <compat/freebsd32/freebsd32_syscall.h>
	#include <compat/freebsd32/freebsd32_util.h>

	static struct syscall_helper_data sem32_syscalls[] = {
	SYSCALL32_INIT_HELPER(freebsd32_semctl),
	- SYSCALL32_INIT_HELPER(semget),
	- SYSCALL32_INIT_HELPER(semop),
	+ SYSCALL32_INIT_HELPER_COMPAT(semget),
	+ SYSCALL32_INIT_HELPER_COMPAT(semop),
	SYSCALL32_INIT_HELPER(freebsd32_semsys),
	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)
	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_semctl),
	#endif
	SYSCALL_INIT_LAST
	};
	#endif

	static int
	seminit(void)
	{
	int i, error;

	TUNABLE_INT_FETCH("kern.ipc.semmni", &seminfo.semmni);
	TUNABLE_INT_FETCH("kern.ipc.semmns", &seminfo.semmns);
	TUNABLE_INT_FETCH("kern.ipc.semmnu", &seminfo.semmnu);
	TUNABLE_INT_FETCH("kern.ipc.semmsl", &seminfo.semmsl);
	TUNABLE_INT_FETCH("kern.ipc.semopm", &seminfo.semopm);
	TUNABLE_INT_FETCH("kern.ipc.semume", &seminfo.semume);
	TUNABLE_INT_FETCH("kern.ipc.semusz", &seminfo.semusz);
	TUNABLE_INT_FETCH("kern.ipc.semvmx", &seminfo.semvmx);
	TUNABLE_INT_FETCH("kern.ipc.semaem", &seminfo.semaem);

	sem = malloc(sizeof(struct sem) * seminfo.semmns, M_SEM, M_WAITOK);
	sema = malloc(sizeof(struct semid_kernel) * seminfo.semmni, M_SEM,
	M_WAITOK);
	sema_mtx = malloc(sizeof(struct mtx) * seminfo.semmni, M_SEM,
	M_WAITOK \| M_ZERO);
	semu = malloc(seminfo.semmnu * seminfo.semusz, M_SEM, M_WAITOK);

	for (i = 0; i < seminfo.semmni; i++) {
	sema[i].u.sem_base = 0;
	sema[i].u.sem_perm.mode = 0;
	sema[i].u.sem_perm.seq = 0;
	#ifdef MAC
	mac_sysvsem_init(&sema[i]);
	#endif
	}
	for (i = 0; i < seminfo.semmni; i++)
	mtx_init(&sema_mtx[i], "semid", NULL, MTX_DEF);
	LIST_INIT(&semu_free_list);
	for (i = 0; i < seminfo.semmnu; i++) {
	struct sem_undo *suptr = SEMU(i);
	suptr->un_proc = NULL;
	LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
	}
	LIST_INIT(&semu_list);
	mtx_init(&sem_mtx, "sem", NULL, MTX_DEF);
	mtx_init(&sem_undo_mtx, "semu", NULL, MTX_DEF);
	semexit_tag = EVENTHANDLER_REGISTER(process_exit, semexit_myhook, NULL,
	EVENTHANDLER_PRI_ANY);

	error = syscall_helper_register(sem_syscalls);
	if (error != 0)
	return (error);
	#ifdef COMPAT_FREEBSD32
	error = syscall32_helper_register(sem32_syscalls);
	if (error != 0)
	return (error);
	#endif
	return (0);
	}

	static int
	semunload(void)
	{
	int i;

	/* XXXKIB */
	if (semtot != 0)
	return (EBUSY);

	#ifdef COMPAT_FREEBSD32
	syscall32_helper_unregister(sem32_syscalls);
	#endif
	syscall_helper_unregister(sem_syscalls);
	EVENTHANDLER_DEREGISTER(process_exit, semexit_tag);
	#ifdef MAC
	for (i = 0; i < seminfo.semmni; i++)
	mac_sysvsem_destroy(&sema[i]);
	#endif
	free(sem, M_SEM);
	free(sema, M_SEM);
	free(semu, M_SEM);
	for (i = 0; i < seminfo.semmni; i++)
	mtx_destroy(&sema_mtx[i]);
	free(sema_mtx, M_SEM);
	mtx_destroy(&sem_mtx);
	mtx_destroy(&sem_undo_mtx);
	return (0);
	}

	static int
	sysvsem_modload(struct module module, int cmd, void arg)
	{
	int error = 0;

	switch (cmd) {
	case MOD_LOAD:
	error = seminit();
	if (error != 0)
	semunload();
	break;
	case MOD_UNLOAD:
	error = semunload();
	break;
	case MOD_SHUTDOWN:
	break;
	default:
	error = EINVAL;
	break;
	}
	return (error);
	}

	static moduledata_t sysvsem_mod = {
	"sysvsem",
	&sysvsem_modload,
	NULL
	};

	DECLARE_MODULE(sysvsem, sysvsem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
	MODULE_VERSION(sysvsem, 1);

	/*
	* Allocate a new sem_undo structure for a process
	* (returns ptr to structure or NULL if no more room)
	*/

	static struct sem_undo *
	semu_alloc(struct thread *td)
	{
	struct sem_undo *suptr;

	SEMUNDO_LOCKASSERT(MA_OWNED);
	if ((suptr = LIST_FIRST(&semu_free_list)) == NULL)
	return (NULL);
	LIST_REMOVE(suptr, un_next);
	LIST_INSERT_HEAD(&semu_list, suptr, un_next);
	suptr->un_cnt = 0;
	suptr->un_proc = td->td_proc;
	return (suptr);
	}

	static int
	semu_try_free(struct sem_undo *suptr)
	{

	SEMUNDO_LOCKASSERT(MA_OWNED);

	if (suptr->un_cnt != 0)
	return (0);
	LIST_REMOVE(suptr, un_next);
	LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
	return (1);
	}

	/*
	* Adjust a particular entry for a particular proc
	*/

	static int
	semundo_adjust(struct thread td, struct sem_undo *supptr, int semid,
	int semseq, int semnum, int adjval)
	{
	struct proc *p = td->td_proc;
	struct sem_undo *suptr;
	struct undo *sunptr;
	int i;

	SEMUNDO_LOCKASSERT(MA_OWNED);
	/* Look for and remember the sem_undo if the caller doesn't provide
	it */

	suptr = *supptr;
	if (suptr == NULL) {
	LIST_FOREACH(suptr, &semu_list, un_next) {
	if (suptr->un_proc == p) {
	*supptr = suptr;
	break;
	}
	}
	if (suptr == NULL) {
	if (adjval == 0)
	return(0);
	suptr = semu_alloc(td);
	if (suptr == NULL)
	return (ENOSPC);
	*supptr = suptr;
	}
	}

	/*
	* Look for the requested entry and adjust it (delete if adjval becomes
	* 0).
	*/
	sunptr = &suptr->un_ent[0];
	for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
	if (sunptr->un_id != semid \|\| sunptr->un_num != semnum)
	continue;
	if (adjval != 0) {
	adjval += sunptr->un_adjval;
	if (adjval > seminfo.semaem \|\| adjval < -seminfo.semaem)
	return (ERANGE);
	}
	sunptr->un_adjval = adjval;
	if (sunptr->un_adjval == 0) {
	suptr->un_cnt--;
	if (i < suptr->un_cnt)
	suptr->un_ent[i] =
	suptr->un_ent[suptr->un_cnt];
	if (suptr->un_cnt == 0)
	semu_try_free(suptr);
	}
	return (0);
	}

	/* Didn't find the right entry - create it */
	if (adjval == 0)
	return (0);
	if (adjval > seminfo.semaem \|\| adjval < -seminfo.semaem)
	return (ERANGE);
	if (suptr->un_cnt != seminfo.semume) {
	sunptr = &suptr->un_ent[suptr->un_cnt];
	suptr->un_cnt++;
	sunptr->un_adjval = adjval;
	sunptr->un_id = semid;
	sunptr->un_num = semnum;
	sunptr->un_seq = semseq;
	} else
	return (EINVAL);
	return (0);
	}

	static void
	semundo_clear(int semid, int semnum)
	{
	struct sem_undo suptr, suptr1;
	struct undo *sunptr;
	int i;

	SEMUNDO_LOCKASSERT(MA_OWNED);
	LIST_FOREACH_SAFE(suptr, &semu_list, un_next, suptr1) {
	sunptr = &suptr->un_ent[0];
	for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
	if (sunptr->un_id != semid)
	continue;
	if (semnum == -1 \|\| sunptr->un_num == semnum) {
	suptr->un_cnt--;
	if (i < suptr->un_cnt) {
	suptr->un_ent[i] =
	suptr->un_ent[suptr->un_cnt];
	continue;
	}
	semu_try_free(suptr);
	}
	if (semnum != -1)
	break;
	}
	}
	}

	static int
	semvalid(int semid, struct semid_kernel *semakptr)
	{

	return ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 \|\|
	semakptr->u.sem_perm.seq != IPCID_TO_SEQ(semid) ? EINVAL : 0);
	}

	/*
	* Note that the user-mode half of this passes a union, not a pointer.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct __semctl_args {
	int semid;
	int semnum;
	int cmd;
	union semun *arg;
	};
	#endif
	int
	-__semctl(struct thread td, struct __semctl_args uap)
	+sys___semctl(struct thread td, struct __semctl_args uap)
	{
	struct semid_ds dsbuf;
	union semun arg, semun;
	register_t rval;
	int error;

	switch (uap->cmd) {
	case SEM_STAT:
	case IPC_SET:
	case IPC_STAT:
	case GETALL:
	case SETVAL:
	case SETALL:
	error = copyin(uap->arg, &arg, sizeof(arg));
	if (error)
	return (error);
	break;
	}

	switch (uap->cmd) {
	case SEM_STAT:
	case IPC_STAT:
	semun.buf = &dsbuf;
	break;
	case IPC_SET:
	error = copyin(arg.buf, &dsbuf, sizeof(dsbuf));
	if (error)
	return (error);
	semun.buf = &dsbuf;
	break;
	case GETALL:
	case SETALL:
	semun.array = arg.array;
	break;
	case SETVAL:
	semun.val = arg.val;
	break;
	}

	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
	&rval);
	if (error)
	return (error);

	switch (uap->cmd) {
	case SEM_STAT:
	case IPC_STAT:
	error = copyout(&dsbuf, arg.buf, sizeof(dsbuf));
	break;
	}

	if (error == 0)
	td->td_retval[0] = rval;
	return (error);
	}

	int
	kern_semctl(struct thread *td, int semid, int semnum, int cmd,
	union semun arg, register_t rval)
	{
	u_short *array;
	struct ucred *cred = td->td_ucred;
	int i, error;
	struct semid_ds *sbuf;
	struct semid_kernel *semakptr;
	struct mtx *sema_mtxp;
	u_short usval, count;
	int semidx;

	DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n",
	semid, semnum, cmd, arg));
	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);

	array = NULL;

	switch(cmd) {
	case SEM_STAT:
	/*
	* For this command we assume semid is an array index
	* rather than an IPC id.
	*/
	if (semid < 0 \|\| semid >= seminfo.semmni)
	return (EINVAL);
	semakptr = &sema[semid];
	sema_mtxp = &sema_mtx[semid];
	mtx_lock(sema_mtxp);
	if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
	error = EINVAL;
	goto done2;
	}
	if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
	goto done2;
	#ifdef MAC
	error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
	if (error != 0)
	goto done2;
	#endif
	bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
	*rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
	mtx_unlock(sema_mtxp);
	return (0);
	}

	semidx = IPCID_TO_IX(semid);
	if (semidx < 0 \|\| semidx >= seminfo.semmni)
	return (EINVAL);

	semakptr = &sema[semidx];
	sema_mtxp = &sema_mtx[semidx];
	if (cmd == IPC_RMID)
	mtx_lock(&sem_mtx);
	mtx_lock(sema_mtxp);
	#ifdef MAC
	error = mac_sysvsem_check_semctl(cred, semakptr, cmd);
	if (error != 0)
	goto done2;
	#endif

	error = 0;
	*rval = 0;

	switch (cmd) {
	case IPC_RMID:
	if ((error = semvalid(semid, semakptr)) != 0)
	goto done2;
	if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
	goto done2;
	semakptr->u.sem_perm.cuid = cred->cr_uid;
	semakptr->u.sem_perm.uid = cred->cr_uid;
	semakptr->u.sem_perm.mode = 0;
	racct_sub_cred(semakptr->cred, RACCT_NSEM, semakptr->u.sem_nsems);
	crfree(semakptr->cred);
	semakptr->cred = NULL;
	SEMUNDO_LOCK();
	semundo_clear(semidx, -1);
	SEMUNDO_UNLOCK();
	#ifdef MAC
	mac_sysvsem_cleanup(semakptr);
	#endif
	wakeup(semakptr);
	for (i = 0; i < seminfo.semmni; i++) {
	if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
	sema[i].u.sem_base > semakptr->u.sem_base)
	mtx_lock_flags(&sema_mtx[i], LOP_DUPOK);
	}
	for (i = semakptr->u.sem_base - sem; i < semtot; i++)
	sem[i] = sem[i + semakptr->u.sem_nsems];
	for (i = 0; i < seminfo.semmni; i++) {
	if ((sema[i].u.sem_perm.mode & SEM_ALLOC) &&
	sema[i].u.sem_base > semakptr->u.sem_base) {
	sema[i].u.sem_base -= semakptr->u.sem_nsems;
	mtx_unlock(&sema_mtx[i]);
	}
	}
	semtot -= semakptr->u.sem_nsems;
	break;

	case IPC_SET:
	if ((error = semvalid(semid, semakptr)) != 0)
	goto done2;
	if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
	goto done2;
	sbuf = arg->buf;
	semakptr->u.sem_perm.uid = sbuf->sem_perm.uid;
	semakptr->u.sem_perm.gid = sbuf->sem_perm.gid;
	semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode &
	~0777) \| (sbuf->sem_perm.mode & 0777);
	semakptr->u.sem_ctime = time_second;
	break;

	case IPC_STAT:
	if ((error = semvalid(semid, semakptr)) != 0)
	goto done2;
	if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
	goto done2;
	bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
	break;

	case GETNCNT:
	if ((error = semvalid(semid, semakptr)) != 0)
	goto done2;
	if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
	goto done2;
	if (semnum < 0 \|\| semnum >= semakptr->u.sem_nsems) {
	error = EINVAL;
	goto done2;
	}
	*rval = semakptr->u.sem_base[semnum].semncnt;
	break;

	case GETPID:
	if ((error = semvalid(semid, semakptr)) != 0)
	goto done2;
	if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
	goto done2;
	if (semnum < 0 \|\| semnum >= semakptr->u.sem_nsems) {
	error = EINVAL;
	goto done2;
	}
	*rval = semakptr->u.sem_base[semnum].sempid;
	break;

	case GETVAL:
	if ((error = semvalid(semid, semakptr)) != 0)
	goto done2;
	if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
	goto done2;
	if (semnum < 0 \|\| semnum >= semakptr->u.sem_nsems) {
	error = EINVAL;
	goto done2;
	}
	*rval = semakptr->u.sem_base[semnum].semval;
	break;

	case GETALL:
	/*
	* Unfortunately, callers of this function don't know
	* in advance how many semaphores are in this set.
	* While we could just allocate the maximum size array
	* and pass the actual size back to the caller, that
	* won't work for SETALL since we can't copyin() more
	* data than the user specified as we may return a
	* spurious EFAULT.
	*
	* Note that the number of semaphores in a set is
	* fixed for the life of that set. The only way that
	* the 'count' could change while are blocked in
	* malloc() is if this semaphore set were destroyed
	* and a new one created with the same index.
	* However, semvalid() will catch that due to the
	* sequence number unless exactly 0x8000 (or a
	* multiple thereof) semaphore sets for the same index
	* are created and destroyed while we are in malloc!
	*
	*/
	count = semakptr->u.sem_nsems;
	mtx_unlock(sema_mtxp);
	array = malloc(sizeof(array) count, M_TEMP, M_WAITOK);
	mtx_lock(sema_mtxp);
	if ((error = semvalid(semid, semakptr)) != 0)
	goto done2;
	KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
	if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
	goto done2;
	for (i = 0; i < semakptr->u.sem_nsems; i++)
	array[i] = semakptr->u.sem_base[i].semval;
	mtx_unlock(sema_mtxp);
	error = copyout(array, arg->array, count * sizeof(*array));
	mtx_lock(sema_mtxp);
	break;

	case GETZCNT:
	if ((error = semvalid(semid, semakptr)) != 0)
	goto done2;
	if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
	goto done2;
	if (semnum < 0 \|\| semnum >= semakptr->u.sem_nsems) {
	error = EINVAL;
	goto done2;
	}
	*rval = semakptr->u.sem_base[semnum].semzcnt;
	break;

	case SETVAL:
	if ((error = semvalid(semid, semakptr)) != 0)
	goto done2;
	if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
	goto done2;
	if (semnum < 0 \|\| semnum >= semakptr->u.sem_nsems) {
	error = EINVAL;
	goto done2;
	}
	if (arg->val < 0 \|\| arg->val > seminfo.semvmx) {
	error = ERANGE;
	goto done2;
	}
	semakptr->u.sem_base[semnum].semval = arg->val;
	SEMUNDO_LOCK();
	semundo_clear(semidx, semnum);
	SEMUNDO_UNLOCK();
	wakeup(semakptr);
	break;

	case SETALL:
	/*
	* See comment on GETALL for why 'count' shouldn't change
	* and why we require a userland buffer.
	*/
	count = semakptr->u.sem_nsems;
	mtx_unlock(sema_mtxp);
	array = malloc(sizeof(array) count, M_TEMP, M_WAITOK);
	error = copyin(arg->array, array, count * sizeof(*array));
	mtx_lock(sema_mtxp);
	if (error)
	break;
	if ((error = semvalid(semid, semakptr)) != 0)
	goto done2;
	KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
	if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
	goto done2;
	for (i = 0; i < semakptr->u.sem_nsems; i++) {
	usval = array[i];
	if (usval > seminfo.semvmx) {
	error = ERANGE;
	break;
	}
	semakptr->u.sem_base[i].semval = usval;
	}
	SEMUNDO_LOCK();
	semundo_clear(semidx, -1);
	SEMUNDO_UNLOCK();
	wakeup(semakptr);
	break;

	default:
	error = EINVAL;
	break;
	}

	done2:
	mtx_unlock(sema_mtxp);
	if (cmd == IPC_RMID)
	mtx_unlock(&sem_mtx);
	if (array != NULL)
	free(array, M_TEMP);
	return(error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct semget_args {
	key_t key;
	int nsems;
	int semflg;
	};
	#endif
	int
	-semget(struct thread td, struct semget_args uap)
	+sys_semget(struct thread td, struct semget_args uap)
	{
	int semid, error = 0;
	int key = uap->key;
	int nsems = uap->nsems;
	int semflg = uap->semflg;
	struct ucred *cred = td->td_ucred;

	DPRINTF(("semget(0x%x, %d, 0%o)\n", key, nsems, semflg));
	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);

	mtx_lock(&sem_mtx);
	if (key != IPC_PRIVATE) {
	for (semid = 0; semid < seminfo.semmni; semid++) {
	if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) &&
	sema[semid].u.sem_perm.key == key)
	break;
	}
	if (semid < seminfo.semmni) {
	DPRINTF(("found public key\n"));
	if ((error = ipcperm(td, &sema[semid].u.sem_perm,
	semflg & 0700))) {
	goto done2;
	}
	if (nsems > 0 && sema[semid].u.sem_nsems < nsems) {
	DPRINTF(("too small\n"));
	error = EINVAL;
	goto done2;
	}
	if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
	DPRINTF(("not exclusive\n"));
	error = EEXIST;
	goto done2;
	}
	#ifdef MAC
	error = mac_sysvsem_check_semget(cred, &sema[semid]);
	if (error != 0)
	goto done2;
	#endif
	goto found;
	}
	}

	DPRINTF(("need to allocate the semid_kernel\n"));
	if (key == IPC_PRIVATE \|\| (semflg & IPC_CREAT)) {
	if (nsems <= 0 \|\| nsems > seminfo.semmsl) {
	DPRINTF(("nsems out of range (0<%d<=%d)\n", nsems,
	seminfo.semmsl));
	error = EINVAL;
	goto done2;
	}
	if (nsems > seminfo.semmns - semtot) {
	DPRINTF((
	"not enough semaphores left (need %d, got %d)\n",
	nsems, seminfo.semmns - semtot));
	error = ENOSPC;
	goto done2;
	}
	for (semid = 0; semid < seminfo.semmni; semid++) {
	if ((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0)
	break;
	}
	if (semid == seminfo.semmni) {
	DPRINTF(("no more semid_kernel's available\n"));
	error = ENOSPC;
	goto done2;
	}
	#ifdef RACCT
	PROC_LOCK(td->td_proc);
	error = racct_add(td->td_proc, RACCT_NSEM, nsems);
	PROC_UNLOCK(td->td_proc);
	if (error != 0) {
	error = ENOSPC;
	goto done2;
	}
	#endif
	DPRINTF(("semid %d is available\n", semid));
	mtx_lock(&sema_mtx[semid]);
	KASSERT((sema[semid].u.sem_perm.mode & SEM_ALLOC) == 0,
	("Lost semaphore %d", semid));
	sema[semid].u.sem_perm.key = key;
	sema[semid].u.sem_perm.cuid = cred->cr_uid;
	sema[semid].u.sem_perm.uid = cred->cr_uid;
	sema[semid].u.sem_perm.cgid = cred->cr_gid;
	sema[semid].u.sem_perm.gid = cred->cr_gid;
	sema[semid].u.sem_perm.mode = (semflg & 0777) \| SEM_ALLOC;
	sema[semid].cred = crhold(cred);
	sema[semid].u.sem_perm.seq =
	(sema[semid].u.sem_perm.seq + 1) & 0x7fff;
	sema[semid].u.sem_nsems = nsems;
	sema[semid].u.sem_otime = 0;
	sema[semid].u.sem_ctime = time_second;
	sema[semid].u.sem_base = &sem[semtot];
	semtot += nsems;
	bzero(sema[semid].u.sem_base,
	sizeof(sema[semid].u.sem_base[0])*nsems);
	#ifdef MAC
	mac_sysvsem_create(cred, &sema[semid]);
	#endif
	mtx_unlock(&sema_mtx[semid]);
	DPRINTF(("sembase = %p, next = %p\n",
	sema[semid].u.sem_base, &sem[semtot]));
	} else {
	DPRINTF(("didn't find it and wasn't asked to create it\n"));
	error = ENOENT;
	goto done2;
	}

	found:
	td->td_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].u.sem_perm);
	done2:
	mtx_unlock(&sem_mtx);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct semop_args {
	int semid;
	struct sembuf *sops;
	size_t nsops;
	};
	#endif
	int
	-semop(struct thread td, struct semop_args uap)
	+sys_semop(struct thread td, struct semop_args uap)
	{
	#define SMALL_SOPS 8
	struct sembuf small_sops[SMALL_SOPS];
	int semid = uap->semid;
	size_t nsops = uap->nsops;
	struct sembuf *sops;
	struct semid_kernel *semakptr;
	struct sembuf *sopptr = 0;
	struct sem *semptr = 0;
	struct sem_undo *suptr;
	struct mtx *sema_mtxp;
	size_t i, j, k;
	int error;
	int do_wakeup, do_undos;
	unsigned short seq;

	#ifdef SEM_DEBUG
	sops = NULL;
	#endif
	DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops));

	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);

	semid = IPCID_TO_IX(semid); /* Convert back to zero origin */

	if (semid < 0 \|\| semid >= seminfo.semmni)
	return (EINVAL);

	/* Allocate memory for sem_ops */
	if (nsops <= SMALL_SOPS)
	sops = small_sops;
	else if (nsops > seminfo.semopm) {
	DPRINTF(("too many sops (max=%d, nsops=%d)\n", seminfo.semopm,
	nsops));
	return (E2BIG);
	} else {
	#ifdef RACCT
	PROC_LOCK(td->td_proc);
	if (nsops > racct_get_available(td->td_proc, RACCT_NSEMOP)) {
	PROC_UNLOCK(td->td_proc);
	return (E2BIG);
	}
	PROC_UNLOCK(td->td_proc);
	#endif

	sops = malloc(nsops * sizeof(*sops), M_TEMP, M_WAITOK);
	}
	if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
	DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error,
	uap->sops, sops, nsops * sizeof(sops[0])));
	if (sops != small_sops)
	free(sops, M_SEM);
	return (error);
	}

	semakptr = &sema[semid];
	sema_mtxp = &sema_mtx[semid];
	mtx_lock(sema_mtxp);
	if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0) {
	error = EINVAL;
	goto done2;
	}
	seq = semakptr->u.sem_perm.seq;
	if (seq != IPCID_TO_SEQ(uap->semid)) {
	error = EINVAL;
	goto done2;
	}
	/*
	* Initial pass thru sops to see what permissions are needed.
	* Also perform any checks that don't need repeating on each
	* attempt to satisfy the request vector.
	*/
	j = 0; /* permission needed */
	do_undos = 0;
	for (i = 0; i < nsops; i++) {
	sopptr = &sops[i];
	if (sopptr->sem_num >= semakptr->u.sem_nsems) {
	error = EFBIG;
	goto done2;
	}
	if (sopptr->sem_flg & SEM_UNDO && sopptr->sem_op != 0)
	do_undos = 1;
	j \|= (sopptr->sem_op == 0) ? SEM_R : SEM_A;
	}

	if ((error = ipcperm(td, &semakptr->u.sem_perm, j))) {
	DPRINTF(("error = %d from ipaccess\n", error));
	goto done2;
	}
	#ifdef MAC
	error = mac_sysvsem_check_semop(td->td_ucred, semakptr, j);
	if (error != 0)
	goto done2;
	#endif

	/*
	* Loop trying to satisfy the vector of requests.
	* If we reach a point where we must wait, any requests already
	* performed are rolled back and we go to sleep until some other
	* process wakes us up. At this point, we start all over again.
	*
	* This ensures that from the perspective of other tasks, a set
	* of requests is atomic (never partially satisfied).
	*/
	for (;;) {
	do_wakeup = 0;
	error = 0; /* error return if necessary */

	for (i = 0; i < nsops; i++) {
	sopptr = &sops[i];
	semptr = &semakptr->u.sem_base[sopptr->sem_num];

	DPRINTF((
	"semop: semakptr=%p, sem_base=%p, "
	"semptr=%p, sem[%d]=%d : op=%d, flag=%s\n",
	semakptr, semakptr->u.sem_base, semptr,
	sopptr->sem_num, semptr->semval, sopptr->sem_op,
	(sopptr->sem_flg & IPC_NOWAIT) ?
	"nowait" : "wait"));

	if (sopptr->sem_op < 0) {
	if (semptr->semval + sopptr->sem_op < 0) {
	DPRINTF(("semop: can't do it now\n"));
	break;
	} else {
	semptr->semval += sopptr->sem_op;
	if (semptr->semval == 0 &&
	semptr->semzcnt > 0)
	do_wakeup = 1;
	}
	} else if (sopptr->sem_op == 0) {
	if (semptr->semval != 0) {
	DPRINTF(("semop: not zero now\n"));
	break;
	}
	} else if (semptr->semval + sopptr->sem_op >
	seminfo.semvmx) {
	error = ERANGE;
	break;
	} else {
	if (semptr->semncnt > 0)
	do_wakeup = 1;
	semptr->semval += sopptr->sem_op;
	}
	}

	/*
	* Did we get through the entire vector?
	*/
	if (i >= nsops)
	goto done;

	/*
	* No ... rollback anything that we've already done
	*/
	DPRINTF(("semop: rollback 0 through %d\n", i-1));
	for (j = 0; j < i; j++)
	semakptr->u.sem_base[sops[j].sem_num].semval -=
	sops[j].sem_op;

	/* If we detected an error, return it */
	if (error != 0)
	goto done2;

	/*
	* If the request that we couldn't satisfy has the
	* NOWAIT flag set then return with EAGAIN.
	*/
	if (sopptr->sem_flg & IPC_NOWAIT) {
	error = EAGAIN;
	goto done2;
	}

	if (sopptr->sem_op == 0)
	semptr->semzcnt++;
	else
	semptr->semncnt++;

	DPRINTF(("semop: good night!\n"));
	error = msleep(semakptr, sema_mtxp, (PZERO - 4) \| PCATCH,
	"semwait", 0);
	DPRINTF(("semop: good morning (error=%d)!\n", error));
	/* return code is checked below, after sem[nz]cnt-- */

	/*
	* Make sure that the semaphore still exists
	*/
	seq = semakptr->u.sem_perm.seq;
	if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 \|\|
	seq != IPCID_TO_SEQ(uap->semid)) {
	error = EIDRM;
	goto done2;
	}

	/*
	* Renew the semaphore's pointer after wakeup since
	* during msleep sem_base may have been modified and semptr
	* is not valid any more
	*/
	semptr = &semakptr->u.sem_base[sopptr->sem_num];

	/*
	* The semaphore is still alive. Readjust the count of
	* waiting processes.
	*/
	if (sopptr->sem_op == 0)
	semptr->semzcnt--;
	else
	semptr->semncnt--;

	/*
	* Is it really morning, or was our sleep interrupted?
	* (Delayed check of msleep() return code because we
	* need to decrement sem[nz]cnt either way.)
	*/
	if (error != 0) {
	error = EINTR;
	goto done2;
	}
	DPRINTF(("semop: good morning!\n"));
	}

	done:
	/*
	* Process any SEM_UNDO requests.
	*/
	if (do_undos) {
	SEMUNDO_LOCK();
	suptr = NULL;
	for (i = 0; i < nsops; i++) {
	/*
	* We only need to deal with SEM_UNDO's for non-zero
	* op's.
	*/
	int adjval;

	if ((sops[i].sem_flg & SEM_UNDO) == 0)
	continue;
	adjval = sops[i].sem_op;
	if (adjval == 0)
	continue;
	error = semundo_adjust(td, &suptr, semid, seq,
	sops[i].sem_num, -adjval);
	if (error == 0)
	continue;

	/*
	* Oh-Oh! We ran out of either sem_undo's or undo's.
	* Rollback the adjustments to this point and then
	* rollback the semaphore ups and down so we can return
	* with an error with all structures restored. We
	* rollback the undo's in the exact reverse order that
	* we applied them. This guarantees that we won't run
	* out of space as we roll things back out.
	*/
	for (j = 0; j < i; j++) {
	k = i - j - 1;
	if ((sops[k].sem_flg & SEM_UNDO) == 0)
	continue;
	adjval = sops[k].sem_op;
	if (adjval == 0)
	continue;
	if (semundo_adjust(td, &suptr, semid, seq,
	sops[k].sem_num, adjval) != 0)
	panic("semop - can't undo undos");
	}

	for (j = 0; j < nsops; j++)
	semakptr->u.sem_base[sops[j].sem_num].semval -=
	sops[j].sem_op;

	DPRINTF(("error = %d from semundo_adjust\n", error));
	SEMUNDO_UNLOCK();
	goto done2;
	} /* loop through the sops */
	SEMUNDO_UNLOCK();
	} /* if (do_undos) */

	/* We're definitely done - set the sempid's and time */
	for (i = 0; i < nsops; i++) {
	sopptr = &sops[i];
	semptr = &semakptr->u.sem_base[sopptr->sem_num];
	semptr->sempid = td->td_proc->p_pid;
	}
	semakptr->u.sem_otime = time_second;

	/*
	* Do a wakeup if any semaphore was up'd whilst something was
	* sleeping on it.
	*/
	if (do_wakeup) {
	DPRINTF(("semop: doing wakeup\n"));
	wakeup(semakptr);
	DPRINTF(("semop: back from wakeup\n"));
	}
	DPRINTF(("semop: done\n"));
	td->td_retval[0] = 0;
	done2:
	mtx_unlock(sema_mtxp);
	if (sops != small_sops)
	free(sops, M_SEM);
	return (error);
	}

	/*
	* Go through the undo structures for this process and apply the adjustments to
	* semaphores.
	*/
	static void
	semexit_myhook(void arg, struct proc p)
	{
	struct sem_undo *suptr;
	struct semid_kernel *semakptr;
	struct mtx *sema_mtxp;
	int semid, semnum, adjval, ix;
	unsigned short seq;

	/*
	* Go through the chain of undo vectors looking for one
	* associated with this process.
	*/
	SEMUNDO_LOCK();
	LIST_FOREACH(suptr, &semu_list, un_next) {
	if (suptr->un_proc == p)
	break;
	}
	if (suptr == NULL) {
	SEMUNDO_UNLOCK();
	return;
	}
	LIST_REMOVE(suptr, un_next);

	DPRINTF(("proc @%p has undo structure with %d entries\n", p,
	suptr->un_cnt));

	/*
	* If there are any active undo elements then process them.
	*/
	if (suptr->un_cnt > 0) {
	SEMUNDO_UNLOCK();
	for (ix = 0; ix < suptr->un_cnt; ix++) {
	semid = suptr->un_ent[ix].un_id;
	semnum = suptr->un_ent[ix].un_num;
	adjval = suptr->un_ent[ix].un_adjval;
	seq = suptr->un_ent[ix].un_seq;
	semakptr = &sema[semid];
	sema_mtxp = &sema_mtx[semid];

	mtx_lock(sema_mtxp);
	if ((semakptr->u.sem_perm.mode & SEM_ALLOC) == 0 \|\|
	(semakptr->u.sem_perm.seq != seq)) {
	mtx_unlock(sema_mtxp);
	continue;
	}
	if (semnum >= semakptr->u.sem_nsems)
	panic("semexit - semnum out of range");

	DPRINTF((
	"semexit: %p id=%d num=%d(adj=%d) ; sem=%d\n",
	suptr->un_proc, suptr->un_ent[ix].un_id,
	suptr->un_ent[ix].un_num,
	suptr->un_ent[ix].un_adjval,
	semakptr->u.sem_base[semnum].semval));

	if (adjval < 0 && semakptr->u.sem_base[semnum].semval <
	-adjval)
	semakptr->u.sem_base[semnum].semval = 0;
	else
	semakptr->u.sem_base[semnum].semval += adjval;

	wakeup(semakptr);
	DPRINTF(("semexit: back from wakeup\n"));
	mtx_unlock(sema_mtxp);
	}
	SEMUNDO_LOCK();
	}

	/*
	* Deallocate the undo vector.
	*/
	DPRINTF(("removing vector\n"));
	suptr->un_proc = NULL;
	suptr->un_cnt = 0;
	LIST_INSERT_HEAD(&semu_free_list, suptr, un_next);
	SEMUNDO_UNLOCK();
	}

	static int
	sysctl_sema(SYSCTL_HANDLER_ARGS)
	{

	return (SYSCTL_OUT(req, sema,
	sizeof(struct semid_kernel) * seminfo.semmni));
	}

	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)

	/* XXX casting to (sy_call_t ) is bogus, as usual. /
	static sy_call_t *semcalls[] = {
	- (sy_call_t )freebsd7___semctl, (sy_call_t )semget,
	- (sy_call_t *)semop
	+ (sy_call_t )freebsd7___semctl, (sy_call_t )sys_semget,
	+ (sy_call_t *)sys_semop
	};

	/*
	* Entry point for all SEM calls.
	*/
	int
	-semsys(td, uap)
	+sys_semsys(td, uap)
	struct thread *td;
	/* XXX actually varargs. */
	struct semsys_args /* {
	int which;
	int a2;
	int a3;
	int a4;
	int a5;
	} / uap;
	{
	int error;

	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);
	if (uap->which < 0 \|\|
	uap->which >= sizeof(semcalls)/sizeof(semcalls[0]))
	return (EINVAL);
	error = (*semcalls[uap->which])(td, &uap->a2);
	return (error);
	}

	#ifndef CP
	#define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0)
	#endif

	#ifndef _SYS_SYSPROTO_H_
	struct freebsd7___semctl_args {
	int semid;
	int semnum;
	int cmd;
	union semun_old *arg;
	};
	#endif
	int
	freebsd7___semctl(struct thread td, struct freebsd7___semctl_args uap)
	{
	struct semid_ds_old dsold;
	struct semid_ds dsbuf;
	union semun_old arg;
	union semun semun;
	register_t rval;
	int error;

	switch (uap->cmd) {
	case SEM_STAT:
	case IPC_SET:
	case IPC_STAT:
	case GETALL:
	case SETVAL:
	case SETALL:
	error = copyin(uap->arg, &arg, sizeof(arg));
	if (error)
	return (error);
	break;
	}

	switch (uap->cmd) {
	case SEM_STAT:
	case IPC_STAT:
	semun.buf = &dsbuf;
	break;
	case IPC_SET:
	error = copyin(arg.buf, &dsold, sizeof(dsold));
	if (error)
	return (error);
	ipcperm_old2new(&dsold.sem_perm, &dsbuf.sem_perm);
	CP(dsold, dsbuf, sem_base);
	CP(dsold, dsbuf, sem_nsems);
	CP(dsold, dsbuf, sem_otime);
	CP(dsold, dsbuf, sem_ctime);
	semun.buf = &dsbuf;
	break;
	case GETALL:
	case SETALL:
	semun.array = arg.array;
	break;
	case SETVAL:
	semun.val = arg.val;
	break;
	}

	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
	&rval);
	if (error)
	return (error);

	switch (uap->cmd) {
	case SEM_STAT:
	case IPC_STAT:
	bzero(&dsold, sizeof(dsold));
	ipcperm_new2old(&dsbuf.sem_perm, &dsold.sem_perm);
	CP(dsbuf, dsold, sem_base);
	CP(dsbuf, dsold, sem_nsems);
	CP(dsbuf, dsold, sem_otime);
	CP(dsbuf, dsold, sem_ctime);
	error = copyout(&dsold, arg.buf, sizeof(dsold));
	break;
	}

	if (error == 0)
	td->td_retval[0] = rval;
	return (error);
	}

	#endif /* COMPAT_FREEBSD{4,5,6,7} */

	#ifdef COMPAT_FREEBSD32

	int
	freebsd32_semsys(struct thread td, struct freebsd32_semsys_args uap)
	{

	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)
	switch (uap->which) {
	case 0:
	return (freebsd7_freebsd32_semctl(td,
	(struct freebsd7_freebsd32_semctl_args *)&uap->a2));
	default:
	- return (semsys(td, (struct semsys_args *)uap));
	+ return (sys_semsys(td, (struct semsys_args *)uap));
	}
	#else
	return (nosys(td, NULL));
	#endif
	}

	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)
	int
	freebsd7_freebsd32_semctl(struct thread *td,
	struct freebsd7_freebsd32_semctl_args *uap)
	{
	struct semid_ds32_old dsbuf32;
	struct semid_ds dsbuf;
	union semun semun;
	union semun32 arg;
	register_t rval;
	int error;

	switch (uap->cmd) {
	case SEM_STAT:
	case IPC_SET:
	case IPC_STAT:
	case GETALL:
	case SETVAL:
	case SETALL:
	error = copyin(uap->arg, &arg, sizeof(arg));
	if (error)
	return (error);
	break;
	}

	switch (uap->cmd) {
	case SEM_STAT:
	case IPC_STAT:
	semun.buf = &dsbuf;
	break;
	case IPC_SET:
	error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
	if (error)
	return (error);
	freebsd32_ipcperm_old_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
	PTRIN_CP(dsbuf32, dsbuf, sem_base);
	CP(dsbuf32, dsbuf, sem_nsems);
	CP(dsbuf32, dsbuf, sem_otime);
	CP(dsbuf32, dsbuf, sem_ctime);
	semun.buf = &dsbuf;
	break;
	case GETALL:
	case SETALL:
	semun.array = PTRIN(arg.array);
	break;
	case SETVAL:
	semun.val = arg.val;
	break;
	}

	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
	&rval);
	if (error)
	return (error);

	switch (uap->cmd) {
	case SEM_STAT:
	case IPC_STAT:
	bzero(&dsbuf32, sizeof(dsbuf32));
	freebsd32_ipcperm_old_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
	PTROUT_CP(dsbuf, dsbuf32, sem_base);
	CP(dsbuf, dsbuf32, sem_nsems);
	CP(dsbuf, dsbuf32, sem_otime);
	CP(dsbuf, dsbuf32, sem_ctime);
	error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
	break;
	}

	if (error == 0)
	td->td_retval[0] = rval;
	return (error);
	}
	#endif

	int
	freebsd32_semctl(struct thread td, struct freebsd32_semctl_args uap)
	{
	struct semid_ds32 dsbuf32;
	struct semid_ds dsbuf;
	union semun semun;
	union semun32 arg;
	register_t rval;
	int error;

	switch (uap->cmd) {
	case SEM_STAT:
	case IPC_SET:
	case IPC_STAT:
	case GETALL:
	case SETVAL:
	case SETALL:
	error = copyin(uap->arg, &arg, sizeof(arg));
	if (error)
	return (error);
	break;
	}

	switch (uap->cmd) {
	case SEM_STAT:
	case IPC_STAT:
	semun.buf = &dsbuf;
	break;
	case IPC_SET:
	error = copyin(PTRIN(arg.buf), &dsbuf32, sizeof(dsbuf32));
	if (error)
	return (error);
	freebsd32_ipcperm_in(&dsbuf32.sem_perm, &dsbuf.sem_perm);
	PTRIN_CP(dsbuf32, dsbuf, sem_base);
	CP(dsbuf32, dsbuf, sem_nsems);
	CP(dsbuf32, dsbuf, sem_otime);
	CP(dsbuf32, dsbuf, sem_ctime);
	semun.buf = &dsbuf;
	break;
	case GETALL:
	case SETALL:
	semun.array = PTRIN(arg.array);
	break;
	case SETVAL:
	semun.val = arg.val;
	break;
	}

	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
	&rval);
	if (error)
	return (error);

	switch (uap->cmd) {
	case SEM_STAT:
	case IPC_STAT:
	bzero(&dsbuf32, sizeof(dsbuf32));
	freebsd32_ipcperm_out(&dsbuf.sem_perm, &dsbuf32.sem_perm);
	PTROUT_CP(dsbuf, dsbuf32, sem_base);
	CP(dsbuf, dsbuf32, sem_nsems);
	CP(dsbuf, dsbuf32, sem_otime);
	CP(dsbuf, dsbuf32, sem_ctime);
	error = copyout(&dsbuf32, PTRIN(arg.buf), sizeof(dsbuf32));
	break;
	}

	if (error == 0)
	td->td_retval[0] = rval;
	return (error);
	}

	#endif /* COMPAT_FREEBSD32 */
	Index: head/sys/kern/sysv_shm.c
	===================================================================
	--- head/sys/kern/sysv_shm.c (revision 225616)
	+++ head/sys/kern/sysv_shm.c (revision 225617)
	@@ -1,1408 +1,1408 @@
	/* $NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $ */
	/*-
	* Copyright (c) 1994 Adam Glass and Charles Hannum. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by Adam Glass and Charles
	* Hannum.
	* 4. The names of the authors may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
	* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/
	/*-
	* Copyright (c) 2003-2005 McAfee, Inc.
	* All rights reserved.
	*
	* This software was developed for the FreeBSD Project in part by McAfee
	* Research, the Security Research Division of McAfee, Inc under DARPA/SPAWAR
	* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS research
	* program.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_sysvipc.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/sysctl.h>
	#include <sys/shm.h>
	#include <sys/proc.h>
	#include <sys/malloc.h>
	#include <sys/mman.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/racct.h>
	#include <sys/resourcevar.h>
	#include <sys/stat.h>
	#include <sys/syscall.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/jail.h>

	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_object.h>
	#include <vm/vm_map.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>

	FEATURE(sysv_shm, "System V shared memory segments support");

	static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");

	static int shmget_allocate_segment(struct thread *td,
	struct shmget_args *uap, int mode);
	static int shmget_existing(struct thread td, struct shmget_args uap,
	int mode, int segnum);

	#define SHMSEG_FREE 0x0200
	#define SHMSEG_REMOVED 0x0400
	#define SHMSEG_ALLOCATED 0x0800
	#define SHMSEG_WANTED 0x1000

	static int shm_last_free, shm_nused, shmalloced;
	vm_size_t shm_committed;
	static struct shmid_kernel *shmsegs;

	struct shmmap_state {
	vm_offset_t va;
	int shmid;
	};

	static void shm_deallocate_segment(struct shmid_kernel *);
	static int shm_find_segment_by_key(key_t);
	static struct shmid_kernel *shm_find_segment_by_shmid(int);
	static struct shmid_kernel *shm_find_segment_by_shmidx(int);
	static int shm_delete_mapping(struct vmspace vm, struct shmmap_state );
	static void shmrealloc(void);
	static int shminit(void);
	static int sysvshm_modload(struct module , int, void );
	static int shmunload(void);
	static void shmexit_myhook(struct vmspace *vm);
	static void shmfork_myhook(struct proc p1, struct proc p2);
	static int sysctl_shmsegs(SYSCTL_HANDLER_ARGS);

	/*
	* Tuneable values.
	*/
	#ifndef SHMMAXPGS
	#define SHMMAXPGS 131072 /* Note: sysv shared memory is swap backed. */
	#endif
	#ifndef SHMMAX
	#define SHMMAX (SHMMAXPGS*PAGE_SIZE)
	#endif
	#ifndef SHMMIN
	#define SHMMIN 1
	#endif
	#ifndef SHMMNI
	#define SHMMNI 192
	#endif
	#ifndef SHMSEG
	#define SHMSEG 128
	#endif
	#ifndef SHMALL
	#define SHMALL (SHMMAXPGS)
	#endif

	struct shminfo shminfo = {
	SHMMAX,
	SHMMIN,
	SHMMNI,
	SHMSEG,
	SHMALL
	};

	static int shm_use_phys;
	static int shm_allow_removed;

	SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0,
	"Maximum shared memory segment size");
	SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0,
	"Minimum shared memory segment size");
	SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
	"Number of shared memory identifiers");
	SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
	"Number of segments per process");
	SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0,
	"Maximum number of pages available for shared memory");
	SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW,
	&shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core");
	SYSCTL_INT(_kern_ipc, OID_AUTO, shm_allow_removed, CTLFLAG_RW,
	&shm_allow_removed, 0,
	"Enable/Disable attachment to attached segments marked for removal");
	SYSCTL_PROC(_kern_ipc, OID_AUTO, shmsegs, CTLTYPE_OPAQUE \| CTLFLAG_RD,
	NULL, 0, sysctl_shmsegs, "",
	"Current number of shared memory segments allocated");

	static int
	shm_find_segment_by_key(key)
	key_t key;
	{
	int i;

	for (i = 0; i < shmalloced; i++)
	if ((shmsegs[i].u.shm_perm.mode & SHMSEG_ALLOCATED) &&
	shmsegs[i].u.shm_perm.key == key)
	return (i);
	return (-1);
	}

	static struct shmid_kernel *
	shm_find_segment_by_shmid(int shmid)
	{
	int segnum;
	struct shmid_kernel *shmseg;

	segnum = IPCID_TO_IX(shmid);
	if (segnum < 0 \|\| segnum >= shmalloced)
	return (NULL);
	shmseg = &shmsegs[segnum];
	if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 \|\|
	(!shm_allow_removed &&
	(shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0) \|\|
	shmseg->u.shm_perm.seq != IPCID_TO_SEQ(shmid))
	return (NULL);
	return (shmseg);
	}

	static struct shmid_kernel *
	shm_find_segment_by_shmidx(int segnum)
	{
	struct shmid_kernel *shmseg;

	if (segnum < 0 \|\| segnum >= shmalloced)
	return (NULL);
	shmseg = &shmsegs[segnum];
	if ((shmseg->u.shm_perm.mode & SHMSEG_ALLOCATED) == 0 \|\|
	(!shm_allow_removed &&
	(shmseg->u.shm_perm.mode & SHMSEG_REMOVED) != 0))
	return (NULL);
	return (shmseg);
	}

	static void
	shm_deallocate_segment(shmseg)
	struct shmid_kernel *shmseg;
	{
	vm_size_t size;

	GIANT_REQUIRED;

	vm_object_deallocate(shmseg->object);
	shmseg->object = NULL;
	size = round_page(shmseg->u.shm_segsz);
	shm_committed -= btoc(size);
	shm_nused--;
	shmseg->u.shm_perm.mode = SHMSEG_FREE;
	#ifdef MAC
	mac_sysvshm_cleanup(shmseg);
	#endif
	racct_sub_cred(shmseg->cred, RACCT_NSHM, 1);
	racct_sub_cred(shmseg->cred, RACCT_SHMSIZE, size);
	crfree(shmseg->cred);
	shmseg->cred = NULL;
	}

	static int
	shm_delete_mapping(struct vmspace vm, struct shmmap_state shmmap_s)
	{
	struct shmid_kernel *shmseg;
	int segnum, result;
	vm_size_t size;

	GIANT_REQUIRED;

	segnum = IPCID_TO_IX(shmmap_s->shmid);
	shmseg = &shmsegs[segnum];
	size = round_page(shmseg->u.shm_segsz);
	result = vm_map_remove(&vm->vm_map, shmmap_s->va, shmmap_s->va + size);
	if (result != KERN_SUCCESS)
	return (EINVAL);
	shmmap_s->shmid = -1;
	shmseg->u.shm_dtime = time_second;
	if ((--shmseg->u.shm_nattch <= 0) &&
	(shmseg->u.shm_perm.mode & SHMSEG_REMOVED)) {
	shm_deallocate_segment(shmseg);
	shm_last_free = segnum;
	}
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct shmdt_args {
	const void *shmaddr;
	};
	#endif
	int
	-shmdt(td, uap)
	+sys_shmdt(td, uap)
	struct thread *td;
	struct shmdt_args *uap;
	{
	struct proc *p = td->td_proc;
	struct shmmap_state *shmmap_s;
	#ifdef MAC
	struct shmid_kernel *shmsegptr;
	#endif
	int i;
	int error = 0;

	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);
	mtx_lock(&Giant);
	shmmap_s = p->p_vmspace->vm_shm;
	if (shmmap_s == NULL) {
	error = EINVAL;
	goto done2;
	}
	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) {
	if (shmmap_s->shmid != -1 &&
	shmmap_s->va == (vm_offset_t)uap->shmaddr) {
	break;
	}
	}
	if (i == shminfo.shmseg) {
	error = EINVAL;
	goto done2;
	}
	#ifdef MAC
	shmsegptr = &shmsegs[IPCID_TO_IX(shmmap_s->shmid)];
	error = mac_sysvshm_check_shmdt(td->td_ucred, shmsegptr);
	if (error != 0)
	goto done2;
	#endif
	error = shm_delete_mapping(p->p_vmspace, shmmap_s);
	done2:
	mtx_unlock(&Giant);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct shmat_args {
	int shmid;
	const void *shmaddr;
	int shmflg;
	};
	#endif
	int
	kern_shmat(td, shmid, shmaddr, shmflg)
	struct thread *td;
	int shmid;
	const void *shmaddr;
	int shmflg;
	{
	struct proc *p = td->td_proc;
	int i, flags;
	struct shmid_kernel *shmseg;
	struct shmmap_state *shmmap_s = NULL;
	vm_offset_t attach_va;
	vm_prot_t prot;
	vm_size_t size;
	int rv;
	int error = 0;

	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);
	mtx_lock(&Giant);
	shmmap_s = p->p_vmspace->vm_shm;
	if (shmmap_s == NULL) {
	shmmap_s = malloc(shminfo.shmseg * sizeof(struct shmmap_state),
	M_SHM, M_WAITOK);
	for (i = 0; i < shminfo.shmseg; i++)
	shmmap_s[i].shmid = -1;
	p->p_vmspace->vm_shm = shmmap_s;
	}
	shmseg = shm_find_segment_by_shmid(shmid);
	if (shmseg == NULL) {
	error = EINVAL;
	goto done2;
	}
	error = ipcperm(td, &shmseg->u.shm_perm,
	(shmflg & SHM_RDONLY) ? IPC_R : IPC_R\|IPC_W);
	if (error)
	goto done2;
	#ifdef MAC
	error = mac_sysvshm_check_shmat(td->td_ucred, shmseg, shmflg);
	if (error != 0)
	goto done2;
	#endif
	for (i = 0; i < shminfo.shmseg; i++) {
	if (shmmap_s->shmid == -1)
	break;
	shmmap_s++;
	}
	if (i >= shminfo.shmseg) {
	error = EMFILE;
	goto done2;
	}
	size = round_page(shmseg->u.shm_segsz);
	prot = VM_PROT_READ;
	if ((shmflg & SHM_RDONLY) == 0)
	prot \|= VM_PROT_WRITE;
	flags = MAP_ANON \| MAP_SHARED;
	if (shmaddr) {
	flags \|= MAP_FIXED;
	if (shmflg & SHM_RND) {
	attach_va = (vm_offset_t)shmaddr & ~(SHMLBA-1);
	} else if (((vm_offset_t)shmaddr & (SHMLBA-1)) == 0) {
	attach_va = (vm_offset_t)shmaddr;
	} else {
	error = EINVAL;
	goto done2;
	}
	} else {
	/*
	* This is just a hint to vm_map_find() about where to
	* put it.
	*/
	PROC_LOCK(p);
	attach_va = round_page((vm_offset_t)p->p_vmspace->vm_daddr +
	lim_max(p, RLIMIT_DATA));
	PROC_UNLOCK(p);
	}

	vm_object_reference(shmseg->object);
	rv = vm_map_find(&p->p_vmspace->vm_map, shmseg->object,
	0, &attach_va, size, (flags & MAP_FIXED) ? VMFS_NO_SPACE :
	VMFS_ANY_SPACE, prot, prot, 0);
	if (rv != KERN_SUCCESS) {
	vm_object_deallocate(shmseg->object);
	error = ENOMEM;
	goto done2;
	}
	vm_map_inherit(&p->p_vmspace->vm_map,
	attach_va, attach_va + size, VM_INHERIT_SHARE);

	shmmap_s->va = attach_va;
	shmmap_s->shmid = shmid;
	shmseg->u.shm_lpid = p->p_pid;
	shmseg->u.shm_atime = time_second;
	shmseg->u.shm_nattch++;
	td->td_retval[0] = attach_va;
	done2:
	mtx_unlock(&Giant);
	return (error);
	}

	int
	-shmat(td, uap)
	+sys_shmat(td, uap)
	struct thread *td;
	struct shmat_args *uap;
	{
	return kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg);
	}

	int
	kern_shmctl(td, shmid, cmd, buf, bufsz)
	struct thread *td;
	int shmid;
	int cmd;
	void *buf;
	size_t *bufsz;
	{
	int error = 0;
	struct shmid_kernel *shmseg;

	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);

	mtx_lock(&Giant);
	switch (cmd) {
	/*
	* It is possible that kern_shmctl is being called from the Linux ABI
	* layer, in which case, we will need to implement IPC_INFO. It should
	* be noted that other shmctl calls will be funneled through here for
	* Linix binaries as well.
	*
	* NB: The Linux ABI layer will convert this data to structure(s) more
	* consistent with the Linux ABI.
	*/
	case IPC_INFO:
	memcpy(buf, &shminfo, sizeof(shminfo));
	if (bufsz)
	*bufsz = sizeof(shminfo);
	td->td_retval[0] = shmalloced;
	goto done2;
	case SHM_INFO: {
	struct shm_info shm_info;
	shm_info.used_ids = shm_nused;
	shm_info.shm_rss = 0; /XXX where to get from ? /
	shm_info.shm_tot = 0; /XXX where to get from ? /
	shm_info.shm_swp = 0; /XXX where to get from ? /
	shm_info.swap_attempts = 0; /XXX where to get from ? /
	shm_info.swap_successes = 0; /XXX where to get from ? /
	memcpy(buf, &shm_info, sizeof(shm_info));
	if (bufsz)
	*bufsz = sizeof(shm_info);
	td->td_retval[0] = shmalloced;
	goto done2;
	}
	}
	if (cmd == SHM_STAT)
	shmseg = shm_find_segment_by_shmidx(shmid);
	else
	shmseg = shm_find_segment_by_shmid(shmid);
	if (shmseg == NULL) {
	error = EINVAL;
	goto done2;
	}
	#ifdef MAC
	error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, cmd);
	if (error != 0)
	goto done2;
	#endif
	switch (cmd) {
	case SHM_STAT:
	case IPC_STAT:
	error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
	if (error)
	goto done2;
	memcpy(buf, &shmseg->u, sizeof(struct shmid_ds));
	if (bufsz)
	*bufsz = sizeof(struct shmid_ds);
	if (cmd == SHM_STAT)
	td->td_retval[0] = IXSEQ_TO_IPCID(shmid, shmseg->u.shm_perm);
	break;
	case IPC_SET: {
	struct shmid_ds *shmid;

	shmid = (struct shmid_ds *)buf;
	error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
	if (error)
	goto done2;
	shmseg->u.shm_perm.uid = shmid->shm_perm.uid;
	shmseg->u.shm_perm.gid = shmid->shm_perm.gid;
	shmseg->u.shm_perm.mode =
	(shmseg->u.shm_perm.mode & ~ACCESSPERMS) \|
	(shmid->shm_perm.mode & ACCESSPERMS);
	shmseg->u.shm_ctime = time_second;
	break;
	}
	case IPC_RMID:
	error = ipcperm(td, &shmseg->u.shm_perm, IPC_M);
	if (error)
	goto done2;
	shmseg->u.shm_perm.key = IPC_PRIVATE;
	shmseg->u.shm_perm.mode \|= SHMSEG_REMOVED;
	if (shmseg->u.shm_nattch <= 0) {
	shm_deallocate_segment(shmseg);
	shm_last_free = IPCID_TO_IX(shmid);
	}
	break;
	#if 0
	case SHM_LOCK:
	case SHM_UNLOCK:
	#endif
	default:
	error = EINVAL;
	break;
	}
	done2:
	mtx_unlock(&Giant);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct shmctl_args {
	int shmid;
	int cmd;
	struct shmid_ds *buf;
	};
	#endif
	int
	-shmctl(td, uap)
	+sys_shmctl(td, uap)
	struct thread *td;
	struct shmctl_args *uap;
	{
	int error = 0;
	struct shmid_ds buf;
	size_t bufsz;

	/*
	* The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
	* Linux binaries. If we see the call come through the FreeBSD ABI,
	* return an error back to the user since we do not to support this.
	*/
	if (uap->cmd == IPC_INFO \|\| uap->cmd == SHM_INFO \|\|
	uap->cmd == SHM_STAT)
	return (EINVAL);

	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
	if (uap->cmd == IPC_SET) {
	if ((error = copyin(uap->buf, &buf, sizeof(struct shmid_ds))))
	goto done;
	}

	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
	if (error)
	goto done;

	/* Cases in which we need to copyout */
	switch (uap->cmd) {
	case IPC_STAT:
	error = copyout(&buf, uap->buf, bufsz);
	break;
	}

	done:
	if (error) {
	/* Invalidate the return value */
	td->td_retval[0] = -1;
	}
	return (error);
	}


	static int
	shmget_existing(td, uap, mode, segnum)
	struct thread *td;
	struct shmget_args *uap;
	int mode;
	int segnum;
	{
	struct shmid_kernel *shmseg;
	int error;

	shmseg = &shmsegs[segnum];
	if (shmseg->u.shm_perm.mode & SHMSEG_REMOVED) {
	/*
	* This segment is in the process of being allocated. Wait
	* until it's done, and look the key up again (in case the
	* allocation failed or it was freed).
	*/
	shmseg->u.shm_perm.mode \|= SHMSEG_WANTED;
	error = tsleep(shmseg, PLOCK \| PCATCH, "shmget", 0);
	if (error)
	return (error);
	return (EAGAIN);
	}
	if ((uap->shmflg & (IPC_CREAT \| IPC_EXCL)) == (IPC_CREAT \| IPC_EXCL))
	return (EEXIST);
	#ifdef MAC
	error = mac_sysvshm_check_shmget(td->td_ucred, shmseg, uap->shmflg);
	if (error != 0)
	return (error);
	#endif
	if (uap->size != 0 && uap->size > shmseg->u.shm_segsz)
	return (EINVAL);
	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
	return (0);
	}

	static int
	shmget_allocate_segment(td, uap, mode)
	struct thread *td;
	struct shmget_args *uap;
	int mode;
	{
	int i, segnum, shmid;
	size_t size;
	struct ucred *cred = td->td_ucred;
	struct shmid_kernel *shmseg;
	vm_object_t shm_object;

	GIANT_REQUIRED;

	if (uap->size < shminfo.shmmin \|\| uap->size > shminfo.shmmax)
	return (EINVAL);
	if (shm_nused >= shminfo.shmmni) /* Any shmids left? */
	return (ENOSPC);
	size = round_page(uap->size);
	if (shm_committed + btoc(size) > shminfo.shmall)
	return (ENOMEM);
	if (shm_last_free < 0) {
	shmrealloc(); /* Maybe expand the shmsegs[] array. */
	for (i = 0; i < shmalloced; i++)
	if (shmsegs[i].u.shm_perm.mode & SHMSEG_FREE)
	break;
	if (i == shmalloced)
	return (ENOSPC);
	segnum = i;
	} else {
	segnum = shm_last_free;
	shm_last_free = -1;
	}
	shmseg = &shmsegs[segnum];
	#ifdef RACCT
	PROC_LOCK(td->td_proc);
	if (racct_add(td->td_proc, RACCT_NSHM, 1)) {
	PROC_UNLOCK(td->td_proc);
	return (ENOSPC);
	}
	if (racct_add(td->td_proc, RACCT_SHMSIZE, size)) {
	racct_sub(td->td_proc, RACCT_NSHM, 1);
	PROC_UNLOCK(td->td_proc);
	return (ENOMEM);
	}
	PROC_UNLOCK(td->td_proc);
	#endif
	/*
	* In case we sleep in malloc(), mark the segment present but deleted
	* so that noone else tries to create the same key.
	*/
	shmseg->u.shm_perm.mode = SHMSEG_ALLOCATED \| SHMSEG_REMOVED;
	shmseg->u.shm_perm.key = uap->key;
	shmseg->u.shm_perm.seq = (shmseg->u.shm_perm.seq + 1) & 0x7fff;
	shmid = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);

	/*
	* We make sure that we have allocated a pager before we need
	* to.
	*/
	shm_object = vm_pager_allocate(shm_use_phys ? OBJT_PHYS : OBJT_SWAP,
	0, size, VM_PROT_DEFAULT, 0, cred);
	if (shm_object == NULL) {
	#ifdef RACCT
	PROC_LOCK(td->td_proc);
	racct_sub(td->td_proc, RACCT_NSHM, 1);
	racct_sub(td->td_proc, RACCT_SHMSIZE, size);
	PROC_UNLOCK(td->td_proc);
	#endif
	return (ENOMEM);
	}
	VM_OBJECT_LOCK(shm_object);
	vm_object_clear_flag(shm_object, OBJ_ONEMAPPING);
	vm_object_set_flag(shm_object, OBJ_NOSPLIT);
	VM_OBJECT_UNLOCK(shm_object);

	shmseg->object = shm_object;
	shmseg->u.shm_perm.cuid = shmseg->u.shm_perm.uid = cred->cr_uid;
	shmseg->u.shm_perm.cgid = shmseg->u.shm_perm.gid = cred->cr_gid;
	shmseg->u.shm_perm.mode = (shmseg->u.shm_perm.mode & SHMSEG_WANTED) \|
	(mode & ACCESSPERMS) \| SHMSEG_ALLOCATED;
	shmseg->cred = crhold(cred);
	shmseg->u.shm_segsz = uap->size;
	shmseg->u.shm_cpid = td->td_proc->p_pid;
	shmseg->u.shm_lpid = shmseg->u.shm_nattch = 0;
	shmseg->u.shm_atime = shmseg->u.shm_dtime = 0;
	#ifdef MAC
	mac_sysvshm_create(cred, shmseg);
	#endif
	shmseg->u.shm_ctime = time_second;
	shm_committed += btoc(size);
	shm_nused++;
	if (shmseg->u.shm_perm.mode & SHMSEG_WANTED) {
	/*
	* Somebody else wanted this key while we were asleep. Wake
	* them up now.
	*/
	shmseg->u.shm_perm.mode &= ~SHMSEG_WANTED;
	wakeup(shmseg);
	}
	td->td_retval[0] = shmid;
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct shmget_args {
	key_t key;
	size_t size;
	int shmflg;
	};
	#endif
	int
	-shmget(td, uap)
	+sys_shmget(td, uap)
	struct thread *td;
	struct shmget_args *uap;
	{
	int segnum, mode;
	int error;

	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);
	mtx_lock(&Giant);
	mode = uap->shmflg & ACCESSPERMS;
	if (uap->key != IPC_PRIVATE) {
	again:
	segnum = shm_find_segment_by_key(uap->key);
	if (segnum >= 0) {
	error = shmget_existing(td, uap, mode, segnum);
	if (error == EAGAIN)
	goto again;
	goto done2;
	}
	if ((uap->shmflg & IPC_CREAT) == 0) {
	error = ENOENT;
	goto done2;
	}
	}
	error = shmget_allocate_segment(td, uap, mode);
	done2:
	mtx_unlock(&Giant);
	return (error);
	}

	static void
	shmfork_myhook(p1, p2)
	struct proc p1, p2;
	{
	struct shmmap_state *shmmap_s;
	size_t size;
	int i;

	mtx_lock(&Giant);
	size = shminfo.shmseg * sizeof(struct shmmap_state);
	shmmap_s = malloc(size, M_SHM, M_WAITOK);
	bcopy(p1->p_vmspace->vm_shm, shmmap_s, size);
	p2->p_vmspace->vm_shm = shmmap_s;
	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
	if (shmmap_s->shmid != -1)
	shmsegs[IPCID_TO_IX(shmmap_s->shmid)].u.shm_nattch++;
	mtx_unlock(&Giant);
	}

	static void
	shmexit_myhook(struct vmspace *vm)
	{
	struct shmmap_state base, shm;
	int i;

	if ((base = vm->vm_shm) != NULL) {
	vm->vm_shm = NULL;
	mtx_lock(&Giant);
	for (i = 0, shm = base; i < shminfo.shmseg; i++, shm++) {
	if (shm->shmid != -1)
	shm_delete_mapping(vm, shm);
	}
	mtx_unlock(&Giant);
	free(base, M_SHM);
	}
	}

	static void
	shmrealloc(void)
	{
	int i;
	struct shmid_kernel *newsegs;

	if (shmalloced >= shminfo.shmmni)
	return;

	newsegs = malloc(shminfo.shmmni * sizeof(*newsegs), M_SHM, M_WAITOK);
	if (newsegs == NULL)
	return;
	for (i = 0; i < shmalloced; i++)
	bcopy(&shmsegs[i], &newsegs[i], sizeof(newsegs[0]));
	for (; i < shminfo.shmmni; i++) {
	shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
	shmsegs[i].u.shm_perm.seq = 0;
	#ifdef MAC
	mac_sysvshm_init(&shmsegs[i]);
	#endif
	}
	free(shmsegs, M_SHM);
	shmsegs = newsegs;
	shmalloced = shminfo.shmmni;
	}

	static struct syscall_helper_data shm_syscalls[] = {
	SYSCALL_INIT_HELPER(shmat),
	SYSCALL_INIT_HELPER(shmctl),
	SYSCALL_INIT_HELPER(shmdt),
	SYSCALL_INIT_HELPER(shmget),
	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)
	- SYSCALL_INIT_HELPER(freebsd7_shmctl),
	+ SYSCALL_INIT_HELPER_COMPAT(freebsd7_shmctl),
	#endif
	#if defined(__i386__) && (defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43))
	SYSCALL_INIT_HELPER(shmsys),
	#endif
	SYSCALL_INIT_LAST
	};

	#ifdef COMPAT_FREEBSD32
	#include <compat/freebsd32/freebsd32.h>
	#include <compat/freebsd32/freebsd32_ipc.h>
	#include <compat/freebsd32/freebsd32_proto.h>
	#include <compat/freebsd32/freebsd32_signal.h>
	#include <compat/freebsd32/freebsd32_syscall.h>
	#include <compat/freebsd32/freebsd32_util.h>

	static struct syscall_helper_data shm32_syscalls[] = {
	- SYSCALL32_INIT_HELPER(shmat),
	- SYSCALL32_INIT_HELPER(shmdt),
	- SYSCALL32_INIT_HELPER(shmget),
	+ SYSCALL32_INIT_HELPER_COMPAT(shmat),
	+ SYSCALL32_INIT_HELPER_COMPAT(shmdt),
	+ SYSCALL32_INIT_HELPER_COMPAT(shmget),
	SYSCALL32_INIT_HELPER(freebsd32_shmsys),
	SYSCALL32_INIT_HELPER(freebsd32_shmctl),
	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)
	SYSCALL32_INIT_HELPER(freebsd7_freebsd32_shmctl),
	#endif
	SYSCALL_INIT_LAST
	};
	#endif

	static int
	shminit()
	{
	int i, error;

	#ifndef BURN_BRIDGES
	if (TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall) != 0)
	printf("kern.ipc.shmmaxpgs is now called kern.ipc.shmall!\n");
	#endif
	TUNABLE_ULONG_FETCH("kern.ipc.shmall", &shminfo.shmall);

	/* Initialize shmmax dealing with possible overflow. */
	for (i = PAGE_SIZE; i > 0; i--) {
	shminfo.shmmax = shminfo.shmall * i;
	if (shminfo.shmmax >= shminfo.shmall)
	break;
	}

	TUNABLE_ULONG_FETCH("kern.ipc.shmmin", &shminfo.shmmin);
	TUNABLE_ULONG_FETCH("kern.ipc.shmmni", &shminfo.shmmni);
	TUNABLE_ULONG_FETCH("kern.ipc.shmseg", &shminfo.shmseg);
	TUNABLE_INT_FETCH("kern.ipc.shm_use_phys", &shm_use_phys);

	shmalloced = shminfo.shmmni;
	shmsegs = malloc(shmalloced * sizeof(shmsegs[0]), M_SHM, M_WAITOK);
	for (i = 0; i < shmalloced; i++) {
	shmsegs[i].u.shm_perm.mode = SHMSEG_FREE;
	shmsegs[i].u.shm_perm.seq = 0;
	#ifdef MAC
	mac_sysvshm_init(&shmsegs[i]);
	#endif
	}
	shm_last_free = 0;
	shm_nused = 0;
	shm_committed = 0;
	shmexit_hook = &shmexit_myhook;
	shmfork_hook = &shmfork_myhook;

	error = syscall_helper_register(shm_syscalls);
	if (error != 0)
	return (error);
	#ifdef COMPAT_FREEBSD32
	error = syscall32_helper_register(shm32_syscalls);
	if (error != 0)
	return (error);
	#endif
	return (0);
	}

	static int
	shmunload()
	{
	int i;

	if (shm_nused > 0)
	return (EBUSY);

	#ifdef COMPAT_FREEBSD32
	syscall32_helper_unregister(shm32_syscalls);
	#endif
	syscall_helper_unregister(shm_syscalls);

	for (i = 0; i < shmalloced; i++) {
	#ifdef MAC
	mac_sysvshm_destroy(&shmsegs[i]);
	#endif
	/*
	* Objects might be still mapped into the processes
	* address spaces. Actual free would happen on the
	* last mapping destruction.
	*/
	if (shmsegs[i].u.shm_perm.mode != SHMSEG_FREE)
	vm_object_deallocate(shmsegs[i].object);
	}
	free(shmsegs, M_SHM);
	shmexit_hook = NULL;
	shmfork_hook = NULL;
	return (0);
	}

	static int
	sysctl_shmsegs(SYSCTL_HANDLER_ARGS)
	{

	return (SYSCTL_OUT(req, shmsegs, shmalloced * sizeof(shmsegs[0])));
	}

	#if defined(__i386__) && (defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_43))
	struct oshmid_ds {
	struct ipc_perm_old shm_perm; /* operation perms */
	int shm_segsz; /* size of segment (bytes) */
	u_short shm_cpid; /* pid, creator */
	u_short shm_lpid; /* pid, last operation */
	short shm_nattch; /* no. of current attaches */
	time_t shm_atime; /* last attach time */
	time_t shm_dtime; /* last detach time */
	time_t shm_ctime; /* last change time */
	void shm_handle; / internal handle for shm segment */
	};

	struct oshmctl_args {
	int shmid;
	int cmd;
	struct oshmid_ds *ubuf;
	};

	static int
	oshmctl(struct thread td, struct oshmctl_args uap)
	{
	#ifdef COMPAT_43
	int error = 0;
	struct shmid_kernel *shmseg;
	struct oshmid_ds outbuf;

	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);
	mtx_lock(&Giant);
	shmseg = shm_find_segment_by_shmid(uap->shmid);
	if (shmseg == NULL) {
	error = EINVAL;
	goto done2;
	}
	switch (uap->cmd) {
	case IPC_STAT:
	error = ipcperm(td, &shmseg->u.shm_perm, IPC_R);
	if (error)
	goto done2;
	#ifdef MAC
	error = mac_sysvshm_check_shmctl(td->td_ucred, shmseg, uap->cmd);
	if (error != 0)
	goto done2;
	#endif
	ipcperm_new2old(&shmseg->u.shm_perm, &outbuf.shm_perm);
	outbuf.shm_segsz = shmseg->u.shm_segsz;
	outbuf.shm_cpid = shmseg->u.shm_cpid;
	outbuf.shm_lpid = shmseg->u.shm_lpid;
	outbuf.shm_nattch = shmseg->u.shm_nattch;
	outbuf.shm_atime = shmseg->u.shm_atime;
	outbuf.shm_dtime = shmseg->u.shm_dtime;
	outbuf.shm_ctime = shmseg->u.shm_ctime;
	outbuf.shm_handle = shmseg->object;
	error = copyout(&outbuf, uap->ubuf, sizeof(outbuf));
	if (error)
	goto done2;
	break;
	default:
	error = freebsd7_shmctl(td, (struct freebsd7_shmctl_args *)uap);
	break;
	}
	done2:
	mtx_unlock(&Giant);
	return (error);
	#else
	return (EINVAL);
	#endif
	}

	/* XXX casting to (sy_call_t ) is bogus, as usual. /
	static sy_call_t *shmcalls[] = {
	- (sy_call_t )shmat, (sy_call_t )oshmctl,
	- (sy_call_t )shmdt, (sy_call_t )shmget,
	+ (sy_call_t )sys_shmat, (sy_call_t )oshmctl,
	+ (sy_call_t )sys_shmdt, (sy_call_t )sys_shmget,
	(sy_call_t *)freebsd7_shmctl
	};

	int
	-shmsys(td, uap)
	+sys_shmsys(td, uap)
	struct thread *td;
	/* XXX actually varargs. */
	struct shmsys_args /* {
	int which;
	int a2;
	int a3;
	int a4;
	} / uap;
	{
	int error;

	if (!prison_allow(td->td_ucred, PR_ALLOW_SYSVIPC))
	return (ENOSYS);
	if (uap->which < 0 \|\|
	uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0]))
	return (EINVAL);
	mtx_lock(&Giant);
	error = (*shmcalls[uap->which])(td, &uap->a2);
	mtx_unlock(&Giant);
	return (error);
	}

	#endif /* i386 && (COMPAT_FREEBSD4 \|\| COMPAT_43) */

	#ifdef COMPAT_FREEBSD32

	int
	freebsd32_shmsys(struct thread td, struct freebsd32_shmsys_args uap)
	{

	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)
	switch (uap->which) {
	case 0: { /* shmat */
	struct shmat_args ap;

	ap.shmid = uap->a2;
	ap.shmaddr = PTRIN(uap->a3);
	ap.shmflg = uap->a4;
	return (sysent[SYS_shmat].sy_call(td, &ap));
	}
	case 2: { /* shmdt */
	struct shmdt_args ap;

	ap.shmaddr = PTRIN(uap->a2);
	return (sysent[SYS_shmdt].sy_call(td, &ap));
	}
	case 3: { /* shmget */
	struct shmget_args ap;

	ap.key = uap->a2;
	ap.size = uap->a3;
	ap.shmflg = uap->a4;
	return (sysent[SYS_shmget].sy_call(td, &ap));
	}
	case 4: { /* shmctl */
	struct freebsd7_freebsd32_shmctl_args ap;

	ap.shmid = uap->a2;
	ap.cmd = uap->a3;
	ap.buf = PTRIN(uap->a4);
	return (freebsd7_freebsd32_shmctl(td, &ap));
	}
	case 1: /* oshmctl */
	default:
	return (EINVAL);
	}
	#else
	return (nosys(td, NULL));
	#endif
	}

	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)
	int
	freebsd7_freebsd32_shmctl(struct thread *td,
	struct freebsd7_freebsd32_shmctl_args *uap)
	{
	int error = 0;
	union {
	struct shmid_ds shmid_ds;
	struct shm_info shm_info;
	struct shminfo shminfo;
	} u;
	union {
	struct shmid_ds32_old shmid_ds32;
	struct shm_info32 shm_info32;
	struct shminfo32 shminfo32;
	} u32;
	size_t sz;

	if (uap->cmd == IPC_SET) {
	if ((error = copyin(uap->buf, &u32.shmid_ds32,
	sizeof(u32.shmid_ds32))))
	goto done;
	freebsd32_ipcperm_old_in(&u32.shmid_ds32.shm_perm,
	&u.shmid_ds.shm_perm);
	CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
	CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
	CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
	CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
	CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
	CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
	CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
	}

	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
	if (error)
	goto done;

	/* Cases in which we need to copyout */
	switch (uap->cmd) {
	case IPC_INFO:
	CP(u.shminfo, u32.shminfo32, shmmax);
	CP(u.shminfo, u32.shminfo32, shmmin);
	CP(u.shminfo, u32.shminfo32, shmmni);
	CP(u.shminfo, u32.shminfo32, shmseg);
	CP(u.shminfo, u32.shminfo32, shmall);
	error = copyout(&u32.shminfo32, uap->buf,
	sizeof(u32.shminfo32));
	break;
	case SHM_INFO:
	CP(u.shm_info, u32.shm_info32, used_ids);
	CP(u.shm_info, u32.shm_info32, shm_rss);
	CP(u.shm_info, u32.shm_info32, shm_tot);
	CP(u.shm_info, u32.shm_info32, shm_swp);
	CP(u.shm_info, u32.shm_info32, swap_attempts);
	CP(u.shm_info, u32.shm_info32, swap_successes);
	error = copyout(&u32.shm_info32, uap->buf,
	sizeof(u32.shm_info32));
	break;
	case SHM_STAT:
	case IPC_STAT:
	freebsd32_ipcperm_old_out(&u.shmid_ds.shm_perm,
	&u32.shmid_ds32.shm_perm);
	if (u.shmid_ds.shm_segsz > INT32_MAX)
	u32.shmid_ds32.shm_segsz = INT32_MAX;
	else
	CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
	CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
	CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
	CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
	CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
	CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
	CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
	u32.shmid_ds32.shm_internal = 0;
	error = copyout(&u32.shmid_ds32, uap->buf,
	sizeof(u32.shmid_ds32));
	break;
	}

	done:
	if (error) {
	/* Invalidate the return value */
	td->td_retval[0] = -1;
	}
	return (error);
	}
	#endif

	int
	freebsd32_shmctl(struct thread td, struct freebsd32_shmctl_args uap)
	{
	int error = 0;
	union {
	struct shmid_ds shmid_ds;
	struct shm_info shm_info;
	struct shminfo shminfo;
	} u;
	union {
	struct shmid_ds32 shmid_ds32;
	struct shm_info32 shm_info32;
	struct shminfo32 shminfo32;
	} u32;
	size_t sz;

	if (uap->cmd == IPC_SET) {
	if ((error = copyin(uap->buf, &u32.shmid_ds32,
	sizeof(u32.shmid_ds32))))
	goto done;
	freebsd32_ipcperm_in(&u32.shmid_ds32.shm_perm,
	&u.shmid_ds.shm_perm);
	CP(u32.shmid_ds32, u.shmid_ds, shm_segsz);
	CP(u32.shmid_ds32, u.shmid_ds, shm_lpid);
	CP(u32.shmid_ds32, u.shmid_ds, shm_cpid);
	CP(u32.shmid_ds32, u.shmid_ds, shm_nattch);
	CP(u32.shmid_ds32, u.shmid_ds, shm_atime);
	CP(u32.shmid_ds32, u.shmid_ds, shm_dtime);
	CP(u32.shmid_ds32, u.shmid_ds, shm_ctime);
	}

	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&u, &sz);
	if (error)
	goto done;

	/* Cases in which we need to copyout */
	switch (uap->cmd) {
	case IPC_INFO:
	CP(u.shminfo, u32.shminfo32, shmmax);
	CP(u.shminfo, u32.shminfo32, shmmin);
	CP(u.shminfo, u32.shminfo32, shmmni);
	CP(u.shminfo, u32.shminfo32, shmseg);
	CP(u.shminfo, u32.shminfo32, shmall);
	error = copyout(&u32.shminfo32, uap->buf,
	sizeof(u32.shminfo32));
	break;
	case SHM_INFO:
	CP(u.shm_info, u32.shm_info32, used_ids);
	CP(u.shm_info, u32.shm_info32, shm_rss);
	CP(u.shm_info, u32.shm_info32, shm_tot);
	CP(u.shm_info, u32.shm_info32, shm_swp);
	CP(u.shm_info, u32.shm_info32, swap_attempts);
	CP(u.shm_info, u32.shm_info32, swap_successes);
	error = copyout(&u32.shm_info32, uap->buf,
	sizeof(u32.shm_info32));
	break;
	case SHM_STAT:
	case IPC_STAT:
	freebsd32_ipcperm_out(&u.shmid_ds.shm_perm,
	&u32.shmid_ds32.shm_perm);
	if (u.shmid_ds.shm_segsz > INT32_MAX)
	u32.shmid_ds32.shm_segsz = INT32_MAX;
	else
	CP(u.shmid_ds, u32.shmid_ds32, shm_segsz);
	CP(u.shmid_ds, u32.shmid_ds32, shm_lpid);
	CP(u.shmid_ds, u32.shmid_ds32, shm_cpid);
	CP(u.shmid_ds, u32.shmid_ds32, shm_nattch);
	CP(u.shmid_ds, u32.shmid_ds32, shm_atime);
	CP(u.shmid_ds, u32.shmid_ds32, shm_dtime);
	CP(u.shmid_ds, u32.shmid_ds32, shm_ctime);
	error = copyout(&u32.shmid_ds32, uap->buf,
	sizeof(u32.shmid_ds32));
	break;
	}

	done:
	if (error) {
	/* Invalidate the return value */
	td->td_retval[0] = -1;
	}
	return (error);
	}
	#endif

	#if defined(COMPAT_FREEBSD4) \|\| defined(COMPAT_FREEBSD5) \|\| \
	defined(COMPAT_FREEBSD6) \|\| defined(COMPAT_FREEBSD7)

	#ifndef CP
	#define CP(src, dst, fld) do { (dst).fld = (src).fld; } while (0)
	#endif

	#ifndef _SYS_SYSPROTO_H_
	struct freebsd7_shmctl_args {
	int shmid;
	int cmd;
	struct shmid_ds_old *buf;
	};
	#endif
	int
	freebsd7_shmctl(td, uap)
	struct thread *td;
	struct freebsd7_shmctl_args *uap;
	{
	int error = 0;
	struct shmid_ds_old old;
	struct shmid_ds buf;
	size_t bufsz;

	/*
	* The only reason IPC_INFO, SHM_INFO, SHM_STAT exists is to support
	* Linux binaries. If we see the call come through the FreeBSD ABI,
	* return an error back to the user since we do not to support this.
	*/
	if (uap->cmd == IPC_INFO \|\| uap->cmd == SHM_INFO \|\|
	uap->cmd == SHM_STAT)
	return (EINVAL);

	/* IPC_SET needs to copyin the buffer before calling kern_shmctl */
	if (uap->cmd == IPC_SET) {
	if ((error = copyin(uap->buf, &old, sizeof(old))))
	goto done;
	ipcperm_old2new(&old.shm_perm, &buf.shm_perm);
	CP(old, buf, shm_segsz);
	CP(old, buf, shm_lpid);
	CP(old, buf, shm_cpid);
	CP(old, buf, shm_nattch);
	CP(old, buf, shm_atime);
	CP(old, buf, shm_dtime);
	CP(old, buf, shm_ctime);
	}

	error = kern_shmctl(td, uap->shmid, uap->cmd, (void *)&buf, &bufsz);
	if (error)
	goto done;

	/* Cases in which we need to copyout */
	switch (uap->cmd) {
	case IPC_STAT:
	ipcperm_new2old(&buf.shm_perm, &old.shm_perm);
	if (buf.shm_segsz > INT_MAX)
	old.shm_segsz = INT_MAX;
	else
	CP(buf, old, shm_segsz);
	CP(buf, old, shm_lpid);
	CP(buf, old, shm_cpid);
	if (buf.shm_nattch > SHRT_MAX)
	old.shm_nattch = SHRT_MAX;
	else
	CP(buf, old, shm_nattch);
	CP(buf, old, shm_atime);
	CP(buf, old, shm_dtime);
	CP(buf, old, shm_ctime);
	old.shm_internal = NULL;
	error = copyout(&old, uap->buf, sizeof(old));
	break;
	}

	done:
	if (error) {
	/* Invalidate the return value */
	td->td_retval[0] = -1;
	}
	return (error);
	}

	#endif /* COMPAT_FREEBSD4 \|\| COMPAT_FREEBSD5 \|\| COMPAT_FREEBSD6 \|\|
	COMPAT_FREEBSD7 */

	static int
	sysvshm_modload(struct module module, int cmd, void arg)
	{
	int error = 0;

	switch (cmd) {
	case MOD_LOAD:
	error = shminit();
	if (error != 0)
	shmunload();
	break;
	case MOD_UNLOAD:
	error = shmunload();
	break;
	case MOD_SHUTDOWN:
	break;
	default:
	error = EINVAL;
	break;
	}
	return (error);
	}

	static moduledata_t sysvshm_mod = {
	"sysvshm",
	&sysvshm_modload,
	NULL
	};

	DECLARE_MODULE(sysvshm, sysvshm_mod, SI_SUB_SYSV_SHM, SI_ORDER_FIRST);
	MODULE_VERSION(sysvshm, 1);
	Index: head/sys/kern/tty.c
	===================================================================
	--- head/sys/kern/tty.c (revision 225616)
	+++ head/sys/kern/tty.c (revision 225617)
	@@ -1,2200 +1,2200 @@
	/*-
	* Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
	* All rights reserved.
	*
	* Portions of this software were developed under sponsorship from Snow
	* B.V., the Netherlands.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_capsicum.h"
	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/capability.h>
	#include <sys/conf.h>
	#include <sys/cons.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/filio.h>
	#ifdef COMPAT_43TTY
	#include <sys/ioctl_compat.h>
	#endif /* COMPAT_43TTY */
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/poll.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/serial.h>
	#include <sys/signal.h>
	#include <sys/stat.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	#include <sys/tty.h>
	#include <sys/ttycom.h>
	#define TTYDEFCHARS
	#include <sys/ttydefaults.h>
	#undef TTYDEFCHARS
	#include <sys/ucred.h>
	#include <sys/vnode.h>

	#include <machine/stdarg.h>

	static MALLOC_DEFINE(M_TTY, "tty", "tty device");

	static void tty_rel_free(struct tty *tp);

	static TAILQ_HEAD(, tty) tty_list = TAILQ_HEAD_INITIALIZER(tty_list);
	static struct sx tty_list_sx;
	SX_SYSINIT(tty_list, &tty_list_sx, "tty list");
	static unsigned int tty_list_count = 0;

	/* Character device of /dev/console. */
	static struct cdev *dev_console;
	static const char *dev_console_filename;

	/*
	* Flags that are supported and stored by this implementation.
	*/
	#define TTYSUP_IFLAG (IGNBRK\|BRKINT\|IGNPAR\|PARMRK\|INPCK\|ISTRIP\|\
	INLCR\|IGNCR\|ICRNL\|IXON\|IXOFF\|IXANY\|IMAXBEL)
	#define TTYSUP_OFLAG (OPOST\|ONLCR\|TAB3\|ONOEOT\|OCRNL\|ONOCR\|ONLRET)
	#define TTYSUP_LFLAG (ECHOKE\|ECHOE\|ECHOK\|ECHO\|ECHONL\|ECHOPRT\|\
	ECHOCTL\|ISIG\|ICANON\|ALTWERASE\|IEXTEN\|TOSTOP\|\
	FLUSHO\|NOKERNINFO\|NOFLSH)
	#define TTYSUP_CFLAG (CIGNORE\|CSIZE\|CSTOPB\|CREAD\|PARENB\|PARODD\|\
	HUPCL\|CLOCAL\|CCTS_OFLOW\|CRTS_IFLOW\|CDTR_IFLOW\|\
	CDSR_OFLOW\|CCAR_OFLOW)

	#define TTY_CALLOUT(tp,d) (dev2unit(d) & TTYUNIT_CALLOUT)

	/*
	* Set TTY buffer sizes.
	*/

	#define TTYBUF_MAX 65536

	static void
	tty_watermarks(struct tty *tp)
	{
	size_t bs = 0;

	/* Provide an input buffer for 0.2 seconds of data. */
	if (tp->t_termios.c_cflag & CREAD)
	bs = MIN(tp->t_termios.c_ispeed / 5, TTYBUF_MAX);
	ttyinq_setsize(&tp->t_inq, tp, bs);

	/* Set low watermark at 10% (when 90% is available). */
	tp->t_inlow = (ttyinq_getallocatedsize(&tp->t_inq) * 9) / 10;

	/* Provide an ouput buffer for 0.2 seconds of data. */
	bs = MIN(tp->t_termios.c_ospeed / 5, TTYBUF_MAX);
	ttyoutq_setsize(&tp->t_outq, tp, bs);

	/* Set low watermark at 10% (when 90% is available). */
	tp->t_outlow = (ttyoutq_getallocatedsize(&tp->t_outq) * 9) / 10;
	}

	static int
	tty_drain(struct tty *tp)
	{
	int error;

	if (ttyhook_hashook(tp, getc_inject))
	/* buffer is inaccessible */
	return (0);

	while (ttyoutq_bytesused(&tp->t_outq) > 0) {
	ttydevsw_outwakeup(tp);
	/* Could be handled synchronously. */
	if (ttyoutq_bytesused(&tp->t_outq) == 0)
	return (0);

	/* Wait for data to be drained. */
	error = tty_wait(tp, &tp->t_outwait);
	if (error)
	return (error);
	}

	return (0);
	}

	/*
	* Though ttydev_enter() and ttydev_leave() seem to be related, they
	* don't have to be used together. ttydev_enter() is used by the cdev
	* operations to prevent an actual operation from being processed when
	* the TTY has been abandoned. ttydev_leave() is used by ttydev_open()
	* and ttydev_close() to determine whether per-TTY data should be
	* deallocated.
	*/

	static __inline int
	ttydev_enter(struct tty *tp)
	{
	tty_lock(tp);

	if (tty_gone(tp) \|\| !tty_opened(tp)) {
	/* Device is already gone. */
	tty_unlock(tp);
	return (ENXIO);
	}

	return (0);
	}

	static void
	ttydev_leave(struct tty *tp)
	{
	tty_lock_assert(tp, MA_OWNED);

	if (tty_opened(tp) \|\| tp->t_flags & TF_OPENCLOSE) {
	/* Device is still opened somewhere. */
	tty_unlock(tp);
	return;
	}

	tp->t_flags \|= TF_OPENCLOSE;

	/* Stop asynchronous I/O. */
	funsetown(&tp->t_sigio);

	/* Remove console TTY. */
	if (constty == tp)
	constty_clear();

	/* Drain any output. */
	MPASS((tp->t_flags & TF_STOPPED) == 0);
	if (!tty_gone(tp))
	tty_drain(tp);

	ttydisc_close(tp);

	/* Destroy associated buffers already. */
	ttyinq_free(&tp->t_inq);
	tp->t_inlow = 0;
	ttyoutq_free(&tp->t_outq);
	tp->t_outlow = 0;

	knlist_clear(&tp->t_inpoll.si_note, 1);
	knlist_clear(&tp->t_outpoll.si_note, 1);

	if (!tty_gone(tp))
	ttydevsw_close(tp);

	tp->t_flags &= ~TF_OPENCLOSE;
	cv_broadcast(&tp->t_dcdwait);
	tty_rel_free(tp);
	}

	/*
	* Operations that are exposed through the character device in /dev.
	*/
	static int
	ttydev_open(struct cdev dev, int oflags, int devtype, struct thread td)
	{
	struct tty *tp = dev->si_drv1;
	int error = 0;

	tty_lock(tp);
	if (tty_gone(tp)) {
	/* Device is already gone. */
	tty_unlock(tp);
	return (ENXIO);
	}

	/*
	* Block when other processes are currently opening or closing
	* the TTY.
	*/
	while (tp->t_flags & TF_OPENCLOSE) {
	error = tty_wait(tp, &tp->t_dcdwait);
	if (error != 0) {
	tty_unlock(tp);
	return (error);
	}
	}
	tp->t_flags \|= TF_OPENCLOSE;

	/*
	* Make sure the "tty" and "cua" device cannot be opened at the
	* same time.
	*/
	if (TTY_CALLOUT(tp, dev)) {
	if (tp->t_flags & TF_OPENED_IN) {
	error = EBUSY;
	goto done;
	}
	} else {
	if (tp->t_flags & TF_OPENED_OUT) {
	error = EBUSY;
	goto done;
	}
	}

	if (tp->t_flags & TF_EXCLUDE && priv_check(td, PRIV_TTY_EXCLUSIVE)) {
	error = EBUSY;
	goto done;
	}

	if (!tty_opened(tp)) {
	/* Set proper termios flags. */
	if (TTY_CALLOUT(tp, dev))
	tp->t_termios = tp->t_termios_init_out;
	else
	tp->t_termios = tp->t_termios_init_in;
	ttydevsw_param(tp, &tp->t_termios);
	/* Prevent modem control on callout devices and /dev/console. */
	if (TTY_CALLOUT(tp, dev) \|\| dev == dev_console)
	tp->t_termios.c_cflag \|= CLOCAL;

	ttydevsw_modem(tp, SER_DTR\|SER_RTS, 0);

	error = ttydevsw_open(tp);
	if (error != 0)
	goto done;

	ttydisc_open(tp);
	tty_watermarks(tp);
	}

	/* Wait for Carrier Detect. */
	if ((oflags & O_NONBLOCK) == 0 &&
	(tp->t_termios.c_cflag & CLOCAL) == 0) {
	while ((ttydevsw_modem(tp, 0, 0) & SER_DCD) == 0) {
	error = tty_wait(tp, &tp->t_dcdwait);
	if (error != 0)
	goto done;
	}
	}

	if (dev == dev_console)
	tp->t_flags \|= TF_OPENED_CONS;
	else if (TTY_CALLOUT(tp, dev))
	tp->t_flags \|= TF_OPENED_OUT;
	else
	tp->t_flags \|= TF_OPENED_IN;

	done: tp->t_flags &= ~TF_OPENCLOSE;
	cv_broadcast(&tp->t_dcdwait);
	ttydev_leave(tp);

	return (error);
	}

	static int
	ttydev_close(struct cdev dev, int fflag, int devtype, struct thread td)
	{
	struct tty *tp = dev->si_drv1;

	tty_lock(tp);

	/*
	* Don't actually close the device if it is being used as the
	* console.
	*/
	MPASS((tp->t_flags & TF_OPENED) != TF_OPENED);
	if (dev == dev_console)
	tp->t_flags &= ~TF_OPENED_CONS;
	else
	tp->t_flags &= ~(TF_OPENED_IN\|TF_OPENED_OUT);

	if (tp->t_flags & TF_OPENED) {
	tty_unlock(tp);
	return (0);
	}

	/*
	* This can only be called once. The callin and the callout
	* devices cannot be opened at the same time.
	*/
	tp->t_flags &= ~(TF_EXCLUDE\|TF_STOPPED);

	/* Properly wake up threads that are stuck - revoke(). */
	tp->t_revokecnt++;
	tty_wakeup(tp, FREAD\|FWRITE);
	cv_broadcast(&tp->t_bgwait);
	cv_broadcast(&tp->t_dcdwait);

	ttydev_leave(tp);

	return (0);
	}

	static __inline int
	tty_is_ctty(struct tty tp, struct proc p)
	{
	tty_lock_assert(tp, MA_OWNED);

	return (p->p_session == tp->t_session && p->p_flag & P_CONTROLT);
	}

	static int
	tty_wait_background(struct tty tp, struct thread td, int sig)
	{
	struct proc *p = td->td_proc;
	struct pgrp *pg;
	ksiginfo_t ksi;
	int error;

	MPASS(sig == SIGTTIN \|\| sig == SIGTTOU);
	tty_lock_assert(tp, MA_OWNED);

	for (;;) {
	PROC_LOCK(p);
	/*
	* The process should only sleep, when:
	* - This terminal is the controling terminal
	* - Its process group is not the foreground process
	* group
	* - The parent process isn't waiting for the child to
	* exit
	* - the signal to send to the process isn't masked
	*/
	if (!tty_is_ctty(tp, p) \|\| p->p_pgrp == tp->t_pgrp) {
	/* Allow the action to happen. */
	PROC_UNLOCK(p);
	return (0);
	}

	if (SIGISMEMBER(p->p_sigacts->ps_sigignore, sig) \|\|
	SIGISMEMBER(td->td_sigmask, sig)) {
	/* Only allow them in write()/ioctl(). */
	PROC_UNLOCK(p);
	return (sig == SIGTTOU ? 0 : EIO);
	}

	pg = p->p_pgrp;
	if (p->p_flag & P_PPWAIT \|\| pg->pg_jobc == 0) {
	/* Don't allow the action to happen. */
	PROC_UNLOCK(p);
	return (EIO);
	}
	PROC_UNLOCK(p);

	/*
	* Send the signal and sleep until we're the new
	* foreground process group.
	*/
	if (sig != 0) {
	ksiginfo_init(&ksi);
	ksi.ksi_code = SI_KERNEL;
	ksi.ksi_signo = sig;
	sig = 0;
	}
	PGRP_LOCK(pg);
	pgsignal(pg, ksi.ksi_signo, 1, &ksi);
	PGRP_UNLOCK(pg);

	error = tty_wait(tp, &tp->t_bgwait);
	if (error)
	return (error);
	}
	}

	static int
	ttydev_read(struct cdev dev, struct uio uio, int ioflag)
	{
	struct tty *tp = dev->si_drv1;
	int error;

	error = ttydev_enter(tp);
	if (error)
	goto done;

	error = tty_wait_background(tp, curthread, SIGTTIN);
	if (error) {
	tty_unlock(tp);
	goto done;
	}

	error = ttydisc_read(tp, uio, ioflag);
	tty_unlock(tp);

	/*
	* The read() call should not throw an error when the device is
	* being destroyed. Silently convert it to an EOF.
	*/
	done: if (error == ENXIO)
	error = 0;
	return (error);
	}

	static int
	ttydev_write(struct cdev dev, struct uio uio, int ioflag)
	{
	struct tty *tp = dev->si_drv1;
	int error;

	error = ttydev_enter(tp);
	if (error)
	return (error);

	if (tp->t_termios.c_lflag & TOSTOP) {
	error = tty_wait_background(tp, curthread, SIGTTOU);
	if (error)
	goto done;
	}

	if (ioflag & IO_NDELAY && tp->t_flags & TF_BUSY_OUT) {
	/* Allow non-blocking writes to bypass serialization. */
	error = ttydisc_write(tp, uio, ioflag);
	} else {
	/* Serialize write() calls. */
	while (tp->t_flags & TF_BUSY_OUT) {
	error = tty_wait(tp, &tp->t_outserwait);
	if (error)
	goto done;
	}

	tp->t_flags \|= TF_BUSY_OUT;
	error = ttydisc_write(tp, uio, ioflag);
	tp->t_flags &= ~TF_BUSY_OUT;
	cv_signal(&tp->t_outserwait);
	}

	done: tty_unlock(tp);
	return (error);
	}

	static int
	ttydev_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
	struct thread *td)
	{
	struct tty *tp = dev->si_drv1;
	int error;

	error = ttydev_enter(tp);
	if (error)
	return (error);

	switch (cmd) {
	case TIOCCBRK:
	case TIOCCONS:
	case TIOCDRAIN:
	case TIOCEXCL:
	case TIOCFLUSH:
	case TIOCNXCL:
	case TIOCSBRK:
	case TIOCSCTTY:
	case TIOCSETA:
	case TIOCSETAF:
	case TIOCSETAW:
	case TIOCSPGRP:
	case TIOCSTART:
	case TIOCSTAT:
	case TIOCSTI:
	case TIOCSTOP:
	case TIOCSWINSZ:
	#if 0
	case TIOCSDRAINWAIT:
	case TIOCSETD:
	#endif
	#ifdef COMPAT_43TTY
	case TIOCLBIC:
	case TIOCLBIS:
	case TIOCLSET:
	case TIOCSETC:
	case OTIOCSETD:
	case TIOCSETN:
	case TIOCSETP:
	case TIOCSLTC:
	#endif /* COMPAT_43TTY */
	/*
	* If the ioctl() causes the TTY to be modified, let it
	* wait in the background.
	*/
	error = tty_wait_background(tp, curthread, SIGTTOU);
	if (error)
	goto done;
	}

	if (cmd == TIOCSETA \|\| cmd == TIOCSETAW \|\| cmd == TIOCSETAF) {
	struct termios *old = &tp->t_termios;
	struct termios new = (struct termios )data;
	struct termios *lock = TTY_CALLOUT(tp, dev) ?
	&tp->t_termios_lock_out : &tp->t_termios_lock_in;
	int cc;

	/*
	* Lock state devices. Just overwrite the values of the
	* commands that are currently in use.
	*/
	new->c_iflag = (old->c_iflag & lock->c_iflag) \|
	(new->c_iflag & ~lock->c_iflag);
	new->c_oflag = (old->c_oflag & lock->c_oflag) \|
	(new->c_oflag & ~lock->c_oflag);
	new->c_cflag = (old->c_cflag & lock->c_cflag) \|
	(new->c_cflag & ~lock->c_cflag);
	new->c_lflag = (old->c_lflag & lock->c_lflag) \|
	(new->c_lflag & ~lock->c_lflag);
	for (cc = 0; cc < NCCS; ++cc)
	if (lock->c_cc[cc])
	new->c_cc[cc] = old->c_cc[cc];
	if (lock->c_ispeed)
	new->c_ispeed = old->c_ispeed;
	if (lock->c_ospeed)
	new->c_ospeed = old->c_ospeed;
	}

	error = tty_ioctl(tp, cmd, data, fflag, td);
	done: tty_unlock(tp);

	return (error);
	}

	static int
	ttydev_poll(struct cdev dev, int events, struct thread td)
	{
	struct tty *tp = dev->si_drv1;
	int error, revents = 0;

	error = ttydev_enter(tp);
	if (error)
	return ((events & (POLLIN\|POLLRDNORM)) \| POLLHUP);

	if (events & (POLLIN\|POLLRDNORM)) {
	/* See if we can read something. */
	if (ttydisc_read_poll(tp) > 0)
	revents \|= events & (POLLIN\|POLLRDNORM);
	}

	if (tp->t_flags & TF_ZOMBIE) {
	/* Hangup flag on zombie state. */
	revents \|= POLLHUP;
	} else if (events & (POLLOUT\|POLLWRNORM)) {
	/* See if we can write something. */
	if (ttydisc_write_poll(tp) > 0)
	revents \|= events & (POLLOUT\|POLLWRNORM);
	}

	if (revents == 0) {
	if (events & (POLLIN\|POLLRDNORM))
	selrecord(td, &tp->t_inpoll);
	if (events & (POLLOUT\|POLLWRNORM))
	selrecord(td, &tp->t_outpoll);
	}

	tty_unlock(tp);

	return (revents);
	}

	static int
	ttydev_mmap(struct cdev dev, vm_ooffset_t offset, vm_paddr_t paddr,
	int nprot, vm_memattr_t *memattr)
	{
	struct tty *tp = dev->si_drv1;
	int error;

	/* Handle mmap() through the driver. */

	error = ttydev_enter(tp);
	if (error)
	return (-1);
	error = ttydevsw_mmap(tp, offset, paddr, nprot, memattr);
	tty_unlock(tp);

	return (error);
	}

	/*
	* kqueue support.
	*/

	static void
	tty_kqops_read_detach(struct knote *kn)
	{
	struct tty *tp = kn->kn_hook;

	knlist_remove(&tp->t_inpoll.si_note, kn, 0);
	}

	static int
	tty_kqops_read_event(struct knote *kn, long hint)
	{
	struct tty *tp = kn->kn_hook;

	tty_lock_assert(tp, MA_OWNED);

	if (tty_gone(tp) \|\| tp->t_flags & TF_ZOMBIE) {
	kn->kn_flags \|= EV_EOF;
	return (1);
	} else {
	kn->kn_data = ttydisc_read_poll(tp);
	return (kn->kn_data > 0);
	}
	}

	static void
	tty_kqops_write_detach(struct knote *kn)
	{
	struct tty *tp = kn->kn_hook;

	knlist_remove(&tp->t_outpoll.si_note, kn, 0);
	}

	static int
	tty_kqops_write_event(struct knote *kn, long hint)
	{
	struct tty *tp = kn->kn_hook;

	tty_lock_assert(tp, MA_OWNED);

	if (tty_gone(tp)) {
	kn->kn_flags \|= EV_EOF;
	return (1);
	} else {
	kn->kn_data = ttydisc_write_poll(tp);
	return (kn->kn_data > 0);
	}
	}

	static struct filterops tty_kqops_read = {
	.f_isfd = 1,
	.f_detach = tty_kqops_read_detach,
	.f_event = tty_kqops_read_event,
	};
	static struct filterops tty_kqops_write = {
	.f_isfd = 1,
	.f_detach = tty_kqops_write_detach,
	.f_event = tty_kqops_write_event,
	};

	static int
	ttydev_kqfilter(struct cdev dev, struct knote kn)
	{
	struct tty *tp = dev->si_drv1;
	int error;

	error = ttydev_enter(tp);
	if (error)
	return (error);

	switch (kn->kn_filter) {
	case EVFILT_READ:
	kn->kn_hook = tp;
	kn->kn_fop = &tty_kqops_read;
	knlist_add(&tp->t_inpoll.si_note, kn, 1);
	break;
	case EVFILT_WRITE:
	kn->kn_hook = tp;
	kn->kn_fop = &tty_kqops_write;
	knlist_add(&tp->t_outpoll.si_note, kn, 1);
	break;
	default:
	error = EINVAL;
	break;
	}

	tty_unlock(tp);
	return (error);
	}

	static struct cdevsw ttydev_cdevsw = {
	.d_version = D_VERSION,
	.d_open = ttydev_open,
	.d_close = ttydev_close,
	.d_read = ttydev_read,
	.d_write = ttydev_write,
	.d_ioctl = ttydev_ioctl,
	.d_kqfilter = ttydev_kqfilter,
	.d_poll = ttydev_poll,
	.d_mmap = ttydev_mmap,
	.d_name = "ttydev",
	.d_flags = D_TTY,
	};

	/*
	* Init/lock-state devices
	*/

	static int
	ttyil_open(struct cdev dev, int oflags, int devtype, struct thread td)
	{
	struct tty *tp = dev->si_drv1;
	int error = 0;

	tty_lock(tp);
	if (tty_gone(tp))
	error = ENODEV;
	tty_unlock(tp);

	return (error);
	}

	static int
	ttyil_close(struct cdev dev, int flag, int mode, struct thread td)
	{
	return (0);
	}

	static int
	ttyil_rdwr(struct cdev dev, struct uio uio, int ioflag)
	{
	return (ENODEV);
	}

	static int
	ttyil_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
	struct thread *td)
	{
	struct tty *tp = dev->si_drv1;
	int error;

	tty_lock(tp);
	if (tty_gone(tp)) {
	error = ENODEV;
	goto done;
	}

	error = ttydevsw_cioctl(tp, dev2unit(dev), cmd, data, td);
	if (error != ENOIOCTL)
	goto done;
	error = 0;

	switch (cmd) {
	case TIOCGETA:
	/* Obtain terminal flags through tcgetattr(). */
	(struct termios)data = (struct termios)dev->si_drv2;
	break;
	case TIOCSETA:
	/* Set terminal flags through tcsetattr(). */
	error = priv_check(td, PRIV_TTY_SETA);
	if (error)
	break;
	(struct termios)dev->si_drv2 = (struct termios)data;
	break;
	case TIOCGETD:
	(int )data = TTYDISC;
	break;
	case TIOCGWINSZ:
	bzero(data, sizeof(struct winsize));
	break;
	default:
	error = ENOTTY;
	}

	done: tty_unlock(tp);
	return (error);
	}

	static struct cdevsw ttyil_cdevsw = {
	.d_version = D_VERSION,
	.d_open = ttyil_open,
	.d_close = ttyil_close,
	.d_read = ttyil_rdwr,
	.d_write = ttyil_rdwr,
	.d_ioctl = ttyil_ioctl,
	.d_name = "ttyil",
	.d_flags = D_TTY,
	};

	static void
	tty_init_termios(struct tty *tp)
	{
	struct termios *t = &tp->t_termios_init_in;

	t->c_cflag = TTYDEF_CFLAG;
	t->c_iflag = TTYDEF_IFLAG;
	t->c_lflag = TTYDEF_LFLAG;
	t->c_oflag = TTYDEF_OFLAG;
	t->c_ispeed = TTYDEF_SPEED;
	t->c_ospeed = TTYDEF_SPEED;
	memcpy(&t->c_cc, ttydefchars, sizeof ttydefchars);

	tp->t_termios_init_out = *t;
	}

	void
	tty_init_console(struct tty *tp, speed_t s)
	{
	struct termios *ti = &tp->t_termios_init_in;
	struct termios *to = &tp->t_termios_init_out;

	if (s != 0) {
	ti->c_ispeed = ti->c_ospeed = s;
	to->c_ispeed = to->c_ospeed = s;
	}

	ti->c_cflag \|= CLOCAL;
	to->c_cflag \|= CLOCAL;
	}

	/*
	* Standard device routine implementations, mostly meant for
	* pseudo-terminal device drivers. When a driver creates a new terminal
	* device class, missing routines are patched.
	*/

	static int
	ttydevsw_defopen(struct tty *tp)
	{

	return (0);
	}

	static void
	ttydevsw_defclose(struct tty *tp)
	{
	}

	static void
	ttydevsw_defoutwakeup(struct tty *tp)
	{

	panic("Terminal device has output, while not implemented");
	}

	static void
	ttydevsw_definwakeup(struct tty *tp)
	{
	}

	static int
	ttydevsw_defioctl(struct tty tp, u_long cmd, caddr_t data, struct thread td)
	{

	return (ENOIOCTL);
	}

	static int
	ttydevsw_defcioctl(struct tty tp, int unit, u_long cmd, caddr_t data, struct thread td)
	{

	return (ENOIOCTL);
	}

	static int
	ttydevsw_defparam(struct tty tp, struct termios t)
	{

	/*
	* Allow the baud rate to be adjusted for pseudo-devices, but at
	* least restrict it to 115200 to prevent excessive buffer
	* usage. Also disallow 0, to prevent foot shooting.
	*/
	if (t->c_ispeed < B50)
	t->c_ispeed = B50;
	else if (t->c_ispeed > B115200)
	t->c_ispeed = B115200;
	if (t->c_ospeed < B50)
	t->c_ospeed = B50;
	else if (t->c_ospeed > B115200)
	t->c_ospeed = B115200;
	t->c_cflag \|= CREAD;

	return (0);
	}

	static int
	ttydevsw_defmodem(struct tty *tp, int sigon, int sigoff)
	{

	/* Simulate a carrier to make the TTY layer happy. */
	return (SER_DCD);
	}

	static int
	ttydevsw_defmmap(struct tty tp, vm_ooffset_t offset, vm_paddr_t paddr,
	int nprot, vm_memattr_t *memattr)
	{

	return (-1);
	}

	static void
	ttydevsw_defpktnotify(struct tty *tp, char event)
	{
	}

	static void
	ttydevsw_deffree(void *softc)
	{

	panic("Terminal device freed without a free-handler");
	}

	/*
	* TTY allocation and deallocation. TTY devices can be deallocated when
	* the driver doesn't use it anymore, when the TTY isn't a session's
	* controlling TTY and when the device node isn't opened through devfs.
	*/

	struct tty *
	tty_alloc(struct ttydevsw tsw, void sc)
	{

	return (tty_alloc_mutex(tsw, sc, NULL));
	}

	struct tty *
	tty_alloc_mutex(struct ttydevsw tsw, void sc, struct mtx *mutex)
	{
	struct tty *tp;

	/* Make sure the driver defines all routines. */
	#define PATCH_FUNC(x) do { \
	if (tsw->tsw_ ## x == NULL) \
	tsw->tsw_ ## x = ttydevsw_def ## x; \
	} while (0)
	PATCH_FUNC(open);
	PATCH_FUNC(close);
	PATCH_FUNC(outwakeup);
	PATCH_FUNC(inwakeup);
	PATCH_FUNC(ioctl);
	PATCH_FUNC(cioctl);
	PATCH_FUNC(param);
	PATCH_FUNC(modem);
	PATCH_FUNC(mmap);
	PATCH_FUNC(pktnotify);
	PATCH_FUNC(free);
	#undef PATCH_FUNC

	tp = malloc(sizeof(struct tty), M_TTY, M_WAITOK\|M_ZERO);
	tp->t_devsw = tsw;
	tp->t_devswsoftc = sc;
	tp->t_flags = tsw->tsw_flags;

	tty_init_termios(tp);

	cv_init(&tp->t_inwait, "ttyin");
	cv_init(&tp->t_outwait, "ttyout");
	cv_init(&tp->t_outserwait, "ttyosr");
	cv_init(&tp->t_bgwait, "ttybg");
	cv_init(&tp->t_dcdwait, "ttydcd");

	/* Allow drivers to use a custom mutex to lock the TTY. */
	if (mutex != NULL) {
	tp->t_mtx = mutex;
	} else {
	tp->t_mtx = &tp->t_mtxobj;
	mtx_init(&tp->t_mtxobj, "ttymtx", NULL, MTX_DEF);
	}

	knlist_init_mtx(&tp->t_inpoll.si_note, tp->t_mtx);
	knlist_init_mtx(&tp->t_outpoll.si_note, tp->t_mtx);

	sx_xlock(&tty_list_sx);
	TAILQ_INSERT_TAIL(&tty_list, tp, t_list);
	tty_list_count++;
	sx_xunlock(&tty_list_sx);

	return (tp);
	}

	static void
	tty_dealloc(void *arg)
	{
	struct tty *tp = arg;

	sx_xlock(&tty_list_sx);
	TAILQ_REMOVE(&tty_list, tp, t_list);
	tty_list_count--;
	sx_xunlock(&tty_list_sx);

	/* Make sure we haven't leaked buffers. */
	MPASS(ttyinq_getsize(&tp->t_inq) == 0);
	MPASS(ttyoutq_getsize(&tp->t_outq) == 0);

	seldrain(&tp->t_inpoll);
	seldrain(&tp->t_outpoll);
	knlist_destroy(&tp->t_inpoll.si_note);
	knlist_destroy(&tp->t_outpoll.si_note);

	cv_destroy(&tp->t_inwait);
	cv_destroy(&tp->t_outwait);
	cv_destroy(&tp->t_bgwait);
	cv_destroy(&tp->t_dcdwait);
	cv_destroy(&tp->t_outserwait);

	if (tp->t_mtx == &tp->t_mtxobj)
	mtx_destroy(&tp->t_mtxobj);
	ttydevsw_free(tp);
	free(tp, M_TTY);
	}

	static void
	tty_rel_free(struct tty *tp)
	{
	struct cdev *dev;

	tty_lock_assert(tp, MA_OWNED);

	#define TF_ACTIVITY (TF_GONE\|TF_OPENED\|TF_HOOK\|TF_OPENCLOSE)
	if (tp->t_sessioncnt != 0 \|\| (tp->t_flags & TF_ACTIVITY) != TF_GONE) {
	/* TTY is still in use. */
	tty_unlock(tp);
	return;
	}

	/* TTY can be deallocated. */
	dev = tp->t_dev;
	tp->t_dev = NULL;
	tty_unlock(tp);

	if (dev != NULL)
	destroy_dev_sched_cb(dev, tty_dealloc, tp);
	}

	void
	tty_rel_pgrp(struct tty tp, struct pgrp pg)
	{
	MPASS(tp->t_sessioncnt > 0);
	tty_lock_assert(tp, MA_OWNED);

	if (tp->t_pgrp == pg)
	tp->t_pgrp = NULL;

	tty_unlock(tp);
	}

	void
	tty_rel_sess(struct tty tp, struct session sess)
	{
	MPASS(tp->t_sessioncnt > 0);

	/* Current session has left. */
	if (tp->t_session == sess) {
	tp->t_session = NULL;
	MPASS(tp->t_pgrp == NULL);
	}
	tp->t_sessioncnt--;
	tty_rel_free(tp);
	}

	void
	tty_rel_gone(struct tty *tp)
	{
	MPASS(!tty_gone(tp));

	/* Simulate carrier removal. */
	ttydisc_modem(tp, 0);

	/* Wake up all blocked threads. */
	tty_wakeup(tp, FREAD\|FWRITE);
	cv_broadcast(&tp->t_bgwait);
	cv_broadcast(&tp->t_dcdwait);

	tp->t_flags \|= TF_GONE;
	tty_rel_free(tp);
	}

	/*
	* Exposing information about current TTY's through sysctl
	*/

	static void
	tty_to_xtty(struct tty tp, struct xtty xt)
	{
	tty_lock_assert(tp, MA_OWNED);

	xt->xt_size = sizeof(struct xtty);
	xt->xt_insize = ttyinq_getsize(&tp->t_inq);
	xt->xt_incc = ttyinq_bytescanonicalized(&tp->t_inq);
	xt->xt_inlc = ttyinq_bytesline(&tp->t_inq);
	xt->xt_inlow = tp->t_inlow;
	xt->xt_outsize = ttyoutq_getsize(&tp->t_outq);
	xt->xt_outcc = ttyoutq_bytesused(&tp->t_outq);
	xt->xt_outlow = tp->t_outlow;
	xt->xt_column = tp->t_column;
	xt->xt_pgid = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
	xt->xt_sid = tp->t_session ? tp->t_session->s_sid : 0;
	xt->xt_flags = tp->t_flags;
	xt->xt_dev = tp->t_dev ? dev2udev(tp->t_dev) : NODEV;
	}

	static int
	sysctl_kern_ttys(SYSCTL_HANDLER_ARGS)
	{
	unsigned long lsize;
	struct xtty xtlist, xt;
	struct tty *tp;
	int error;

	sx_slock(&tty_list_sx);
	lsize = tty_list_count * sizeof(struct xtty);
	if (lsize == 0) {
	sx_sunlock(&tty_list_sx);
	return (0);
	}

	xtlist = xt = malloc(lsize, M_TTY, M_WAITOK);

	TAILQ_FOREACH(tp, &tty_list, t_list) {
	tty_lock(tp);
	tty_to_xtty(tp, xt);
	tty_unlock(tp);
	xt++;
	}
	sx_sunlock(&tty_list_sx);

	error = SYSCTL_OUT(req, xtlist, lsize);
	free(xtlist, M_TTY);
	return (error);
	}

	SYSCTL_PROC(_kern, OID_AUTO, ttys, CTLTYPE_OPAQUE\|CTLFLAG_RD\|CTLFLAG_MPSAFE,
	0, 0, sysctl_kern_ttys, "S,xtty", "List of TTYs");

	/*
	* Device node creation. Device has been set up, now we can expose it to
	* the user.
	*/

	void
	tty_makedev(struct tty tp, struct ucred cred, const char *fmt, ...)
	{
	va_list ap;
	struct cdev *dev;
	const char *prefix = "tty";
	char name[SPECNAMELEN - 3]; /* for "tty" and "cua". */
	uid_t uid;
	gid_t gid;
	mode_t mode;

	/* Remove "tty" prefix from devices like PTY's. */
	if (tp->t_flags & TF_NOPREFIX)
	prefix = "";

	va_start(ap, fmt);
	vsnrprintf(name, sizeof name, 32, fmt, ap);
	va_end(ap);

	if (cred == NULL) {
	/* System device. */
	uid = UID_ROOT;
	gid = GID_WHEEL;
	mode = S_IRUSR\|S_IWUSR;
	} else {
	/* User device. */
	uid = cred->cr_ruid;
	gid = GID_TTY;
	mode = S_IRUSR\|S_IWUSR\|S_IWGRP;
	}

	/* Master call-in device. */
	dev = make_dev_cred(&ttydev_cdevsw, 0, cred,
	uid, gid, mode, "%s%s", prefix, name);
	dev->si_drv1 = tp;
	tp->t_dev = dev;

	/* Slave call-in devices. */
	if (tp->t_flags & TF_INITLOCK) {
	dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_INIT, cred,
	uid, gid, mode, "%s%s.init", prefix, name);
	dev_depends(tp->t_dev, dev);
	dev->si_drv1 = tp;
	dev->si_drv2 = &tp->t_termios_init_in;

	dev = make_dev_cred(&ttyil_cdevsw, TTYUNIT_LOCK, cred,
	uid, gid, mode, "%s%s.lock", prefix, name);
	dev_depends(tp->t_dev, dev);
	dev->si_drv1 = tp;
	dev->si_drv2 = &tp->t_termios_lock_in;
	}

	/* Call-out devices. */
	if (tp->t_flags & TF_CALLOUT) {
	dev = make_dev_cred(&ttydev_cdevsw, TTYUNIT_CALLOUT, cred,
	UID_UUCP, GID_DIALER, 0660, "cua%s", name);
	dev_depends(tp->t_dev, dev);
	dev->si_drv1 = tp;

	/* Slave call-out devices. */
	if (tp->t_flags & TF_INITLOCK) {
	dev = make_dev_cred(&ttyil_cdevsw,
	TTYUNIT_CALLOUT \| TTYUNIT_INIT, cred,
	UID_UUCP, GID_DIALER, 0660, "cua%s.init", name);
	dev_depends(tp->t_dev, dev);
	dev->si_drv1 = tp;
	dev->si_drv2 = &tp->t_termios_init_out;

	dev = make_dev_cred(&ttyil_cdevsw,
	TTYUNIT_CALLOUT \| TTYUNIT_LOCK, cred,
	UID_UUCP, GID_DIALER, 0660, "cua%s.lock", name);
	dev_depends(tp->t_dev, dev);
	dev->si_drv1 = tp;
	dev->si_drv2 = &tp->t_termios_lock_out;
	}
	}
	}

	/*
	* Signalling processes.
	*/

	void
	tty_signal_sessleader(struct tty *tp, int sig)
	{
	struct proc *p;

	tty_lock_assert(tp, MA_OWNED);
	MPASS(sig >= 1 && sig < NSIG);

	/* Make signals start output again. */
	tp->t_flags &= ~TF_STOPPED;

	if (tp->t_session != NULL && tp->t_session->s_leader != NULL) {
	p = tp->t_session->s_leader;
	PROC_LOCK(p);
	- psignal(p, sig);
	+ kern_psignal(p, sig);
	PROC_UNLOCK(p);
	}
	}

	void
	tty_signal_pgrp(struct tty *tp, int sig)
	{
	ksiginfo_t ksi;

	tty_lock_assert(tp, MA_OWNED);
	MPASS(sig >= 1 && sig < NSIG);

	/* Make signals start output again. */
	tp->t_flags &= ~TF_STOPPED;

	if (sig == SIGINFO && !(tp->t_termios.c_lflag & NOKERNINFO))
	tty_info(tp);
	if (tp->t_pgrp != NULL) {
	ksiginfo_init(&ksi);
	ksi.ksi_signo = sig;
	ksi.ksi_code = SI_KERNEL;
	PGRP_LOCK(tp->t_pgrp);
	pgsignal(tp->t_pgrp, sig, 1, &ksi);
	PGRP_UNLOCK(tp->t_pgrp);
	}
	}

	void
	tty_wakeup(struct tty *tp, int flags)
	{
	if (tp->t_flags & TF_ASYNC && tp->t_sigio != NULL)
	pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));

	if (flags & FWRITE) {
	cv_broadcast(&tp->t_outwait);
	selwakeup(&tp->t_outpoll);
	KNOTE_LOCKED(&tp->t_outpoll.si_note, 0);
	}
	if (flags & FREAD) {
	cv_broadcast(&tp->t_inwait);
	selwakeup(&tp->t_inpoll);
	KNOTE_LOCKED(&tp->t_inpoll.si_note, 0);
	}
	}

	int
	tty_wait(struct tty tp, struct cv cv)
	{
	int error;
	int revokecnt = tp->t_revokecnt;

	tty_lock_assert(tp, MA_OWNED\|MA_NOTRECURSED);
	MPASS(!tty_gone(tp));

	error = cv_wait_sig(cv, tp->t_mtx);

	/* Restart the system call when we may have been revoked. */
	if (tp->t_revokecnt != revokecnt)
	return (ERESTART);

	/* Bail out when the device slipped away. */
	if (tty_gone(tp))
	return (ENXIO);

	return (error);
	}

	int
	tty_timedwait(struct tty tp, struct cv cv, int hz)
	{
	int error;
	int revokecnt = tp->t_revokecnt;

	tty_lock_assert(tp, MA_OWNED\|MA_NOTRECURSED);
	MPASS(!tty_gone(tp));

	error = cv_timedwait_sig(cv, tp->t_mtx, hz);

	/* Restart the system call when we may have been revoked. */
	if (tp->t_revokecnt != revokecnt)
	return (ERESTART);

	/* Bail out when the device slipped away. */
	if (tty_gone(tp))
	return (ENXIO);

	return (error);
	}

	void
	tty_flush(struct tty *tp, int flags)
	{
	if (flags & FWRITE) {
	tp->t_flags &= ~TF_HIWAT_OUT;
	ttyoutq_flush(&tp->t_outq);
	tty_wakeup(tp, FWRITE);
	ttydevsw_pktnotify(tp, TIOCPKT_FLUSHWRITE);
	}
	if (flags & FREAD) {
	tty_hiwat_in_unblock(tp);
	ttyinq_flush(&tp->t_inq);
	ttydevsw_inwakeup(tp);
	ttydevsw_pktnotify(tp, TIOCPKT_FLUSHREAD);
	}
	}

	static int
	tty_generic_ioctl(struct tty tp, u_long cmd, void data, int fflag,
	struct thread *td)
	{
	int error;

	switch (cmd) {
	/*
	* Modem commands.
	* The SER_* and TIOCM_* flags are the same, but one bit
	* shifted. I don't know why.
	*/
	case TIOCSDTR:
	ttydevsw_modem(tp, SER_DTR, 0);
	return (0);
	case TIOCCDTR:
	ttydevsw_modem(tp, 0, SER_DTR);
	return (0);
	case TIOCMSET: {
	int bits = (int )data;
	ttydevsw_modem(tp,
	(bits & (TIOCM_DTR \| TIOCM_RTS)) >> 1,
	((~bits) & (TIOCM_DTR \| TIOCM_RTS)) >> 1);
	return (0);
	}
	case TIOCMBIS: {
	int bits = (int )data;
	ttydevsw_modem(tp, (bits & (TIOCM_DTR \| TIOCM_RTS)) >> 1, 0);
	return (0);
	}
	case TIOCMBIC: {
	int bits = (int )data;
	ttydevsw_modem(tp, 0, (bits & (TIOCM_DTR \| TIOCM_RTS)) >> 1);
	return (0);
	}
	case TIOCMGET:
	(int )data = TIOCM_LE + (ttydevsw_modem(tp, 0, 0) << 1);
	return (0);

	case FIOASYNC:
	if ((int )data)
	tp->t_flags \|= TF_ASYNC;
	else
	tp->t_flags &= ~TF_ASYNC;
	return (0);
	case FIONBIO:
	/* This device supports non-blocking operation. */
	return (0);
	case FIONREAD:
	(int )data = ttyinq_bytescanonicalized(&tp->t_inq);
	return (0);
	case FIONWRITE:
	case TIOCOUTQ:
	(int )data = ttyoutq_bytesused(&tp->t_outq);
	return (0);
	case FIOSETOWN:
	if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
	/* Not allowed to set ownership. */
	return (ENOTTY);

	/* Temporarily unlock the TTY to set ownership. */
	tty_unlock(tp);
	error = fsetown((int )data, &tp->t_sigio);
	tty_lock(tp);
	return (error);
	case FIOGETOWN:
	if (tp->t_session != NULL && !tty_is_ctty(tp, td->td_proc))
	/* Not allowed to set ownership. */
	return (ENOTTY);

	/* Get ownership. */
	(int )data = fgetown(&tp->t_sigio);
	return (0);
	case TIOCGETA:
	/* Obtain terminal flags through tcgetattr(). */
	(struct termios)data = tp->t_termios;
	return (0);
	case TIOCSETA:
	case TIOCSETAW:
	case TIOCSETAF: {
	struct termios *t = data;

	/*
	* Who makes up these funny rules? According to POSIX,
	* input baud rate is set equal to the output baud rate
	* when zero.
	*/
	if (t->c_ispeed == 0)
	t->c_ispeed = t->c_ospeed;

	/* Discard any unsupported bits. */
	t->c_iflag &= TTYSUP_IFLAG;
	t->c_oflag &= TTYSUP_OFLAG;
	t->c_lflag &= TTYSUP_LFLAG;
	t->c_cflag &= TTYSUP_CFLAG;

	/* Set terminal flags through tcsetattr(). */
	if (cmd == TIOCSETAW \|\| cmd == TIOCSETAF) {
	error = tty_drain(tp);
	if (error)
	return (error);
	if (cmd == TIOCSETAF)
	tty_flush(tp, FREAD);
	}

	/*
	* Only call param() when the flags really change.
	*/
	if ((t->c_cflag & CIGNORE) == 0 &&
	(tp->t_termios.c_cflag != t->c_cflag \|\|
	tp->t_termios.c_ispeed != t->c_ispeed \|\|
	tp->t_termios.c_ospeed != t->c_ospeed)) {
	error = ttydevsw_param(tp, t);
	if (error)
	return (error);

	/* XXX: CLOCAL? */

	tp->t_termios.c_cflag = t->c_cflag & ~CIGNORE;
	tp->t_termios.c_ispeed = t->c_ispeed;
	tp->t_termios.c_ospeed = t->c_ospeed;

	/* Baud rate has changed - update watermarks. */
	tty_watermarks(tp);
	}

	/* Copy new non-device driver parameters. */
	tp->t_termios.c_iflag = t->c_iflag;
	tp->t_termios.c_oflag = t->c_oflag;
	tp->t_termios.c_lflag = t->c_lflag;
	memcpy(&tp->t_termios.c_cc, t->c_cc, sizeof t->c_cc);

	ttydisc_optimize(tp);

	if ((t->c_lflag & ICANON) == 0) {
	/*
	* When in non-canonical mode, wake up all
	* readers. Canonicalize any partial input. VMIN
	* and VTIME could also be adjusted.
	*/
	ttyinq_canonicalize(&tp->t_inq);
	tty_wakeup(tp, FREAD);
	}

	/*
	* For packet mode: notify the PTY consumer that VSTOP
	* and VSTART may have been changed.
	*/
	if (tp->t_termios.c_iflag & IXON &&
	tp->t_termios.c_cc[VSTOP] == CTRL('S') &&
	tp->t_termios.c_cc[VSTART] == CTRL('Q'))
	ttydevsw_pktnotify(tp, TIOCPKT_DOSTOP);
	else
	ttydevsw_pktnotify(tp, TIOCPKT_NOSTOP);
	return (0);
	}
	case TIOCGETD:
	/* For compatibility - we only support TTYDISC. */
	(int )data = TTYDISC;
	return (0);
	case TIOCGPGRP:
	if (!tty_is_ctty(tp, td->td_proc))
	return (ENOTTY);

	if (tp->t_pgrp != NULL)
	(int )data = tp->t_pgrp->pg_id;
	else
	(int )data = NO_PID;
	return (0);
	case TIOCGSID:
	if (!tty_is_ctty(tp, td->td_proc))
	return (ENOTTY);

	MPASS(tp->t_session);
	(int )data = tp->t_session->s_sid;
	return (0);
	case TIOCSCTTY: {
	struct proc *p = td->td_proc;

	/* XXX: This looks awful. */
	tty_unlock(tp);
	sx_xlock(&proctree_lock);
	tty_lock(tp);

	if (!SESS_LEADER(p)) {
	/* Only the session leader may do this. */
	sx_xunlock(&proctree_lock);
	return (EPERM);
	}

	if (tp->t_session != NULL && tp->t_session == p->p_session) {
	/* This is already our controlling TTY. */
	sx_xunlock(&proctree_lock);
	return (0);
	}

	if (p->p_session->s_ttyp != NULL \|\|
	(tp->t_session != NULL && tp->t_session->s_ttyvp != NULL &&
	tp->t_session->s_ttyvp->v_type != VBAD)) {
	/*
	* There is already a relation between a TTY and
	* a session, or the caller is not the session
	* leader.
	*
	* Allow the TTY to be stolen when the vnode is
	* invalid, but the reference to the TTY is
	* still active. This allows immediate reuse of
	* TTYs of which the session leader has been
	* killed or the TTY revoked.
	*/
	sx_xunlock(&proctree_lock);
	return (EPERM);
	}

	/* Connect the session to the TTY. */
	tp->t_session = p->p_session;
	tp->t_session->s_ttyp = tp;
	tp->t_sessioncnt++;
	sx_xunlock(&proctree_lock);

	/* Assign foreground process group. */
	tp->t_pgrp = p->p_pgrp;
	PROC_LOCK(p);
	p->p_flag \|= P_CONTROLT;
	PROC_UNLOCK(p);

	return (0);
	}
	case TIOCSPGRP: {
	struct pgrp *pg;

	/*
	* XXX: Temporarily unlock the TTY to locate the process
	* group. This code would be lot nicer if we would ever
	* decompose proctree_lock.
	*/
	tty_unlock(tp);
	sx_slock(&proctree_lock);
	pg = pgfind((int )data);
	if (pg != NULL)
	PGRP_UNLOCK(pg);
	if (pg == NULL \|\| pg->pg_session != td->td_proc->p_session) {
	sx_sunlock(&proctree_lock);
	tty_lock(tp);
	return (EPERM);
	}
	tty_lock(tp);

	/*
	* Determine if this TTY is the controlling TTY after
	* relocking the TTY.
	*/
	if (!tty_is_ctty(tp, td->td_proc)) {
	sx_sunlock(&proctree_lock);
	return (ENOTTY);
	}
	tp->t_pgrp = pg;
	sx_sunlock(&proctree_lock);

	/* Wake up the background process groups. */
	cv_broadcast(&tp->t_bgwait);
	return (0);
	}
	case TIOCFLUSH: {
	int flags = (int )data;

	if (flags == 0)
	flags = (FREAD\|FWRITE);
	else
	flags &= (FREAD\|FWRITE);
	tty_flush(tp, flags);
	return (0);
	}
	case TIOCDRAIN:
	/* Drain TTY output. */
	return tty_drain(tp);
	case TIOCCONS:
	/* Set terminal as console TTY. */
	if ((int )data) {
	error = priv_check(td, PRIV_TTY_CONSOLE);
	if (error)
	return (error);

	/*
	* XXX: constty should really need to be locked!
	* XXX: allow disconnected constty's to be stolen!
	*/

	if (constty == tp)
	return (0);
	if (constty != NULL)
	return (EBUSY);

	tty_unlock(tp);
	constty_set(tp);
	tty_lock(tp);
	} else if (constty == tp) {
	constty_clear();
	}
	return (0);
	case TIOCGWINSZ:
	/* Obtain window size. */
	(struct winsize)data = tp->t_winsize;
	return (0);
	case TIOCSWINSZ:
	/* Set window size. */
	if (bcmp(&tp->t_winsize, data, sizeof(struct winsize)) == 0)
	return (0);
	tp->t_winsize = (struct winsize)data;
	tty_signal_pgrp(tp, SIGWINCH);
	return (0);
	case TIOCEXCL:
	tp->t_flags \|= TF_EXCLUDE;
	return (0);
	case TIOCNXCL:
	tp->t_flags &= ~TF_EXCLUDE;
	return (0);
	case TIOCSTOP:
	tp->t_flags \|= TF_STOPPED;
	ttydevsw_pktnotify(tp, TIOCPKT_STOP);
	return (0);
	case TIOCSTART:
	tp->t_flags &= ~TF_STOPPED;
	ttydevsw_outwakeup(tp);
	ttydevsw_pktnotify(tp, TIOCPKT_START);
	return (0);
	case TIOCSTAT:
	tty_info(tp);
	return (0);
	case TIOCSTI:
	if ((fflag & FREAD) == 0 && priv_check(td, PRIV_TTY_STI))
	return (EPERM);
	if (!tty_is_ctty(tp, td->td_proc) &&
	priv_check(td, PRIV_TTY_STI))
	return (EACCES);
	ttydisc_rint(tp, (char )data, 0);
	ttydisc_rint_done(tp);
	return (0);
	}

	#ifdef COMPAT_43TTY
	return tty_ioctl_compat(tp, cmd, data, fflag, td);
	#else /* !COMPAT_43TTY */
	return (ENOIOCTL);
	#endif /* COMPAT_43TTY */
	}

	int
	tty_ioctl(struct tty tp, u_long cmd, void data, int fflag, struct thread *td)
	{
	int error;

	tty_lock_assert(tp, MA_OWNED);

	if (tty_gone(tp))
	return (ENXIO);

	error = ttydevsw_ioctl(tp, cmd, data, td);
	if (error == ENOIOCTL)
	error = tty_generic_ioctl(tp, cmd, data, fflag, td);

	return (error);
	}

	dev_t
	tty_udev(struct tty *tp)
	{
	if (tp->t_dev)
	return dev2udev(tp->t_dev);
	else
	return NODEV;
	}

	int
	tty_checkoutq(struct tty *tp)
	{

	/* 256 bytes should be enough to print a log message. */
	return (ttyoutq_bytesleft(&tp->t_outq) >= 256);
	}

	void
	tty_hiwat_in_block(struct tty *tp)
	{

	if ((tp->t_flags & TF_HIWAT_IN) == 0 &&
	tp->t_termios.c_iflag & IXOFF &&
	tp->t_termios.c_cc[VSTOP] != _POSIX_VDISABLE) {
	/*
	* Input flow control. Only enter the high watermark when we
	* can successfully store the VSTOP character.
	*/
	if (ttyoutq_write_nofrag(&tp->t_outq,
	&tp->t_termios.c_cc[VSTOP], 1) == 0)
	tp->t_flags \|= TF_HIWAT_IN;
	} else {
	/* No input flow control. */
	tp->t_flags \|= TF_HIWAT_IN;
	}
	}

	void
	tty_hiwat_in_unblock(struct tty *tp)
	{

	if (tp->t_flags & TF_HIWAT_IN &&
	tp->t_termios.c_iflag & IXOFF &&
	tp->t_termios.c_cc[VSTART] != _POSIX_VDISABLE) {
	/*
	* Input flow control. Only leave the high watermark when we
	* can successfully store the VSTART character.
	*/
	if (ttyoutq_write_nofrag(&tp->t_outq,
	&tp->t_termios.c_cc[VSTART], 1) == 0)
	tp->t_flags &= ~TF_HIWAT_IN;
	} else {
	/* No input flow control. */
	tp->t_flags &= ~TF_HIWAT_IN;
	}

	if (!tty_gone(tp))
	ttydevsw_inwakeup(tp);
	}

	/*
	* TTY hooks interface.
	*/

	static int
	ttyhook_defrint(struct tty *tp, char c, int flags)
	{

	if (ttyhook_rint_bypass(tp, &c, 1) != 1)
	return (-1);

	return (0);
	}

	int
	ttyhook_register(struct tty *rtp, struct proc p, int fd,
	struct ttyhook th, void softc)
	{
	struct tty *tp;
	struct file *fp;
	#ifdef CAPABILITIES
	struct file *fp_cap;
	#endif
	struct cdev *dev;
	struct cdevsw *cdp;
	struct filedesc *fdp;
	int error, ref;

	/* Validate the file descriptor. */
	if ((fdp = p->p_fd) == NULL)
	return (EBADF);

	fp = fget_unlocked(fdp, fd);
	if (fp == NULL)
	return (EBADF);
	if (fp->f_ops == &badfileops) {
	error = EBADF;
	goto done1;
	}

	#ifdef CAPABILITIES
	fp_cap = fp;
	error = cap_funwrap(fp_cap, CAP_TTYHOOK, &fp);
	if (error)
	return (error);
	#endif

	/*
	* Make sure the vnode is bound to a character device.
	* Unlocked check for the vnode type is ok there, because we
	* only shall prevent calling devvn_refthread on the file that
	* never has been opened over a character device.
	*/
	if (fp->f_type != DTYPE_VNODE \|\| fp->f_vnode->v_type != VCHR) {
	error = EINVAL;
	goto done1;
	}

	/* Make sure it is a TTY. */
	cdp = devvn_refthread(fp->f_vnode, &dev, &ref);
	if (cdp == NULL) {
	error = ENXIO;
	goto done1;
	}
	if (dev != fp->f_data) {
	error = ENXIO;
	goto done2;
	}
	if (cdp != &ttydev_cdevsw) {
	error = ENOTTY;
	goto done2;
	}
	tp = dev->si_drv1;

	/* Try to attach the hook to the TTY. */
	error = EBUSY;
	tty_lock(tp);
	MPASS((tp->t_hook == NULL) == ((tp->t_flags & TF_HOOK) == 0));
	if (tp->t_flags & TF_HOOK)
	goto done3;

	tp->t_flags \|= TF_HOOK;
	tp->t_hook = th;
	tp->t_hooksoftc = softc;
	*rtp = tp;
	error = 0;

	/* Maybe we can switch into bypass mode now. */
	ttydisc_optimize(tp);

	/* Silently convert rint() calls to rint_bypass() when possible. */
	if (!ttyhook_hashook(tp, rint) && ttyhook_hashook(tp, rint_bypass))
	th->th_rint = ttyhook_defrint;

	done3: tty_unlock(tp);
	done2: dev_relthread(dev, ref);
	done1: fdrop(fp, curthread);
	return (error);
	}

	void
	ttyhook_unregister(struct tty *tp)
	{

	tty_lock_assert(tp, MA_OWNED);
	MPASS(tp->t_flags & TF_HOOK);

	/* Disconnect the hook. */
	tp->t_flags &= ~TF_HOOK;
	tp->t_hook = NULL;

	/* Maybe we need to leave bypass mode. */
	ttydisc_optimize(tp);

	/* Maybe deallocate the TTY as well. */
	tty_rel_free(tp);
	}

	/*
	* /dev/console handling.
	*/

	static int
	ttyconsdev_open(struct cdev dev, int oflags, int devtype, struct thread td)
	{
	struct tty *tp;

	/* System has no console device. */
	if (dev_console_filename == NULL)
	return (ENXIO);

	/* Look up corresponding TTY by device name. */
	sx_slock(&tty_list_sx);
	TAILQ_FOREACH(tp, &tty_list, t_list) {
	if (strcmp(dev_console_filename, tty_devname(tp)) == 0) {
	dev_console->si_drv1 = tp;
	break;
	}
	}
	sx_sunlock(&tty_list_sx);

	/* System console has no TTY associated. */
	if (dev_console->si_drv1 == NULL)
	return (ENXIO);

	return (ttydev_open(dev, oflags, devtype, td));
	}

	static int
	ttyconsdev_write(struct cdev dev, struct uio uio, int ioflag)
	{

	log_console(uio);

	return (ttydev_write(dev, uio, ioflag));
	}

	/*
	* /dev/console is a little different than normal TTY's. When opened,
	* it determines which TTY to use. When data gets written to it, it
	* will be logged in the kernel message buffer.
	*/
	static struct cdevsw ttyconsdev_cdevsw = {
	.d_version = D_VERSION,
	.d_open = ttyconsdev_open,
	.d_close = ttydev_close,
	.d_read = ttydev_read,
	.d_write = ttyconsdev_write,
	.d_ioctl = ttydev_ioctl,
	.d_kqfilter = ttydev_kqfilter,
	.d_poll = ttydev_poll,
	.d_mmap = ttydev_mmap,
	.d_name = "ttyconsdev",
	.d_flags = D_TTY,
	};

	static void
	ttyconsdev_init(void *unused)
	{

	dev_console = make_dev_credf(MAKEDEV_ETERNAL, &ttyconsdev_cdevsw, 0,
	NULL, UID_ROOT, GID_WHEEL, 0600, "console");
	}

	SYSINIT(tty, SI_SUB_DRIVERS, SI_ORDER_FIRST, ttyconsdev_init, NULL);

	void
	ttyconsdev_select(const char *name)
	{

	dev_console_filename = name;
	}

	/*
	* Debugging routines.
	*/

	#include "opt_ddb.h"
	#ifdef DDB
	#include <ddb/ddb.h>
	#include <ddb/db_sym.h>

	static struct {
	int flag;
	char val;
	} ttystates[] = {
	#if 0
	{ TF_NOPREFIX, 'N' },
	#endif
	{ TF_INITLOCK, 'I' },
	{ TF_CALLOUT, 'C' },

	/* Keep these together -> 'Oi' and 'Oo'. */
	{ TF_OPENED, 'O' },
	{ TF_OPENED_IN, 'i' },
	{ TF_OPENED_OUT, 'o' },
	{ TF_OPENED_CONS, 'c' },

	{ TF_GONE, 'G' },
	{ TF_OPENCLOSE, 'B' },
	{ TF_ASYNC, 'Y' },
	{ TF_LITERAL, 'L' },

	/* Keep these together -> 'Hi' and 'Ho'. */
	{ TF_HIWAT, 'H' },
	{ TF_HIWAT_IN, 'i' },
	{ TF_HIWAT_OUT, 'o' },

	{ TF_STOPPED, 'S' },
	{ TF_EXCLUDE, 'X' },
	{ TF_BYPASS, 'l' },
	{ TF_ZOMBIE, 'Z' },
	{ TF_HOOK, 's' },

	/* Keep these together -> 'bi' and 'bo'. */
	{ TF_BUSY, 'b' },
	{ TF_BUSY_IN, 'i' },
	{ TF_BUSY_OUT, 'o' },

	{ 0, '\0'},
	};

	#define TTY_FLAG_BITS \
	"\20\1NOPREFIX\2INITLOCK\3CALLOUT\4OPENED_IN\5OPENED_OUT\6GONE" \
	"\7OPENCLOSE\10ASYNC\11LITERAL\12HIWAT_IN\13HIWAT_OUT\14STOPPED" \
	"\15EXCLUDE\16BYPASS\17ZOMBIE\20HOOK"

	#define DB_PRINTSYM(name, addr) \
	db_printf("%s " #name ": ", sep); \
	db_printsym((db_addr_t) addr, DB_STGY_ANY); \
	db_printf("\n");

	static void
	_db_show_devsw(const char sep, const struct ttydevsw tsw)
	{
	db_printf("%sdevsw: ", sep);
	db_printsym((db_addr_t)tsw, DB_STGY_ANY);
	db_printf(" (%p)\n", tsw);
	DB_PRINTSYM(open, tsw->tsw_open);
	DB_PRINTSYM(close, tsw->tsw_close);
	DB_PRINTSYM(outwakeup, tsw->tsw_outwakeup);
	DB_PRINTSYM(inwakeup, tsw->tsw_inwakeup);
	DB_PRINTSYM(ioctl, tsw->tsw_ioctl);
	DB_PRINTSYM(param, tsw->tsw_param);
	DB_PRINTSYM(modem, tsw->tsw_modem);
	DB_PRINTSYM(mmap, tsw->tsw_mmap);
	DB_PRINTSYM(pktnotify, tsw->tsw_pktnotify);
	DB_PRINTSYM(free, tsw->tsw_free);
	}
	static void
	_db_show_hooks(const char sep, const struct ttyhook th)
	{
	db_printf("%shook: ", sep);
	db_printsym((db_addr_t)th, DB_STGY_ANY);
	db_printf(" (%p)\n", th);
	if (th == NULL)
	return;
	DB_PRINTSYM(rint, th->th_rint);
	DB_PRINTSYM(rint_bypass, th->th_rint_bypass);
	DB_PRINTSYM(rint_done, th->th_rint_done);
	DB_PRINTSYM(rint_poll, th->th_rint_poll);
	DB_PRINTSYM(getc_inject, th->th_getc_inject);
	DB_PRINTSYM(getc_capture, th->th_getc_capture);
	DB_PRINTSYM(getc_poll, th->th_getc_poll);
	DB_PRINTSYM(close, th->th_close);
	}

	static void
	_db_show_termios(const char name, const struct termios t)
	{

	db_printf("%s: iflag 0x%x oflag 0x%x cflag 0x%x "
	"lflag 0x%x ispeed %u ospeed %u\n", name,
	t->c_iflag, t->c_oflag, t->c_cflag, t->c_lflag,
	t->c_ispeed, t->c_ospeed);
	}

	/* DDB command to show TTY statistics. */
	DB_SHOW_COMMAND(tty, db_show_tty)
	{
	struct tty *tp;

	if (!have_addr) {
	db_printf("usage: show tty <addr>\n");
	return;
	}
	tp = (struct tty *)addr;

	db_printf("0x%p: %s\n", tp, tty_devname(tp));
	db_printf("\tmtx: %p\n", tp->t_mtx);
	db_printf("\tflags: %b\n", tp->t_flags, TTY_FLAG_BITS);
	db_printf("\trevokecnt: %u\n", tp->t_revokecnt);

	/* Buffering mechanisms. */
	db_printf("\tinq: %p begin %u linestart %u reprint %u end %u "
	"nblocks %u quota %u\n", &tp->t_inq, tp->t_inq.ti_begin,
	tp->t_inq.ti_linestart, tp->t_inq.ti_reprint, tp->t_inq.ti_end,
	tp->t_inq.ti_nblocks, tp->t_inq.ti_quota);
	db_printf("\toutq: %p begin %u end %u nblocks %u quota %u\n",
	&tp->t_outq, tp->t_outq.to_begin, tp->t_outq.to_end,
	tp->t_outq.to_nblocks, tp->t_outq.to_quota);
	db_printf("\tinlow: %zu\n", tp->t_inlow);
	db_printf("\toutlow: %zu\n", tp->t_outlow);
	_db_show_termios("\ttermios", &tp->t_termios);
	db_printf("\twinsize: row %u col %u xpixel %u ypixel %u\n",
	tp->t_winsize.ws_row, tp->t_winsize.ws_col,
	tp->t_winsize.ws_xpixel, tp->t_winsize.ws_ypixel);
	db_printf("\tcolumn: %u\n", tp->t_column);
	db_printf("\twritepos: %u\n", tp->t_writepos);
	db_printf("\tcompatflags: 0x%x\n", tp->t_compatflags);

	/* Init/lock-state devices. */
	_db_show_termios("\ttermios_init_in", &tp->t_termios_init_in);
	_db_show_termios("\ttermios_init_out", &tp->t_termios_init_out);
	_db_show_termios("\ttermios_lock_in", &tp->t_termios_lock_in);
	_db_show_termios("\ttermios_lock_out", &tp->t_termios_lock_out);

	/* Hooks */
	_db_show_devsw("\t", tp->t_devsw);
	_db_show_hooks("\t", tp->t_hook);

	/* Process info. */
	db_printf("\tpgrp: %p gid %d jobc %d\n", tp->t_pgrp,
	tp->t_pgrp ? tp->t_pgrp->pg_id : 0,
	tp->t_pgrp ? tp->t_pgrp->pg_jobc : 0);
	db_printf("\tsession: %p", tp->t_session);
	if (tp->t_session != NULL)
	db_printf(" count %u leader %p tty %p sid %d login %s",
	tp->t_session->s_count, tp->t_session->s_leader,
	tp->t_session->s_ttyp, tp->t_session->s_sid,
	tp->t_session->s_login);
	db_printf("\n");
	db_printf("\tsessioncnt: %u\n", tp->t_sessioncnt);
	db_printf("\tdevswsoftc: %p\n", tp->t_devswsoftc);
	db_printf("\thooksoftc: %p\n", tp->t_hooksoftc);
	db_printf("\tdev: %p\n", tp->t_dev);
	}

	/* DDB command to list TTYs. */
	DB_SHOW_ALL_COMMAND(ttys, db_show_all_ttys)
	{
	struct tty *tp;
	size_t isiz, osiz;
	int i, j;

	/* Make the output look like `pstat -t'. */
	db_printf("PTR ");
	#if defined(__LP64__)
	db_printf(" ");
	#endif
	db_printf(" LINE INQ CAN LIN LOW OUTQ USE LOW "
	"COL SESS PGID STATE\n");

	TAILQ_FOREACH(tp, &tty_list, t_list) {
	isiz = tp->t_inq.ti_nblocks * TTYINQ_DATASIZE;
	osiz = tp->t_outq.to_nblocks * TTYOUTQ_DATASIZE;

	db_printf("%p %10s %5zu %4u %4u %4zu %5zu %4u %4zu %5u %5d %5d ",
	tp,
	tty_devname(tp),
	isiz,
	tp->t_inq.ti_linestart - tp->t_inq.ti_begin,
	tp->t_inq.ti_end - tp->t_inq.ti_linestart,
	isiz - tp->t_inlow,
	osiz,
	tp->t_outq.to_end - tp->t_outq.to_begin,
	osiz - tp->t_outlow,
	MIN(tp->t_column, 99999),
	tp->t_session ? tp->t_session->s_sid : 0,
	tp->t_pgrp ? tp->t_pgrp->pg_id : 0);

	/* Flag bits. */
	for (i = j = 0; ttystates[i].flag; i++)
	if (tp->t_flags & ttystates[i].flag) {
	db_printf("%c", ttystates[i].val);
	j++;
	}
	if (j == 0)
	db_printf("-");
	db_printf("\n");
	}
	}
	#endif /* DDB */
	Index: head/sys/kern/tty_pts.c
	===================================================================
	--- head/sys/kern/tty_pts.c (revision 225616)
	+++ head/sys/kern/tty_pts.c (revision 225617)
	@@ -1,856 +1,856 @@
	/*-
	* Copyright (c) 2008 Ed Schouten <ed@FreeBSD.org>
	* All rights reserved.
	*
	* Portions of this software were developed under sponsorship from Snow
	* B.V., the Netherlands.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	/* Add compatibility bits for FreeBSD. */
	#define PTS_COMPAT
	/* Add pty(4) compat bits. */
	#define PTS_EXTERNAL
	/* Add bits to make Linux binaries work. */
	#define PTS_LINUX

	#include <sys/param.h>
	#include <sys/lock.h>
	#include <sys/condvar.h>
	#include <sys/conf.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/filio.h>
	#include <sys/kernel.h>
	#include <sys/limits.h>
	#include <sys/malloc.h>
	#include <sys/poll.h>
	#include <sys/proc.h>
	#include <sys/racct.h>
	#include <sys/resourcevar.h>
	#include <sys/serial.h>
	#include <sys/stat.h>
	#include <sys/syscall.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/tty.h>
	#include <sys/ttycom.h>

	#include <machine/stdarg.h>

	/*
	* Our utmp(5) format is limited to 8-byte TTY line names. This means
	* we can at most allocate 1000 pseudo-terminals ("pts/999"). Allow
	* users to increase this number, assuming they have manually increased
	* UT_LINESIZE.
	*/
	static struct unrhdr *pts_pool;

	static MALLOC_DEFINE(M_PTS, "pts", "pseudo tty device");

	/*
	* Per-PTS structure.
	*
	* List of locks
	* (t) locked by tty_lock()
	* (c) const until freeing
	*/
	struct pts_softc {
	int pts_unit; /* (c) Device unit number. */
	unsigned int pts_flags; /* (t) Device flags. */
	#define PTS_PKT 0x1 /* Packet mode. */
	#define PTS_FINISHED 0x2 /* Return errors on read()/write(). */
	char pts_pkt; /* (t) Unread packet mode data. */

	struct cv pts_inwait; /* (t) Blocking write() on master. */
	struct selinfo pts_inpoll; /* (t) Select queue for write(). */
	struct cv pts_outwait; /* (t) Blocking read() on master. */
	struct selinfo pts_outpoll; /* (t) Select queue for read(). */

	#ifdef PTS_EXTERNAL
	struct cdev pts_cdev; / (c) Master device node. */
	#endif /* PTS_EXTERNAL */

	struct ucred pts_cred; / (c) Resource limit. */
	};

	/*
	* Controller-side file operations.
	*/

	static int
	ptsdev_read(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{
	struct tty *tp = fp->f_data;
	struct pts_softc *psc = tty_softc(tp);
	int error = 0;
	char pkt;

	if (uio->uio_resid == 0)
	return (0);

	tty_lock(tp);

	for (;;) {
	/*
	* Implement packet mode. When packet mode is turned on,
	* the first byte contains a bitmask of events that
	* occured (start, stop, flush, window size, etc).
	*/
	if (psc->pts_flags & PTS_PKT && psc->pts_pkt) {
	pkt = psc->pts_pkt;
	psc->pts_pkt = 0;
	tty_unlock(tp);

	error = ureadc(pkt, uio);
	return (error);
	}

	/*
	* Transmit regular data.
	*
	* XXX: We shouldn't use ttydisc_getc_poll()! Even
	* though in this implementation, there is likely going
	* to be data, we should just call ttydisc_getc_uio()
	* and use its return value to sleep.
	*/
	if (ttydisc_getc_poll(tp)) {
	if (psc->pts_flags & PTS_PKT) {
	/*
	* XXX: Small race. Fortunately PTY
	* consumers aren't multithreaded.
	*/

	tty_unlock(tp);
	error = ureadc(TIOCPKT_DATA, uio);
	if (error)
	return (error);
	tty_lock(tp);
	}

	error = ttydisc_getc_uio(tp, uio);
	break;
	}

	/* Maybe the device isn't used anyway. */
	if (psc->pts_flags & PTS_FINISHED)
	break;

	/* Wait for more data. */
	if (fp->f_flag & O_NONBLOCK) {
	error = EWOULDBLOCK;
	break;
	}
	error = cv_wait_sig(&psc->pts_outwait, tp->t_mtx);
	if (error != 0)
	break;
	}

	tty_unlock(tp);

	return (error);
	}

	static int
	ptsdev_write(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{
	struct tty *tp = fp->f_data;
	struct pts_softc *psc = tty_softc(tp);
	char ib[256], *ibstart;
	size_t iblen, rintlen;
	int error = 0;

	if (uio->uio_resid == 0)
	return (0);

	for (;;) {
	ibstart = ib;
	iblen = MIN(uio->uio_resid, sizeof ib);
	error = uiomove(ib, iblen, uio);

	tty_lock(tp);
	if (error != 0) {
	iblen = 0;
	goto done;
	}

	/*
	* When possible, avoid the slow path. rint_bypass()
	* copies all input to the input queue at once.
	*/
	MPASS(iblen > 0);
	do {
	rintlen = ttydisc_rint_simple(tp, ibstart, iblen);
	ibstart += rintlen;
	iblen -= rintlen;
	if (iblen == 0) {
	/* All data written. */
	break;
	}

	/* Maybe the device isn't used anyway. */
	if (psc->pts_flags & PTS_FINISHED) {
	error = EIO;
	goto done;
	}

	/* Wait for more data. */
	if (fp->f_flag & O_NONBLOCK) {
	error = EWOULDBLOCK;
	goto done;
	}

	/* Wake up users on the slave side. */
	ttydisc_rint_done(tp);
	error = cv_wait_sig(&psc->pts_inwait, tp->t_mtx);
	if (error != 0)
	goto done;
	} while (iblen > 0);

	if (uio->uio_resid == 0)
	break;
	tty_unlock(tp);
	}

	done: ttydisc_rint_done(tp);
	tty_unlock(tp);

	/*
	* Don't account for the part of the buffer that we couldn't
	* pass to the TTY.
	*/
	uio->uio_resid += iblen;
	return (error);
	}

	static int
	ptsdev_truncate(struct file fp, off_t length, struct ucred active_cred,
	struct thread *td)
	{

	return (EINVAL);
	}

	static int
	ptsdev_ioctl(struct file fp, u_long cmd, void data,
	struct ucred active_cred, struct thread td)
	{
	struct tty *tp = fp->f_data;
	struct pts_softc *psc = tty_softc(tp);
	int error = 0, sig;

	switch (cmd) {
	case FIONBIO:
	/* This device supports non-blocking operation. */
	return (0);
	case FIONREAD:
	tty_lock(tp);
	if (psc->pts_flags & PTS_FINISHED) {
	/* Force read() to be called. */
	(int )data = 1;
	} else {
	(int )data = ttydisc_getc_poll(tp);
	}
	tty_unlock(tp);
	return (0);
	case FIODGNAME: {
	struct fiodgname_arg *fgn;
	const char *p;
	int i;

	/* Reverse device name lookups, for ptsname() and ttyname(). */
	fgn = data;
	p = tty_devname(tp);
	i = strlen(p) + 1;
	if (i > fgn->len)
	return (EINVAL);
	return copyout(p, fgn->buf, i);
	}

	/*
	* We need to implement TIOCGPGRP and TIOCGSID here again. When
	* called on the pseudo-terminal master, it should not check if
	* the terminal is the foreground terminal of the calling
	* process.
	*
	* TIOCGETA is also implemented here. Various Linux PTY routines
	* often call isatty(), which is implemented by tcgetattr().
	*/
	#ifdef PTS_LINUX
	case TIOCGETA:
	/* Obtain terminal flags through tcgetattr(). */
	tty_lock(tp);
	(struct termios)data = tp->t_termios;
	tty_unlock(tp);
	return (0);
	#endif /* PTS_LINUX */
	case TIOCSETAF:
	case TIOCSETAW:
	/*
	* We must make sure we turn tcsetattr() calls of TCSAFLUSH and
	* TCSADRAIN into something different. If an application would
	* call TCSAFLUSH or TCSADRAIN on the master descriptor, it may
	* deadlock waiting for all data to be read.
	*/
	cmd = TIOCSETA;
	break;
	#if defined(PTS_COMPAT) \|\| defined(PTS_LINUX)
	case TIOCGPTN:
	/*
	* Get the device unit number.
	*/
	if (psc->pts_unit < 0)
	return (ENOTTY);
	(unsigned int )data = psc->pts_unit;
	return (0);
	#endif /* PTS_COMPAT \|\| PTS_LINUX */
	case TIOCGPGRP:
	/* Get the foreground process group ID. */
	tty_lock(tp);
	if (tp->t_pgrp != NULL)
	(int )data = tp->t_pgrp->pg_id;
	else
	(int )data = NO_PID;
	tty_unlock(tp);
	return (0);
	case TIOCGSID:
	/* Get the session leader process ID. */
	tty_lock(tp);
	if (tp->t_session == NULL)
	error = ENOTTY;
	else
	(int )data = tp->t_session->s_sid;
	tty_unlock(tp);
	return (error);
	case TIOCPTMASTER:
	/* Yes, we are a pseudo-terminal master. */
	return (0);
	case TIOCSIG:
	/* Signal the foreground process group. */
	sig = (int )data;
	if (sig < 1 \|\| sig >= NSIG)
	return (EINVAL);

	tty_lock(tp);
	tty_signal_pgrp(tp, sig);
	tty_unlock(tp);
	return (0);
	case TIOCPKT:
	/* Enable/disable packet mode. */
	tty_lock(tp);
	if ((int )data)
	psc->pts_flags \|= PTS_PKT;
	else
	psc->pts_flags &= ~PTS_PKT;
	tty_unlock(tp);
	return (0);
	}

	/* Just redirect this ioctl to the slave device. */
	tty_lock(tp);
	error = tty_ioctl(tp, cmd, data, fp->f_flag, td);
	tty_unlock(tp);
	if (error == ENOIOCTL)
	error = ENOTTY;

	return (error);
	}

	static int
	ptsdev_poll(struct file fp, int events, struct ucred active_cred,
	struct thread *td)
	{
	struct tty *tp = fp->f_data;
	struct pts_softc *psc = tty_softc(tp);
	int revents = 0;

	tty_lock(tp);

	if (psc->pts_flags & PTS_FINISHED) {
	/* Slave device is not opened. */
	tty_unlock(tp);
	return ((events & (POLLIN\|POLLRDNORM)) \| POLLHUP);
	}

	if (events & (POLLIN\|POLLRDNORM)) {
	/* See if we can getc something. */
	if (ttydisc_getc_poll(tp) \|\|
	(psc->pts_flags & PTS_PKT && psc->pts_pkt))
	revents \|= events & (POLLIN\|POLLRDNORM);
	}
	if (events & (POLLOUT\|POLLWRNORM)) {
	/* See if we can rint something. */
	if (ttydisc_rint_poll(tp))
	revents \|= events & (POLLOUT\|POLLWRNORM);
	}

	/*
	* No need to check for POLLHUP here. This device cannot be used
	* as a callout device, which means we always have a carrier,
	* because the master is.
	*/

	if (revents == 0) {
	/*
	* This code might look misleading, but the naming of
	* poll events on this side is the opposite of the slave
	* device.
	*/
	if (events & (POLLIN\|POLLRDNORM))
	selrecord(td, &psc->pts_outpoll);
	if (events & (POLLOUT\|POLLWRNORM))
	selrecord(td, &psc->pts_inpoll);
	}

	tty_unlock(tp);

	return (revents);
	}

	/*
	* kqueue support.
	*/

	static void
	pts_kqops_read_detach(struct knote *kn)
	{
	struct file *fp = kn->kn_fp;
	struct tty *tp = fp->f_data;
	struct pts_softc *psc = tty_softc(tp);

	knlist_remove(&psc->pts_outpoll.si_note, kn, 0);
	}

	static int
	pts_kqops_read_event(struct knote *kn, long hint)
	{
	struct file *fp = kn->kn_fp;
	struct tty *tp = fp->f_data;
	struct pts_softc *psc = tty_softc(tp);

	if (psc->pts_flags & PTS_FINISHED) {
	kn->kn_flags \|= EV_EOF;
	return (1);
	} else {
	kn->kn_data = ttydisc_getc_poll(tp);
	return (kn->kn_data > 0);
	}
	}

	static void
	pts_kqops_write_detach(struct knote *kn)
	{
	struct file *fp = kn->kn_fp;
	struct tty *tp = fp->f_data;
	struct pts_softc *psc = tty_softc(tp);

	knlist_remove(&psc->pts_inpoll.si_note, kn, 0);
	}

	static int
	pts_kqops_write_event(struct knote *kn, long hint)
	{
	struct file *fp = kn->kn_fp;
	struct tty *tp = fp->f_data;
	struct pts_softc *psc = tty_softc(tp);

	if (psc->pts_flags & PTS_FINISHED) {
	kn->kn_flags \|= EV_EOF;
	return (1);
	} else {
	kn->kn_data = ttydisc_rint_poll(tp);
	return (kn->kn_data > 0);
	}
	}

	static struct filterops pts_kqops_read = {
	.f_isfd = 1,
	.f_detach = pts_kqops_read_detach,
	.f_event = pts_kqops_read_event,
	};
	static struct filterops pts_kqops_write = {
	.f_isfd = 1,
	.f_detach = pts_kqops_write_detach,
	.f_event = pts_kqops_write_event,
	};

	static int
	ptsdev_kqfilter(struct file fp, struct knote kn)
	{
	struct tty *tp = fp->f_data;
	struct pts_softc *psc = tty_softc(tp);
	int error = 0;

	tty_lock(tp);

	switch (kn->kn_filter) {
	case EVFILT_READ:
	kn->kn_fop = &pts_kqops_read;
	knlist_add(&psc->pts_outpoll.si_note, kn, 1);
	break;
	case EVFILT_WRITE:
	kn->kn_fop = &pts_kqops_write;
	knlist_add(&psc->pts_inpoll.si_note, kn, 1);
	break;
	default:
	error = EINVAL;
	break;
	}

	tty_unlock(tp);
	return (error);
	}

	static int
	ptsdev_stat(struct file fp, struct stat sb, struct ucred *active_cred,
	struct thread *td)
	{
	struct tty *tp = fp->f_data;
	#ifdef PTS_EXTERNAL
	struct pts_softc *psc = tty_softc(tp);
	#endif /* PTS_EXTERNAL */
	struct cdev *dev = tp->t_dev;

	/*
	* According to POSIX, we must implement an fstat(). This also
	* makes this implementation compatible with Linux binaries,
	* because Linux calls fstat() on the pseudo-terminal master to
	* obtain st_rdev.
	*
	* XXX: POSIX also mentions we must fill in st_dev, but how?
	*/

	bzero(sb, sizeof *sb);
	#ifdef PTS_EXTERNAL
	if (psc->pts_cdev != NULL)
	sb->st_ino = sb->st_rdev = dev2udev(psc->pts_cdev);
	else
	#endif /* PTS_EXTERNAL */
	sb->st_ino = sb->st_rdev = tty_udev(tp);

	sb->st_atim = dev->si_atime;
	sb->st_ctim = dev->si_ctime;
	sb->st_mtim = dev->si_mtime;
	sb->st_uid = dev->si_uid;
	sb->st_gid = dev->si_gid;
	sb->st_mode = dev->si_mode \| S_IFCHR;

	return (0);
	}

	static int
	ptsdev_close(struct file fp, struct thread td)
	{
	struct tty *tp = fp->f_data;

	/* Deallocate TTY device. */
	tty_lock(tp);
	tty_rel_gone(tp);

	/*
	* Open of /dev/ptmx or /dev/ptyXX changes the type of file
	* from DTYPE_VNODE to DTYPE_PTS. vn_open() increases vnode
	* use count, we need to decrement it, and possibly do other
	* required cleanup.
	*/
	if (fp->f_vnode != NULL)
	return (vnops.fo_close(fp, td));

	return (0);
	}

	static struct fileops ptsdev_ops = {
	.fo_read = ptsdev_read,
	.fo_write = ptsdev_write,
	.fo_truncate = ptsdev_truncate,
	.fo_ioctl = ptsdev_ioctl,
	.fo_poll = ptsdev_poll,
	.fo_kqfilter = ptsdev_kqfilter,
	.fo_stat = ptsdev_stat,
	.fo_close = ptsdev_close,
	.fo_chmod = invfo_chmod,
	.fo_chown = invfo_chown,
	.fo_flags = DFLAG_PASSABLE,
	};

	/*
	* Driver-side hooks.
	*/

	static void
	ptsdrv_outwakeup(struct tty *tp)
	{
	struct pts_softc *psc = tty_softc(tp);

	cv_broadcast(&psc->pts_outwait);
	selwakeup(&psc->pts_outpoll);
	KNOTE_LOCKED(&psc->pts_outpoll.si_note, 0);
	}

	static void
	ptsdrv_inwakeup(struct tty *tp)
	{
	struct pts_softc *psc = tty_softc(tp);

	cv_broadcast(&psc->pts_inwait);
	selwakeup(&psc->pts_inpoll);
	KNOTE_LOCKED(&psc->pts_inpoll.si_note, 0);
	}

	static int
	ptsdrv_open(struct tty *tp)
	{
	struct pts_softc *psc = tty_softc(tp);

	psc->pts_flags &= ~PTS_FINISHED;

	return (0);
	}

	static void
	ptsdrv_close(struct tty *tp)
	{
	struct pts_softc *psc = tty_softc(tp);

	/* Wake up any blocked readers/writers. */
	psc->pts_flags \|= PTS_FINISHED;
	ptsdrv_outwakeup(tp);
	ptsdrv_inwakeup(tp);
	}

	static void
	ptsdrv_pktnotify(struct tty *tp, char event)
	{
	struct pts_softc *psc = tty_softc(tp);

	/*
	* Clear conflicting flags.
	*/

	switch (event) {
	case TIOCPKT_STOP:
	psc->pts_pkt &= ~TIOCPKT_START;
	break;
	case TIOCPKT_START:
	psc->pts_pkt &= ~TIOCPKT_STOP;
	break;
	case TIOCPKT_NOSTOP:
	psc->pts_pkt &= ~TIOCPKT_DOSTOP;
	break;
	case TIOCPKT_DOSTOP:
	psc->pts_pkt &= ~TIOCPKT_NOSTOP;
	break;
	}

	psc->pts_pkt \|= event;
	ptsdrv_outwakeup(tp);
	}

	static void
	ptsdrv_free(void *softc)
	{
	struct pts_softc *psc = softc;

	/* Make device number available again. */
	if (psc->pts_unit >= 0)
	free_unr(pts_pool, psc->pts_unit);

	chgptscnt(psc->pts_cred->cr_ruidinfo, -1, 0);
	racct_sub_cred(psc->pts_cred, RACCT_NPTS, 1);
	crfree(psc->pts_cred);

	seldrain(&psc->pts_inpoll);
	seldrain(&psc->pts_outpoll);
	knlist_destroy(&psc->pts_inpoll.si_note);
	knlist_destroy(&psc->pts_outpoll.si_note);

	#ifdef PTS_EXTERNAL
	/* Destroy master device as well. */
	if (psc->pts_cdev != NULL)
	destroy_dev_sched(psc->pts_cdev);
	#endif /* PTS_EXTERNAL */

	free(psc, M_PTS);
	}

	static struct ttydevsw pts_class = {
	.tsw_flags = TF_NOPREFIX,
	.tsw_outwakeup = ptsdrv_outwakeup,
	.tsw_inwakeup = ptsdrv_inwakeup,
	.tsw_open = ptsdrv_open,
	.tsw_close = ptsdrv_close,
	.tsw_pktnotify = ptsdrv_pktnotify,
	.tsw_free = ptsdrv_free,
	};

	#ifndef PTS_EXTERNAL
	static
	#endif /* !PTS_EXTERNAL */
	int
	pts_alloc(int fflags, struct thread td, struct file fp)
	{
	int unit, ok, error;
	struct tty *tp;
	struct pts_softc *psc;
	struct proc *p = td->td_proc;
	struct ucred *cred = td->td_ucred;

	/* Resource limiting. */
	PROC_LOCK(p);
	error = racct_add(p, RACCT_NPTS, 1);
	if (error != 0) {
	PROC_UNLOCK(p);
	return (EAGAIN);
	}
	ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(p, RLIMIT_NPTS));
	if (!ok) {
	racct_sub(p, RACCT_NPTS, 1);
	PROC_UNLOCK(p);
	return (EAGAIN);
	}
	PROC_UNLOCK(p);

	/* Try to allocate a new pts unit number. */
	unit = alloc_unr(pts_pool);
	if (unit < 0) {
	racct_sub(p, RACCT_NPTS, 1);
	chgptscnt(cred->cr_ruidinfo, -1, 0);
	return (EAGAIN);
	}

	/* Allocate TTY and softc. */
	psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK\|M_ZERO);
	cv_init(&psc->pts_inwait, "ptsin");
	cv_init(&psc->pts_outwait, "ptsout");

	psc->pts_unit = unit;
	psc->pts_cred = crhold(cred);

	tp = tty_alloc(&pts_class, psc);
	knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
	knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);

	/* Expose the slave device as well. */
	tty_makedev(tp, td->td_ucred, "pts/%u", psc->pts_unit);

	finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);

	return (0);
	}

	#ifdef PTS_EXTERNAL
	int
	pts_alloc_external(int fflags, struct thread td, struct file fp,
	struct cdev dev, const char name)
	{
	int ok, error;
	struct tty *tp;
	struct pts_softc *psc;
	struct proc *p = td->td_proc;
	struct ucred *cred = td->td_ucred;

	/* Resource limiting. */
	PROC_LOCK(p);
	error = racct_add(p, RACCT_NPTS, 1);
	if (error != 0) {
	PROC_UNLOCK(p);
	return (EAGAIN);
	}
	ok = chgptscnt(cred->cr_ruidinfo, 1, lim_cur(p, RLIMIT_NPTS));
	if (!ok) {
	racct_sub(p, RACCT_NPTS, 1);
	PROC_UNLOCK(p);
	return (EAGAIN);
	}
	PROC_UNLOCK(p);

	/* Allocate TTY and softc. */
	psc = malloc(sizeof(struct pts_softc), M_PTS, M_WAITOK\|M_ZERO);
	cv_init(&psc->pts_inwait, "ptsin");
	cv_init(&psc->pts_outwait, "ptsout");

	psc->pts_unit = -1;
	psc->pts_cdev = dev;
	psc->pts_cred = crhold(cred);

	tp = tty_alloc(&pts_class, psc);
	knlist_init_mtx(&psc->pts_inpoll.si_note, tp->t_mtx);
	knlist_init_mtx(&psc->pts_outpoll.si_note, tp->t_mtx);

	/* Expose the slave device as well. */
	tty_makedev(tp, td->td_ucred, "%s", name);

	finit(fp, fflags, DTYPE_PTS, tp, &ptsdev_ops);

	return (0);
	}
	#endif /* PTS_EXTERNAL */

	int
	-posix_openpt(struct thread td, struct posix_openpt_args uap)
	+sys_posix_openpt(struct thread td, struct posix_openpt_args uap)
	{
	int error, fd;
	struct file *fp;

	/*
	* POSIX states it's unspecified when other flags are passed. We
	* don't allow this.
	*/
	if (uap->flags & ~(O_RDWR\|O_NOCTTY))
	return (EINVAL);

	error = falloc(td, &fp, &fd, 0);
	if (error)
	return (error);

	/* Allocate the actual pseudo-TTY. */
	error = pts_alloc(FFLAGS(uap->flags & O_ACCMODE), td, fp);
	if (error != 0) {
	fdclose(td->td_proc->p_fd, fp, fd, td);
	return (error);
	}

	/* Pass it back to userspace. */
	td->td_retval[0] = fd;
	fdrop(fp, td);

	return (0);
	}

	static void
	pts_init(void *unused)
	{

	pts_pool = new_unrhdr(0, INT_MAX, NULL);
	}

	SYSINIT(pts, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, pts_init, NULL);
	Index: head/sys/kern/uipc_mqueue.c
	===================================================================
	--- head/sys/kern/uipc_mqueue.c (revision 225616)
	+++ head/sys/kern/uipc_mqueue.c (revision 225617)
	@@ -1,2836 +1,2836 @@
	/*-
	* Copyright (c) 2005 David Xu <davidxu@freebsd.org>
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	/*
	* POSIX message queue implementation.
	*
	* 1) A mqueue filesystem can be mounted, each message queue appears
	* in mounted directory, user can change queue's permission and
	* ownership, or remove a queue. Manually creating a file in the
	* directory causes a message queue to be created in the kernel with
	* default message queue attributes applied and same name used, this
	* method is not advocated since mq_open syscall allows user to specify
	* different attributes. Also the file system can be mounted multiple
	* times at different mount points but shows same contents.
	*
	* 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
	* but directly operate on internal data structure, this allows user to
	* use the IPC facility without having to mount mqueue file system.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/systm.h>
	#include <sys/limits.h>
	#include <sys/buf.h>
	#include <sys/capability.h>
	#include <sys/dirent.h>
	#include <sys/event.h>
	#include <sys/eventhandler.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mount.h>
	#include <sys/mqueue.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/posix4.h>
	#include <sys/poll.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/queue.h>
	#include <sys/sysproto.h>
	#include <sys/stat.h>
	#include <sys/syscall.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/taskqueue.h>
	#include <sys/unistd.h>
	#include <sys/vnode.h>
	#include <machine/atomic.h>

	FEATURE(p1003_1b_mqueue, "POSIX P1003.1B message queues support");

	/*
	* Limits and constants
	*/
	#define MQFS_NAMELEN NAME_MAX
	#define MQFS_DELEN (8 + MQFS_NAMELEN)

	/* node types */
	typedef enum {
	mqfstype_none = 0,
	mqfstype_root,
	mqfstype_dir,
	mqfstype_this,
	mqfstype_parent,
	mqfstype_file,
	mqfstype_symlink,
	} mqfs_type_t;

	struct mqfs_node;

	/*
	* mqfs_info: describes a mqfs instance
	*/
	struct mqfs_info {
	struct sx mi_lock;
	struct mqfs_node *mi_root;
	struct unrhdr *mi_unrhdr;
	};

	struct mqfs_vdata {
	LIST_ENTRY(mqfs_vdata) mv_link;
	struct mqfs_node *mv_node;
	struct vnode *mv_vnode;
	struct task mv_task;
	};

	/*
	* mqfs_node: describes a node (file or directory) within a mqfs
	*/
	struct mqfs_node {
	char mn_name[MQFS_NAMELEN+1];
	struct mqfs_info *mn_info;
	struct mqfs_node *mn_parent;
	LIST_HEAD(,mqfs_node) mn_children;
	LIST_ENTRY(mqfs_node) mn_sibling;
	LIST_HEAD(,mqfs_vdata) mn_vnodes;
	int mn_refcount;
	mqfs_type_t mn_type;
	int mn_deleted;
	uint32_t mn_fileno;
	void *mn_data;
	struct timespec mn_birth;
	struct timespec mn_ctime;
	struct timespec mn_atime;
	struct timespec mn_mtime;
	uid_t mn_uid;
	gid_t mn_gid;
	int mn_mode;
	};

	#define VTON(vp) (((struct mqfs_vdata *)((vp)->v_data))->mv_node)
	#define VTOMQ(vp) ((struct mqueue *)(VTON(vp)->mn_data))
	#define VFSTOMQFS(m) ((struct mqfs_info *)((m)->mnt_data))
	#define FPTOMQ(fp) ((struct mqueue )(((struct mqfs_node ) \
	(fp)->f_data)->mn_data))

	TAILQ_HEAD(msgq, mqueue_msg);

	struct mqueue;

	struct mqueue_notifier {
	LIST_ENTRY(mqueue_notifier) nt_link;
	struct sigevent nt_sigev;
	ksiginfo_t nt_ksi;
	struct proc *nt_proc;
	};

	struct mqueue {
	struct mtx mq_mutex;
	int mq_flags;
	long mq_maxmsg;
	long mq_msgsize;
	long mq_curmsgs;
	long mq_totalbytes;
	struct msgq mq_msgq;
	int mq_receivers;
	int mq_senders;
	struct selinfo mq_rsel;
	struct selinfo mq_wsel;
	struct mqueue_notifier *mq_notifier;
	};

	#define MQ_RSEL 0x01
	#define MQ_WSEL 0x02

	struct mqueue_msg {
	TAILQ_ENTRY(mqueue_msg) msg_link;
	unsigned int msg_prio;
	unsigned int msg_size;
	/* following real data... */
	};

	SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0,
	"POSIX real time message queue");

	static int default_maxmsg = 10;
	static int default_msgsize = 1024;

	static int maxmsg = 100;
	SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
	&maxmsg, 0, "Default maximum messages in queue");
	static int maxmsgsize = 16384;
	SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
	&maxmsgsize, 0, "Default maximum message size");
	static int maxmq = 100;
	SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
	&maxmq, 0, "maximum message queues");
	static int curmq = 0;
	SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
	&curmq, 0, "current message queue number");
	static int unloadable = 0;
	static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");

	static eventhandler_tag exit_tag;

	/* Only one instance per-system */
	static struct mqfs_info mqfs_data;
	static uma_zone_t mqnode_zone;
	static uma_zone_t mqueue_zone;
	static uma_zone_t mvdata_zone;
	static uma_zone_t mqnoti_zone;
	static struct vop_vector mqfs_vnodeops;
	static struct fileops mqueueops;

	/*
	* Directory structure construction and manipulation
	*/
	#ifdef notyet
	static struct mqfs_node mqfs_create_dir(struct mqfs_node parent,
	const char name, int namelen, struct ucred cred, int mode);
	static struct mqfs_node mqfs_create_link(struct mqfs_node parent,
	const char name, int namelen, struct ucred cred, int mode);
	#endif

	static struct mqfs_node mqfs_create_file(struct mqfs_node parent,
	const char name, int namelen, struct ucred cred, int mode);
	static int mqfs_destroy(struct mqfs_node *mn);
	static void mqfs_fileno_alloc(struct mqfs_info mi, struct mqfs_node mn);
	static void mqfs_fileno_free(struct mqfs_info mi, struct mqfs_node mn);
	static int mqfs_allocv(struct mount mp, struct vnode vpp, struct mqfs_node pn);

	/*
	* Message queue construction and maniplation
	*/
	static struct mqueue mqueue_alloc(const struct mq_attr attr);
	static void mqueue_free(struct mqueue *mq);
	static int mqueue_send(struct mqueue mq, const char msg_ptr,
	size_t msg_len, unsigned msg_prio, int waitok,
	const struct timespec *abs_timeout);
	static int mqueue_receive(struct mqueue mq, char msg_ptr,
	size_t msg_len, unsigned *msg_prio, int waitok,
	const struct timespec *abs_timeout);
	static int _mqueue_send(struct mqueue mq, struct mqueue_msg msg,
	int timo);
	static int _mqueue_recv(struct mqueue mq, struct mqueue_msg *msg,
	int timo);
	static void mqueue_send_notification(struct mqueue *mq);
	static void mqueue_fdclose(struct thread td, int fd, struct file fp);
	static void mq_proc_exit(void arg, struct proc p);

	/*
	* kqueue filters
	*/
	static void filt_mqdetach(struct knote *kn);
	static int filt_mqread(struct knote *kn, long hint);
	static int filt_mqwrite(struct knote *kn, long hint);

	struct filterops mq_rfiltops = {
	.f_isfd = 1,
	.f_detach = filt_mqdetach,
	.f_event = filt_mqread,
	};
	struct filterops mq_wfiltops = {
	.f_isfd = 1,
	.f_detach = filt_mqdetach,
	.f_event = filt_mqwrite,
	};

	/*
	* Initialize fileno bitmap
	*/
	static void
	mqfs_fileno_init(struct mqfs_info *mi)
	{
	struct unrhdr *up;

	up = new_unrhdr(1, INT_MAX, NULL);
	mi->mi_unrhdr = up;
	}

	/*
	* Tear down fileno bitmap
	*/
	static void
	mqfs_fileno_uninit(struct mqfs_info *mi)
	{
	struct unrhdr *up;

	up = mi->mi_unrhdr;
	mi->mi_unrhdr = NULL;
	delete_unrhdr(up);
	}

	/*
	* Allocate a file number
	*/
	static void
	mqfs_fileno_alloc(struct mqfs_info mi, struct mqfs_node mn)
	{
	/* make sure our parent has a file number */
	if (mn->mn_parent && !mn->mn_parent->mn_fileno)
	mqfs_fileno_alloc(mi, mn->mn_parent);

	switch (mn->mn_type) {
	case mqfstype_root:
	case mqfstype_dir:
	case mqfstype_file:
	case mqfstype_symlink:
	mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
	break;
	case mqfstype_this:
	KASSERT(mn->mn_parent != NULL,
	("mqfstype_this node has no parent"));
	mn->mn_fileno = mn->mn_parent->mn_fileno;
	break;
	case mqfstype_parent:
	KASSERT(mn->mn_parent != NULL,
	("mqfstype_parent node has no parent"));
	if (mn->mn_parent == mi->mi_root) {
	mn->mn_fileno = mn->mn_parent->mn_fileno;
	break;
	}
	KASSERT(mn->mn_parent->mn_parent != NULL,
	("mqfstype_parent node has no grandparent"));
	mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
	break;
	default:
	KASSERT(0,
	("mqfs_fileno_alloc() called for unknown type node: %d",
	mn->mn_type));
	break;
	}
	}

	/*
	* Release a file number
	*/
	static void
	mqfs_fileno_free(struct mqfs_info mi, struct mqfs_node mn)
	{
	switch (mn->mn_type) {
	case mqfstype_root:
	case mqfstype_dir:
	case mqfstype_file:
	case mqfstype_symlink:
	free_unr(mi->mi_unrhdr, mn->mn_fileno);
	break;
	case mqfstype_this:
	case mqfstype_parent:
	/* ignore these, as they don't "own" their file number */
	break;
	default:
	KASSERT(0,
	("mqfs_fileno_free() called for unknown type node: %d",
	mn->mn_type));
	break;
	}
	}

	static __inline struct mqfs_node *
	mqnode_alloc(void)
	{
	return uma_zalloc(mqnode_zone, M_WAITOK \| M_ZERO);
	}

	static __inline void
	mqnode_free(struct mqfs_node *node)
	{
	uma_zfree(mqnode_zone, node);
	}

	static __inline void
	mqnode_addref(struct mqfs_node *node)
	{
	atomic_fetchadd_int(&node->mn_refcount, 1);
	}

	static __inline void
	mqnode_release(struct mqfs_node *node)
	{
	struct mqfs_info *mqfs;
	int old, exp;

	mqfs = node->mn_info;
	old = atomic_fetchadd_int(&node->mn_refcount, -1);
	if (node->mn_type == mqfstype_dir \|\|
	node->mn_type == mqfstype_root)
	exp = 3; /* include . and .. */
	else
	exp = 1;
	if (old == exp) {
	int locked = sx_xlocked(&mqfs->mi_lock);
	if (!locked)
	sx_xlock(&mqfs->mi_lock);
	mqfs_destroy(node);
	if (!locked)
	sx_xunlock(&mqfs->mi_lock);
	}
	}

	/*
	* Add a node to a directory
	*/
	static int
	mqfs_add_node(struct mqfs_node parent, struct mqfs_node node)
	{
	KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
	KASSERT(parent->mn_info != NULL,
	("%s(): parent has no mn_info", __func__));
	KASSERT(parent->mn_type == mqfstype_dir \|\|
	parent->mn_type == mqfstype_root,
	("%s(): parent is not a directory", __func__));

	node->mn_info = parent->mn_info;
	node->mn_parent = parent;
	LIST_INIT(&node->mn_children);
	LIST_INIT(&node->mn_vnodes);
	LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
	mqnode_addref(parent);
	return (0);
	}

	static struct mqfs_node *
	mqfs_create_node(const char name, int namelen, struct ucred cred, int mode,
	int nodetype)
	{
	struct mqfs_node *node;

	node = mqnode_alloc();
	strncpy(node->mn_name, name, namelen);
	node->mn_type = nodetype;
	node->mn_refcount = 1;
	vfs_timestamp(&node->mn_birth);
	node->mn_ctime = node->mn_atime = node->mn_mtime
	= node->mn_birth;
	node->mn_uid = cred->cr_uid;
	node->mn_gid = cred->cr_gid;
	node->mn_mode = mode;
	return (node);
	}

	/*
	* Create a file
	*/
	static struct mqfs_node *
	mqfs_create_file(struct mqfs_node parent, const char name, int namelen,
	struct ucred *cred, int mode)
	{
	struct mqfs_node *node;

	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
	if (mqfs_add_node(parent, node) != 0) {
	mqnode_free(node);
	return (NULL);
	}
	return (node);
	}

	/*
	* Add . and .. to a directory
	*/
	static int
	mqfs_fixup_dir(struct mqfs_node *parent)
	{
	struct mqfs_node *dir;

	dir = mqnode_alloc();
	dir->mn_name[0] = '.';
	dir->mn_type = mqfstype_this;
	dir->mn_refcount = 1;
	if (mqfs_add_node(parent, dir) != 0) {
	mqnode_free(dir);
	return (-1);
	}

	dir = mqnode_alloc();
	dir->mn_name[0] = dir->mn_name[1] = '.';
	dir->mn_type = mqfstype_parent;
	dir->mn_refcount = 1;

	if (mqfs_add_node(parent, dir) != 0) {
	mqnode_free(dir);
	return (-1);
	}

	return (0);
	}

	#ifdef notyet

	/*
	* Create a directory
	*/
	static struct mqfs_node *
	mqfs_create_dir(struct mqfs_node parent, const char name, int namelen,
	struct ucred *cred, int mode)
	{
	struct mqfs_node *node;

	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
	if (mqfs_add_node(parent, node) != 0) {
	mqnode_free(node);
	return (NULL);
	}

	if (mqfs_fixup_dir(node) != 0) {
	mqfs_destroy(node);
	return (NULL);
	}
	return (node);
	}

	/*
	* Create a symlink
	*/
	static struct mqfs_node *
	mqfs_create_link(struct mqfs_node parent, const char name, int namelen,
	struct ucred *cred, int mode)
	{
	struct mqfs_node *node;

	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
	if (mqfs_add_node(parent, node) != 0) {
	mqnode_free(node);
	return (NULL);
	}
	return (node);
	}

	#endif

	/*
	* Destroy a node or a tree of nodes
	*/
	static int
	mqfs_destroy(struct mqfs_node *node)
	{
	struct mqfs_node *parent;

	KASSERT(node != NULL,
	("%s(): node is NULL", __func__));
	KASSERT(node->mn_info != NULL,
	("%s(): node has no mn_info", __func__));

	/* destroy children */
	if (node->mn_type == mqfstype_dir \|\| node->mn_type == mqfstype_root)
	while (! LIST_EMPTY(&node->mn_children))
	mqfs_destroy(LIST_FIRST(&node->mn_children));

	/* unlink from parent */
	if ((parent = node->mn_parent) != NULL) {
	KASSERT(parent->mn_info == node->mn_info,
	("%s(): parent has different mn_info", __func__));
	LIST_REMOVE(node, mn_sibling);
	}

	if (node->mn_fileno != 0)
	mqfs_fileno_free(node->mn_info, node);
	if (node->mn_data != NULL)
	mqueue_free(node->mn_data);
	mqnode_free(node);
	return (0);
	}

	/*
	* Mount a mqfs instance
	*/
	static int
	mqfs_mount(struct mount *mp)
	{
	struct statfs *sbp;

	if (mp->mnt_flag & MNT_UPDATE)
	return (EOPNOTSUPP);

	mp->mnt_data = &mqfs_data;
	MNT_ILOCK(mp);
	mp->mnt_flag \|= MNT_LOCAL;
	mp->mnt_kern_flag \|= MNTK_MPSAFE;
	MNT_IUNLOCK(mp);
	vfs_getnewfsid(mp);

	sbp = &mp->mnt_stat;
	vfs_mountedfrom(mp, "mqueue");
	sbp->f_bsize = PAGE_SIZE;
	sbp->f_iosize = PAGE_SIZE;
	sbp->f_blocks = 1;
	sbp->f_bfree = 0;
	sbp->f_bavail = 0;
	sbp->f_files = 1;
	sbp->f_ffree = 0;
	return (0);
	}

	/*
	* Unmount a mqfs instance
	*/
	static int
	mqfs_unmount(struct mount *mp, int mntflags)
	{
	int error;

	error = vflush(mp, 0, (mntflags & MNT_FORCE) ? FORCECLOSE : 0,
	curthread);
	return (error);
	}

	/*
	* Return a root vnode
	*/
	static int
	mqfs_root(struct mount mp, int flags, struct vnode *vpp)
	{
	struct mqfs_info *mqfs;
	int ret;

	mqfs = VFSTOMQFS(mp);
	ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
	return (ret);
	}

	/*
	* Return filesystem stats
	*/
	static int
	mqfs_statfs(struct mount mp, struct statfs sbp)
	{
	/* XXX update statistics */
	return (0);
	}

	/*
	* Initialize a mqfs instance
	*/
	static int
	mqfs_init(struct vfsconf *vfc)
	{
	struct mqfs_node *root;
	struct mqfs_info *mi;

	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	mvdata_zone = uma_zcreate("mvdata",
	sizeof(struct mqfs_vdata), NULL, NULL, NULL,
	NULL, UMA_ALIGN_PTR, 0);
	mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
	NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
	mi = &mqfs_data;
	sx_init(&mi->mi_lock, "mqfs lock");
	/* set up the root diretory */
	root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
	mqfstype_root);
	root->mn_info = mi;
	LIST_INIT(&root->mn_children);
	LIST_INIT(&root->mn_vnodes);
	mi->mi_root = root;
	mqfs_fileno_init(mi);
	mqfs_fileno_alloc(mi, root);
	mqfs_fixup_dir(root);
	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
	EVENTHANDLER_PRI_ANY);
	mq_fdclose = mqueue_fdclose;
	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
	return (0);
	}

	/*
	* Destroy a mqfs instance
	*/
	static int
	mqfs_uninit(struct vfsconf *vfc)
	{
	struct mqfs_info *mi;

	if (!unloadable)
	return (EOPNOTSUPP);
	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
	mi = &mqfs_data;
	mqfs_destroy(mi->mi_root);
	mi->mi_root = NULL;
	mqfs_fileno_uninit(mi);
	sx_destroy(&mi->mi_lock);
	uma_zdestroy(mqnode_zone);
	uma_zdestroy(mqueue_zone);
	uma_zdestroy(mvdata_zone);
	uma_zdestroy(mqnoti_zone);
	return (0);
	}

	/*
	* task routine
	*/
	static void
	do_recycle(void *context, int pending __unused)
	{
	struct vnode vp = (struct vnode )context;

	vrecycle(vp, curthread);
	vdrop(vp);
	}

	/*
	* Allocate a vnode
	*/
	static int
	mqfs_allocv(struct mount mp, struct vnode vpp, struct mqfs_node pn)
	{
	struct mqfs_vdata *vd;
	struct mqfs_info *mqfs;
	struct vnode *newvpp;
	int error;

	mqfs = pn->mn_info;
	*vpp = NULL;
	sx_xlock(&mqfs->mi_lock);
	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
	if (vd->mv_vnode->v_mount == mp) {
	vhold(vd->mv_vnode);
	break;
	}
	}

	if (vd != NULL) {
	found:
	*vpp = vd->mv_vnode;
	sx_xunlock(&mqfs->mi_lock);
	error = vget(*vpp, LK_RETRY \| LK_EXCLUSIVE, curthread);
	vdrop(*vpp);
	return (error);
	}
	sx_xunlock(&mqfs->mi_lock);

	error = getnewvnode("mqueue", mp, &mqfs_vnodeops, &newvpp);
	if (error)
	return (error);
	vn_lock(newvpp, LK_EXCLUSIVE \| LK_RETRY);
	error = insmntque(newvpp, mp);
	if (error != 0)
	return (error);

	sx_xlock(&mqfs->mi_lock);
	/*
	* Check if it has already been allocated
	* while we were blocked.
	*/
	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
	if (vd->mv_vnode->v_mount == mp) {
	vhold(vd->mv_vnode);
	sx_xunlock(&mqfs->mi_lock);

	vgone(newvpp);
	vput(newvpp);
	goto found;
	}
	}

	*vpp = newvpp;

	vd = uma_zalloc(mvdata_zone, M_WAITOK);
	(*vpp)->v_data = vd;
	vd->mv_vnode = *vpp;
	vd->mv_node = pn;
	TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
	LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
	mqnode_addref(pn);
	switch (pn->mn_type) {
	case mqfstype_root:
	(*vpp)->v_vflag = VV_ROOT;
	/* fall through */
	case mqfstype_dir:
	case mqfstype_this:
	case mqfstype_parent:
	(*vpp)->v_type = VDIR;
	break;
	case mqfstype_file:
	(*vpp)->v_type = VREG;
	break;
	case mqfstype_symlink:
	(*vpp)->v_type = VLNK;
	break;
	case mqfstype_none:
	KASSERT(0, ("mqfs_allocf called for null node\n"));
	default:
	panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
	}
	sx_xunlock(&mqfs->mi_lock);
	return (0);
	}

	/*
	* Search a directory entry
	*/
	static struct mqfs_node *
	mqfs_search(struct mqfs_node pd, const char name, int len)
	{
	struct mqfs_node *pn;

	sx_assert(&pd->mn_info->mi_lock, SX_LOCKED);
	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
	if (strncmp(pn->mn_name, name, len) == 0 &&
	pn->mn_name[len] == '\0')
	return (pn);
	}
	return (NULL);
	}

	/*
	* Look up a file or directory.
	*/
	static int
	mqfs_lookupx(struct vop_cachedlookup_args *ap)
	{
	struct componentname *cnp;
	struct vnode dvp, *vpp;
	struct mqfs_node *pd;
	struct mqfs_node *pn;
	struct mqfs_info *mqfs;
	int nameiop, flags, error, namelen;
	char *pname;
	struct thread *td;

	cnp = ap->a_cnp;
	vpp = ap->a_vpp;
	dvp = ap->a_dvp;
	pname = cnp->cn_nameptr;
	namelen = cnp->cn_namelen;
	td = cnp->cn_thread;
	flags = cnp->cn_flags;
	nameiop = cnp->cn_nameiop;
	pd = VTON(dvp);
	pn = NULL;
	mqfs = pd->mn_info;
	*vpp = NULLVP;

	if (dvp->v_type != VDIR)
	return (ENOTDIR);

	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
	if (error)
	return (error);

	/* shortcut: check if the name is too long */
	if (cnp->cn_namelen >= MQFS_NAMELEN)
	return (ENOENT);

	/* self */
	if (namelen == 1 && pname[0] == '.') {
	if ((flags & ISLASTCN) && nameiop != LOOKUP)
	return (EINVAL);
	pn = pd;
	*vpp = dvp;
	VREF(dvp);
	return (0);
	}

	/* parent */
	if (cnp->cn_flags & ISDOTDOT) {
	if (dvp->v_vflag & VV_ROOT)
	return (EIO);
	if ((flags & ISLASTCN) && nameiop != LOOKUP)
	return (EINVAL);
	VOP_UNLOCK(dvp, 0);
	KASSERT(pd->mn_parent, ("non-root directory has no parent"));
	pn = pd->mn_parent;
	error = mqfs_allocv(dvp->v_mount, vpp, pn);
	vn_lock(dvp, LK_EXCLUSIVE \| LK_RETRY);
	return (error);
	}

	/* named node */
	sx_xlock(&mqfs->mi_lock);
	pn = mqfs_search(pd, pname, namelen);
	if (pn != NULL)
	mqnode_addref(pn);
	sx_xunlock(&mqfs->mi_lock);

	/* found */
	if (pn != NULL) {
	/* DELETE */
	if (nameiop == DELETE && (flags & ISLASTCN)) {
	error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
	if (error) {
	mqnode_release(pn);
	return (error);
	}
	if (*vpp == dvp) {
	VREF(dvp);
	*vpp = dvp;
	mqnode_release(pn);
	return (0);
	}
	}

	/* allocate vnode */
	error = mqfs_allocv(dvp->v_mount, vpp, pn);
	mqnode_release(pn);
	if (error == 0 && cnp->cn_flags & MAKEENTRY)
	cache_enter(dvp, *vpp, cnp);
	return (error);
	}

	/* not found */

	/* will create a new entry in the directory ? */
	if ((nameiop == CREATE \|\| nameiop == RENAME) && (flags & LOCKPARENT)
	&& (flags & ISLASTCN)) {
	error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
	if (error)
	return (error);
	cnp->cn_flags \|= SAVENAME;
	return (EJUSTRETURN);
	}
	return (ENOENT);
	}

	#if 0
	struct vop_lookup_args {
	struct vop_generic_args a_gen;
	struct vnode *a_dvp;
	struct vnode **a_vpp;
	struct componentname *a_cnp;
	};
	#endif

	/*
	* vnode lookup operation
	*/
	static int
	mqfs_lookup(struct vop_cachedlookup_args *ap)
	{
	int rc;

	rc = mqfs_lookupx(ap);
	return (rc);
	}

	#if 0
	struct vop_create_args {
	struct vnode *a_dvp;
	struct vnode **a_vpp;
	struct componentname *a_cnp;
	struct vattr *a_vap;
	};
	#endif

	/*
	* vnode creation operation
	*/
	static int
	mqfs_create(struct vop_create_args *ap)
	{
	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
	struct componentname *cnp = ap->a_cnp;
	struct mqfs_node *pd;
	struct mqfs_node *pn;
	struct mqueue *mq;
	int error;

	pd = VTON(ap->a_dvp);
	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
	return (ENOTDIR);
	mq = mqueue_alloc(NULL);
	if (mq == NULL)
	return (EAGAIN);
	sx_xlock(&mqfs->mi_lock);
	if ((cnp->cn_flags & HASBUF) == 0)
	panic("%s: no name", __func__);
	pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
	cnp->cn_cred, ap->a_vap->va_mode);
	if (pn == NULL) {
	sx_xunlock(&mqfs->mi_lock);
	error = ENOSPC;
	} else {
	mqnode_addref(pn);
	sx_xunlock(&mqfs->mi_lock);
	error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
	mqnode_release(pn);
	if (error)
	mqfs_destroy(pn);
	else
	pn->mn_data = mq;
	}
	if (error)
	mqueue_free(mq);
	return (error);
	}

	/*
	* Remove an entry
	*/
	static
	int do_unlink(struct mqfs_node pn, struct ucred ucred)
	{
	struct mqfs_node *parent;
	struct mqfs_vdata *vd;
	int error = 0;

	sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);

	if (ucred->cr_uid != pn->mn_uid &&
	(error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0)
	error = EACCES;
	else if (!pn->mn_deleted) {
	parent = pn->mn_parent;
	pn->mn_parent = NULL;
	pn->mn_deleted = 1;
	LIST_REMOVE(pn, mn_sibling);
	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
	cache_purge(vd->mv_vnode);
	vhold(vd->mv_vnode);
	taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
	}
	mqnode_release(pn);
	mqnode_release(parent);
	} else
	error = ENOENT;
	return (error);
	}

	#if 0
	struct vop_remove_args {
	struct vnode *a_dvp;
	struct vnode *a_vp;
	struct componentname *a_cnp;
	};
	#endif

	/*
	* vnode removal operation
	*/
	static int
	mqfs_remove(struct vop_remove_args *ap)
	{
	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
	struct mqfs_node *pn;
	int error;

	if (ap->a_vp->v_type == VDIR)
	return (EPERM);
	pn = VTON(ap->a_vp);
	sx_xlock(&mqfs->mi_lock);
	error = do_unlink(pn, ap->a_cnp->cn_cred);
	sx_xunlock(&mqfs->mi_lock);
	return (error);
	}

	#if 0
	struct vop_inactive_args {
	struct vnode *a_vp;
	struct thread *a_td;
	};
	#endif

	static int
	mqfs_inactive(struct vop_inactive_args *ap)
	{
	struct mqfs_node *pn = VTON(ap->a_vp);

	if (pn->mn_deleted)
	vrecycle(ap->a_vp, ap->a_td);
	return (0);
	}

	#if 0
	struct vop_reclaim_args {
	struct vop_generic_args a_gen;
	struct vnode *a_vp;
	struct thread *a_td;
	};
	#endif

	static int
	mqfs_reclaim(struct vop_reclaim_args *ap)
	{
	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
	struct vnode *vp = ap->a_vp;
	struct mqfs_node *pn;
	struct mqfs_vdata *vd;

	vd = vp->v_data;
	pn = vd->mv_node;
	sx_xlock(&mqfs->mi_lock);
	vp->v_data = NULL;
	LIST_REMOVE(vd, mv_link);
	uma_zfree(mvdata_zone, vd);
	mqnode_release(pn);
	sx_xunlock(&mqfs->mi_lock);
	return (0);
	}

	#if 0
	struct vop_open_args {
	struct vop_generic_args a_gen;
	struct vnode *a_vp;
	int a_mode;
	struct ucred *a_cred;
	struct thread *a_td;
	struct file *a_fp;
	};
	#endif

	static int
	mqfs_open(struct vop_open_args *ap)
	{
	return (0);
	}

	#if 0
	struct vop_close_args {
	struct vop_generic_args a_gen;
	struct vnode *a_vp;
	int a_fflag;
	struct ucred *a_cred;
	struct thread *a_td;
	};
	#endif

	static int
	mqfs_close(struct vop_close_args *ap)
	{
	return (0);
	}

	#if 0
	struct vop_access_args {
	struct vop_generic_args a_gen;
	struct vnode *a_vp;
	accmode_t a_accmode;
	struct ucred *a_cred;
	struct thread *a_td;
	};
	#endif

	/*
	* Verify permissions
	*/
	static int
	mqfs_access(struct vop_access_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct vattr vattr;
	int error;

	error = VOP_GETATTR(vp, &vattr, ap->a_cred);
	if (error)
	return (error);
	error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid,
	vattr.va_gid, ap->a_accmode, ap->a_cred, NULL);
	return (error);
	}

	#if 0
	struct vop_getattr_args {
	struct vop_generic_args a_gen;
	struct vnode *a_vp;
	struct vattr *a_vap;
	struct ucred *a_cred;
	};
	#endif

	/*
	* Get file attributes
	*/
	static int
	mqfs_getattr(struct vop_getattr_args *ap)
	{
	struct vnode *vp = ap->a_vp;
	struct mqfs_node *pn = VTON(vp);
	struct vattr *vap = ap->a_vap;
	int error = 0;

	vap->va_type = vp->v_type;
	vap->va_mode = pn->mn_mode;
	vap->va_nlink = 1;
	vap->va_uid = pn->mn_uid;
	vap->va_gid = pn->mn_gid;
	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
	vap->va_fileid = pn->mn_fileno;
	vap->va_size = 0;
	vap->va_blocksize = PAGE_SIZE;
	vap->va_bytes = vap->va_size = 0;
	vap->va_atime = pn->mn_atime;
	vap->va_mtime = pn->mn_mtime;
	vap->va_ctime = pn->mn_ctime;
	vap->va_birthtime = pn->mn_birth;
	vap->va_gen = 0;
	vap->va_flags = 0;
	vap->va_rdev = NODEV;
	vap->va_bytes = 0;
	vap->va_filerev = 0;
	return (error);
	}

	#if 0
	struct vop_setattr_args {
	struct vop_generic_args a_gen;
	struct vnode *a_vp;
	struct vattr *a_vap;
	struct ucred *a_cred;
	};
	#endif
	/*
	* Set attributes
	*/
	static int
	mqfs_setattr(struct vop_setattr_args *ap)
	{
	struct mqfs_node *pn;
	struct vattr *vap;
	struct vnode *vp;
	struct thread *td;
	int c, error;
	uid_t uid;
	gid_t gid;

	td = curthread;
	vap = ap->a_vap;
	vp = ap->a_vp;
	if ((vap->va_type != VNON) \|\|
	(vap->va_nlink != VNOVAL) \|\|
	(vap->va_fsid != VNOVAL) \|\|
	(vap->va_fileid != VNOVAL) \|\|
	(vap->va_blocksize != VNOVAL) \|\|
	(vap->va_flags != VNOVAL && vap->va_flags != 0) \|\|
	(vap->va_rdev != VNOVAL) \|\|
	((int)vap->va_bytes != VNOVAL) \|\|
	(vap->va_gen != VNOVAL)) {
	return (EINVAL);
	}

	pn = VTON(vp);

	error = c = 0;
	if (vap->va_uid == (uid_t)VNOVAL)
	uid = pn->mn_uid;
	else
	uid = vap->va_uid;
	if (vap->va_gid == (gid_t)VNOVAL)
	gid = pn->mn_gid;
	else
	gid = vap->va_gid;

	if (uid != pn->mn_uid \|\| gid != pn->mn_gid) {
	/*
	* To modify the ownership of a file, must possess VADMIN
	* for that file.
	*/
	if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)))
	return (error);

	/*
	* XXXRW: Why is there a privilege check here: shouldn't the
	* check in VOP_ACCESS() be enough? Also, are the group bits
	* below definitely right?
	*/
	if (((ap->a_cred->cr_uid != pn->mn_uid) \|\| uid != pn->mn_uid \|\|
	(gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
	(error = priv_check(td, PRIV_MQ_ADMIN)) != 0)
	return (error);
	pn->mn_uid = uid;
	pn->mn_gid = gid;
	c = 1;
	}

	if (vap->va_mode != (mode_t)VNOVAL) {
	if ((ap->a_cred->cr_uid != pn->mn_uid) &&
	(error = priv_check(td, PRIV_MQ_ADMIN)))
	return (error);
	pn->mn_mode = vap->va_mode;
	c = 1;
	}

	if (vap->va_atime.tv_sec != VNOVAL \|\| vap->va_mtime.tv_sec != VNOVAL) {
	/* See the comment in ufs_vnops::ufs_setattr(). */
	if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, td)) &&
	((vap->va_vaflags & VA_UTIMES_NULL) == 0 \|\|
	(error = VOP_ACCESS(vp, VWRITE, ap->a_cred, td))))
	return (error);
	if (vap->va_atime.tv_sec != VNOVAL) {
	pn->mn_atime = vap->va_atime;
	}
	if (vap->va_mtime.tv_sec != VNOVAL) {
	pn->mn_mtime = vap->va_mtime;
	}
	c = 1;
	}
	if (c) {
	vfs_timestamp(&pn->mn_ctime);
	}
	return (0);
	}

	#if 0
	struct vop_read_args {
	struct vop_generic_args a_gen;
	struct vnode *a_vp;
	struct uio *a_uio;
	int a_ioflag;
	struct ucred *a_cred;
	};
	#endif

	/*
	* Read from a file
	*/
	static int
	mqfs_read(struct vop_read_args *ap)
	{
	char buf[80];
	struct vnode *vp = ap->a_vp;
	struct uio *uio = ap->a_uio;
	struct mqfs_node *pn;
	struct mqueue *mq;
	int len, error;

	if (vp->v_type != VREG)
	return (EINVAL);

	pn = VTON(vp);
	mq = VTOMQ(vp);
	snprintf(buf, sizeof(buf),
	"QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
	mq->mq_totalbytes,
	mq->mq_maxmsg,
	mq->mq_curmsgs,
	mq->mq_msgsize);
	buf[sizeof(buf)-1] = '\0';
	len = strlen(buf);
	error = uiomove_frombuf(buf, len, uio);
	return (error);
	}

	#if 0
	struct vop_readdir_args {
	struct vop_generic_args a_gen;
	struct vnode *a_vp;
	struct uio *a_uio;
	struct ucred *a_cred;
	int *a_eofflag;
	int *a_ncookies;
	u_long **a_cookies;
	};
	#endif

	/*
	* Return directory entries.
	*/
	static int
	mqfs_readdir(struct vop_readdir_args *ap)
	{
	struct vnode *vp;
	struct mqfs_info *mi;
	struct mqfs_node *pd;
	struct mqfs_node *pn;
	struct dirent entry;
	struct uio *uio;
	int *tmp_ncookies = NULL;
	off_t offset;
	int error, i;

	vp = ap->a_vp;
	mi = VFSTOMQFS(vp->v_mount);
	pd = VTON(vp);
	uio = ap->a_uio;

	if (vp->v_type != VDIR)
	return (ENOTDIR);

	if (uio->uio_offset < 0)
	return (EINVAL);

	if (ap->a_ncookies != NULL) {
	tmp_ncookies = ap->a_ncookies;
	*ap->a_ncookies = 0;
	ap->a_ncookies = NULL;
	}

	error = 0;
	offset = 0;

	sx_xlock(&mi->mi_lock);

	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
	entry.d_reclen = sizeof(entry);
	if (!pn->mn_fileno)
	mqfs_fileno_alloc(mi, pn);
	entry.d_fileno = pn->mn_fileno;
	for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
	entry.d_name[i] = pn->mn_name[i];
	entry.d_name[i] = 0;
	entry.d_namlen = i;
	switch (pn->mn_type) {
	case mqfstype_root:
	case mqfstype_dir:
	case mqfstype_this:
	case mqfstype_parent:
	entry.d_type = DT_DIR;
	break;
	case mqfstype_file:
	entry.d_type = DT_REG;
	break;
	case mqfstype_symlink:
	entry.d_type = DT_LNK;
	break;
	default:
	panic("%s has unexpected node type: %d", pn->mn_name,
	pn->mn_type);
	}
	if (entry.d_reclen > uio->uio_resid)
	break;
	if (offset >= uio->uio_offset) {
	error = vfs_read_dirent(ap, &entry, offset);
	if (error)
	break;
	}
	offset += entry.d_reclen;
	}
	sx_xunlock(&mi->mi_lock);

	uio->uio_offset = offset;

	if (tmp_ncookies != NULL)
	ap->a_ncookies = tmp_ncookies;

	return (error);
	}

	#ifdef notyet

	#if 0
	struct vop_mkdir_args {
	struct vnode *a_dvp;
	struvt vnode **a_vpp;
	struvt componentname *a_cnp;
	struct vattr *a_vap;
	};
	#endif

	/*
	* Create a directory.
	*/
	static int
	mqfs_mkdir(struct vop_mkdir_args *ap)
	{
	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
	struct componentname *cnp = ap->a_cnp;
	struct mqfs_node *pd = VTON(ap->a_dvp);
	struct mqfs_node *pn;
	int error;

	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
	return (ENOTDIR);
	sx_xlock(&mqfs->mi_lock);
	if ((cnp->cn_flags & HASBUF) == 0)
	panic("%s: no name", __func__);
	pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
	ap->a_vap->cn_cred, ap->a_vap->va_mode);
	if (pn != NULL)
	mqnode_addref(pn);
	sx_xunlock(&mqfs->mi_lock);
	if (pn == NULL) {
	error = ENOSPC;
	} else {
	error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
	mqnode_release(pn);
	}
	return (error);
	}

	#if 0
	struct vop_rmdir_args {
	struct vnode *a_dvp;
	struct vnode *a_vp;
	struct componentname *a_cnp;
	};
	#endif

	/*
	* Remove a directory.
	*/
	static int
	mqfs_rmdir(struct vop_rmdir_args *ap)
	{
	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
	struct mqfs_node *pn = VTON(ap->a_vp);
	struct mqfs_node *pt;

	if (pn->mn_type != mqfstype_dir)
	return (ENOTDIR);

	sx_xlock(&mqfs->mi_lock);
	if (pn->mn_deleted) {
	sx_xunlock(&mqfs->mi_lock);
	return (ENOENT);
	}

	pt = LIST_FIRST(&pn->mn_children);
	pt = LIST_NEXT(pt, mn_sibling);
	pt = LIST_NEXT(pt, mn_sibling);
	if (pt != NULL) {
	sx_xunlock(&mqfs->mi_lock);
	return (ENOTEMPTY);
	}
	pt = pn->mn_parent;
	pn->mn_parent = NULL;
	pn->mn_deleted = 1;
	LIST_REMOVE(pn, mn_sibling);
	mqnode_release(pn);
	mqnode_release(pt);
	sx_xunlock(&mqfs->mi_lock);
	cache_purge(ap->a_vp);
	return (0);
	}

	#endif /* notyet */

	/*
	* Allocate a message queue
	*/
	static struct mqueue *
	mqueue_alloc(const struct mq_attr *attr)
	{
	struct mqueue *mq;

	if (curmq >= maxmq)
	return (NULL);
	mq = uma_zalloc(mqueue_zone, M_WAITOK \| M_ZERO);
	TAILQ_INIT(&mq->mq_msgq);
	if (attr != NULL) {
	mq->mq_maxmsg = attr->mq_maxmsg;
	mq->mq_msgsize = attr->mq_msgsize;
	} else {
	mq->mq_maxmsg = default_maxmsg;
	mq->mq_msgsize = default_msgsize;
	}
	mtx_init(&mq->mq_mutex, "mqueue lock", NULL, MTX_DEF);
	knlist_init_mtx(&mq->mq_rsel.si_note, &mq->mq_mutex);
	knlist_init_mtx(&mq->mq_wsel.si_note, &mq->mq_mutex);
	atomic_add_int(&curmq, 1);
	return (mq);
	}

	/*
	* Destroy a message queue
	*/
	static void
	mqueue_free(struct mqueue *mq)
	{
	struct mqueue_msg *msg;

	while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
	TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
	free(msg, M_MQUEUEDATA);
	}

	mtx_destroy(&mq->mq_mutex);
	seldrain(&mq->mq_rsel);
	seldrain(&mq->mq_wsel);
	knlist_destroy(&mq->mq_rsel.si_note);
	knlist_destroy(&mq->mq_wsel.si_note);
	uma_zfree(mqueue_zone, mq);
	atomic_add_int(&curmq, -1);
	}

	/*
	* Load a message from user space
	*/
	static struct mqueue_msg *
	mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
	{
	struct mqueue_msg *msg;
	size_t len;
	int error;

	len = sizeof(struct mqueue_msg) + msg_size;
	msg = malloc(len, M_MQUEUEDATA, M_WAITOK);
	error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
	msg_size);
	if (error) {
	free(msg, M_MQUEUEDATA);
	msg = NULL;
	} else {
	msg->msg_size = msg_size;
	msg->msg_prio = msg_prio;
	}
	return (msg);
	}

	/*
	* Save a message to user space
	*/
	static int
	mqueue_savemsg(struct mqueue_msg msg, char msg_ptr, int *msg_prio)
	{
	int error;

	error = copyout(((char )msg) + sizeof(msg), msg_ptr,
	msg->msg_size);
	if (error == 0 && msg_prio != NULL)
	error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
	return (error);
	}

	/*
	* Free a message's memory
	*/
	static __inline void
	mqueue_freemsg(struct mqueue_msg *msg)
	{
	free(msg, M_MQUEUEDATA);
	}

	/*
	* Send a message. if waitok is false, thread will not be
	* blocked if there is no data in queue, otherwise, absolute
	* time will be checked.
	*/
	int
	mqueue_send(struct mqueue mq, const char msg_ptr,
	size_t msg_len, unsigned msg_prio, int waitok,
	const struct timespec *abs_timeout)
	{
	struct mqueue_msg *msg;
	struct timespec ts, ts2;
	struct timeval tv;
	int error;

	if (msg_prio >= MQ_PRIO_MAX)
	return (EINVAL);
	if (msg_len > mq->mq_msgsize)
	return (EMSGSIZE);
	msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
	if (msg == NULL)
	return (EFAULT);

	/* O_NONBLOCK case */
	if (!waitok) {
	error = _mqueue_send(mq, msg, -1);
	if (error)
	goto bad;
	return (0);
	}

	/* we allow a null timeout (wait forever) */
	if (abs_timeout == NULL) {
	error = _mqueue_send(mq, msg, 0);
	if (error)
	goto bad;
	return (0);
	}

	/* send it before checking time */
	error = _mqueue_send(mq, msg, -1);
	if (error == 0)
	return (0);

	if (error != EAGAIN)
	goto bad;

	if (abs_timeout->tv_nsec >= 1000000000 \|\| abs_timeout->tv_nsec < 0) {
	error = EINVAL;
	goto bad;
	}
	for (;;) {
	ts2 = *abs_timeout;
	getnanotime(&ts);
	timespecsub(&ts2, &ts);
	if (ts2.tv_sec < 0 \|\| (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
	error = ETIMEDOUT;
	break;
	}
	TIMESPEC_TO_TIMEVAL(&tv, &ts2);
	error = _mqueue_send(mq, msg, tvtohz(&tv));
	if (error != ETIMEDOUT)
	break;
	}
	if (error == 0)
	return (0);
	bad:
	mqueue_freemsg(msg);
	return (error);
	}

	/*
	* Common routine to send a message
	*/
	static int
	_mqueue_send(struct mqueue mq, struct mqueue_msg msg, int timo)
	{
	struct mqueue_msg *msg2;
	int error = 0;

	mtx_lock(&mq->mq_mutex);
	while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
	if (timo < 0) {
	mtx_unlock(&mq->mq_mutex);
	return (EAGAIN);
	}
	mq->mq_senders++;
	error = msleep(&mq->mq_senders, &mq->mq_mutex,
	PCATCH, "mqsend", timo);
	mq->mq_senders--;
	if (error == EAGAIN)
	error = ETIMEDOUT;
	}
	if (mq->mq_curmsgs >= mq->mq_maxmsg) {
	mtx_unlock(&mq->mq_mutex);
	return (error);
	}
	error = 0;
	if (TAILQ_EMPTY(&mq->mq_msgq)) {
	TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
	} else {
	if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
	TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
	} else {
	TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
	if (msg2->msg_prio < msg->msg_prio)
	break;
	}
	TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
	}
	}
	mq->mq_curmsgs++;
	mq->mq_totalbytes += msg->msg_size;
	if (mq->mq_receivers)
	wakeup_one(&mq->mq_receivers);
	else if (mq->mq_notifier != NULL)
	mqueue_send_notification(mq);
	if (mq->mq_flags & MQ_RSEL) {
	mq->mq_flags &= ~MQ_RSEL;
	selwakeup(&mq->mq_rsel);
	}
	KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
	mtx_unlock(&mq->mq_mutex);
	return (0);
	}

	/*
	* Send realtime a signal to process which registered itself
	* successfully by mq_notify.
	*/
	static void
	mqueue_send_notification(struct mqueue *mq)
	{
	struct mqueue_notifier *nt;
	struct thread *td;
	struct proc *p;
	int error;

	mtx_assert(&mq->mq_mutex, MA_OWNED);
	nt = mq->mq_notifier;
	if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
	p = nt->nt_proc;
	error = sigev_findtd(p, &nt->nt_sigev, &td);
	if (error) {
	mq->mq_notifier = NULL;
	return;
	}
	if (!KSI_ONQ(&nt->nt_ksi)) {
	ksiginfo_set_sigev(&nt->nt_ksi, &nt->nt_sigev);
	tdsendsignal(p, td, nt->nt_ksi.ksi_signo, &nt->nt_ksi);
	}
	PROC_UNLOCK(p);
	}
	mq->mq_notifier = NULL;
	}

	/*
	* Get a message. if waitok is false, thread will not be
	* blocked if there is no data in queue, otherwise, absolute
	* time will be checked.
	*/
	int
	mqueue_receive(struct mqueue mq, char msg_ptr,
	size_t msg_len, unsigned *msg_prio, int waitok,
	const struct timespec *abs_timeout)
	{
	struct mqueue_msg *msg;
	struct timespec ts, ts2;
	struct timeval tv;
	int error;

	if (msg_len < mq->mq_msgsize)
	return (EMSGSIZE);

	/* O_NONBLOCK case */
	if (!waitok) {
	error = _mqueue_recv(mq, &msg, -1);
	if (error)
	return (error);
	goto received;
	}

	/* we allow a null timeout (wait forever). */
	if (abs_timeout == NULL) {
	error = _mqueue_recv(mq, &msg, 0);
	if (error)
	return (error);
	goto received;
	}

	/* try to get a message before checking time */
	error = _mqueue_recv(mq, &msg, -1);
	if (error == 0)
	goto received;

	if (error != EAGAIN)
	return (error);

	if (abs_timeout->tv_nsec >= 1000000000 \|\| abs_timeout->tv_nsec < 0) {
	error = EINVAL;
	return (error);
	}

	for (;;) {
	ts2 = *abs_timeout;
	getnanotime(&ts);
	timespecsub(&ts2, &ts);
	if (ts2.tv_sec < 0 \|\| (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
	error = ETIMEDOUT;
	return (error);
	}
	TIMESPEC_TO_TIMEVAL(&tv, &ts2);
	error = _mqueue_recv(mq, &msg, tvtohz(&tv));
	if (error == 0)
	break;
	if (error != ETIMEDOUT)
	return (error);
	}

	received:
	error = mqueue_savemsg(msg, msg_ptr, msg_prio);
	if (error == 0) {
	curthread->td_retval[0] = msg->msg_size;
	curthread->td_retval[1] = 0;
	}
	mqueue_freemsg(msg);
	return (error);
	}

	/*
	* Common routine to receive a message
	*/
	static int
	_mqueue_recv(struct mqueue mq, struct mqueue_msg *msg, int timo)
	{
	int error = 0;

	mtx_lock(&mq->mq_mutex);
	while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
	if (timo < 0) {
	mtx_unlock(&mq->mq_mutex);
	return (EAGAIN);
	}
	mq->mq_receivers++;
	error = msleep(&mq->mq_receivers, &mq->mq_mutex,
	PCATCH, "mqrecv", timo);
	mq->mq_receivers--;
	if (error == EAGAIN)
	error = ETIMEDOUT;
	}
	if (*msg != NULL) {
	error = 0;
	TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
	mq->mq_curmsgs--;
	mq->mq_totalbytes -= (*msg)->msg_size;
	if (mq->mq_senders)
	wakeup_one(&mq->mq_senders);
	if (mq->mq_flags & MQ_WSEL) {
	mq->mq_flags &= ~MQ_WSEL;
	selwakeup(&mq->mq_wsel);
	}
	KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
	}
	if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
	!TAILQ_EMPTY(&mq->mq_msgq)) {
	mqueue_send_notification(mq);
	}
	mtx_unlock(&mq->mq_mutex);
	return (error);
	}

	static __inline struct mqueue_notifier *
	notifier_alloc(void)
	{
	return (uma_zalloc(mqnoti_zone, M_WAITOK \| M_ZERO));
	}

	static __inline void
	notifier_free(struct mqueue_notifier *p)
	{
	uma_zfree(mqnoti_zone, p);
	}

	static struct mqueue_notifier *
	notifier_search(struct proc *p, int fd)
	{
	struct mqueue_notifier *nt;

	LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
	if (nt->nt_ksi.ksi_mqd == fd)
	break;
	}
	return (nt);
	}

	static __inline void
	notifier_insert(struct proc p, struct mqueue_notifier nt)
	{
	LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
	}

	static __inline void
	notifier_delete(struct proc p, struct mqueue_notifier nt)
	{
	LIST_REMOVE(nt, nt_link);
	notifier_free(nt);
	}

	static void
	notifier_remove(struct proc p, struct mqueue mq, int fd)
	{
	struct mqueue_notifier *nt;

	mtx_assert(&mq->mq_mutex, MA_OWNED);
	PROC_LOCK(p);
	nt = notifier_search(p, fd);
	if (nt != NULL) {
	if (mq->mq_notifier == nt)
	mq->mq_notifier = NULL;
	sigqueue_take(&nt->nt_ksi);
	notifier_delete(p, nt);
	}
	PROC_UNLOCK(p);
	}

	static int
	kern_kmq_open(struct thread td, const char upath, int flags, mode_t mode,
	const struct mq_attr *attr)
	{
	char path[MQFS_NAMELEN + 1];
	struct mqfs_node *pn;
	struct filedesc *fdp;
	struct file *fp;
	struct mqueue *mq;
	int fd, error, len, cmode;

	fdp = td->td_proc->p_fd;
	cmode = (((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT);
	mq = NULL;
	if ((flags & O_CREAT) != 0 && attr != NULL) {
	if (attr->mq_maxmsg <= 0 \|\| attr->mq_maxmsg > maxmsg)
	return (EINVAL);
	if (attr->mq_msgsize <= 0 \|\| attr->mq_msgsize > maxmsgsize)
	return (EINVAL);
	}

	error = copyinstr(upath, path, MQFS_NAMELEN + 1, NULL);
	if (error)
	return (error);

	/*
	* The first character of name must be a slash (/) character
	* and the remaining characters of name cannot include any slash
	* characters.
	*/
	len = strlen(path);
	if (len < 2 \|\| path[0] != '/' \|\| index(path + 1, '/') != NULL)
	return (EINVAL);

	error = falloc(td, &fp, &fd, 0);
	if (error)
	return (error);

	sx_xlock(&mqfs_data.mi_lock);
	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
	if (pn == NULL) {
	if (!(flags & O_CREAT)) {
	error = ENOENT;
	} else {
	mq = mqueue_alloc(attr);
	if (mq == NULL) {
	error = ENFILE;
	} else {
	pn = mqfs_create_file(mqfs_data.mi_root,
	path + 1, len - 1, td->td_ucred,
	cmode);
	if (pn == NULL) {
	error = ENOSPC;
	mqueue_free(mq);
	}
	}
	}

	if (error == 0) {
	pn->mn_data = mq;
	}
	} else {
	if ((flags & (O_CREAT \| O_EXCL)) == (O_CREAT \| O_EXCL)) {
	error = EEXIST;
	} else {
	accmode_t accmode = 0;

	if (flags & FREAD)
	accmode \|= VREAD;
	if (flags & FWRITE)
	accmode \|= VWRITE;
	error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
	pn->mn_gid, accmode, td->td_ucred, NULL);
	}
	}

	if (error) {
	sx_xunlock(&mqfs_data.mi_lock);
	fdclose(fdp, fp, fd, td);
	fdrop(fp, td);
	return (error);
	}

	mqnode_addref(pn);
	sx_xunlock(&mqfs_data.mi_lock);

	finit(fp, flags & (FREAD \| FWRITE \| O_NONBLOCK), DTYPE_MQUEUE, pn,
	&mqueueops);

	FILEDESC_XLOCK(fdp);
	if (fdp->fd_ofiles[fd] == fp)
	fdp->fd_ofileflags[fd] \|= UF_EXCLOSE;
	FILEDESC_XUNLOCK(fdp);
	td->td_retval[0] = fd;
	fdrop(fp, td);
	return (0);
	}

	/*
	* Syscall to open a message queue.
	*/
	int
	-kmq_open(struct thread td, struct kmq_open_args uap)
	+sys_kmq_open(struct thread td, struct kmq_open_args uap)
	{
	struct mq_attr attr;
	int flags, error;

	if ((uap->flags & O_ACCMODE) == O_ACCMODE)
	return (EINVAL);
	flags = FFLAGS(uap->flags);
	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
	error = copyin(uap->attr, &attr, sizeof(attr));
	if (error)
	return (error);
	}
	return (kern_kmq_open(td, uap->path, flags, uap->mode,
	uap->attr != NULL ? &attr : NULL));
	}

	/*
	* Syscall to unlink a message queue.
	*/
	int
	-kmq_unlink(struct thread td, struct kmq_unlink_args uap)
	+sys_kmq_unlink(struct thread td, struct kmq_unlink_args uap)
	{
	char path[MQFS_NAMELEN+1];
	struct mqfs_node *pn;
	int error, len;

	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
	if (error)
	return (error);

	len = strlen(path);
	if (len < 2 \|\| path[0] != '/' \|\| index(path + 1, '/') != NULL)
	return (EINVAL);

	sx_xlock(&mqfs_data.mi_lock);
	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
	if (pn != NULL)
	error = do_unlink(pn, td->td_ucred);
	else
	error = ENOENT;
	sx_xunlock(&mqfs_data.mi_lock);
	return (error);
	}

	typedef int (_fgetf)(struct thread , int, cap_rights_t, struct file **);

	/*
	* Get message queue by giving file slot
	*/
	static int
	_getmq(struct thread *td, int fd, cap_rights_t rights, _fgetf func,
	struct file fpp, struct mqfs_node ppn, struct mqueue **pmq)
	{
	struct mqfs_node *pn;
	int error;

	error = func(td, fd, rights, fpp);
	if (error)
	return (error);
	if (&mqueueops != (*fpp)->f_ops) {
	fdrop(*fpp, td);
	return (EBADF);
	}
	pn = (*fpp)->f_data;
	if (ppn)
	*ppn = pn;
	if (pmq)
	*pmq = pn->mn_data;
	return (0);
	}

	static __inline int
	getmq(struct thread td, int fd, struct file fpp, struct mqfs_node *ppn,
	struct mqueue **pmq)
	{
	return _getmq(td, fd, CAP_POLL_EVENT, fget, fpp, ppn, pmq);
	}

	static __inline int
	getmq_read(struct thread td, int fd, struct file *fpp,
	struct mqfs_node ppn, struct mqueue pmq)
	{
	return _getmq(td, fd, CAP_READ, fget_read, fpp, ppn, pmq);
	}

	static __inline int
	getmq_write(struct thread td, int fd, struct file *fpp,
	struct mqfs_node ppn, struct mqueue pmq)
	{
	return _getmq(td, fd, CAP_WRITE, fget_write, fpp, ppn, pmq);
	}

	static int
	kern_kmq_setattr(struct thread td, int mqd, const struct mq_attr attr,
	struct mq_attr *oattr)
	{
	struct mqueue *mq;
	struct file *fp;
	u_int oflag, flag;
	int error;

	if (attr != NULL && (attr->mq_flags & ~O_NONBLOCK) != 0)
	return (EINVAL);
	error = getmq(td, mqd, &fp, NULL, &mq);
	if (error)
	return (error);
	oattr->mq_maxmsg = mq->mq_maxmsg;
	oattr->mq_msgsize = mq->mq_msgsize;
	oattr->mq_curmsgs = mq->mq_curmsgs;
	if (attr != NULL) {
	do {
	oflag = flag = fp->f_flag;
	flag &= ~O_NONBLOCK;
	flag \|= (attr->mq_flags & O_NONBLOCK);
	} while (atomic_cmpset_int(&fp->f_flag, oflag, flag) == 0);
	} else
	oflag = fp->f_flag;
	oattr->mq_flags = (O_NONBLOCK & oflag);
	fdrop(fp, td);
	return (error);
	}

	int
	-kmq_setattr(struct thread td, struct kmq_setattr_args uap)
	+sys_kmq_setattr(struct thread td, struct kmq_setattr_args uap)
	{
	struct mq_attr attr, oattr;
	int error;

	if (uap->attr != NULL) {
	error = copyin(uap->attr, &attr, sizeof(attr));
	if (error != 0)
	return (error);
	}
	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
	&oattr);
	if (error != 0)
	return (error);
	if (uap->oattr != NULL)
	error = copyout(&oattr, uap->oattr, sizeof(oattr));
	return (error);
	}

	int
	-kmq_timedreceive(struct thread td, struct kmq_timedreceive_args uap)
	+sys_kmq_timedreceive(struct thread td, struct kmq_timedreceive_args uap)
	{
	struct mqueue *mq;
	struct file *fp;
	struct timespec *abs_timeout, ets;
	int error;
	int waitok;

	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
	if (error)
	return (error);
	if (uap->abs_timeout != NULL) {
	error = copyin(uap->abs_timeout, &ets, sizeof(ets));
	if (error != 0)
	return (error);
	abs_timeout = &ets;
	} else
	abs_timeout = NULL;
	waitok = !(fp->f_flag & O_NONBLOCK);
	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
	uap->msg_prio, waitok, abs_timeout);
	fdrop(fp, td);
	return (error);
	}

	int
	-kmq_timedsend(struct thread td, struct kmq_timedsend_args uap)
	+sys_kmq_timedsend(struct thread td, struct kmq_timedsend_args uap)
	{
	struct mqueue *mq;
	struct file *fp;
	struct timespec *abs_timeout, ets;
	int error, waitok;

	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
	if (error)
	return (error);
	if (uap->abs_timeout != NULL) {
	error = copyin(uap->abs_timeout, &ets, sizeof(ets));
	if (error != 0)
	return (error);
	abs_timeout = &ets;
	} else
	abs_timeout = NULL;
	waitok = !(fp->f_flag & O_NONBLOCK);
	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
	uap->msg_prio, waitok, abs_timeout);
	fdrop(fp, td);
	return (error);
	}

	int
	-kmq_notify(struct thread td, struct kmq_notify_args uap)
	+sys_kmq_notify(struct thread td, struct kmq_notify_args uap)
	{
	struct sigevent ev;
	struct filedesc *fdp;
	struct proc *p;
	struct mqueue *mq;
	struct file fp, fp2;
	struct mqueue_notifier nt, newnt = NULL;
	int error;

	p = td->td_proc;
	fdp = td->td_proc->p_fd;
	if (uap->sigev) {
	error = copyin(uap->sigev, &ev, sizeof(ev));
	if (error)
	return (error);
	if (ev.sigev_notify != SIGEV_SIGNAL &&
	ev.sigev_notify != SIGEV_THREAD_ID &&
	ev.sigev_notify != SIGEV_NONE)
	return (EINVAL);
	if ((ev.sigev_notify == SIGEV_SIGNAL \|\|
	ev.sigev_notify == SIGEV_THREAD_ID) &&
	!_SIG_VALID(ev.sigev_signo))
	return (EINVAL);
	}
	error = getmq(td, uap->mqd, &fp, NULL, &mq);
	if (error)
	return (error);
	again:
	FILEDESC_SLOCK(fdp);
	fp2 = fget_locked(fdp, uap->mqd);
	if (fp2 == NULL) {
	FILEDESC_SUNLOCK(fdp);
	error = EBADF;
	goto out;
	}
	error = cap_funwrap(fp2, CAP_POLL_EVENT, &fp2);
	if (error) {
	FILEDESC_SUNLOCK(fdp);
	goto out;
	}
	if (fp2 != fp) {
	FILEDESC_SUNLOCK(fdp);
	error = EBADF;
	goto out;
	}
	mtx_lock(&mq->mq_mutex);
	FILEDESC_SUNLOCK(fdp);
	if (uap->sigev != NULL) {
	if (mq->mq_notifier != NULL) {
	error = EBUSY;
	} else {
	PROC_LOCK(p);
	nt = notifier_search(p, uap->mqd);
	if (nt == NULL) {
	if (newnt == NULL) {
	PROC_UNLOCK(p);
	mtx_unlock(&mq->mq_mutex);
	newnt = notifier_alloc();
	goto again;
	}
	}

	if (nt != NULL) {
	sigqueue_take(&nt->nt_ksi);
	if (newnt != NULL) {
	notifier_free(newnt);
	newnt = NULL;
	}
	} else {
	nt = newnt;
	newnt = NULL;
	ksiginfo_init(&nt->nt_ksi);
	nt->nt_ksi.ksi_flags \|= KSI_INS \| KSI_EXT;
	nt->nt_ksi.ksi_code = SI_MESGQ;
	nt->nt_proc = p;
	nt->nt_ksi.ksi_mqd = uap->mqd;
	notifier_insert(p, nt);
	}
	nt->nt_sigev = ev;
	mq->mq_notifier = nt;
	PROC_UNLOCK(p);
	/*
	* if there is no receivers and message queue
	* is not empty, we should send notification
	* as soon as possible.
	*/
	if (mq->mq_receivers == 0 &&
	!TAILQ_EMPTY(&mq->mq_msgq))
	mqueue_send_notification(mq);
	}
	} else {
	notifier_remove(p, mq, uap->mqd);
	}
	mtx_unlock(&mq->mq_mutex);

	out:
	fdrop(fp, td);
	if (newnt != NULL)
	notifier_free(newnt);
	return (error);
	}

	static void
	mqueue_fdclose(struct thread td, int fd, struct file fp)
	{
	struct filedesc *fdp;
	struct mqueue *mq;

	fdp = td->td_proc->p_fd;
	FILEDESC_LOCK_ASSERT(fdp);

	if (fp->f_ops == &mqueueops) {
	mq = FPTOMQ(fp);
	mtx_lock(&mq->mq_mutex);
	notifier_remove(td->td_proc, mq, fd);

	/* have to wakeup thread in same process */
	if (mq->mq_flags & MQ_RSEL) {
	mq->mq_flags &= ~MQ_RSEL;
	selwakeup(&mq->mq_rsel);
	}
	if (mq->mq_flags & MQ_WSEL) {
	mq->mq_flags &= ~MQ_WSEL;
	selwakeup(&mq->mq_wsel);
	}
	mtx_unlock(&mq->mq_mutex);
	}
	}

	static void
	mq_proc_exit(void arg __unused, struct proc p)
	{
	struct filedesc *fdp;
	struct file *fp;
	struct mqueue *mq;
	int i;

	fdp = p->p_fd;
	FILEDESC_SLOCK(fdp);
	for (i = 0; i < fdp->fd_nfiles; ++i) {
	fp = fget_locked(fdp, i);
	if (fp != NULL && fp->f_ops == &mqueueops) {
	mq = FPTOMQ(fp);
	mtx_lock(&mq->mq_mutex);
	notifier_remove(p, FPTOMQ(fp), i);
	mtx_unlock(&mq->mq_mutex);
	}
	}
	FILEDESC_SUNLOCK(fdp);
	KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
	}

	static int
	mqf_read(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{
	return (EOPNOTSUPP);
	}

	static int
	mqf_write(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{
	return (EOPNOTSUPP);
	}

	static int
	mqf_truncate(struct file fp, off_t length, struct ucred active_cred,
	struct thread *td)
	{

	return (EINVAL);
	}

	static int
	mqf_ioctl(struct file fp, u_long cmd, void data,
	struct ucred active_cred, struct thread td)
	{
	return (ENOTTY);
	}

	static int
	mqf_poll(struct file fp, int events, struct ucred active_cred,
	struct thread *td)
	{
	struct mqueue *mq = FPTOMQ(fp);
	int revents = 0;

	mtx_lock(&mq->mq_mutex);
	if (events & (POLLIN \| POLLRDNORM)) {
	if (mq->mq_curmsgs) {
	revents \|= events & (POLLIN \| POLLRDNORM);
	} else {
	mq->mq_flags \|= MQ_RSEL;
	selrecord(td, &mq->mq_rsel);
	}
	}
	if (events & POLLOUT) {
	if (mq->mq_curmsgs < mq->mq_maxmsg)
	revents \|= POLLOUT;
	else {
	mq->mq_flags \|= MQ_WSEL;
	selrecord(td, &mq->mq_wsel);
	}
	}
	mtx_unlock(&mq->mq_mutex);
	return (revents);
	}

	static int
	mqf_close(struct file fp, struct thread td)
	{
	struct mqfs_node *pn;

	fp->f_ops = &badfileops;
	pn = fp->f_data;
	fp->f_data = NULL;
	sx_xlock(&mqfs_data.mi_lock);
	mqnode_release(pn);
	sx_xunlock(&mqfs_data.mi_lock);
	return (0);
	}

	static int
	mqf_stat(struct file fp, struct stat st, struct ucred *active_cred,
	struct thread *td)
	{
	struct mqfs_node *pn = fp->f_data;

	bzero(st, sizeof *st);
	sx_xlock(&mqfs_data.mi_lock);
	st->st_atim = pn->mn_atime;
	st->st_mtim = pn->mn_mtime;
	st->st_ctim = pn->mn_ctime;
	st->st_birthtim = pn->mn_birth;
	st->st_uid = pn->mn_uid;
	st->st_gid = pn->mn_gid;
	st->st_mode = S_IFIFO \| pn->mn_mode;
	sx_xunlock(&mqfs_data.mi_lock);
	return (0);
	}

	static int
	mqf_chmod(struct file fp, mode_t mode, struct ucred active_cred,
	struct thread *td)
	{
	struct mqfs_node *pn;
	int error;

	error = 0;
	pn = fp->f_data;
	sx_xlock(&mqfs_data.mi_lock);
	error = vaccess(VREG, pn->mn_mode, pn->mn_uid, pn->mn_gid, VADMIN,
	active_cred, NULL);
	if (error != 0)
	goto out;
	pn->mn_mode = mode & ACCESSPERMS;
	out:
	sx_xunlock(&mqfs_data.mi_lock);
	return (error);
	}

	static int
	mqf_chown(struct file fp, uid_t uid, gid_t gid, struct ucred active_cred,
	struct thread *td)
	{
	struct mqfs_node *pn;
	int error;

	error = 0;
	pn = fp->f_data;
	sx_xlock(&mqfs_data.mi_lock);
	if (uid == (uid_t)-1)
	uid = pn->mn_uid;
	if (gid == (gid_t)-1)
	gid = pn->mn_gid;
	if (((uid != pn->mn_uid && uid != active_cred->cr_uid) \|\|
	(gid != pn->mn_gid && !groupmember(gid, active_cred))) &&
	(error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
	goto out;
	pn->mn_uid = uid;
	pn->mn_gid = gid;
	out:
	sx_xunlock(&mqfs_data.mi_lock);
	return (error);
	}

	static int
	mqf_kqfilter(struct file fp, struct knote kn)
	{
	struct mqueue *mq = FPTOMQ(fp);
	int error = 0;

	if (kn->kn_filter == EVFILT_READ) {
	kn->kn_fop = &mq_rfiltops;
	knlist_add(&mq->mq_rsel.si_note, kn, 0);
	} else if (kn->kn_filter == EVFILT_WRITE) {
	kn->kn_fop = &mq_wfiltops;
	knlist_add(&mq->mq_wsel.si_note, kn, 0);
	} else
	error = EINVAL;
	return (error);
	}

	static void
	filt_mqdetach(struct knote *kn)
	{
	struct mqueue *mq = FPTOMQ(kn->kn_fp);

	if (kn->kn_filter == EVFILT_READ)
	knlist_remove(&mq->mq_rsel.si_note, kn, 0);
	else if (kn->kn_filter == EVFILT_WRITE)
	knlist_remove(&mq->mq_wsel.si_note, kn, 0);
	else
	panic("filt_mqdetach");
	}

	static int
	filt_mqread(struct knote *kn, long hint)
	{
	struct mqueue *mq = FPTOMQ(kn->kn_fp);

	mtx_assert(&mq->mq_mutex, MA_OWNED);
	return (mq->mq_curmsgs != 0);
	}

	static int
	filt_mqwrite(struct knote *kn, long hint)
	{
	struct mqueue *mq = FPTOMQ(kn->kn_fp);

	mtx_assert(&mq->mq_mutex, MA_OWNED);
	return (mq->mq_curmsgs < mq->mq_maxmsg);
	}

	static struct fileops mqueueops = {
	.fo_read = mqf_read,
	.fo_write = mqf_write,
	.fo_truncate = mqf_truncate,
	.fo_ioctl = mqf_ioctl,
	.fo_poll = mqf_poll,
	.fo_kqfilter = mqf_kqfilter,
	.fo_stat = mqf_stat,
	.fo_chmod = mqf_chmod,
	.fo_chown = mqf_chown,
	.fo_close = mqf_close
	};

	static struct vop_vector mqfs_vnodeops = {
	.vop_default = &default_vnodeops,
	.vop_access = mqfs_access,
	.vop_cachedlookup = mqfs_lookup,
	.vop_lookup = vfs_cache_lookup,
	.vop_reclaim = mqfs_reclaim,
	.vop_create = mqfs_create,
	.vop_remove = mqfs_remove,
	.vop_inactive = mqfs_inactive,
	.vop_open = mqfs_open,
	.vop_close = mqfs_close,
	.vop_getattr = mqfs_getattr,
	.vop_setattr = mqfs_setattr,
	.vop_read = mqfs_read,
	.vop_write = VOP_EOPNOTSUPP,
	.vop_readdir = mqfs_readdir,
	.vop_mkdir = VOP_EOPNOTSUPP,
	.vop_rmdir = VOP_EOPNOTSUPP
	};

	static struct vfsops mqfs_vfsops = {
	.vfs_init = mqfs_init,
	.vfs_uninit = mqfs_uninit,
	.vfs_mount = mqfs_mount,
	.vfs_unmount = mqfs_unmount,
	.vfs_root = mqfs_root,
	.vfs_statfs = mqfs_statfs,
	};

	static struct vfsconf mqueuefs_vfsconf = {
	.vfc_version = VFS_VERSION,
	.vfc_name = "mqueuefs",
	.vfc_vfsops = &mqfs_vfsops,
	.vfc_typenum = -1,
	.vfc_flags = VFCF_SYNTHETIC
	};

	static struct syscall_helper_data mq_syscalls[] = {
	SYSCALL_INIT_HELPER(kmq_open),
	SYSCALL_INIT_HELPER(kmq_setattr),
	SYSCALL_INIT_HELPER(kmq_timedsend),
	SYSCALL_INIT_HELPER(kmq_timedreceive),
	SYSCALL_INIT_HELPER(kmq_notify),
	SYSCALL_INIT_HELPER(kmq_unlink),
	SYSCALL_INIT_LAST
	};

	#ifdef COMPAT_FREEBSD32
	#include <compat/freebsd32/freebsd32.h>
	#include <compat/freebsd32/freebsd32_proto.h>
	#include <compat/freebsd32/freebsd32_syscall.h>
	#include <compat/freebsd32/freebsd32_util.h>

	static void
	mq_attr_from32(const struct mq_attr32 from, struct mq_attr to)
	{

	to->mq_flags = from->mq_flags;
	to->mq_maxmsg = from->mq_maxmsg;
	to->mq_msgsize = from->mq_msgsize;
	to->mq_curmsgs = from->mq_curmsgs;
	}

	static void
	mq_attr_to32(const struct mq_attr from, struct mq_attr32 to)
	{

	to->mq_flags = from->mq_flags;
	to->mq_maxmsg = from->mq_maxmsg;
	to->mq_msgsize = from->mq_msgsize;
	to->mq_curmsgs = from->mq_curmsgs;
	}

	int
	freebsd32_kmq_open(struct thread td, struct freebsd32_kmq_open_args uap)
	{
	struct mq_attr attr;
	struct mq_attr32 attr32;
	int flags, error;

	if ((uap->flags & O_ACCMODE) == O_ACCMODE)
	return (EINVAL);
	flags = FFLAGS(uap->flags);
	if ((flags & O_CREAT) != 0 && uap->attr != NULL) {
	error = copyin(uap->attr, &attr32, sizeof(attr32));
	if (error)
	return (error);
	mq_attr_from32(&attr32, &attr);
	}
	return (kern_kmq_open(td, uap->path, flags, uap->mode,
	uap->attr != NULL ? &attr : NULL));
	}

	int
	freebsd32_kmq_setattr(struct thread td, struct freebsd32_kmq_setattr_args uap)
	{
	struct mq_attr attr, oattr;
	struct mq_attr32 attr32, oattr32;
	int error;

	if (uap->attr != NULL) {
	error = copyin(uap->attr, &attr32, sizeof(attr32));
	if (error != 0)
	return (error);
	mq_attr_from32(&attr32, &attr);
	}
	error = kern_kmq_setattr(td, uap->mqd, uap->attr != NULL ? &attr : NULL,
	&oattr);
	if (error != 0)
	return (error);
	if (uap->oattr != NULL) {
	mq_attr_to32(&oattr, &oattr32);
	error = copyout(&oattr32, uap->oattr, sizeof(oattr32));
	}
	return (error);
	}

	int
	freebsd32_kmq_timedsend(struct thread *td,
	struct freebsd32_kmq_timedsend_args *uap)
	{
	struct mqueue *mq;
	struct file *fp;
	struct timespec32 ets32;
	struct timespec *abs_timeout, ets;
	int error;
	int waitok;

	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
	if (error)
	return (error);
	if (uap->abs_timeout != NULL) {
	error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
	if (error != 0)
	return (error);
	CP(ets32, ets, tv_sec);
	CP(ets32, ets, tv_nsec);
	abs_timeout = &ets;
	} else
	abs_timeout = NULL;
	waitok = !(fp->f_flag & O_NONBLOCK);
	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
	uap->msg_prio, waitok, abs_timeout);
	fdrop(fp, td);
	return (error);
	}

	int
	freebsd32_kmq_timedreceive(struct thread *td,
	struct freebsd32_kmq_timedreceive_args *uap)
	{
	struct mqueue *mq;
	struct file *fp;
	struct timespec32 ets32;
	struct timespec *abs_timeout, ets;
	int error, waitok;

	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
	if (error)
	return (error);
	if (uap->abs_timeout != NULL) {
	error = copyin(uap->abs_timeout, &ets32, sizeof(ets32));
	if (error != 0)
	return (error);
	CP(ets32, ets, tv_sec);
	CP(ets32, ets, tv_nsec);
	abs_timeout = &ets;
	} else
	abs_timeout = NULL;
	waitok = !(fp->f_flag & O_NONBLOCK);
	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
	uap->msg_prio, waitok, abs_timeout);
	fdrop(fp, td);
	return (error);
	}

	static struct syscall_helper_data mq32_syscalls[] = {
	SYSCALL32_INIT_HELPER(freebsd32_kmq_open),
	SYSCALL32_INIT_HELPER(freebsd32_kmq_setattr),
	SYSCALL32_INIT_HELPER(freebsd32_kmq_timedsend),
	SYSCALL32_INIT_HELPER(freebsd32_kmq_timedreceive),
	- SYSCALL32_INIT_HELPER(kmq_notify),
	- SYSCALL32_INIT_HELPER(kmq_unlink),
	+ SYSCALL32_INIT_HELPER_COMPAT(kmq_notify),
	+ SYSCALL32_INIT_HELPER_COMPAT(kmq_unlink),
	SYSCALL_INIT_LAST
	};
	#endif

	static int
	mqinit(void)
	{
	int error;

	error = syscall_helper_register(mq_syscalls);
	if (error != 0)
	return (error);
	#ifdef COMPAT_FREEBSD32
	error = syscall32_helper_register(mq32_syscalls);
	if (error != 0)
	return (error);
	#endif
	return (0);
	}

	static int
	mqunload(void)
	{

	#ifdef COMPAT_FREEBSD32
	syscall32_helper_unregister(mq32_syscalls);
	#endif
	syscall_helper_unregister(mq_syscalls);
	return (0);
	}

	static int
	mq_modload(struct module module, int cmd, void arg)
	{
	int error = 0;

	error = vfs_modevent(module, cmd, arg);
	if (error != 0)
	return (error);

	switch (cmd) {
	case MOD_LOAD:
	error = mqinit();
	if (error != 0)
	mqunload();
	break;
	case MOD_UNLOAD:
	error = mqunload();
	break;
	default:
	break;
	}
	return (error);
	}

	static moduledata_t mqueuefs_mod = {
	"mqueuefs",
	mq_modload,
	&mqueuefs_vfsconf
	};
	DECLARE_MODULE(mqueuefs, mqueuefs_mod, SI_SUB_VFS, SI_ORDER_MIDDLE);
	MODULE_VERSION(mqueuefs, 1);
	Index: head/sys/kern/uipc_sem.c
	===================================================================
	--- head/sys/kern/uipc_sem.c (revision 225616)
	+++ head/sys/kern/uipc_sem.c (revision 225617)
	@@ -1,1091 +1,1091 @@
	/*-
	* Copyright (c) 2002 Alfred Perlstein <alfred@FreeBSD.org>
	* Copyright (c) 2003-2005 SPARTA, Inc.
	* Copyright (c) 2005 Robert N. M. Watson
	* All rights reserved.
	*
	* This software was developed for the FreeBSD Project in part by Network
	* Associates Laboratories, the Security Research Division of Network
	* Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
	* as part of the DARPA CHATS research program.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_posix.h"

	#include <sys/param.h>
	#include <sys/capability.h>
	#include <sys/condvar.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/fnv_hash.h>
	#include <sys/kernel.h>
	#include <sys/ksem.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/posix4.h>
	#include <sys/_semaphore.h>
	#include <sys/stat.h>
	#include <sys/syscall.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/sx.h>
	#include <sys/vnode.h>

	#include <security/mac/mac_framework.h>

	FEATURE(p1003_1b_semaphores, "POSIX P1003.1B semaphores support");
	/*
	* TODO
	*
	* - Resource limits?
	* - Update fstat(1)
	* - Replace global sem_lock with mtx_pool locks?
	* - Add a MAC check_create() hook for creating new named semaphores.
	*/

	#ifndef SEM_MAX
	#define SEM_MAX 30
	#endif

	#ifdef SEM_DEBUG
	#define DP(x) printf x
	#else
	#define DP(x)
	#endif

	struct ksem_mapping {
	char *km_path;
	Fnv32_t km_fnv;
	struct ksem *km_ksem;
	LIST_ENTRY(ksem_mapping) km_link;
	};

	static MALLOC_DEFINE(M_KSEM, "ksem", "semaphore file descriptor");
	static LIST_HEAD(, ksem_mapping) *ksem_dictionary;
	static struct sx ksem_dict_lock;
	static struct mtx ksem_count_lock;
	static struct mtx sem_lock;
	static u_long ksem_hash;
	static int ksem_dead;

	#define KSEM_HASH(fnv) (&ksem_dictionary[(fnv) & ksem_hash])

	static int nsems = 0;
	SYSCTL_DECL(_p1003_1b);
	SYSCTL_INT(_p1003_1b, OID_AUTO, nsems, CTLFLAG_RD, &nsems, 0,
	"Number of active kernel POSIX semaphores");

	static int kern_sem_wait(struct thread *td, semid_t id, int tryflag,
	struct timespec *abstime);
	static int ksem_access(struct ksem ks, struct ucred ucred);
	static struct ksem ksem_alloc(struct ucred ucred, mode_t mode,
	unsigned int value);
	static int ksem_create(struct thread td, const char path,
	semid_t *semidp, mode_t mode, unsigned int value,
	int flags, int compat32);
	static void ksem_drop(struct ksem *ks);
	static int ksem_get(struct thread *td, semid_t id, cap_rights_t rights,
	struct file **fpp);
	static struct ksem ksem_hold(struct ksem ks);
	static void ksem_insert(char path, Fnv32_t fnv, struct ksem ks);
	static struct ksem ksem_lookup(char path, Fnv32_t fnv);
	static void ksem_module_destroy(void);
	static int ksem_module_init(void);
	static int ksem_remove(char path, Fnv32_t fnv, struct ucred ucred);
	static int sem_modload(struct module module, int cmd, void arg);

	static fo_rdwr_t ksem_read;
	static fo_rdwr_t ksem_write;
	static fo_truncate_t ksem_truncate;
	static fo_ioctl_t ksem_ioctl;
	static fo_poll_t ksem_poll;
	static fo_kqfilter_t ksem_kqfilter;
	static fo_stat_t ksem_stat;
	static fo_close_t ksem_closef;
	static fo_chmod_t ksem_chmod;
	static fo_chown_t ksem_chown;

	/* File descriptor operations. */
	static struct fileops ksem_ops = {
	.fo_read = ksem_read,
	.fo_write = ksem_write,
	.fo_truncate = ksem_truncate,
	.fo_ioctl = ksem_ioctl,
	.fo_poll = ksem_poll,
	.fo_kqfilter = ksem_kqfilter,
	.fo_stat = ksem_stat,
	.fo_close = ksem_closef,
	.fo_chmod = ksem_chmod,
	.fo_chown = ksem_chown,
	.fo_flags = DFLAG_PASSABLE
	};

	FEATURE(posix_sem, "POSIX semaphores");

	static int
	ksem_read(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{

	return (EOPNOTSUPP);
	}

	static int
	ksem_write(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{

	return (EOPNOTSUPP);
	}

	static int
	ksem_truncate(struct file fp, off_t length, struct ucred active_cred,
	struct thread *td)
	{

	return (EINVAL);
	}

	static int
	ksem_ioctl(struct file fp, u_long com, void data,
	struct ucred active_cred, struct thread td)
	{

	return (EOPNOTSUPP);
	}

	static int
	ksem_poll(struct file fp, int events, struct ucred active_cred,
	struct thread *td)
	{

	return (EOPNOTSUPP);
	}

	static int
	ksem_kqfilter(struct file fp, struct knote kn)
	{

	return (EOPNOTSUPP);
	}

	static int
	ksem_stat(struct file fp, struct stat sb, struct ucred *active_cred,
	struct thread *td)
	{
	struct ksem *ks;
	#ifdef MAC
	int error;
	#endif

	ks = fp->f_data;

	#ifdef MAC
	error = mac_posixsem_check_stat(active_cred, fp->f_cred, ks);
	if (error)
	return (error);
	#endif

	/*
	* Attempt to return sanish values for fstat() on a semaphore
	* file descriptor.
	*/
	bzero(sb, sizeof(*sb));

	mtx_lock(&sem_lock);
	sb->st_atim = ks->ks_atime;
	sb->st_ctim = ks->ks_ctime;
	sb->st_mtim = ks->ks_mtime;
	sb->st_birthtim = ks->ks_birthtime;
	sb->st_uid = ks->ks_uid;
	sb->st_gid = ks->ks_gid;
	sb->st_mode = S_IFREG \| ks->ks_mode; /* XXX */
	mtx_unlock(&sem_lock);

	return (0);
	}

	static int
	ksem_chmod(struct file fp, mode_t mode, struct ucred active_cred,
	struct thread *td)
	{
	struct ksem *ks;
	int error;

	error = 0;
	ks = fp->f_data;
	mtx_lock(&sem_lock);
	#ifdef MAC
	error = mac_posixsem_check_setmode(active_cred, ks, mode);
	if (error != 0)
	goto out;
	#endif
	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid, VADMIN,
	active_cred, NULL);
	if (error != 0)
	goto out;
	ks->ks_mode = mode & ACCESSPERMS;
	out:
	mtx_unlock(&sem_lock);
	return (error);
	}

	static int
	ksem_chown(struct file fp, uid_t uid, gid_t gid, struct ucred active_cred,
	struct thread *td)
	{
	struct ksem *ks;
	int error;

	error = 0;
	ks = fp->f_data;
	mtx_lock(&sem_lock);
	#ifdef MAC
	error = mac_posixsem_check_setowner(active_cred, ks, uid, gid);
	if (error != 0)
	goto out;
	#endif
	if (uid == (uid_t)-1)
	uid = ks->ks_uid;
	if (gid == (gid_t)-1)
	gid = ks->ks_gid;
	if (((uid != ks->ks_uid && uid != active_cred->cr_uid) \|\|
	(gid != ks->ks_gid && !groupmember(gid, active_cred))) &&
	(error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
	goto out;
	ks->ks_uid = uid;
	ks->ks_gid = gid;
	out:
	mtx_unlock(&sem_lock);
	return (error);
	}

	static int
	ksem_closef(struct file fp, struct thread td)
	{
	struct ksem *ks;

	ks = fp->f_data;
	fp->f_data = NULL;
	ksem_drop(ks);

	return (0);
	}

	/*
	* ksem object management including creation and reference counting
	* routines.
	*/
	static struct ksem *
	ksem_alloc(struct ucred *ucred, mode_t mode, unsigned int value)
	{
	struct ksem *ks;

	mtx_lock(&ksem_count_lock);
	if (nsems == p31b_getcfg(CTL_P1003_1B_SEM_NSEMS_MAX) \|\| ksem_dead) {
	mtx_unlock(&ksem_count_lock);
	return (NULL);
	}
	nsems++;
	mtx_unlock(&ksem_count_lock);
	ks = malloc(sizeof(*ks), M_KSEM, M_WAITOK \| M_ZERO);
	ks->ks_uid = ucred->cr_uid;
	ks->ks_gid = ucred->cr_gid;
	ks->ks_mode = mode;
	ks->ks_value = value;
	cv_init(&ks->ks_cv, "ksem");
	vfs_timestamp(&ks->ks_birthtime);
	ks->ks_atime = ks->ks_mtime = ks->ks_ctime = ks->ks_birthtime;
	refcount_init(&ks->ks_ref, 1);
	#ifdef MAC
	mac_posixsem_init(ks);
	mac_posixsem_create(ucred, ks);
	#endif

	return (ks);
	}

	static struct ksem *
	ksem_hold(struct ksem *ks)
	{

	refcount_acquire(&ks->ks_ref);
	return (ks);
	}

	static void
	ksem_drop(struct ksem *ks)
	{

	if (refcount_release(&ks->ks_ref)) {
	#ifdef MAC
	mac_posixsem_destroy(ks);
	#endif
	cv_destroy(&ks->ks_cv);
	free(ks, M_KSEM);
	mtx_lock(&ksem_count_lock);
	nsems--;
	mtx_unlock(&ksem_count_lock);
	}
	}

	/*
	* Determine if the credentials have sufficient permissions for read
	* and write access.
	*/
	static int
	ksem_access(struct ksem ks, struct ucred ucred)
	{
	int error;

	error = vaccess(VREG, ks->ks_mode, ks->ks_uid, ks->ks_gid,
	VREAD \| VWRITE, ucred, NULL);
	if (error)
	error = priv_check_cred(ucred, PRIV_SEM_WRITE, 0);
	return (error);
	}

	/*
	* Dictionary management. We maintain an in-kernel dictionary to map
	* paths to semaphore objects. We use the FNV hash on the path to
	* store the mappings in a hash table.
	*/
	static struct ksem *
	ksem_lookup(char *path, Fnv32_t fnv)
	{
	struct ksem_mapping *map;

	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
	if (map->km_fnv != fnv)
	continue;
	if (strcmp(map->km_path, path) == 0)
	return (map->km_ksem);
	}

	return (NULL);
	}

	static void
	ksem_insert(char path, Fnv32_t fnv, struct ksem ks)
	{
	struct ksem_mapping *map;

	map = malloc(sizeof(struct ksem_mapping), M_KSEM, M_WAITOK);
	map->km_path = path;
	map->km_fnv = fnv;
	map->km_ksem = ksem_hold(ks);
	LIST_INSERT_HEAD(KSEM_HASH(fnv), map, km_link);
	}

	static int
	ksem_remove(char path, Fnv32_t fnv, struct ucred ucred)
	{
	struct ksem_mapping *map;
	int error;

	LIST_FOREACH(map, KSEM_HASH(fnv), km_link) {
	if (map->km_fnv != fnv)
	continue;
	if (strcmp(map->km_path, path) == 0) {
	#ifdef MAC
	error = mac_posixsem_check_unlink(ucred, map->km_ksem);
	if (error)
	return (error);
	#endif
	error = ksem_access(map->km_ksem, ucred);
	if (error)
	return (error);
	LIST_REMOVE(map, km_link);
	ksem_drop(map->km_ksem);
	free(map->km_path, M_KSEM);
	free(map, M_KSEM);
	return (0);
	}
	}

	return (ENOENT);
	}

	static int
	ksem_create_copyout_semid(struct thread td, semid_t semidp, int fd,
	int compat32)
	{
	semid_t semid;
	#ifdef COMPAT_FREEBSD32
	int32_t semid32;
	#endif
	void *ptr;
	size_t ptrs;

	#ifdef COMPAT_FREEBSD32
	if (compat32) {
	semid32 = fd;
	ptr = &semid32;
	ptrs = sizeof(semid32);
	} else {
	#endif
	semid = fd;
	ptr = &semid;
	ptrs = sizeof(semid);
	compat32 = 0; /* silence gcc */
	#ifdef COMPAT_FREEBSD32
	}
	#endif

	return (copyout(ptr, semidp, ptrs));
	}

	/* Other helper routines. */
	static int
	ksem_create(struct thread td, const char name, semid_t *semidp, mode_t mode,
	unsigned int value, int flags, int compat32)
	{
	struct filedesc *fdp;
	struct ksem *ks;
	struct file *fp;
	char *path;
	Fnv32_t fnv;
	int error, fd;

	if (value > SEM_VALUE_MAX)
	return (EINVAL);

	fdp = td->td_proc->p_fd;
	mode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
	error = falloc(td, &fp, &fd, 0);
	if (error) {
	if (name == NULL)
	error = ENOSPC;
	return (error);
	}

	/*
	* Go ahead and copyout the file descriptor now. This is a bit
	* premature, but it is a lot easier to handle errors as opposed
	* to later when we've possibly created a new semaphore, etc.
	*/
	error = ksem_create_copyout_semid(td, semidp, fd, compat32);
	if (error) {
	fdclose(fdp, fp, fd, td);
	fdrop(fp, td);
	return (error);
	}

	if (name == NULL) {
	/* Create an anonymous semaphore. */
	ks = ksem_alloc(td->td_ucred, mode, value);
	if (ks == NULL)
	error = ENOSPC;
	else
	ks->ks_flags \|= KS_ANONYMOUS;
	} else {
	path = malloc(MAXPATHLEN, M_KSEM, M_WAITOK);
	error = copyinstr(name, path, MAXPATHLEN, NULL);

	/* Require paths to start with a '/' character. */
	if (error == 0 && path[0] != '/')
	error = EINVAL;
	if (error) {
	fdclose(fdp, fp, fd, td);
	fdrop(fp, td);
	free(path, M_KSEM);
	return (error);
	}

	fnv = fnv_32_str(path, FNV1_32_INIT);
	sx_xlock(&ksem_dict_lock);
	ks = ksem_lookup(path, fnv);
	if (ks == NULL) {
	/* Object does not exist, create it if requested. */
	if (flags & O_CREAT) {
	ks = ksem_alloc(td->td_ucred, mode, value);
	if (ks == NULL)
	error = ENFILE;
	else {
	ksem_insert(path, fnv, ks);
	path = NULL;
	}
	} else
	error = ENOENT;
	} else {
	/*
	* Object already exists, obtain a new
	* reference if requested and permitted.
	*/
	if ((flags & (O_CREAT \| O_EXCL)) ==
	(O_CREAT \| O_EXCL))
	error = EEXIST;
	else {
	#ifdef MAC
	error = mac_posixsem_check_open(td->td_ucred,
	ks);
	if (error == 0)
	#endif
	error = ksem_access(ks, td->td_ucred);
	}
	if (error == 0)
	ksem_hold(ks);
	#ifdef INVARIANTS
	else
	ks = NULL;
	#endif
	}
	sx_xunlock(&ksem_dict_lock);
	if (path)
	free(path, M_KSEM);
	}

	if (error) {
	KASSERT(ks == NULL, ("ksem_create error with a ksem"));
	fdclose(fdp, fp, fd, td);
	fdrop(fp, td);
	return (error);
	}
	KASSERT(ks != NULL, ("ksem_create w/o a ksem"));

	finit(fp, FREAD \| FWRITE, DTYPE_SEM, ks, &ksem_ops);

	FILEDESC_XLOCK(fdp);
	if (fdp->fd_ofiles[fd] == fp)
	fdp->fd_ofileflags[fd] \|= UF_EXCLOSE;
	FILEDESC_XUNLOCK(fdp);
	fdrop(fp, td);

	return (0);
	}

	static int
	ksem_get(struct thread td, semid_t id, cap_rights_t rights, struct file *fpp)
	{
	struct ksem *ks;
	struct file *fp;
	int error;

	error = fget(td, id, rights, &fp);
	if (error)
	return (EINVAL);
	if (fp->f_type != DTYPE_SEM) {
	fdrop(fp, td);
	return (EINVAL);
	}
	ks = fp->f_data;
	if (ks->ks_flags & KS_DEAD) {
	fdrop(fp, td);
	return (EINVAL);
	}
	*fpp = fp;
	return (0);
	}

	/* System calls. */
	#ifndef _SYS_SYSPROTO_H_
	struct ksem_init_args {
	unsigned int value;
	semid_t *idp;
	};
	#endif
	int
	-ksem_init(struct thread td, struct ksem_init_args uap)
	+sys_ksem_init(struct thread td, struct ksem_init_args uap)
	{

	return (ksem_create(td, NULL, uap->idp, S_IRWXU \| S_IRWXG, uap->value,
	0, 0));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ksem_open_args {
	char *name;
	int oflag;
	mode_t mode;
	unsigned int value;
	semid_t *idp;
	};
	#endif
	int
	-ksem_open(struct thread td, struct ksem_open_args uap)
	+sys_ksem_open(struct thread td, struct ksem_open_args uap)
	{

	DP((">>> ksem_open start, pid=%d\n", (int)td->td_proc->p_pid));

	if ((uap->oflag & ~(O_CREAT \| O_EXCL)) != 0)
	return (EINVAL);
	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
	uap->oflag, 0));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ksem_unlink_args {
	char *name;
	};
	#endif
	int
	-ksem_unlink(struct thread td, struct ksem_unlink_args uap)
	+sys_ksem_unlink(struct thread td, struct ksem_unlink_args uap)
	{
	char *path;
	Fnv32_t fnv;
	int error;

	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
	error = copyinstr(uap->name, path, MAXPATHLEN, NULL);
	if (error) {
	free(path, M_TEMP);
	return (error);
	}

	fnv = fnv_32_str(path, FNV1_32_INIT);
	sx_xlock(&ksem_dict_lock);
	error = ksem_remove(path, fnv, td->td_ucred);
	sx_xunlock(&ksem_dict_lock);
	free(path, M_TEMP);

	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ksem_close_args {
	semid_t id;
	};
	#endif
	int
	-ksem_close(struct thread td, struct ksem_close_args uap)
	+sys_ksem_close(struct thread td, struct ksem_close_args uap)
	{
	struct ksem *ks;
	struct file *fp;
	int error;

	/* No capability rights required to close a semaphore. */
	error = ksem_get(td, uap->id, 0, &fp);
	if (error)
	return (error);
	ks = fp->f_data;
	if (ks->ks_flags & KS_ANONYMOUS) {
	fdrop(fp, td);
	return (EINVAL);
	}
	error = kern_close(td, uap->id);
	fdrop(fp, td);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ksem_post_args {
	semid_t id;
	};
	#endif
	int
	-ksem_post(struct thread td, struct ksem_post_args uap)
	+sys_ksem_post(struct thread td, struct ksem_post_args uap)
	{
	struct file *fp;
	struct ksem *ks;
	int error;

	error = ksem_get(td, uap->id, CAP_SEM_POST, &fp);
	if (error)
	return (error);
	ks = fp->f_data;

	mtx_lock(&sem_lock);
	#ifdef MAC
	error = mac_posixsem_check_post(td->td_ucred, fp->f_cred, ks);
	if (error)
	goto err;
	#endif
	if (ks->ks_value == SEM_VALUE_MAX) {
	error = EOVERFLOW;
	goto err;
	}
	++ks->ks_value;
	if (ks->ks_waiters > 0)
	cv_signal(&ks->ks_cv);
	error = 0;
	vfs_timestamp(&ks->ks_ctime);
	err:
	mtx_unlock(&sem_lock);
	fdrop(fp, td);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ksem_wait_args {
	semid_t id;
	};
	#endif
	int
	-ksem_wait(struct thread td, struct ksem_wait_args uap)
	+sys_ksem_wait(struct thread td, struct ksem_wait_args uap)
	{

	return (kern_sem_wait(td, uap->id, 0, NULL));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ksem_timedwait_args {
	semid_t id;
	const struct timespec *abstime;
	};
	#endif
	int
	-ksem_timedwait(struct thread td, struct ksem_timedwait_args uap)
	+sys_ksem_timedwait(struct thread td, struct ksem_timedwait_args uap)
	{
	struct timespec abstime;
	struct timespec *ts;
	int error;

	/*
	* We allow a null timespec (wait forever).
	*/
	if (uap->abstime == NULL)
	ts = NULL;
	else {
	error = copyin(uap->abstime, &abstime, sizeof(abstime));
	if (error != 0)
	return (error);
	if (abstime.tv_nsec >= 1000000000 \|\| abstime.tv_nsec < 0)
	return (EINVAL);
	ts = &abstime;
	}
	return (kern_sem_wait(td, uap->id, 0, ts));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ksem_trywait_args {
	semid_t id;
	};
	#endif
	int
	-ksem_trywait(struct thread td, struct ksem_trywait_args uap)
	+sys_ksem_trywait(struct thread td, struct ksem_trywait_args uap)
	{

	return (kern_sem_wait(td, uap->id, 1, NULL));
	}

	static int
	kern_sem_wait(struct thread *td, semid_t id, int tryflag,
	struct timespec *abstime)
	{
	struct timespec ts1, ts2;
	struct timeval tv;
	struct file *fp;
	struct ksem *ks;
	int error;

	DP((">>> kern_sem_wait entered! pid=%d\n", (int)td->td_proc->p_pid));
	error = ksem_get(td, id, CAP_SEM_WAIT, &fp);
	if (error)
	return (error);
	ks = fp->f_data;
	mtx_lock(&sem_lock);
	DP((">>> kern_sem_wait critical section entered! pid=%d\n",
	(int)td->td_proc->p_pid));
	#ifdef MAC
	error = mac_posixsem_check_wait(td->td_ucred, fp->f_cred, ks);
	if (error) {
	DP(("kern_sem_wait mac failed\n"));
	goto err;
	}
	#endif
	DP(("kern_sem_wait value = %d, tryflag %d\n", ks->ks_value, tryflag));
	vfs_timestamp(&ks->ks_atime);
	while (ks->ks_value == 0) {
	ks->ks_waiters++;
	if (tryflag != 0)
	error = EAGAIN;
	else if (abstime == NULL)
	error = cv_wait_sig(&ks->ks_cv, &sem_lock);
	else {
	for (;;) {
	ts1 = *abstime;
	getnanotime(&ts2);
	timespecsub(&ts1, &ts2);
	TIMESPEC_TO_TIMEVAL(&tv, &ts1);
	if (tv.tv_sec < 0) {
	error = ETIMEDOUT;
	break;
	}
	error = cv_timedwait_sig(&ks->ks_cv,
	&sem_lock, tvtohz(&tv));
	if (error != EWOULDBLOCK)
	break;
	}
	}
	ks->ks_waiters--;
	if (error)
	goto err;
	}
	ks->ks_value--;
	DP(("kern_sem_wait value post-decrement = %d\n", ks->ks_value));
	error = 0;
	err:
	mtx_unlock(&sem_lock);
	fdrop(fp, td);
	DP(("<<< kern_sem_wait leaving, pid=%d, error = %d\n",
	(int)td->td_proc->p_pid, error));
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ksem_getvalue_args {
	semid_t id;
	int *val;
	};
	#endif
	int
	-ksem_getvalue(struct thread td, struct ksem_getvalue_args uap)
	+sys_ksem_getvalue(struct thread td, struct ksem_getvalue_args uap)
	{
	struct file *fp;
	struct ksem *ks;
	int error, val;

	error = ksem_get(td, uap->id, CAP_SEM_GETVALUE, &fp);
	if (error)
	return (error);
	ks = fp->f_data;

	mtx_lock(&sem_lock);
	#ifdef MAC
	error = mac_posixsem_check_getvalue(td->td_ucred, fp->f_cred, ks);
	if (error) {
	mtx_unlock(&sem_lock);
	fdrop(fp, td);
	return (error);
	}
	#endif
	val = ks->ks_value;
	vfs_timestamp(&ks->ks_atime);
	mtx_unlock(&sem_lock);
	fdrop(fp, td);
	error = copyout(&val, uap->val, sizeof(val));
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ksem_destroy_args {
	semid_t id;
	};
	#endif
	int
	-ksem_destroy(struct thread td, struct ksem_destroy_args uap)
	+sys_ksem_destroy(struct thread td, struct ksem_destroy_args uap)
	{
	struct file *fp;
	struct ksem *ks;
	int error;

	/* No capability rights required to close a semaphore. */
	error = ksem_get(td, uap->id, 0, &fp);
	if (error)
	return (error);
	ks = fp->f_data;
	if (!(ks->ks_flags & KS_ANONYMOUS)) {
	fdrop(fp, td);
	return (EINVAL);
	}
	mtx_lock(&sem_lock);
	if (ks->ks_waiters != 0) {
	mtx_unlock(&sem_lock);
	error = EBUSY;
	goto err;
	}
	ks->ks_flags \|= KS_DEAD;
	mtx_unlock(&sem_lock);

	error = kern_close(td, uap->id);
	err:
	fdrop(fp, td);
	return (error);
	}

	static struct syscall_helper_data ksem_syscalls[] = {
	SYSCALL_INIT_HELPER(ksem_init),
	SYSCALL_INIT_HELPER(ksem_open),
	SYSCALL_INIT_HELPER(ksem_unlink),
	SYSCALL_INIT_HELPER(ksem_close),
	SYSCALL_INIT_HELPER(ksem_post),
	SYSCALL_INIT_HELPER(ksem_wait),
	SYSCALL_INIT_HELPER(ksem_timedwait),
	SYSCALL_INIT_HELPER(ksem_trywait),
	SYSCALL_INIT_HELPER(ksem_getvalue),
	SYSCALL_INIT_HELPER(ksem_destroy),
	SYSCALL_INIT_LAST
	};

	#ifdef COMPAT_FREEBSD32
	#include <compat/freebsd32/freebsd32.h>
	#include <compat/freebsd32/freebsd32_proto.h>
	#include <compat/freebsd32/freebsd32_signal.h>
	#include <compat/freebsd32/freebsd32_syscall.h>
	#include <compat/freebsd32/freebsd32_util.h>

	int
	freebsd32_ksem_init(struct thread td, struct freebsd32_ksem_init_args uap)
	{

	return (ksem_create(td, NULL, uap->idp, S_IRWXU \| S_IRWXG, uap->value,
	0, 1));
	}

	int
	freebsd32_ksem_open(struct thread td, struct freebsd32_ksem_open_args uap)
	{

	if ((uap->oflag & ~(O_CREAT \| O_EXCL)) != 0)
	return (EINVAL);
	return (ksem_create(td, uap->name, uap->idp, uap->mode, uap->value,
	uap->oflag, 1));
	}

	int
	freebsd32_ksem_timedwait(struct thread *td,
	struct freebsd32_ksem_timedwait_args *uap)
	{
	struct timespec32 abstime32;
	struct timespec *ts, abstime;
	int error;

	/*
	* We allow a null timespec (wait forever).
	*/
	if (uap->abstime == NULL)
	ts = NULL;
	else {
	error = copyin(uap->abstime, &abstime32, sizeof(abstime32));
	if (error != 0)
	return (error);
	CP(abstime32, abstime, tv_sec);
	CP(abstime32, abstime, tv_nsec);
	if (abstime.tv_nsec >= 1000000000 \|\| abstime.tv_nsec < 0)
	return (EINVAL);
	ts = &abstime;
	}
	return (kern_sem_wait(td, uap->id, 0, ts));
	}

	static struct syscall_helper_data ksem32_syscalls[] = {
	SYSCALL32_INIT_HELPER(freebsd32_ksem_init),
	SYSCALL32_INIT_HELPER(freebsd32_ksem_open),
	- SYSCALL32_INIT_HELPER(ksem_unlink),
	- SYSCALL32_INIT_HELPER(ksem_close),
	- SYSCALL32_INIT_HELPER(ksem_post),
	- SYSCALL32_INIT_HELPER(ksem_wait),
	+ SYSCALL32_INIT_HELPER_COMPAT(ksem_unlink),
	+ SYSCALL32_INIT_HELPER_COMPAT(ksem_close),
	+ SYSCALL32_INIT_HELPER_COMPAT(ksem_post),
	+ SYSCALL32_INIT_HELPER_COMPAT(ksem_wait),
	SYSCALL32_INIT_HELPER(freebsd32_ksem_timedwait),
	- SYSCALL32_INIT_HELPER(ksem_trywait),
	- SYSCALL32_INIT_HELPER(ksem_getvalue),
	- SYSCALL32_INIT_HELPER(ksem_destroy),
	+ SYSCALL32_INIT_HELPER_COMPAT(ksem_trywait),
	+ SYSCALL32_INIT_HELPER_COMPAT(ksem_getvalue),
	+ SYSCALL32_INIT_HELPER_COMPAT(ksem_destroy),
	SYSCALL_INIT_LAST
	};
	#endif

	static int
	ksem_module_init(void)
	{
	int error;

	mtx_init(&sem_lock, "sem", NULL, MTX_DEF);
	mtx_init(&ksem_count_lock, "ksem count", NULL, MTX_DEF);
	sx_init(&ksem_dict_lock, "ksem dictionary");
	ksem_dictionary = hashinit(1024, M_KSEM, &ksem_hash);
	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 200112L);
	p31b_setcfg(CTL_P1003_1B_SEM_NSEMS_MAX, SEM_MAX);
	p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);

	error = syscall_helper_register(ksem_syscalls);
	if (error)
	return (error);
	#ifdef COMPAT_FREEBSD32
	error = syscall32_helper_register(ksem32_syscalls);
	if (error)
	return (error);
	#endif
	return (0);
	}

	static void
	ksem_module_destroy(void)
	{

	#ifdef COMPAT_FREEBSD32
	syscall32_helper_unregister(ksem32_syscalls);
	#endif
	syscall_helper_unregister(ksem_syscalls);

	p31b_setcfg(CTL_P1003_1B_SEMAPHORES, 0);
	hashdestroy(ksem_dictionary, M_KSEM, ksem_hash);
	sx_destroy(&ksem_dict_lock);
	mtx_destroy(&ksem_count_lock);
	mtx_destroy(&sem_lock);
	p31b_unsetcfg(CTL_P1003_1B_SEM_VALUE_MAX);
	p31b_unsetcfg(CTL_P1003_1B_SEM_NSEMS_MAX);
	}

	static int
	sem_modload(struct module module, int cmd, void arg)
	{
	int error = 0;

	switch (cmd) {
	case MOD_LOAD:
	error = ksem_module_init();
	if (error)
	ksem_module_destroy();
	break;

	case MOD_UNLOAD:
	mtx_lock(&ksem_count_lock);
	if (nsems != 0) {
	error = EOPNOTSUPP;
	mtx_unlock(&ksem_count_lock);
	break;
	}
	ksem_dead = 1;
	mtx_unlock(&ksem_count_lock);
	ksem_module_destroy();
	break;

	case MOD_SHUTDOWN:
	break;
	default:
	error = EINVAL;
	break;
	}
	return (error);
	}

	static moduledata_t sem_mod = {
	"sem",
	&sem_modload,
	NULL
	};

	DECLARE_MODULE(sem, sem_mod, SI_SUB_SYSV_SEM, SI_ORDER_FIRST);
	MODULE_VERSION(sem, 1);
	Index: head/sys/kern/uipc_shm.c
	===================================================================
	--- head/sys/kern/uipc_shm.c (revision 225616)
	+++ head/sys/kern/uipc_shm.c (revision 225617)
	@@ -1,727 +1,727 @@
	/*-
	* Copyright (c) 2006, 2011 Robert N. M. Watson
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	/*
	* Support for shared swap-backed anonymous memory objects via
	* shm_open(2) and shm_unlink(2). While most of the implementation is
	* here, vm_mmap.c contains mapping logic changes.
	*
	* TODO:
	*
	* (1) Need to export data to a userland tool via a sysctl. Should ipcs(1)
	* and ipcrm(1) be expanded or should new tools to manage both POSIX
	* kernel semaphores and POSIX shared memory be written?
	*
	* (2) Add support for this file type to fstat(1).
	*
	* (3) Resource limits? Does this need its own resource limits or are the
	* existing limits in mmap(2) sufficient?
	*
	* (4) Partial page truncation. vnode_pager_setsize() will zero any parts
	* of a partially mapped page as a result of ftruncate(2)/truncate(2).
	* We can do the same (with the same pmap evil), but do we need to
	* worry about the bits on disk if the page is swapped out or will the
	* swapper zero the parts of a page that are invalid if the page is
	* swapped back in for us?
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_capsicum.h"

	#include <sys/param.h>
	#include <sys/capability.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/fnv_hash.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mman.h>
	#include <sys/mutex.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/refcount.h>
	#include <sys/resourcevar.h>
	#include <sys/stat.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/sx.h>
	#include <sys/time.h>
	#include <sys/vnode.h>

	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>
	#include <vm/swap_pager.h>

	struct shm_mapping {
	char *sm_path;
	Fnv32_t sm_fnv;
	struct shmfd *sm_shmfd;
	LIST_ENTRY(shm_mapping) sm_link;
	};

	static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
	static LIST_HEAD(, shm_mapping) *shm_dictionary;
	static struct sx shm_dict_lock;
	static struct mtx shm_timestamp_lock;
	static u_long shm_hash;

	#define SHM_HASH(fnv) (&shm_dictionary[(fnv) & shm_hash])

	static int shm_access(struct shmfd shmfd, struct ucred ucred, int flags);
	static struct shmfd shm_alloc(struct ucred ucred, mode_t mode);
	static void shm_dict_init(void *arg);
	static void shm_drop(struct shmfd *shmfd);
	static struct shmfd shm_hold(struct shmfd shmfd);
	static void shm_insert(char path, Fnv32_t fnv, struct shmfd shmfd);
	static struct shmfd shm_lookup(char path, Fnv32_t fnv);
	static int shm_remove(char path, Fnv32_t fnv, struct ucred ucred);
	static int shm_dotruncate(struct shmfd *shmfd, off_t length);

	static fo_rdwr_t shm_read;
	static fo_rdwr_t shm_write;
	static fo_truncate_t shm_truncate;
	static fo_ioctl_t shm_ioctl;
	static fo_poll_t shm_poll;
	static fo_kqfilter_t shm_kqfilter;
	static fo_stat_t shm_stat;
	static fo_close_t shm_close;
	static fo_chmod_t shm_chmod;
	static fo_chown_t shm_chown;

	/* File descriptor operations. */
	static struct fileops shm_ops = {
	.fo_read = shm_read,
	.fo_write = shm_write,
	.fo_truncate = shm_truncate,
	.fo_ioctl = shm_ioctl,
	.fo_poll = shm_poll,
	.fo_kqfilter = shm_kqfilter,
	.fo_stat = shm_stat,
	.fo_close = shm_close,
	.fo_chmod = shm_chmod,
	.fo_chown = shm_chown,
	.fo_flags = DFLAG_PASSABLE
	};

	FEATURE(posix_shm, "POSIX shared memory");

	static int
	shm_read(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{

	return (EOPNOTSUPP);
	}

	static int
	shm_write(struct file fp, struct uio uio, struct ucred *active_cred,
	int flags, struct thread *td)
	{

	return (EOPNOTSUPP);
	}

	static int
	shm_truncate(struct file fp, off_t length, struct ucred active_cred,
	struct thread *td)
	{
	struct shmfd *shmfd;
	#ifdef MAC
	int error;
	#endif

	shmfd = fp->f_data;
	#ifdef MAC
	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
	if (error)
	return (error);
	#endif
	return (shm_dotruncate(shmfd, length));
	}

	static int
	shm_ioctl(struct file fp, u_long com, void data,
	struct ucred active_cred, struct thread td)
	{

	return (EOPNOTSUPP);
	}

	static int
	shm_poll(struct file fp, int events, struct ucred active_cred,
	struct thread *td)
	{

	return (EOPNOTSUPP);
	}

	static int
	shm_kqfilter(struct file fp, struct knote kn)
	{

	return (EOPNOTSUPP);
	}

	static int
	shm_stat(struct file fp, struct stat sb, struct ucred *active_cred,
	struct thread *td)
	{
	struct shmfd *shmfd;
	#ifdef MAC
	int error;
	#endif

	shmfd = fp->f_data;

	#ifdef MAC
	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
	if (error)
	return (error);
	#endif

	/*
	* Attempt to return sanish values for fstat() on a memory file
	* descriptor.
	*/
	bzero(sb, sizeof(*sb));
	sb->st_blksize = PAGE_SIZE;
	sb->st_size = shmfd->shm_size;
	sb->st_blocks = (sb->st_size + sb->st_blksize - 1) / sb->st_blksize;
	mtx_lock(&shm_timestamp_lock);
	sb->st_atim = shmfd->shm_atime;
	sb->st_ctim = shmfd->shm_ctime;
	sb->st_mtim = shmfd->shm_mtime;
	sb->st_birthtim = shmfd->shm_birthtime;
	sb->st_mode = S_IFREG \| shmfd->shm_mode; /* XXX */
	sb->st_uid = shmfd->shm_uid;
	sb->st_gid = shmfd->shm_gid;
	mtx_unlock(&shm_timestamp_lock);

	return (0);
	}

	static int
	shm_close(struct file fp, struct thread td)
	{
	struct shmfd *shmfd;

	shmfd = fp->f_data;
	fp->f_data = NULL;
	shm_drop(shmfd);

	return (0);
	}

	static int
	shm_dotruncate(struct shmfd *shmfd, off_t length)
	{
	vm_object_t object;
	vm_page_t m;
	vm_pindex_t nobjsize;
	vm_ooffset_t delta;

	object = shmfd->shm_object;
	VM_OBJECT_LOCK(object);
	if (length == shmfd->shm_size) {
	VM_OBJECT_UNLOCK(object);
	return (0);
	}
	nobjsize = OFF_TO_IDX(length + PAGE_MASK);

	/* Are we shrinking? If so, trim the end. */
	if (length < shmfd->shm_size) {
	delta = ptoa(object->size - nobjsize);

	/* Toss in memory pages. */
	if (nobjsize < object->size)
	vm_object_page_remove(object, nobjsize, object->size,
	0);

	/* Toss pages from swap. */
	if (object->type == OBJT_SWAP)
	swap_pager_freespace(object, nobjsize, delta);

	/* Free the swap accounted for shm */
	swap_release_by_cred(delta, object->cred);
	object->charge -= delta;

	/*
	* If the last page is partially mapped, then zero out
	* the garbage at the end of the page. See comments
	* in vnode_pager_setsize() for more details.
	*
	* XXXJHB: This handles in memory pages, but what about
	* a page swapped out to disk?
	*/
	if ((length & PAGE_MASK) &&
	(m = vm_page_lookup(object, OFF_TO_IDX(length))) != NULL &&
	m->valid != 0) {
	int base = (int)length & PAGE_MASK;
	int size = PAGE_SIZE - base;

	pmap_zero_page_area(m, base, size);

	/*
	* Update the valid bits to reflect the blocks that
	* have been zeroed. Some of these valid bits may
	* have already been set.
	*/
	vm_page_set_valid(m, base, size);

	/*
	* Round "base" to the next block boundary so that the
	* dirty bit for a partially zeroed block is not
	* cleared.
	*/
	base = roundup2(base, DEV_BSIZE);

	vm_page_clear_dirty(m, base, PAGE_SIZE - base);
	} else if ((length & PAGE_MASK) &&
	__predict_false(object->cache != NULL)) {
	vm_page_cache_free(object, OFF_TO_IDX(length),
	nobjsize);
	}
	} else {

	/* Attempt to reserve the swap */
	delta = ptoa(nobjsize - object->size);
	if (!swap_reserve_by_cred(delta, object->cred)) {
	VM_OBJECT_UNLOCK(object);
	return (ENOMEM);
	}
	object->charge += delta;
	}
	shmfd->shm_size = length;
	mtx_lock(&shm_timestamp_lock);
	vfs_timestamp(&shmfd->shm_ctime);
	shmfd->shm_mtime = shmfd->shm_ctime;
	mtx_unlock(&shm_timestamp_lock);
	object->size = nobjsize;
	VM_OBJECT_UNLOCK(object);
	return (0);
	}

	/*
	* shmfd object management including creation and reference counting
	* routines.
	*/
	static struct shmfd *
	shm_alloc(struct ucred *ucred, mode_t mode)
	{
	struct shmfd *shmfd;

	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK \| M_ZERO);
	shmfd->shm_size = 0;
	shmfd->shm_uid = ucred->cr_uid;
	shmfd->shm_gid = ucred->cr_gid;
	shmfd->shm_mode = mode;
	shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL,
	shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
	VM_OBJECT_LOCK(shmfd->shm_object);
	vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING);
	vm_object_set_flag(shmfd->shm_object, OBJ_NOSPLIT);
	VM_OBJECT_UNLOCK(shmfd->shm_object);
	vfs_timestamp(&shmfd->shm_birthtime);
	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
	shmfd->shm_birthtime;
	refcount_init(&shmfd->shm_refs, 1);
	#ifdef MAC
	mac_posixshm_init(shmfd);
	mac_posixshm_create(ucred, shmfd);
	#endif

	return (shmfd);
	}

	static struct shmfd *
	shm_hold(struct shmfd *shmfd)
	{

	refcount_acquire(&shmfd->shm_refs);
	return (shmfd);
	}

	static void
	shm_drop(struct shmfd *shmfd)
	{

	if (refcount_release(&shmfd->shm_refs)) {
	#ifdef MAC
	mac_posixshm_destroy(shmfd);
	#endif
	vm_object_deallocate(shmfd->shm_object);
	free(shmfd, M_SHMFD);
	}
	}

	/*
	* Determine if the credentials have sufficient permissions for a
	* specified combination of FREAD and FWRITE.
	*/
	static int
	shm_access(struct shmfd shmfd, struct ucred ucred, int flags)
	{
	accmode_t accmode;
	int error;

	accmode = 0;
	if (flags & FREAD)
	accmode \|= VREAD;
	if (flags & FWRITE)
	accmode \|= VWRITE;
	mtx_lock(&shm_timestamp_lock);
	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
	accmode, ucred, NULL);
	mtx_unlock(&shm_timestamp_lock);
	return (error);
	}

	/*
	* Dictionary management. We maintain an in-kernel dictionary to map
	* paths to shmfd objects. We use the FNV hash on the path to store
	* the mappings in a hash table.
	*/
	static void
	shm_dict_init(void *arg)
	{

	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
	sx_init(&shm_dict_lock, "shm dictionary");
	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
	}
	SYSINIT(shm_dict_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_dict_init, NULL);

	static struct shmfd *
	shm_lookup(char *path, Fnv32_t fnv)
	{
	struct shm_mapping *map;

	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
	if (map->sm_fnv != fnv)
	continue;
	if (strcmp(map->sm_path, path) == 0)
	return (map->sm_shmfd);
	}

	return (NULL);
	}

	static void
	shm_insert(char path, Fnv32_t fnv, struct shmfd shmfd)
	{
	struct shm_mapping *map;

	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
	map->sm_path = path;
	map->sm_fnv = fnv;
	map->sm_shmfd = shm_hold(shmfd);
	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
	}

	static int
	shm_remove(char path, Fnv32_t fnv, struct ucred ucred)
	{
	struct shm_mapping *map;
	int error;

	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
	if (map->sm_fnv != fnv)
	continue;
	if (strcmp(map->sm_path, path) == 0) {
	#ifdef MAC
	error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
	if (error)
	return (error);
	#endif
	error = shm_access(map->sm_shmfd, ucred,
	FREAD \| FWRITE);
	if (error)
	return (error);
	LIST_REMOVE(map, sm_link);
	shm_drop(map->sm_shmfd);
	free(map->sm_path, M_SHMFD);
	free(map, M_SHMFD);
	return (0);
	}
	}

	return (ENOENT);
	}

	/* System calls. */
	int
	-shm_open(struct thread td, struct shm_open_args uap)
	+sys_shm_open(struct thread td, struct shm_open_args uap)
	{
	struct filedesc *fdp;
	struct shmfd *shmfd;
	struct file *fp;
	char *path;
	Fnv32_t fnv;
	mode_t cmode;
	int fd, error;

	#ifdef CAPABILITY_MODE
	/*
	* shm_open(2) is only allowed for anonymous objects.
	*/
	if (IN_CAPABILITY_MODE(td) && (uap->path != SHM_ANON))
	return (ECAPMODE);
	#endif

	if ((uap->flags & O_ACCMODE) != O_RDONLY &&
	(uap->flags & O_ACCMODE) != O_RDWR)
	return (EINVAL);

	if ((uap->flags & ~(O_ACCMODE \| O_CREAT \| O_EXCL \| O_TRUNC)) != 0)
	return (EINVAL);

	fdp = td->td_proc->p_fd;
	cmode = (uap->mode & ~fdp->fd_cmask) & ACCESSPERMS;

	error = falloc(td, &fp, &fd, 0);
	if (error)
	return (error);

	/* A SHM_ANON path pointer creates an anonymous object. */
	if (uap->path == SHM_ANON) {
	/* A read-only anonymous object is pointless. */
	if ((uap->flags & O_ACCMODE) == O_RDONLY) {
	fdclose(fdp, fp, fd, td);
	fdrop(fp, td);
	return (EINVAL);
	}
	shmfd = shm_alloc(td->td_ucred, cmode);
	} else {
	path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);

	/* Require paths to start with a '/' character. */
	if (error == 0 && path[0] != '/')
	error = EINVAL;
	if (error) {
	fdclose(fdp, fp, fd, td);
	fdrop(fp, td);
	free(path, M_SHMFD);
	return (error);
	}

	fnv = fnv_32_str(path, FNV1_32_INIT);
	sx_xlock(&shm_dict_lock);
	shmfd = shm_lookup(path, fnv);
	if (shmfd == NULL) {
	/* Object does not yet exist, create it if requested. */
	if (uap->flags & O_CREAT) {
	#ifdef MAC
	error = mac_posixshm_check_create(td->td_ucred,
	path);
	if (error == 0) {
	#endif
	shmfd = shm_alloc(td->td_ucred, cmode);
	shm_insert(path, fnv, shmfd);
	#ifdef MAC
	}
	#endif
	} else {
	free(path, M_SHMFD);
	error = ENOENT;
	}
	} else {
	/*
	* Object already exists, obtain a new
	* reference if requested and permitted.
	*/
	free(path, M_SHMFD);
	if ((uap->flags & (O_CREAT \| O_EXCL)) ==
	(O_CREAT \| O_EXCL))
	error = EEXIST;
	else {
	#ifdef MAC
	error = mac_posixshm_check_open(td->td_ucred,
	shmfd, FFLAGS(uap->flags & O_ACCMODE));
	if (error == 0)
	#endif
	error = shm_access(shmfd, td->td_ucred,
	FFLAGS(uap->flags & O_ACCMODE));
	}

	/*
	* Truncate the file back to zero length if
	* O_TRUNC was specified and the object was
	* opened with read/write.
	*/
	if (error == 0 &&
	(uap->flags & (O_ACCMODE \| O_TRUNC)) ==
	(O_RDWR \| O_TRUNC)) {
	#ifdef MAC
	error = mac_posixshm_check_truncate(
	td->td_ucred, fp->f_cred, shmfd);
	if (error == 0)
	#endif
	shm_dotruncate(shmfd, 0);
	}
	if (error == 0)
	shm_hold(shmfd);
	}
	sx_xunlock(&shm_dict_lock);

	if (error) {
	fdclose(fdp, fp, fd, td);
	fdrop(fp, td);
	return (error);
	}
	}

	finit(fp, FFLAGS(uap->flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);

	FILEDESC_XLOCK(fdp);
	if (fdp->fd_ofiles[fd] == fp)
	fdp->fd_ofileflags[fd] \|= UF_EXCLOSE;
	FILEDESC_XUNLOCK(fdp);
	td->td_retval[0] = fd;
	fdrop(fp, td);

	return (0);
	}

	int
	-shm_unlink(struct thread td, struct shm_unlink_args uap)
	+sys_shm_unlink(struct thread td, struct shm_unlink_args uap)
	{
	char *path;
	Fnv32_t fnv;
	int error;

	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
	error = copyinstr(uap->path, path, MAXPATHLEN, NULL);
	if (error) {
	free(path, M_TEMP);
	return (error);
	}

	fnv = fnv_32_str(path, FNV1_32_INIT);
	sx_xlock(&shm_dict_lock);
	error = shm_remove(path, fnv, td->td_ucred);
	sx_xunlock(&shm_dict_lock);
	free(path, M_TEMP);

	return (error);
	}

	/*
	* mmap() helper to validate mmap() requests against shm object state
	* and give mmap() the vm_object to use for the mapping.
	*/
	int
	shm_mmap(struct shmfd *shmfd, vm_size_t objsize, vm_ooffset_t foff,
	vm_object_t *obj)
	{

	/*
	* XXXRW: This validation is probably insufficient, and subject to
	* sign errors. It should be fixed.
	*/
	if (foff >= shmfd->shm_size \|\|
	foff + objsize > round_page(shmfd->shm_size))
	return (EINVAL);

	mtx_lock(&shm_timestamp_lock);
	vfs_timestamp(&shmfd->shm_atime);
	mtx_unlock(&shm_timestamp_lock);
	vm_object_reference(shmfd->shm_object);
	*obj = shmfd->shm_object;
	return (0);
	}

	static int
	shm_chmod(struct file fp, mode_t mode, struct ucred active_cred,
	struct thread *td)
	{
	struct shmfd *shmfd;
	int error;

	error = 0;
	shmfd = fp->f_data;
	mtx_lock(&shm_timestamp_lock);
	/*
	* SUSv4 says that x bits of permission need not be affected.
	* Be consistent with our shm_open there.
	*/
	#ifdef MAC
	error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
	if (error != 0)
	goto out;
	#endif
	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid,
	shmfd->shm_gid, VADMIN, active_cred, NULL);
	if (error != 0)
	goto out;
	shmfd->shm_mode = mode & ACCESSPERMS;
	out:
	mtx_unlock(&shm_timestamp_lock);
	return (error);
	}

	static int
	shm_chown(struct file fp, uid_t uid, gid_t gid, struct ucred active_cred,
	struct thread *td)
	{
	struct shmfd *shmfd;
	int error;

	error = 0;
	shmfd = fp->f_data;
	mtx_lock(&shm_timestamp_lock);
	#ifdef MAC
	error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
	if (error != 0)
	goto out;
	#endif
	if (uid == (uid_t)-1)
	uid = shmfd->shm_uid;
	if (gid == (gid_t)-1)
	gid = shmfd->shm_gid;
	if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) \|\|
	(gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
	(error = priv_check_cred(active_cred, PRIV_VFS_CHOWN, 0)))
	goto out;
	shmfd->shm_uid = uid;
	shmfd->shm_gid = gid;
	out:
	mtx_unlock(&shm_timestamp_lock);
	return (error);
	}
	Index: head/sys/kern/uipc_syscalls.c
	===================================================================
	--- head/sys/kern/uipc_syscalls.c (revision 225616)
	+++ head/sys/kern/uipc_syscalls.c (revision 225617)
	@@ -1,2766 +1,2766 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1990, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* sendfile(2) and related extensions:
	* Copyright (c) 1998, David Greenman. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_capsicum.h"
	#include "opt_inet.h"
	#include "opt_inet6.h"
	#include "opt_sctp.h"
	#include "opt_compat.h"
	#include "opt_ktrace.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysproto.h>
	#include <sys/malloc.h>
	#include <sys/filedesc.h>
	#include <sys/event.h>
	#include <sys/proc.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filio.h>
	#include <sys/jail.h>
	#include <sys/mount.h>
	#include <sys/mbuf.h>
	#include <sys/protosw.h>
	#include <sys/sf_buf.h>
	#include <sys/sysent.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/signalvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/uio.h>
	#include <sys/vnode.h>
	#ifdef KTRACE
	#include <sys/ktrace.h>
	#endif
	#ifdef COMPAT_FREEBSD32
	#include <compat/freebsd32/freebsd32_util.h>
	#endif

	#include <net/vnet.h>

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pageout.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_extern.h>

	#if defined(INET) \|\| defined(INET6)
	#ifdef SCTP
	#include <netinet/sctp.h>
	#include <netinet/sctp_peeloff.h>
	#endif /* SCTP */
	#endif /* INET \|\| INET6 */

	static int sendit(struct thread td, int s, struct msghdr mp, int flags);
	static int recvit(struct thread td, int s, struct msghdr mp, void *namelenp);

	static int accept1(struct thread td, struct accept_args uap, int compat);
	static int do_sendfile(struct thread td, struct sendfile_args uap, int compat);
	static int getsockname1(struct thread td, struct getsockname_args uap,
	int compat);
	static int getpeername1(struct thread td, struct getpeername_args uap,
	int compat);

	/*
	* NSFBUFS-related variables and associated sysctls
	*/
	int nsfbufs;
	int nsfbufspeak;
	int nsfbufsused;

	SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
	"Maximum number of sendfile(2) sf_bufs available");
	SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
	"Number of sendfile(2) sf_bufs at peak usage");
	SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufsused, CTLFLAG_RD, &nsfbufsused, 0,
	"Number of sendfile(2) sf_bufs in use");

	/*
	* Convert a user file descriptor to a kernel file entry and check that, if
	* it is a capability, the right rights are present. A reference on the file
	* entry is held upon returning.
	*/
	static int
	getsock_cap(struct filedesc *fdp, int fd, cap_rights_t rights,
	struct file *fpp, u_int fflagp)
	{
	struct file *fp;
	#ifdef CAPABILITIES
	struct file *fp_fromcap;
	int error;
	#endif

	fp = NULL;
	if ((fdp == NULL) \|\| ((fp = fget_unlocked(fdp, fd)) == NULL))
	return (EBADF);
	#ifdef CAPABILITIES
	/*
	* If the file descriptor is for a capability, test rights and use
	* the file descriptor referenced by the capability.
	*/
	error = cap_funwrap(fp, rights, &fp_fromcap);
	if (error) {
	fdrop(fp, curthread);
	return (error);
	}
	if (fp != fp_fromcap) {
	fhold(fp_fromcap);
	fdrop(fp, curthread);
	fp = fp_fromcap;
	}
	#endif /* CAPABILITIES */
	if (fp->f_type != DTYPE_SOCKET) {
	fdrop(fp, curthread);
	return (ENOTSOCK);
	}
	if (fflagp != NULL)
	*fflagp = fp->f_flag;
	*fpp = fp;
	return (0);
	}

	/*
	* System call interface to the socket abstraction.
	*/
	#if defined(COMPAT_43)
	#define COMPAT_OLDSOCK
	#endif

	int
	-socket(td, uap)
	+sys_socket(td, uap)
	struct thread *td;
	struct socket_args /* {
	int domain;
	int type;
	int protocol;
	} / uap;
	{
	struct filedesc *fdp;
	struct socket *so;
	struct file *fp;
	int fd, error;

	AUDIT_ARG_SOCKET(uap->domain, uap->type, uap->protocol);
	#ifdef MAC
	error = mac_socket_check_create(td->td_ucred, uap->domain, uap->type,
	uap->protocol);
	if (error)
	return (error);
	#endif
	fdp = td->td_proc->p_fd;
	error = falloc(td, &fp, &fd, 0);
	if (error)
	return (error);
	/* An extra reference on `fp' has been held for us by falloc(). */
	error = socreate(uap->domain, &so, uap->type, uap->protocol,
	td->td_ucred, td);
	if (error) {
	fdclose(fdp, fp, fd, td);
	} else {
	finit(fp, FREAD \| FWRITE, DTYPE_SOCKET, so, &socketops);
	td->td_retval[0] = fd;
	}
	fdrop(fp, td);
	return (error);
	}

	/* ARGSUSED */
	int
	-bind(td, uap)
	+sys_bind(td, uap)
	struct thread *td;
	struct bind_args /* {
	int s;
	caddr_t name;
	int namelen;
	} / uap;
	{
	struct sockaddr *sa;
	int error;

	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
	return (error);

	error = kern_bind(td, uap->s, sa);
	free(sa, M_SONAME);
	return (error);
	}

	int
	kern_bind(td, fd, sa)
	struct thread *td;
	int fd;
	struct sockaddr *sa;
	{
	struct socket *so;
	struct file *fp;
	int error;

	AUDIT_ARG_FD(fd);
	error = getsock_cap(td->td_proc->p_fd, fd, CAP_BIND, &fp, NULL);
	if (error)
	return (error);
	so = fp->f_data;
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_STRUCT))
	ktrsockaddr(sa);
	#endif
	#ifdef MAC
	error = mac_socket_check_bind(td->td_ucred, so, sa);
	if (error == 0)
	#endif
	error = sobind(so, sa, td);
	fdrop(fp, td);
	return (error);
	}

	/* ARGSUSED */
	int
	-listen(td, uap)
	+sys_listen(td, uap)
	struct thread *td;
	struct listen_args /* {
	int s;
	int backlog;
	} / uap;
	{
	struct socket *so;
	struct file *fp;
	int error;

	AUDIT_ARG_FD(uap->s);
	error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_LISTEN, &fp, NULL);
	if (error == 0) {
	so = fp->f_data;
	#ifdef MAC
	error = mac_socket_check_listen(td->td_ucred, so);
	if (error == 0)
	#endif
	error = solisten(so, uap->backlog, td);
	fdrop(fp, td);
	}
	return(error);
	}

	/*
	* accept1()
	*/
	static int
	accept1(td, uap, compat)
	struct thread *td;
	struct accept_args /* {
	int s;
	struct sockaddr * __restrict name;
	socklen_t * __restrict anamelen;
	} / uap;
	int compat;
	{
	struct sockaddr *name;
	socklen_t namelen;
	struct file *fp;
	int error;

	if (uap->name == NULL)
	return (kern_accept(td, uap->s, NULL, NULL, NULL));

	error = copyin(uap->anamelen, &namelen, sizeof (namelen));
	if (error)
	return (error);

	error = kern_accept(td, uap->s, &name, &namelen, &fp);

	/*
	* return a namelen of zero for older code which might
	* ignore the return value from accept.
	*/
	if (error) {
	(void) copyout(&namelen,
	uap->anamelen, sizeof(*uap->anamelen));
	return (error);
	}

	if (error == 0 && name != NULL) {
	#ifdef COMPAT_OLDSOCK
	if (compat)
	((struct osockaddr *)name)->sa_family =
	name->sa_family;
	#endif
	error = copyout(name, uap->name, namelen);
	}
	if (error == 0)
	error = copyout(&namelen, uap->anamelen,
	sizeof(namelen));
	if (error)
	fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
	fdrop(fp, td);
	free(name, M_SONAME);
	return (error);
	}

	int
	kern_accept(struct thread td, int s, struct sockaddr *name,
	socklen_t namelen, struct file *fp)
	{
	struct filedesc *fdp;
	struct file headfp, nfp = NULL;
	struct sockaddr *sa = NULL;
	int error;
	struct socket head, so;
	int fd;
	u_int fflag;
	pid_t pgid;
	int tmp;

	if (name) {
	*name = NULL;
	if (*namelen < 0)
	return (EINVAL);
	}

	AUDIT_ARG_FD(s);
	fdp = td->td_proc->p_fd;
	error = getsock_cap(fdp, s, CAP_ACCEPT, &headfp, &fflag);
	if (error)
	return (error);
	head = headfp->f_data;
	if ((head->so_options & SO_ACCEPTCONN) == 0) {
	error = EINVAL;
	goto done;
	}
	#ifdef MAC
	error = mac_socket_check_accept(td->td_ucred, head);
	if (error != 0)
	goto done;
	#endif
	error = falloc(td, &nfp, &fd, 0);
	if (error)
	goto done;
	ACCEPT_LOCK();
	if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->so_comp)) {
	ACCEPT_UNLOCK();
	error = EWOULDBLOCK;
	goto noconnection;
	}
	while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0) {
	if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
	head->so_error = ECONNABORTED;
	break;
	}
	error = msleep(&head->so_timeo, &accept_mtx, PSOCK \| PCATCH,
	"accept", 0);
	if (error) {
	ACCEPT_UNLOCK();
	goto noconnection;
	}
	}
	if (head->so_error) {
	error = head->so_error;
	head->so_error = 0;
	ACCEPT_UNLOCK();
	goto noconnection;
	}
	so = TAILQ_FIRST(&head->so_comp);
	KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
	KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));

	/*
	* Before changing the flags on the socket, we have to bump the
	* reference count. Otherwise, if the protocol calls sofree(),
	* the socket will be released due to a zero refcount.
	*/
	SOCK_LOCK(so); /* soref() and so_state update */
	soref(so); /* file descriptor reference */

	TAILQ_REMOVE(&head->so_comp, so, so_list);
	head->so_qlen--;
	so->so_state \|= (head->so_state & SS_NBIO);
	so->so_qstate &= ~SQ_COMP;
	so->so_head = NULL;

	SOCK_UNLOCK(so);
	ACCEPT_UNLOCK();

	/* An extra reference on `nfp' has been held for us by falloc(). */
	td->td_retval[0] = fd;

	/* connection has been removed from the listen queue */
	KNOTE_UNLOCKED(&head->so_rcv.sb_sel.si_note, 0);

	pgid = fgetown(&head->so_sigio);
	if (pgid != 0)
	fsetown(pgid, &so->so_sigio);

	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
	/* Sync socket nonblocking/async state with file flags */
	tmp = fflag & FNONBLOCK;
	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
	tmp = fflag & FASYNC;
	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
	sa = 0;
	error = soaccept(so, &sa);
	if (error) {
	/*
	* return a namelen of zero for older code which might
	* ignore the return value from accept.
	*/
	if (name)
	*namelen = 0;
	goto noconnection;
	}
	if (sa == NULL) {
	if (name)
	*namelen = 0;
	goto done;
	}
	if (name) {
	/* check sa_len before it is destroyed */
	if (*namelen > sa->sa_len)
	*namelen = sa->sa_len;
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_STRUCT))
	ktrsockaddr(sa);
	#endif
	*name = sa;
	sa = NULL;
	}
	noconnection:
	if (sa)
	free(sa, M_SONAME);

	/*
	* close the new descriptor, assuming someone hasn't ripped it
	* out from under us.
	*/
	if (error)
	fdclose(fdp, nfp, fd, td);

	/*
	* Release explicitly held references before returning. We return
	* a reference on nfp to the caller on success if they request it.
	*/
	done:
	if (fp != NULL) {
	if (error == 0) {
	*fp = nfp;
	nfp = NULL;
	} else
	*fp = NULL;
	}
	if (nfp != NULL)
	fdrop(nfp, td);
	fdrop(headfp, td);
	return (error);
	}

	int
	-accept(td, uap)
	+sys_accept(td, uap)
	struct thread *td;
	struct accept_args *uap;
	{

	return (accept1(td, uap, 0));
	}

	#ifdef COMPAT_OLDSOCK
	int
	oaccept(td, uap)
	struct thread *td;
	struct accept_args *uap;
	{

	return (accept1(td, uap, 1));
	}
	#endif /* COMPAT_OLDSOCK */

	/* ARGSUSED */
	int
	-connect(td, uap)
	+sys_connect(td, uap)
	struct thread *td;
	struct connect_args /* {
	int s;
	caddr_t name;
	int namelen;
	} / uap;
	{
	struct sockaddr *sa;
	int error;

	error = getsockaddr(&sa, uap->name, uap->namelen);
	if (error)
	return (error);

	error = kern_connect(td, uap->s, sa);
	free(sa, M_SONAME);
	return (error);
	}


	int
	kern_connect(td, fd, sa)
	struct thread *td;
	int fd;
	struct sockaddr *sa;
	{
	struct socket *so;
	struct file *fp;
	int error;
	int interrupted = 0;

	AUDIT_ARG_FD(fd);
	error = getsock_cap(td->td_proc->p_fd, fd, CAP_CONNECT, &fp, NULL);
	if (error)
	return (error);
	so = fp->f_data;
	if (so->so_state & SS_ISCONNECTING) {
	error = EALREADY;
	goto done1;
	}
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_STRUCT))
	ktrsockaddr(sa);
	#endif
	#ifdef MAC
	error = mac_socket_check_connect(td->td_ucred, so, sa);
	if (error)
	goto bad;
	#endif
	error = soconnect(so, sa, td);
	if (error)
	goto bad;
	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
	error = EINPROGRESS;
	goto done1;
	}
	SOCK_LOCK(so);
	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
	error = msleep(&so->so_timeo, SOCK_MTX(so), PSOCK \| PCATCH,
	"connec", 0);
	if (error) {
	if (error == EINTR \|\| error == ERESTART)
	interrupted = 1;
	break;
	}
	}
	if (error == 0) {
	error = so->so_error;
	so->so_error = 0;
	}
	SOCK_UNLOCK(so);
	bad:
	if (!interrupted)
	so->so_state &= ~SS_ISCONNECTING;
	if (error == ERESTART)
	error = EINTR;
	done1:
	fdrop(fp, td);
	return (error);
	}

	int
	kern_socketpair(struct thread *td, int domain, int type, int protocol,
	int *rsv)
	{
	struct filedesc *fdp = td->td_proc->p_fd;
	struct file fp1, fp2;
	struct socket so1, so2;
	int fd, error;

	AUDIT_ARG_SOCKET(domain, type, protocol);
	#ifdef MAC
	/* We might want to have a separate check for socket pairs. */
	error = mac_socket_check_create(td->td_ucred, domain, type,
	protocol);
	if (error)
	return (error);
	#endif
	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
	if (error)
	return (error);
	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
	if (error)
	goto free1;
	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
	error = falloc(td, &fp1, &fd, 0);
	if (error)
	goto free2;
	rsv[0] = fd;
	fp1->f_data = so1; /* so1 already has ref count */
	error = falloc(td, &fp2, &fd, 0);
	if (error)
	goto free3;
	fp2->f_data = so2; /* so2 already has ref count */
	rsv[1] = fd;
	error = soconnect2(so1, so2);
	if (error)
	goto free4;
	if (type == SOCK_DGRAM) {
	/*
	* Datagram socket connection is asymmetric.
	*/
	error = soconnect2(so2, so1);
	if (error)
	goto free4;
	}
	finit(fp1, FREAD \| FWRITE, DTYPE_SOCKET, fp1->f_data, &socketops);
	finit(fp2, FREAD \| FWRITE, DTYPE_SOCKET, fp2->f_data, &socketops);
	fdrop(fp1, td);
	fdrop(fp2, td);
	return (0);
	free4:
	fdclose(fdp, fp2, rsv[1], td);
	fdrop(fp2, td);
	free3:
	fdclose(fdp, fp1, rsv[0], td);
	fdrop(fp1, td);
	free2:
	if (so2 != NULL)
	(void)soclose(so2);
	free1:
	if (so1 != NULL)
	(void)soclose(so1);
	return (error);
	}

	int
	-socketpair(struct thread td, struct socketpair_args uap)
	+sys_socketpair(struct thread td, struct socketpair_args uap)
	{
	int error, sv[2];

	error = kern_socketpair(td, uap->domain, uap->type,
	uap->protocol, sv);
	if (error)
	return (error);
	error = copyout(sv, uap->rsv, 2 * sizeof(int));
	if (error) {
	(void)kern_close(td, sv[0]);
	(void)kern_close(td, sv[1]);
	}
	return (error);
	}

	static int
	sendit(td, s, mp, flags)
	struct thread *td;
	int s;
	struct msghdr *mp;
	int flags;
	{
	struct mbuf *control;
	struct sockaddr *to;
	int error;

	#ifdef CAPABILITY_MODE
	if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
	return (ECAPMODE);
	#endif

	if (mp->msg_name != NULL) {
	error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
	if (error) {
	to = NULL;
	goto bad;
	}
	mp->msg_name = to;
	} else {
	to = NULL;
	}

	if (mp->msg_control) {
	if (mp->msg_controllen < sizeof(struct cmsghdr)
	#ifdef COMPAT_OLDSOCK
	&& mp->msg_flags != MSG_COMPAT
	#endif
	) {
	error = EINVAL;
	goto bad;
	}
	error = sockargs(&control, mp->msg_control,
	mp->msg_controllen, MT_CONTROL);
	if (error)
	goto bad;
	#ifdef COMPAT_OLDSOCK
	if (mp->msg_flags == MSG_COMPAT) {
	struct cmsghdr *cm;

	M_PREPEND(control, sizeof(*cm), M_WAIT);
	cm = mtod(control, struct cmsghdr *);
	cm->cmsg_len = control->m_len;
	cm->cmsg_level = SOL_SOCKET;
	cm->cmsg_type = SCM_RIGHTS;
	}
	#endif
	} else {
	control = NULL;
	}

	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);

	bad:
	if (to)
	free(to, M_SONAME);
	return (error);
	}

	int
	kern_sendit(td, s, mp, flags, control, segflg)
	struct thread *td;
	int s;
	struct msghdr *mp;
	int flags;
	struct mbuf *control;
	enum uio_seg segflg;
	{
	struct file *fp;
	struct uio auio;
	struct iovec *iov;
	struct socket *so;
	int i;
	int len, error;
	cap_rights_t rights;
	#ifdef KTRACE
	struct uio *ktruio = NULL;
	#endif

	AUDIT_ARG_FD(s);
	rights = CAP_WRITE;
	if (mp->msg_name != NULL)
	rights \|= CAP_CONNECT;
	error = getsock_cap(td->td_proc->p_fd, s, rights, &fp, NULL);
	if (error)
	return (error);
	so = (struct socket *)fp->f_data;

	#ifdef KTRACE
	if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
	ktrsockaddr(mp->msg_name);
	#endif
	#ifdef MAC
	if (mp->msg_name != NULL) {
	error = mac_socket_check_connect(td->td_ucred, so,
	mp->msg_name);
	if (error)
	goto bad;
	}
	error = mac_socket_check_send(td->td_ucred, so);
	if (error)
	goto bad;
	#endif

	auio.uio_iov = mp->msg_iov;
	auio.uio_iovcnt = mp->msg_iovlen;
	auio.uio_segflg = segflg;
	auio.uio_rw = UIO_WRITE;
	auio.uio_td = td;
	auio.uio_offset = 0; /* XXX */
	auio.uio_resid = 0;
	iov = mp->msg_iov;
	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
	if ((auio.uio_resid += iov->iov_len) < 0) {
	error = EINVAL;
	goto bad;
	}
	}
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_GENIO))
	ktruio = cloneuio(&auio);
	#endif
	len = auio.uio_resid;
	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
	if (error) {
	if (auio.uio_resid != len && (error == ERESTART \|\|
	error == EINTR \|\| error == EWOULDBLOCK))
	error = 0;
	/* Generation of SIGPIPE can be controlled per socket */
	if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
	!(flags & MSG_NOSIGNAL)) {
	PROC_LOCK(td->td_proc);
	tdsignal(td, SIGPIPE);
	PROC_UNLOCK(td->td_proc);
	}
	}
	if (error == 0)
	td->td_retval[0] = len - auio.uio_resid;
	#ifdef KTRACE
	if (ktruio != NULL) {
	ktruio->uio_resid = td->td_retval[0];
	ktrgenio(s, UIO_WRITE, ktruio, error);
	}
	#endif
	bad:
	fdrop(fp, td);
	return (error);
	}

	int
	-sendto(td, uap)
	+sys_sendto(td, uap)
	struct thread *td;
	struct sendto_args /* {
	int s;
	caddr_t buf;
	size_t len;
	int flags;
	caddr_t to;
	int tolen;
	} / uap;
	{
	struct msghdr msg;
	struct iovec aiov;
	int error;

	msg.msg_name = uap->to;
	msg.msg_namelen = uap->tolen;
	msg.msg_iov = &aiov;
	msg.msg_iovlen = 1;
	msg.msg_control = 0;
	#ifdef COMPAT_OLDSOCK
	msg.msg_flags = 0;
	#endif
	aiov.iov_base = uap->buf;
	aiov.iov_len = uap->len;
	error = sendit(td, uap->s, &msg, uap->flags);
	return (error);
	}

	#ifdef COMPAT_OLDSOCK
	int
	osend(td, uap)
	struct thread *td;
	struct osend_args /* {
	int s;
	caddr_t buf;
	int len;
	int flags;
	} / uap;
	{
	struct msghdr msg;
	struct iovec aiov;
	int error;

	msg.msg_name = 0;
	msg.msg_namelen = 0;
	msg.msg_iov = &aiov;
	msg.msg_iovlen = 1;
	aiov.iov_base = uap->buf;
	aiov.iov_len = uap->len;
	msg.msg_control = 0;
	msg.msg_flags = 0;
	error = sendit(td, uap->s, &msg, uap->flags);
	return (error);
	}

	int
	osendmsg(td, uap)
	struct thread *td;
	struct osendmsg_args /* {
	int s;
	caddr_t msg;
	int flags;
	} / uap;
	{
	struct msghdr msg;
	struct iovec *iov;
	int error;

	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
	if (error)
	return (error);
	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
	if (error)
	return (error);
	msg.msg_iov = iov;
	msg.msg_flags = MSG_COMPAT;
	error = sendit(td, uap->s, &msg, uap->flags);
	free(iov, M_IOV);
	return (error);
	}
	#endif

	int
	-sendmsg(td, uap)
	+sys_sendmsg(td, uap)
	struct thread *td;
	struct sendmsg_args /* {
	int s;
	caddr_t msg;
	int flags;
	} / uap;
	{
	struct msghdr msg;
	struct iovec *iov;
	int error;

	error = copyin(uap->msg, &msg, sizeof (msg));
	if (error)
	return (error);
	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
	if (error)
	return (error);
	msg.msg_iov = iov;
	#ifdef COMPAT_OLDSOCK
	msg.msg_flags = 0;
	#endif
	error = sendit(td, uap->s, &msg, uap->flags);
	free(iov, M_IOV);
	return (error);
	}

	int
	kern_recvit(td, s, mp, fromseg, controlp)
	struct thread *td;
	int s;
	struct msghdr *mp;
	enum uio_seg fromseg;
	struct mbuf **controlp;
	{
	struct uio auio;
	struct iovec *iov;
	int i;
	socklen_t len;
	int error;
	struct mbuf m, control = 0;
	caddr_t ctlbuf;
	struct file *fp;
	struct socket *so;
	struct sockaddr *fromsa = 0;
	#ifdef KTRACE
	struct uio *ktruio = NULL;
	#endif

	if (controlp != NULL)
	*controlp = NULL;

	AUDIT_ARG_FD(s);
	error = getsock_cap(td->td_proc->p_fd, s, CAP_READ, &fp, NULL);
	if (error)
	return (error);
	so = fp->f_data;

	#ifdef MAC
	error = mac_socket_check_receive(td->td_ucred, so);
	if (error) {
	fdrop(fp, td);
	return (error);
	}
	#endif

	auio.uio_iov = mp->msg_iov;
	auio.uio_iovcnt = mp->msg_iovlen;
	auio.uio_segflg = UIO_USERSPACE;
	auio.uio_rw = UIO_READ;
	auio.uio_td = td;
	auio.uio_offset = 0; /* XXX */
	auio.uio_resid = 0;
	iov = mp->msg_iov;
	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
	if ((auio.uio_resid += iov->iov_len) < 0) {
	fdrop(fp, td);
	return (EINVAL);
	}
	}
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_GENIO))
	ktruio = cloneuio(&auio);
	#endif
	len = auio.uio_resid;
	error = soreceive(so, &fromsa, &auio, (struct mbuf **)0,
	(mp->msg_control \|\| controlp) ? &control : (struct mbuf **)0,
	&mp->msg_flags);
	if (error) {
	if (auio.uio_resid != (int)len && (error == ERESTART \|\|
	error == EINTR \|\| error == EWOULDBLOCK))
	error = 0;
	}
	#ifdef KTRACE
	if (ktruio != NULL) {
	ktruio->uio_resid = (int)len - auio.uio_resid;
	ktrgenio(s, UIO_READ, ktruio, error);
	}
	#endif
	if (error)
	goto out;
	td->td_retval[0] = (int)len - auio.uio_resid;
	if (mp->msg_name) {
	len = mp->msg_namelen;
	if (len <= 0 \|\| fromsa == 0)
	len = 0;
	else {
	/* save sa_len before it is destroyed by MSG_COMPAT */
	len = MIN(len, fromsa->sa_len);
	#ifdef COMPAT_OLDSOCK
	if (mp->msg_flags & MSG_COMPAT)
	((struct osockaddr *)fromsa)->sa_family =
	fromsa->sa_family;
	#endif
	if (fromseg == UIO_USERSPACE) {
	error = copyout(fromsa, mp->msg_name,
	(unsigned)len);
	if (error)
	goto out;
	} else
	bcopy(fromsa, mp->msg_name, len);
	}
	mp->msg_namelen = len;
	}
	if (mp->msg_control && controlp == NULL) {
	#ifdef COMPAT_OLDSOCK
	/*
	* We assume that old recvmsg calls won't receive access
	* rights and other control info, esp. as control info
	* is always optional and those options didn't exist in 4.3.
	* If we receive rights, trim the cmsghdr; anything else
	* is tossed.
	*/
	if (control && mp->msg_flags & MSG_COMPAT) {
	if (mtod(control, struct cmsghdr *)->cmsg_level !=
	SOL_SOCKET \|\|
	mtod(control, struct cmsghdr *)->cmsg_type !=
	SCM_RIGHTS) {
	mp->msg_controllen = 0;
	goto out;
	}
	control->m_len -= sizeof (struct cmsghdr);
	control->m_data += sizeof (struct cmsghdr);
	}
	#endif
	len = mp->msg_controllen;
	m = control;
	mp->msg_controllen = 0;
	ctlbuf = mp->msg_control;

	while (m && len > 0) {
	unsigned int tocopy;

	if (len >= m->m_len)
	tocopy = m->m_len;
	else {
	mp->msg_flags \|= MSG_CTRUNC;
	tocopy = len;
	}

	if ((error = copyout(mtod(m, caddr_t),
	ctlbuf, tocopy)) != 0)
	goto out;

	ctlbuf += tocopy;
	len -= tocopy;
	m = m->m_next;
	}
	mp->msg_controllen = ctlbuf - (caddr_t)mp->msg_control;
	}
	out:
	fdrop(fp, td);
	#ifdef KTRACE
	if (fromsa && KTRPOINT(td, KTR_STRUCT))
	ktrsockaddr(fromsa);
	#endif
	if (fromsa)
	free(fromsa, M_SONAME);

	if (error == 0 && controlp != NULL)
	*controlp = control;
	else if (control)
	m_freem(control);

	return (error);
	}

	static int
	recvit(td, s, mp, namelenp)
	struct thread *td;
	int s;
	struct msghdr *mp;
	void *namelenp;
	{
	int error;

	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
	if (error)
	return (error);
	if (namelenp) {
	error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
	#ifdef COMPAT_OLDSOCK
	if (mp->msg_flags & MSG_COMPAT)
	error = 0; /* old recvfrom didn't check */
	#endif
	}
	return (error);
	}

	int
	-recvfrom(td, uap)
	+sys_recvfrom(td, uap)
	struct thread *td;
	struct recvfrom_args /* {
	int s;
	caddr_t buf;
	size_t len;
	int flags;
	struct sockaddr * __restrict from;
	socklen_t * __restrict fromlenaddr;
	} / uap;
	{
	struct msghdr msg;
	struct iovec aiov;
	int error;

	if (uap->fromlenaddr) {
	error = copyin(uap->fromlenaddr,
	&msg.msg_namelen, sizeof (msg.msg_namelen));
	if (error)
	goto done2;
	} else {
	msg.msg_namelen = 0;
	}
	msg.msg_name = uap->from;
	msg.msg_iov = &aiov;
	msg.msg_iovlen = 1;
	aiov.iov_base = uap->buf;
	aiov.iov_len = uap->len;
	msg.msg_control = 0;
	msg.msg_flags = uap->flags;
	error = recvit(td, uap->s, &msg, uap->fromlenaddr);
	done2:
	return(error);
	}

	#ifdef COMPAT_OLDSOCK
	int
	orecvfrom(td, uap)
	struct thread *td;
	struct recvfrom_args *uap;
	{

	uap->flags \|= MSG_COMPAT;
	- return (recvfrom(td, uap));
	+ return (sys_recvfrom(td, uap));
	}
	#endif

	#ifdef COMPAT_OLDSOCK
	int
	orecv(td, uap)
	struct thread *td;
	struct orecv_args /* {
	int s;
	caddr_t buf;
	int len;
	int flags;
	} / uap;
	{
	struct msghdr msg;
	struct iovec aiov;
	int error;

	msg.msg_name = 0;
	msg.msg_namelen = 0;
	msg.msg_iov = &aiov;
	msg.msg_iovlen = 1;
	aiov.iov_base = uap->buf;
	aiov.iov_len = uap->len;
	msg.msg_control = 0;
	msg.msg_flags = uap->flags;
	error = recvit(td, uap->s, &msg, NULL);
	return (error);
	}

	/*
	* Old recvmsg. This code takes advantage of the fact that the old msghdr
	* overlays the new one, missing only the flags, and with the (old) access
	* rights where the control fields are now.
	*/
	int
	orecvmsg(td, uap)
	struct thread *td;
	struct orecvmsg_args /* {
	int s;
	struct omsghdr *msg;
	int flags;
	} / uap;
	{
	struct msghdr msg;
	struct iovec *iov;
	int error;

	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
	if (error)
	return (error);
	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
	if (error)
	return (error);
	msg.msg_flags = uap->flags \| MSG_COMPAT;
	msg.msg_iov = iov;
	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
	if (msg.msg_controllen && error == 0)
	error = copyout(&msg.msg_controllen,
	&uap->msg->msg_accrightslen, sizeof (int));
	free(iov, M_IOV);
	return (error);
	}
	#endif

	int
	-recvmsg(td, uap)
	+sys_recvmsg(td, uap)
	struct thread *td;
	struct recvmsg_args /* {
	int s;
	struct msghdr *msg;
	int flags;
	} / uap;
	{
	struct msghdr msg;
	struct iovec uiov, iov;
	int error;

	error = copyin(uap->msg, &msg, sizeof (msg));
	if (error)
	return (error);
	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
	if (error)
	return (error);
	msg.msg_flags = uap->flags;
	#ifdef COMPAT_OLDSOCK
	msg.msg_flags &= ~MSG_COMPAT;
	#endif
	uiov = msg.msg_iov;
	msg.msg_iov = iov;
	error = recvit(td, uap->s, &msg, NULL);
	if (error == 0) {
	msg.msg_iov = uiov;
	error = copyout(&msg, uap->msg, sizeof(msg));
	}
	free(iov, M_IOV);
	return (error);
	}

	/* ARGSUSED */
	int
	-shutdown(td, uap)
	+sys_shutdown(td, uap)
	struct thread *td;
	struct shutdown_args /* {
	int s;
	int how;
	} / uap;
	{
	struct socket *so;
	struct file *fp;
	int error;

	AUDIT_ARG_FD(uap->s);
	error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_SHUTDOWN, &fp,
	NULL);
	if (error == 0) {
	so = fp->f_data;
	error = soshutdown(so, uap->how);
	fdrop(fp, td);
	}
	return (error);
	}

	/* ARGSUSED */
	int
	-setsockopt(td, uap)
	+sys_setsockopt(td, uap)
	struct thread *td;
	struct setsockopt_args /* {
	int s;
	int level;
	int name;
	caddr_t val;
	int valsize;
	} / uap;
	{

	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
	uap->val, UIO_USERSPACE, uap->valsize));
	}

	int
	kern_setsockopt(td, s, level, name, val, valseg, valsize)
	struct thread *td;
	int s;
	int level;
	int name;
	void *val;
	enum uio_seg valseg;
	socklen_t valsize;
	{
	int error;
	struct socket *so;
	struct file *fp;
	struct sockopt sopt;

	if (val == NULL && valsize != 0)
	return (EFAULT);
	if ((int)valsize < 0)
	return (EINVAL);

	sopt.sopt_dir = SOPT_SET;
	sopt.sopt_level = level;
	sopt.sopt_name = name;
	sopt.sopt_val = val;
	sopt.sopt_valsize = valsize;
	switch (valseg) {
	case UIO_USERSPACE:
	sopt.sopt_td = td;
	break;
	case UIO_SYSSPACE:
	sopt.sopt_td = NULL;
	break;
	default:
	panic("kern_setsockopt called with bad valseg");
	}

	AUDIT_ARG_FD(s);
	error = getsock_cap(td->td_proc->p_fd, s, CAP_SETSOCKOPT, &fp, NULL);
	if (error == 0) {
	so = fp->f_data;
	error = sosetopt(so, &sopt);
	fdrop(fp, td);
	}
	return(error);
	}

	/* ARGSUSED */
	int
	-getsockopt(td, uap)
	+sys_getsockopt(td, uap)
	struct thread *td;
	struct getsockopt_args /* {
	int s;
	int level;
	int name;
	void * __restrict val;
	socklen_t * __restrict avalsize;
	} / uap;
	{
	socklen_t valsize;
	int error;

	if (uap->val) {
	error = copyin(uap->avalsize, &valsize, sizeof (valsize));
	if (error)
	return (error);
	}

	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
	uap->val, UIO_USERSPACE, &valsize);

	if (error == 0)
	error = copyout(&valsize, uap->avalsize, sizeof (valsize));
	return (error);
	}

	/*
	* Kernel version of getsockopt.
	* optval can be a userland or userspace. optlen is always a kernel pointer.
	*/
	int
	kern_getsockopt(td, s, level, name, val, valseg, valsize)
	struct thread *td;
	int s;
	int level;
	int name;
	void *val;
	enum uio_seg valseg;
	socklen_t *valsize;
	{
	int error;
	struct socket *so;
	struct file *fp;
	struct sockopt sopt;

	if (val == NULL)
	*valsize = 0;
	if ((int)*valsize < 0)
	return (EINVAL);

	sopt.sopt_dir = SOPT_GET;
	sopt.sopt_level = level;
	sopt.sopt_name = name;
	sopt.sopt_val = val;
	sopt.sopt_valsize = (size_t)valsize; / checked non-negative above */
	switch (valseg) {
	case UIO_USERSPACE:
	sopt.sopt_td = td;
	break;
	case UIO_SYSSPACE:
	sopt.sopt_td = NULL;
	break;
	default:
	panic("kern_getsockopt called with bad valseg");
	}

	AUDIT_ARG_FD(s);
	error = getsock_cap(td->td_proc->p_fd, s, CAP_GETSOCKOPT, &fp, NULL);
	if (error == 0) {
	so = fp->f_data;
	error = sogetopt(so, &sopt);
	*valsize = sopt.sopt_valsize;
	fdrop(fp, td);
	}
	return (error);
	}

	/*
	* getsockname1() - Get socket name.
	*/
	/* ARGSUSED */
	static int
	getsockname1(td, uap, compat)
	struct thread *td;
	struct getsockname_args /* {
	int fdes;
	struct sockaddr * __restrict asa;
	socklen_t * __restrict alen;
	} / uap;
	int compat;
	{
	struct sockaddr *sa;
	socklen_t len;
	int error;

	error = copyin(uap->alen, &len, sizeof(len));
	if (error)
	return (error);

	error = kern_getsockname(td, uap->fdes, &sa, &len);
	if (error)
	return (error);

	if (len != 0) {
	#ifdef COMPAT_OLDSOCK
	if (compat)
	((struct osockaddr *)sa)->sa_family = sa->sa_family;
	#endif
	error = copyout(sa, uap->asa, (u_int)len);
	}
	free(sa, M_SONAME);
	if (error == 0)
	error = copyout(&len, uap->alen, sizeof(len));
	return (error);
	}

	int
	kern_getsockname(struct thread td, int fd, struct sockaddr *sa,
	socklen_t *alen)
	{
	struct socket *so;
	struct file *fp;
	socklen_t len;
	int error;

	if (*alen < 0)
	return (EINVAL);

	AUDIT_ARG_FD(fd);
	error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETSOCKNAME, &fp, NULL);
	if (error)
	return (error);
	so = fp->f_data;
	*sa = NULL;
	CURVNET_SET(so->so_vnet);
	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
	CURVNET_RESTORE();
	if (error)
	goto bad;
	if (*sa == NULL)
	len = 0;
	else
	len = MIN(alen, (sa)->sa_len);
	*alen = len;
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_STRUCT))
	ktrsockaddr(*sa);
	#endif
	bad:
	fdrop(fp, td);
	if (error && *sa) {
	free(*sa, M_SONAME);
	*sa = NULL;
	}
	return (error);
	}

	int
	-getsockname(td, uap)
	+sys_getsockname(td, uap)
	struct thread *td;
	struct getsockname_args *uap;
	{

	return (getsockname1(td, uap, 0));
	}

	#ifdef COMPAT_OLDSOCK
	int
	ogetsockname(td, uap)
	struct thread *td;
	struct getsockname_args *uap;
	{

	return (getsockname1(td, uap, 1));
	}
	#endif /* COMPAT_OLDSOCK */

	/*
	* getpeername1() - Get name of peer for connected socket.
	*/
	/* ARGSUSED */
	static int
	getpeername1(td, uap, compat)
	struct thread *td;
	struct getpeername_args /* {
	int fdes;
	struct sockaddr * __restrict asa;
	socklen_t * __restrict alen;
	} / uap;
	int compat;
	{
	struct sockaddr *sa;
	socklen_t len;
	int error;

	error = copyin(uap->alen, &len, sizeof (len));
	if (error)
	return (error);

	error = kern_getpeername(td, uap->fdes, &sa, &len);
	if (error)
	return (error);

	if (len != 0) {
	#ifdef COMPAT_OLDSOCK
	if (compat)
	((struct osockaddr *)sa)->sa_family = sa->sa_family;
	#endif
	error = copyout(sa, uap->asa, (u_int)len);
	}
	free(sa, M_SONAME);
	if (error == 0)
	error = copyout(&len, uap->alen, sizeof(len));
	return (error);
	}

	int
	kern_getpeername(struct thread td, int fd, struct sockaddr *sa,
	socklen_t *alen)
	{
	struct socket *so;
	struct file *fp;
	socklen_t len;
	int error;

	if (*alen < 0)
	return (EINVAL);

	AUDIT_ARG_FD(fd);
	error = getsock_cap(td->td_proc->p_fd, fd, CAP_GETPEERNAME, &fp, NULL);
	if (error)
	return (error);
	so = fp->f_data;
	if ((so->so_state & (SS_ISCONNECTED\|SS_ISCONFIRMING)) == 0) {
	error = ENOTCONN;
	goto done;
	}
	*sa = NULL;
	CURVNET_SET(so->so_vnet);
	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
	CURVNET_RESTORE();
	if (error)
	goto bad;
	if (*sa == NULL)
	len = 0;
	else
	len = MIN(alen, (sa)->sa_len);
	*alen = len;
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_STRUCT))
	ktrsockaddr(*sa);
	#endif
	bad:
	if (error && *sa) {
	free(*sa, M_SONAME);
	*sa = NULL;
	}
	done:
	fdrop(fp, td);
	return (error);
	}

	int
	-getpeername(td, uap)
	+sys_getpeername(td, uap)
	struct thread *td;
	struct getpeername_args *uap;
	{

	return (getpeername1(td, uap, 0));
	}

	#ifdef COMPAT_OLDSOCK
	int
	ogetpeername(td, uap)
	struct thread *td;
	struct ogetpeername_args *uap;
	{

	/* XXX uap should have type `getpeername_args ' to begin with. /
	return (getpeername1(td, (struct getpeername_args *)uap, 1));
	}
	#endif /* COMPAT_OLDSOCK */

	int
	sockargs(mp, buf, buflen, type)
	struct mbuf **mp;
	caddr_t buf;
	int buflen, type;
	{
	struct sockaddr *sa;
	struct mbuf *m;
	int error;

	if ((u_int)buflen > MLEN) {
	#ifdef COMPAT_OLDSOCK
	if (type == MT_SONAME && (u_int)buflen <= 112)
	buflen = MLEN; /* unix domain compat. hack */
	else
	#endif
	if ((u_int)buflen > MCLBYTES)
	return (EINVAL);
	}
	m = m_get(M_WAIT, type);
	if ((u_int)buflen > MLEN)
	MCLGET(m, M_WAIT);
	m->m_len = buflen;
	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
	if (error)
	(void) m_free(m);
	else {
	*mp = m;
	if (type == MT_SONAME) {
	sa = mtod(m, struct sockaddr *);

	#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
	if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
	sa->sa_family = sa->sa_len;
	#endif
	sa->sa_len = buflen;
	}
	}
	return (error);
	}

	int
	getsockaddr(namp, uaddr, len)
	struct sockaddr **namp;
	caddr_t uaddr;
	size_t len;
	{
	struct sockaddr *sa;
	int error;

	if (len > SOCK_MAXADDRLEN)
	return (ENAMETOOLONG);
	if (len < offsetof(struct sockaddr, sa_data[0]))
	return (EINVAL);
	sa = malloc(len, M_SONAME, M_WAITOK);
	error = copyin(uaddr, sa, len);
	if (error) {
	free(sa, M_SONAME);
	} else {
	#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
	if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
	sa->sa_family = sa->sa_len;
	#endif
	sa->sa_len = len;
	*namp = sa;
	}
	return (error);
	}

	#include <sys/condvar.h>

	struct sendfile_sync {
	struct mtx mtx;
	struct cv cv;
	unsigned count;
	};

	/*
	* Detach mapped page and release resources back to the system.
	*/
	void
	sf_buf_mext(void addr, void args)
	{
	vm_page_t m;
	struct sendfile_sync *sfs;

	m = sf_buf_page(args);
	sf_buf_free(args);
	vm_page_lock(m);
	vm_page_unwire(m, 0);
	/*
	* Check for the object going away on us. This can
	* happen since we don't hold a reference to it.
	* If so, we're responsible for freeing the page.
	*/
	if (m->wire_count == 0 && m->object == NULL)
	vm_page_free(m);
	vm_page_unlock(m);
	if (addr == NULL)
	return;
	sfs = addr;
	mtx_lock(&sfs->mtx);
	KASSERT(sfs->count> 0, ("Sendfile sync botchup count == 0"));
	if (--sfs->count == 0)
	cv_signal(&sfs->cv);
	mtx_unlock(&sfs->mtx);
	}

	/*
	* sendfile(2)
	*
	* int sendfile(int fd, int s, off_t offset, size_t nbytes,
	* struct sf_hdtr hdtr, off_t sbytes, int flags)
	*
	* Send a file specified by 'fd' and starting at 'offset' to a socket
	* specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
	* 0. Optionally add a header and/or trailer to the socket output. If
	* specified, write the total number of bytes sent into *sbytes.
	*/
	int
	-sendfile(struct thread td, struct sendfile_args uap)
	+sys_sendfile(struct thread td, struct sendfile_args uap)
	{

	return (do_sendfile(td, uap, 0));
	}

	static int
	do_sendfile(struct thread td, struct sendfile_args uap, int compat)
	{
	struct sf_hdtr hdtr;
	struct uio hdr_uio, trl_uio;
	int error;

	hdr_uio = trl_uio = NULL;

	if (uap->hdtr != NULL) {
	error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
	if (error)
	goto out;
	if (hdtr.headers != NULL) {
	error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
	if (error)
	goto out;
	}
	if (hdtr.trailers != NULL) {
	error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
	if (error)
	goto out;

	}
	}

	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
	out:
	if (hdr_uio)
	free(hdr_uio, M_IOV);
	if (trl_uio)
	free(trl_uio, M_IOV);
	return (error);
	}

	#ifdef COMPAT_FREEBSD4
	int
	freebsd4_sendfile(struct thread td, struct freebsd4_sendfile_args uap)
	{
	struct sendfile_args args;

	args.fd = uap->fd;
	args.s = uap->s;
	args.offset = uap->offset;
	args.nbytes = uap->nbytes;
	args.hdtr = uap->hdtr;
	args.sbytes = uap->sbytes;
	args.flags = uap->flags;

	return (do_sendfile(td, &args, 1));
	}
	#endif /* COMPAT_FREEBSD4 */

	int
	kern_sendfile(struct thread td, struct sendfile_args uap,
	struct uio hdr_uio, struct uio trl_uio, int compat)
	{
	struct file *sock_fp;
	struct vnode *vp;
	struct vm_object *obj = NULL;
	struct socket *so = NULL;
	struct mbuf *m = NULL;
	struct sf_buf *sf;
	struct vm_page *pg;
	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
	int error, hdrlen = 0, mnw = 0;
	int vfslocked;
	struct sendfile_sync *sfs = NULL;

	/*
	* The file descriptor must be a regular file and have a
	* backing VM object.
	* File offset must be positive. If it goes beyond EOF
	* we send only the header/trailer and no payload data.
	*/
	AUDIT_ARG_FD(uap->fd);
	if ((error = fgetvp_read(td, uap->fd, CAP_READ, &vp)) != 0)
	goto out;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	if (vp->v_type == VREG) {
	obj = vp->v_object;
	if (obj != NULL) {
	/*
	* Temporarily increase the backing VM
	* object's reference count so that a forced
	* reclamation of its vnode does not
	* immediately destroy it.
	*/
	VM_OBJECT_LOCK(obj);
	if ((obj->flags & OBJ_DEAD) == 0) {
	vm_object_reference_locked(obj);
	VM_OBJECT_UNLOCK(obj);
	} else {
	VM_OBJECT_UNLOCK(obj);
	obj = NULL;
	}
	}
	}
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	if (obj == NULL) {
	error = EINVAL;
	goto out;
	}
	if (uap->offset < 0) {
	error = EINVAL;
	goto out;
	}

	/*
	* The socket must be a stream socket and connected.
	* Remember if it a blocking or non-blocking socket.
	*/
	if ((error = getsock_cap(td->td_proc->p_fd, uap->s, CAP_WRITE,
	&sock_fp, NULL)) != 0)
	goto out;
	so = sock_fp->f_data;
	if (so->so_type != SOCK_STREAM) {
	error = EINVAL;
	goto out;
	}
	if ((so->so_state & SS_ISCONNECTED) == 0) {
	error = ENOTCONN;
	goto out;
	}
	/*
	* Do not wait on memory allocations but return ENOMEM for
	* caller to retry later.
	* XXX: Experimental.
	*/
	if (uap->flags & SF_MNOWAIT)
	mnw = 1;

	if (uap->flags & SF_SYNC) {
	sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK \| M_ZERO);
	mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
	cv_init(&sfs->cv, "sendfile");
	}

	#ifdef MAC
	error = mac_socket_check_send(td->td_ucred, so);
	if (error)
	goto out;
	#endif

	/* If headers are specified copy them into mbufs. */
	if (hdr_uio != NULL) {
	hdr_uio->uio_td = td;
	hdr_uio->uio_rw = UIO_WRITE;
	if (hdr_uio->uio_resid > 0) {
	/*
	* In FBSD < 5.0 the nbytes to send also included
	* the header. If compat is specified subtract the
	* header size from nbytes.
	*/
	if (compat) {
	if (uap->nbytes > hdr_uio->uio_resid)
	uap->nbytes -= hdr_uio->uio_resid;
	else
	uap->nbytes = 0;
	}
	m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
	0, 0, 0);
	if (m == NULL) {
	error = mnw ? EAGAIN : ENOBUFS;
	goto out;
	}
	hdrlen = m_length(m, NULL);
	}
	}

	/*
	* Protect against multiple writers to the socket.
	*
	* XXXRW: Historically this has assumed non-interruptibility, so now
	* we implement that, but possibly shouldn't.
	*/
	(void)sblock(&so->so_snd, SBL_WAIT \| SBL_NOINTR);

	/*
	* Loop through the pages of the file, starting with the requested
	* offset. Get a file page (do I/O if necessary), map the file page
	* into an sf_buf, attach an mbuf header to the sf_buf, and queue
	* it on the socket.
	* This is done in two loops. The inner loop turns as many pages
	* as it can, up to available socket buffer space, without blocking
	* into mbufs to have it bulk delivered into the socket send buffer.
	* The outer loop checks the state and available space of the socket
	* and takes care of the overall progress.
	*/
	for (off = uap->offset, rem = uap->nbytes; ; ) {
	int loopbytes = 0;
	int space = 0;
	int done = 0;

	/*
	* Check the socket state for ongoing connection,
	* no errors and space in socket buffer.
	* If space is low allow for the remainder of the
	* file to be processed if it fits the socket buffer.
	* Otherwise block in waiting for sufficient space
	* to proceed, or if the socket is nonblocking, return
	* to userland with EAGAIN while reporting how far
	* we've come.
	* We wait until the socket buffer has significant free
	* space to do bulk sends. This makes good use of file
	* system read ahead and allows packet segmentation
	* offloading hardware to take over lots of work. If
	* we were not careful here we would send off only one
	* sfbuf at a time.
	*/
	SOCKBUF_LOCK(&so->so_snd);
	if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
	so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
	retry_space:
	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
	error = EPIPE;
	SOCKBUF_UNLOCK(&so->so_snd);
	goto done;
	} else if (so->so_error) {
	error = so->so_error;
	so->so_error = 0;
	SOCKBUF_UNLOCK(&so->so_snd);
	goto done;
	}
	space = sbspace(&so->so_snd);
	if (space < rem &&
	(space <= 0 \|\|
	space < so->so_snd.sb_lowat)) {
	if (so->so_state & SS_NBIO) {
	SOCKBUF_UNLOCK(&so->so_snd);
	error = EAGAIN;
	goto done;
	}
	/*
	* sbwait drops the lock while sleeping.
	* When we loop back to retry_space the
	* state may have changed and we retest
	* for it.
	*/
	error = sbwait(&so->so_snd);
	/*
	* An error from sbwait usually indicates that we've
	* been interrupted by a signal. If we've sent anything
	* then return bytes sent, otherwise return the error.
	*/
	if (error) {
	SOCKBUF_UNLOCK(&so->so_snd);
	goto done;
	}
	goto retry_space;
	}
	SOCKBUF_UNLOCK(&so->so_snd);

	/*
	* Reduce space in the socket buffer by the size of
	* the header mbuf chain.
	* hdrlen is set to 0 after the first loop.
	*/
	space -= hdrlen;

	/*
	* Loop and construct maximum sized mbuf chain to be bulk
	* dumped into socket buffer.
	*/
	while (space > loopbytes) {
	vm_pindex_t pindex;
	vm_offset_t pgoff;
	struct mbuf *m0;

	VM_OBJECT_LOCK(obj);
	/*
	* Calculate the amount to transfer.
	* Not to exceed a page, the EOF,
	* or the passed in nbytes.
	*/
	pgoff = (vm_offset_t)(off & PAGE_MASK);
	xfsize = omin(PAGE_SIZE - pgoff,
	obj->un_pager.vnp.vnp_size - uap->offset -
	fsbytes - loopbytes);
	if (uap->nbytes)
	rem = (uap->nbytes - fsbytes - loopbytes);
	else
	rem = obj->un_pager.vnp.vnp_size -
	uap->offset - fsbytes - loopbytes;
	xfsize = omin(rem, xfsize);
	xfsize = omin(space - loopbytes, xfsize);
	if (xfsize <= 0) {
	VM_OBJECT_UNLOCK(obj);
	done = 1; /* all data sent */
	break;
	}

	/*
	* Attempt to look up the page. Allocate
	* if not found or wait and loop if busy.
	*/
	pindex = OFF_TO_IDX(off);
	pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY \|
	VM_ALLOC_NORMAL \| VM_ALLOC_WIRED \| VM_ALLOC_RETRY);

	/*
	* Check if page is valid for what we need,
	* otherwise initiate I/O.
	* If we already turned some pages into mbufs,
	* send them off before we come here again and
	* block.
	*/
	if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
	VM_OBJECT_UNLOCK(obj);
	else if (m != NULL)
	error = EAGAIN; /* send what we already got */
	else if (uap->flags & SF_NODISKIO)
	error = EBUSY;
	else {
	int bsize, resid;

	/*
	* Ensure that our page is still around
	* when the I/O completes.
	*/
	vm_page_io_start(pg);
	VM_OBJECT_UNLOCK(obj);

	/*
	* Get the page from backing store.
	*/
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	error = vn_lock(vp, LK_SHARED);
	if (error != 0)
	goto after_read;
	bsize = vp->v_mount->mnt_stat.f_iosize;

	/*
	* XXXMAC: Because we don't have fp->f_cred
	* here, we pass in NOCRED. This is probably
	* wrong, but is consistent with our original
	* implementation.
	*/
	error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
	trunc_page(off), UIO_NOCOPY, IO_NODELOCKED \|
	IO_VMIO \| ((MAXBSIZE / bsize) << IO_SEQSHIFT),
	td->td_ucred, NOCRED, &resid, td);
	VOP_UNLOCK(vp, 0);
	after_read:
	VFS_UNLOCK_GIANT(vfslocked);
	VM_OBJECT_LOCK(obj);
	vm_page_io_finish(pg);
	if (!error)
	VM_OBJECT_UNLOCK(obj);
	mbstat.sf_iocnt++;
	}
	if (error) {
	vm_page_lock(pg);
	vm_page_unwire(pg, 0);
	/*
	* See if anyone else might know about
	* this page. If not and it is not valid,
	* then free it.
	*/
	if (pg->wire_count == 0 && pg->valid == 0 &&
	pg->busy == 0 && !(pg->oflags & VPO_BUSY))
	vm_page_free(pg);
	vm_page_unlock(pg);
	VM_OBJECT_UNLOCK(obj);
	if (error == EAGAIN)
	error = 0; /* not a real error */
	break;
	}

	/*
	* Get a sendfile buf. When allocating the
	* first buffer for mbuf chain, we usually
	* wait as long as necessary, but this wait
	* can be interrupted. For consequent
	* buffers, do not sleep, since several
	* threads might exhaust the buffers and then
	* deadlock.
	*/
	sf = sf_buf_alloc(pg, (mnw \|\| m != NULL) ? SFB_NOWAIT :
	SFB_CATCH);
	if (sf == NULL) {
	mbstat.sf_allocfail++;
	vm_page_lock(pg);
	vm_page_unwire(pg, 0);
	KASSERT(pg->object != NULL,
	("kern_sendfile: object disappeared"));
	vm_page_unlock(pg);
	if (m == NULL)
	error = (mnw ? EAGAIN : EINTR);
	break;
	}

	/*
	* Get an mbuf and set it up as having
	* external storage.
	*/
	m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
	if (m0 == NULL) {
	error = (mnw ? EAGAIN : ENOBUFS);
	sf_buf_mext((void *)sf_buf_kva(sf), sf);
	break;
	}
	MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext,
	sfs, sf, M_RDONLY, EXT_SFBUF);
	m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
	m0->m_len = xfsize;

	/* Append to mbuf chain. */
	if (m != NULL)
	m_cat(m, m0);
	else
	m = m0;

	/* Keep track of bits processed. */
	loopbytes += xfsize;
	off += xfsize;

	if (sfs != NULL) {
	mtx_lock(&sfs->mtx);
	sfs->count++;
	mtx_unlock(&sfs->mtx);
	}
	}

	/* Add the buffer chain to the socket buffer. */
	if (m != NULL) {
	int mlen, err;

	mlen = m_length(m, NULL);
	SOCKBUF_LOCK(&so->so_snd);
	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
	error = EPIPE;
	SOCKBUF_UNLOCK(&so->so_snd);
	goto done;
	}
	SOCKBUF_UNLOCK(&so->so_snd);
	CURVNET_SET(so->so_vnet);
	/* Avoid error aliasing. */
	err = (*so->so_proto->pr_usrreqs->pru_send)
	(so, 0, m, NULL, NULL, td);
	CURVNET_RESTORE();
	if (err == 0) {
	/*
	* We need two counters to get the
	* file offset and nbytes to send
	* right:
	* - sbytes contains the total amount
	* of bytes sent, including headers.
	* - fsbytes contains the total amount
	* of bytes sent from the file.
	*/
	sbytes += mlen;
	fsbytes += mlen;
	if (hdrlen) {
	fsbytes -= hdrlen;
	hdrlen = 0;
	}
	} else if (error == 0)
	error = err;
	m = NULL; /* pru_send always consumes */
	}

	/* Quit outer loop on error or when we're done. */
	if (done)
	break;
	if (error)
	goto done;
	}

	/*
	* Send trailers. Wimp out and use writev(2).
	*/
	if (trl_uio != NULL) {
	sbunlock(&so->so_snd);
	error = kern_writev(td, uap->s, trl_uio);
	if (error == 0)
	sbytes += td->td_retval[0];
	goto out;
	}

	done:
	sbunlock(&so->so_snd);
	out:
	/*
	* If there was no error we have to clear td->td_retval[0]
	* because it may have been set by writev.
	*/
	if (error == 0) {
	td->td_retval[0] = 0;
	}
	if (uap->sbytes != NULL) {
	copyout(&sbytes, uap->sbytes, sizeof(off_t));
	}
	if (obj != NULL)
	vm_object_deallocate(obj);
	if (vp != NULL) {
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	if (so)
	fdrop(sock_fp, td);
	if (m)
	m_freem(m);

	if (sfs != NULL) {
	mtx_lock(&sfs->mtx);
	if (sfs->count != 0)
	cv_wait(&sfs->cv, &sfs->mtx);
	KASSERT(sfs->count == 0, ("sendfile sync still busy"));
	cv_destroy(&sfs->cv);
	mtx_destroy(&sfs->mtx);
	free(sfs, M_TEMP);
	}

	if (error == ERESTART)
	error = EINTR;

	return (error);
	}

	/*
	* SCTP syscalls.
	* Functionality only compiled in if SCTP is defined in the kernel Makefile,
	* otherwise all return EOPNOTSUPP.
	* XXX: We should make this loadable one day.
	*/
	int
	-sctp_peeloff(td, uap)
	+sys_sctp_peeloff(td, uap)
	struct thread *td;
	struct sctp_peeloff_args /* {
	int sd;
	caddr_t name;
	} / uap;
	{
	#if (defined(INET) \|\| defined(INET6)) && defined(SCTP)
	struct filedesc *fdp;
	struct file *nfp = NULL;
	int error;
	struct socket head, so;
	int fd;
	u_int fflag;

	fdp = td->td_proc->p_fd;
	AUDIT_ARG_FD(uap->sd);
	error = fgetsock(td, uap->sd, CAP_PEELOFF, &head, &fflag);
	if (error)
	goto done2;
	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
	if (error)
	goto done2;
	/*
	* At this point we know we do have a assoc to pull
	* we proceed to get the fd setup. This may block
	* but that is ok.
	*/

	error = falloc(td, &nfp, &fd, 0);
	if (error)
	goto done;
	td->td_retval[0] = fd;

	CURVNET_SET(head->so_vnet);
	so = sonewconn(head, SS_ISCONNECTED);
	if (so == NULL)
	goto noconnection;
	/*
	* Before changing the flags on the socket, we have to bump the
	* reference count. Otherwise, if the protocol calls sofree(),
	* the socket will be released due to a zero refcount.
	*/
	SOCK_LOCK(so);
	soref(so); /* file descriptor reference */
	SOCK_UNLOCK(so);

	ACCEPT_LOCK();

	TAILQ_REMOVE(&head->so_comp, so, so_list);
	head->so_qlen--;
	so->so_state \|= (head->so_state & SS_NBIO);
	so->so_state &= ~SS_NOFDREF;
	so->so_qstate &= ~SQ_COMP;
	so->so_head = NULL;
	ACCEPT_UNLOCK();
	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
	if (error)
	goto noconnection;
	if (head->so_sigio != NULL)
	fsetown(fgetown(&head->so_sigio), &so->so_sigio);

	noconnection:
	/*
	* close the new descriptor, assuming someone hasn't ripped it
	* out from under us.
	*/
	if (error)
	fdclose(fdp, nfp, fd, td);

	/*
	* Release explicitly held references before returning.
	*/
	CURVNET_RESTORE();
	done:
	if (nfp != NULL)
	fdrop(nfp, td);
	fputsock(head);
	done2:
	return (error);
	#else /* SCTP */
	return (EOPNOTSUPP);
	#endif /* SCTP */
	}

	int
	-sctp_generic_sendmsg (td, uap)
	+sys_sctp_generic_sendmsg (td, uap)
	struct thread *td;
	struct sctp_generic_sendmsg_args /* {
	int sd,
	caddr_t msg,
	int mlen,
	caddr_t to,
	__socklen_t tolen,
	struct sctp_sndrcvinfo *sinfo,
	int flags
	} / uap;
	{
	#if (defined(INET) \|\| defined(INET6)) && defined(SCTP)
	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
	struct socket *so;
	struct file *fp = NULL;
	int error = 0, len;
	struct sockaddr *to = NULL;
	#ifdef KTRACE
	struct uio *ktruio = NULL;
	#endif
	struct uio auio;
	struct iovec iov[1];
	cap_rights_t rights;

	if (uap->sinfo) {
	error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
	if (error)
	return (error);
	u_sinfo = &sinfo;
	}

	rights = CAP_WRITE;
	if (uap->tolen) {
	error = getsockaddr(&to, uap->to, uap->tolen);
	if (error) {
	to = NULL;
	goto sctp_bad2;
	}
	rights \|= CAP_CONNECT;
	}

	AUDIT_ARG_FD(uap->sd);
	error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
	if (error)
	goto sctp_bad;
	#ifdef KTRACE
	if (to && (KTRPOINT(td, KTR_STRUCT)))
	ktrsockaddr(to);
	#endif

	iov[0].iov_base = uap->msg;
	iov[0].iov_len = uap->mlen;

	so = (struct socket *)fp->f_data;
	#ifdef MAC
	error = mac_socket_check_send(td->td_ucred, so);
	if (error)
	goto sctp_bad;
	#endif /* MAC */

	auio.uio_iov = iov;
	auio.uio_iovcnt = 1;
	auio.uio_segflg = UIO_USERSPACE;
	auio.uio_rw = UIO_WRITE;
	auio.uio_td = td;
	auio.uio_offset = 0; /* XXX */
	auio.uio_resid = 0;
	len = auio.uio_resid = uap->mlen;
	CURVNET_SET(so->so_vnet);
	error = sctp_lower_sosend(so, to, &auio,
	(struct mbuf )NULL, (struct mbuf )NULL,
	uap->flags, u_sinfo, td);
	CURVNET_RESTORE();
	if (error) {
	if (auio.uio_resid != len && (error == ERESTART \|\|
	error == EINTR \|\| error == EWOULDBLOCK))
	error = 0;
	/* Generation of SIGPIPE can be controlled per socket. */
	if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
	!(uap->flags & MSG_NOSIGNAL)) {
	PROC_LOCK(td->td_proc);
	tdsignal(td, SIGPIPE);
	PROC_UNLOCK(td->td_proc);
	}
	}
	if (error == 0)
	td->td_retval[0] = len - auio.uio_resid;
	#ifdef KTRACE
	if (ktruio != NULL) {
	ktruio->uio_resid = td->td_retval[0];
	ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
	}
	#endif /* KTRACE */
	sctp_bad:
	if (fp)
	fdrop(fp, td);
	sctp_bad2:
	if (to)
	free(to, M_SONAME);
	return (error);
	#else /* SCTP */
	return (EOPNOTSUPP);
	#endif /* SCTP */
	}

	int
	-sctp_generic_sendmsg_iov(td, uap)
	+sys_sctp_generic_sendmsg_iov(td, uap)
	struct thread *td;
	struct sctp_generic_sendmsg_iov_args /* {
	int sd,
	struct iovec *iov,
	int iovlen,
	caddr_t to,
	__socklen_t tolen,
	struct sctp_sndrcvinfo *sinfo,
	int flags
	} / uap;
	{
	#if (defined(INET) \|\| defined(INET6)) && defined(SCTP)
	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
	struct socket *so;
	struct file *fp = NULL;
	int error=0, len, i;
	struct sockaddr *to = NULL;
	#ifdef KTRACE
	struct uio *ktruio = NULL;
	#endif
	struct uio auio;
	struct iovec iov, tiov;
	cap_rights_t rights;

	if (uap->sinfo) {
	error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
	if (error)
	return (error);
	u_sinfo = &sinfo;
	}
	rights = CAP_WRITE;
	if (uap->tolen) {
	error = getsockaddr(&to, uap->to, uap->tolen);
	if (error) {
	to = NULL;
	goto sctp_bad2;
	}
	rights \|= CAP_CONNECT;
	}

	AUDIT_ARG_FD(uap->sd);
	error = getsock_cap(td->td_proc->p_fd, uap->sd, rights, &fp, NULL);
	if (error)
	goto sctp_bad1;

	#ifdef COMPAT_FREEBSD32
	if (SV_CURPROC_FLAG(SV_ILP32))
	error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
	uap->iovlen, &iov, EMSGSIZE);
	else
	#endif
	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
	if (error)
	goto sctp_bad1;
	#ifdef KTRACE
	if (to && (KTRPOINT(td, KTR_STRUCT)))
	ktrsockaddr(to);
	#endif

	so = (struct socket *)fp->f_data;
	#ifdef MAC
	error = mac_socket_check_send(td->td_ucred, so);
	if (error)
	goto sctp_bad;
	#endif /* MAC */

	auio.uio_iov = iov;
	auio.uio_iovcnt = uap->iovlen;
	auio.uio_segflg = UIO_USERSPACE;
	auio.uio_rw = UIO_WRITE;
	auio.uio_td = td;
	auio.uio_offset = 0; /* XXX */
	auio.uio_resid = 0;
	tiov = iov;
	for (i = 0; i <uap->iovlen; i++, tiov++) {
	if ((auio.uio_resid += tiov->iov_len) < 0) {
	error = EINVAL;
	goto sctp_bad;
	}
	}
	len = auio.uio_resid;
	CURVNET_SET(so->so_vnet);
	error = sctp_lower_sosend(so, to, &auio,
	(struct mbuf )NULL, (struct mbuf )NULL,
	uap->flags, u_sinfo, td);
	CURVNET_RESTORE();
	if (error) {
	if (auio.uio_resid != len && (error == ERESTART \|\|
	error == EINTR \|\| error == EWOULDBLOCK))
	error = 0;
	/* Generation of SIGPIPE can be controlled per socket */
	if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
	!(uap->flags & MSG_NOSIGNAL)) {
	PROC_LOCK(td->td_proc);
	tdsignal(td, SIGPIPE);
	PROC_UNLOCK(td->td_proc);
	}
	}
	if (error == 0)
	td->td_retval[0] = len - auio.uio_resid;
	#ifdef KTRACE
	if (ktruio != NULL) {
	ktruio->uio_resid = td->td_retval[0];
	ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
	}
	#endif /* KTRACE */
	sctp_bad:
	free(iov, M_IOV);
	sctp_bad1:
	if (fp)
	fdrop(fp, td);
	sctp_bad2:
	if (to)
	free(to, M_SONAME);
	return (error);
	#else /* SCTP */
	return (EOPNOTSUPP);
	#endif /* SCTP */
	}

	int
	-sctp_generic_recvmsg(td, uap)
	+sys_sctp_generic_recvmsg(td, uap)
	struct thread *td;
	struct sctp_generic_recvmsg_args /* {
	int sd,
	struct iovec *iov,
	int iovlen,
	struct sockaddr *from,
	__socklen_t *fromlenaddr,
	struct sctp_sndrcvinfo *sinfo,
	int *msg_flags
	} / uap;
	{
	#if (defined(INET) \|\| defined(INET6)) && defined(SCTP)
	uint8_t sockbufstore[256];
	struct uio auio;
	struct iovec iov, tiov;
	struct sctp_sndrcvinfo sinfo;
	struct socket *so;
	struct file *fp = NULL;
	struct sockaddr *fromsa;
	int fromlen;
	int len, i, msg_flags;
	int error = 0;
	#ifdef KTRACE
	struct uio *ktruio = NULL;
	#endif

	AUDIT_ARG_FD(uap->sd);
	error = getsock_cap(td->td_proc->p_fd, uap->sd, CAP_READ, &fp, NULL);
	if (error) {
	return (error);
	}
	#ifdef COMPAT_FREEBSD32
	if (SV_CURPROC_FLAG(SV_ILP32))
	error = freebsd32_copyiniov((struct iovec32 *)uap->iov,
	uap->iovlen, &iov, EMSGSIZE);
	else
	#endif
	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
	if (error)
	goto out1;

	so = fp->f_data;
	#ifdef MAC
	error = mac_socket_check_receive(td->td_ucred, so);
	if (error) {
	goto out;
	}
	#endif /* MAC */

	if (uap->fromlenaddr) {
	error = copyin(uap->fromlenaddr,
	&fromlen, sizeof (fromlen));
	if (error) {
	goto out;
	}
	} else {
	fromlen = 0;
	}
	if (uap->msg_flags) {
	error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
	if (error) {
	goto out;
	}
	} else {
	msg_flags = 0;
	}
	auio.uio_iov = iov;
	auio.uio_iovcnt = uap->iovlen;
	auio.uio_segflg = UIO_USERSPACE;
	auio.uio_rw = UIO_READ;
	auio.uio_td = td;
	auio.uio_offset = 0; /* XXX */
	auio.uio_resid = 0;
	tiov = iov;
	for (i = 0; i <uap->iovlen; i++, tiov++) {
	if ((auio.uio_resid += tiov->iov_len) < 0) {
	error = EINVAL;
	goto out;
	}
	}
	len = auio.uio_resid;
	fromsa = (struct sockaddr *)sockbufstore;

	#ifdef KTRACE
	if (KTRPOINT(td, KTR_GENIO))
	ktruio = cloneuio(&auio);
	#endif /* KTRACE */
	memset(&sinfo, 0, sizeof(struct sctp_sndrcvinfo));
	CURVNET_SET(so->so_vnet);
	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
	fromsa, fromlen, &msg_flags,
	(struct sctp_sndrcvinfo *)&sinfo, 1);
	CURVNET_RESTORE();
	if (error) {
	if (auio.uio_resid != (int)len && (error == ERESTART \|\|
	error == EINTR \|\| error == EWOULDBLOCK))
	error = 0;
	} else {
	if (uap->sinfo)
	error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
	}
	#ifdef KTRACE
	if (ktruio != NULL) {
	ktruio->uio_resid = (int)len - auio.uio_resid;
	ktrgenio(uap->sd, UIO_READ, ktruio, error);
	}
	#endif /* KTRACE */
	if (error)
	goto out;
	td->td_retval[0] = (int)len - auio.uio_resid;

	if (fromlen && uap->from) {
	len = fromlen;
	if (len <= 0 \|\| fromsa == 0)
	len = 0;
	else {
	len = MIN(len, fromsa->sa_len);
	error = copyout(fromsa, uap->from, (unsigned)len);
	if (error)
	goto out;
	}
	error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
	if (error) {
	goto out;
	}
	}
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_STRUCT))
	ktrsockaddr(fromsa);
	#endif
	if (uap->msg_flags) {
	error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
	if (error) {
	goto out;
	}
	}
	out:
	free(iov, M_IOV);
	out1:
	if (fp)
	fdrop(fp, td);

	return (error);
	#else /* SCTP */
	return (EOPNOTSUPP);
	#endif /* SCTP */
	}
	Index: head/sys/kern/vfs_acl.c
	===================================================================
	--- head/sys/kern/vfs_acl.c (revision 225616)
	+++ head/sys/kern/vfs_acl.c (revision 225617)
	@@ -1,577 +1,577 @@
	/*-
	* Copyright (c) 1999-2006 Robert N. M. Watson
	* All rights reserved.
	*
	* This software was developed by Robert Watson for the TrustedBSD Project.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/
	/*
	* Developed by the TrustedBSD Project.
	*
	* ACL system calls and other functions common across different ACL types.
	* Type-specific routines go into subr_acl_<type>.c.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/capability.h>
	#include <sys/fcntl.h>
	#include <sys/kernel.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/vnode.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/file.h>
	#include <sys/filedesc.h>
	#include <sys/proc.h>
	#include <sys/sysent.h>
	#include <sys/acl.h>

	#include <security/mac/mac_framework.h>

	CTASSERT(ACL_MAX_ENTRIES >= OLDACL_MAX_ENTRIES);

	MALLOC_DEFINE(M_ACL, "acl", "Access Control Lists");

	static int vacl_set_acl(struct thread td, struct vnode vp,
	acl_type_t type, struct acl *aclp);
	static int vacl_get_acl(struct thread td, struct vnode vp,
	acl_type_t type, struct acl *aclp);
	static int vacl_aclcheck(struct thread td, struct vnode vp,
	acl_type_t type, struct acl *aclp);

	int
	acl_copy_oldacl_into_acl(const struct oldacl source, struct acl dest)
	{
	int i;

	if (source->acl_cnt < 0 \|\| source->acl_cnt > OLDACL_MAX_ENTRIES)
	return (EINVAL);

	bzero(dest, sizeof(*dest));

	dest->acl_cnt = source->acl_cnt;
	dest->acl_maxcnt = ACL_MAX_ENTRIES;

	for (i = 0; i < dest->acl_cnt; i++) {
	dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
	dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
	dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
	}

	return (0);
	}

	int
	acl_copy_acl_into_oldacl(const struct acl source, struct oldacl dest)
	{
	int i;

	if (source->acl_cnt > OLDACL_MAX_ENTRIES)
	return (EINVAL);

	bzero(dest, sizeof(*dest));

	dest->acl_cnt = source->acl_cnt;

	for (i = 0; i < dest->acl_cnt; i++) {
	dest->acl_entry[i].ae_tag = source->acl_entry[i].ae_tag;
	dest->acl_entry[i].ae_id = source->acl_entry[i].ae_id;
	dest->acl_entry[i].ae_perm = source->acl_entry[i].ae_perm;
	}

	return (0);
	}

	/*
	* At one time, "struct ACL" was extended in order to add support for NFSv4
	* ACLs. Instead of creating compatibility versions of all the ACL-related
	* syscalls, they were left intact. It's possible to find out what the code
	* calling these syscalls (libc) expects basing on "type" argument - if it's
	* either ACL_TYPE_ACCESS_OLD or ACL_TYPE_DEFAULT_OLD (which previously were
	* known as ACL_TYPE_ACCESS and ACL_TYPE_DEFAULT), then it's the "struct
	* oldacl". If it's something else, then it's the new "struct acl". In the
	* latter case, the routines below just copyin/copyout the contents. In the
	* former case, they copyin the "struct oldacl" and convert it to the new
	* format.
	*/
	static int
	acl_copyin(void user_acl, struct acl kernel_acl, acl_type_t type)
	{
	int error;
	struct oldacl old;

	switch (type) {
	case ACL_TYPE_ACCESS_OLD:
	case ACL_TYPE_DEFAULT_OLD:
	error = copyin(user_acl, &old, sizeof(old));
	if (error != 0)
	break;
	acl_copy_oldacl_into_acl(&old, kernel_acl);
	break;

	default:
	error = copyin(user_acl, kernel_acl, sizeof(*kernel_acl));
	if (kernel_acl->acl_maxcnt != ACL_MAX_ENTRIES)
	return (EINVAL);
	}

	return (error);
	}

	static int
	acl_copyout(struct acl kernel_acl, void user_acl, acl_type_t type)
	{
	int error;
	struct oldacl old;

	switch (type) {
	case ACL_TYPE_ACCESS_OLD:
	case ACL_TYPE_DEFAULT_OLD:
	error = acl_copy_acl_into_oldacl(kernel_acl, &old);
	if (error != 0)
	break;

	error = copyout(&old, user_acl, sizeof(old));
	break;

	default:
	if (fuword32((char *)user_acl +
	offsetof(struct acl, acl_maxcnt)) != ACL_MAX_ENTRIES)
	return (EINVAL);

	error = copyout(kernel_acl, user_acl, sizeof(*kernel_acl));
	}

	return (error);
	}

	/*
	* Convert "old" type - ACL_TYPE_{ACCESS,DEFAULT}_OLD - into its "new"
	* counterpart. It's required for old (pre-NFSv4 ACLs) libc to work
	* with new kernel. Fixing 'type' for old binaries with new libc
	* is being done in lib/libc/posix1e/acl_support.c:_acl_type_unold().
	*/
	static int
	acl_type_unold(int type)
	{
	switch (type) {
	case ACL_TYPE_ACCESS_OLD:
	return (ACL_TYPE_ACCESS);

	case ACL_TYPE_DEFAULT_OLD:
	return (ACL_TYPE_DEFAULT);

	default:
	return (type);
	}
	}

	/*
	* These calls wrap the real vnode operations, and are called by the syscall
	* code once the syscall has converted the path or file descriptor to a vnode
	* (unlocked). The aclp pointer is assumed still to point to userland, so
	* this should not be consumed within the kernel except by syscall code.
	* Other code should directly invoke VOP_{SET,GET}ACL.
	*/

	/*
	* Given a vnode, set its ACL.
	*/
	static int
	vacl_set_acl(struct thread td, struct vnode vp, acl_type_t type,
	struct acl *aclp)
	{
	struct acl *inkernelacl;
	struct mount *mp;
	int error;

	inkernelacl = acl_alloc(M_WAITOK);
	error = acl_copyin(aclp, inkernelacl, type);
	if (error != 0)
	goto out;
	error = vn_start_write(vp, &mp, V_WAIT \| PCATCH);
	if (error != 0)
	goto out;
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	#ifdef MAC
	error = mac_vnode_check_setacl(td->td_ucred, vp, type, inkernelacl);
	if (error != 0)
	goto out_unlock;
	#endif
	error = VOP_SETACL(vp, acl_type_unold(type), inkernelacl,
	td->td_ucred, td);
	#ifdef MAC
	out_unlock:
	#endif
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	out:
	acl_free(inkernelacl);
	return (error);
	}

	/*
	* Given a vnode, get its ACL.
	*/
	static int
	vacl_get_acl(struct thread td, struct vnode vp, acl_type_t type,
	struct acl *aclp)
	{
	struct acl *inkernelacl;
	int error;

	inkernelacl = acl_alloc(M_WAITOK);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	#ifdef MAC
	error = mac_vnode_check_getacl(td->td_ucred, vp, type);
	if (error != 0)
	goto out;
	#endif
	error = VOP_GETACL(vp, acl_type_unold(type), inkernelacl,
	td->td_ucred, td);

	#ifdef MAC
	out:
	#endif
	VOP_UNLOCK(vp, 0);
	if (error == 0)
	error = acl_copyout(inkernelacl, aclp, type);
	acl_free(inkernelacl);
	return (error);
	}

	/*
	* Given a vnode, delete its ACL.
	*/
	static int
	vacl_delete(struct thread td, struct vnode vp, acl_type_t type)
	{
	struct mount *mp;
	int error;

	error = vn_start_write(vp, &mp, V_WAIT \| PCATCH);
	if (error != 0)
	return (error);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	#ifdef MAC
	error = mac_vnode_check_deleteacl(td->td_ucred, vp, type);
	if (error != 0)
	goto out;
	#endif
	error = VOP_SETACL(vp, acl_type_unold(type), 0, td->td_ucred, td);
	#ifdef MAC
	out:
	#endif
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	return (error);
	}

	/*
	* Given a vnode, check whether an ACL is appropriate for it
	*/
	static int
	vacl_aclcheck(struct thread td, struct vnode vp, acl_type_t type,
	struct acl *aclp)
	{
	struct acl *inkernelacl;
	int error;

	inkernelacl = acl_alloc(M_WAITOK);
	error = acl_copyin(aclp, inkernelacl, type);
	if (error != 0)
	goto out;
	error = VOP_ACLCHECK(vp, acl_type_unold(type), inkernelacl,
	td->td_ucred, td);
	out:
	acl_free(inkernelacl);
	return (error);
	}

	/*
	* syscalls -- convert the path/fd to a vnode, and call vacl_whatever. Don't
	* need to lock, as the vacl_ code will get/release any locks required.
	*/

	/*
	* Given a file path, get an ACL for it
	*/
	int
	-__acl_get_file(struct thread td, struct __acl_get_file_args uap)
	+sys___acl_get_file(struct thread td, struct __acl_get_file_args uap)
	{
	struct nameidata nd;
	int vfslocked, error;

	NDINIT(&nd, LOOKUP, MPSAFE\|FOLLOW, UIO_USERSPACE, uap->path, td);
	error = namei(&nd);
	vfslocked = NDHASGIANT(&nd);
	if (error == 0) {
	error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
	NDFREE(&nd, 0);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Given a file path, get an ACL for it; don't follow links.
	*/
	int
	-__acl_get_link(struct thread td, struct __acl_get_link_args uap)
	+sys___acl_get_link(struct thread td, struct __acl_get_link_args uap)
	{
	struct nameidata nd;
	int vfslocked, error;

	NDINIT(&nd, LOOKUP, MPSAFE\|NOFOLLOW, UIO_USERSPACE, uap->path, td);
	error = namei(&nd);
	vfslocked = NDHASGIANT(&nd);
	if (error == 0) {
	error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
	NDFREE(&nd, 0);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Given a file path, set an ACL for it.
	*/
	int
	-__acl_set_file(struct thread td, struct __acl_set_file_args uap)
	+sys___acl_set_file(struct thread td, struct __acl_set_file_args uap)
	{
	struct nameidata nd;
	int vfslocked, error;

	NDINIT(&nd, LOOKUP, MPSAFE\|FOLLOW, UIO_USERSPACE, uap->path, td);
	error = namei(&nd);
	vfslocked = NDHASGIANT(&nd);
	if (error == 0) {
	error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
	NDFREE(&nd, 0);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Given a file path, set an ACL for it; don't follow links.
	*/
	int
	-__acl_set_link(struct thread td, struct __acl_set_link_args uap)
	+sys___acl_set_link(struct thread td, struct __acl_set_link_args uap)
	{
	struct nameidata nd;
	int vfslocked, error;

	NDINIT(&nd, LOOKUP, MPSAFE\|NOFOLLOW, UIO_USERSPACE, uap->path, td);
	error = namei(&nd);
	vfslocked = NDHASGIANT(&nd);
	if (error == 0) {
	error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
	NDFREE(&nd, 0);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Given a file descriptor, get an ACL for it.
	*/
	int
	-__acl_get_fd(struct thread td, struct __acl_get_fd_args uap)
	+sys___acl_get_fd(struct thread td, struct __acl_get_fd_args uap)
	{
	struct file *fp;
	int vfslocked, error;

	error = getvnode(td->td_proc->p_fd, uap->filedes, CAP_ACL_GET, &fp);
	if (error == 0) {
	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
	error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
	fdrop(fp, td);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	return (error);
	}

	/*
	* Given a file descriptor, set an ACL for it.
	*/
	int
	-__acl_set_fd(struct thread td, struct __acl_set_fd_args uap)
	+sys___acl_set_fd(struct thread td, struct __acl_set_fd_args uap)
	{
	struct file *fp;
	int vfslocked, error;

	error = getvnode(td->td_proc->p_fd, uap->filedes, CAP_ACL_SET, &fp);
	if (error == 0) {
	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
	error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
	fdrop(fp, td);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	return (error);
	}

	/*
	* Given a file path, delete an ACL from it.
	*/
	int
	-__acl_delete_file(struct thread td, struct __acl_delete_file_args uap)
	+sys___acl_delete_file(struct thread td, struct __acl_delete_file_args uap)
	{
	struct nameidata nd;
	int vfslocked, error;

	NDINIT(&nd, LOOKUP, MPSAFE\|FOLLOW, UIO_USERSPACE, uap->path, td);
	error = namei(&nd);
	vfslocked = NDHASGIANT(&nd);
	if (error == 0) {
	error = vacl_delete(td, nd.ni_vp, uap->type);
	NDFREE(&nd, 0);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Given a file path, delete an ACL from it; don't follow links.
	*/
	int
	-__acl_delete_link(struct thread td, struct __acl_delete_link_args uap)
	+sys___acl_delete_link(struct thread td, struct __acl_delete_link_args uap)
	{
	struct nameidata nd;
	int vfslocked, error;

	NDINIT(&nd, LOOKUP, MPSAFE\|NOFOLLOW, UIO_USERSPACE, uap->path, td);
	error = namei(&nd);
	vfslocked = NDHASGIANT(&nd);
	if (error == 0) {
	error = vacl_delete(td, nd.ni_vp, uap->type);
	NDFREE(&nd, 0);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Given a file path, delete an ACL from it.
	*/
	int
	-__acl_delete_fd(struct thread td, struct __acl_delete_fd_args uap)
	+sys___acl_delete_fd(struct thread td, struct __acl_delete_fd_args uap)
	{
	struct file *fp;
	int vfslocked, error;

	error = getvnode(td->td_proc->p_fd, uap->filedes, CAP_ACL_DELETE,
	&fp);
	if (error == 0) {
	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
	error = vacl_delete(td, fp->f_vnode, uap->type);
	fdrop(fp, td);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	return (error);
	}

	/*
	* Given a file path, check an ACL for it.
	*/
	int
	-__acl_aclcheck_file(struct thread td, struct __acl_aclcheck_file_args uap)
	+sys___acl_aclcheck_file(struct thread td, struct __acl_aclcheck_file_args uap)
	{
	struct nameidata nd;
	int vfslocked, error;

	NDINIT(&nd, LOOKUP, MPSAFE\|FOLLOW, UIO_USERSPACE, uap->path, td);
	error = namei(&nd);
	vfslocked = NDHASGIANT(&nd);
	if (error == 0) {
	error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
	NDFREE(&nd, 0);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Given a file path, check an ACL for it; don't follow links.
	*/
	int
	-__acl_aclcheck_link(struct thread td, struct __acl_aclcheck_link_args uap)
	+sys___acl_aclcheck_link(struct thread td, struct __acl_aclcheck_link_args uap)
	{
	struct nameidata nd;
	int vfslocked, error;

	NDINIT(&nd, LOOKUP, MPSAFE\|NOFOLLOW, UIO_USERSPACE, uap->path, td);
	error = namei(&nd);
	vfslocked = NDHASGIANT(&nd);
	if (error == 0) {
	error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
	NDFREE(&nd, 0);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Given a file descriptor, check an ACL for it.
	*/
	int
	-__acl_aclcheck_fd(struct thread td, struct __acl_aclcheck_fd_args uap)
	+sys___acl_aclcheck_fd(struct thread td, struct __acl_aclcheck_fd_args uap)
	{
	struct file *fp;
	int vfslocked, error;

	error = getvnode(td->td_proc->p_fd, uap->filedes, CAP_ACL_CHECK,
	&fp);
	if (error == 0) {
	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
	error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
	fdrop(fp, td);
	VFS_UNLOCK_GIANT(vfslocked);
	}
	return (error);
	}

	struct acl *
	acl_alloc(int flags)
	{
	struct acl *aclp;

	aclp = malloc(sizeof(*aclp), M_ACL, flags);
	aclp->acl_maxcnt = ACL_MAX_ENTRIES;

	return (aclp);
	}

	void
	acl_free(struct acl *aclp)
	{

	free(aclp, M_ACL);
	}
	Index: head/sys/kern/vfs_aio.c
	===================================================================
	--- head/sys/kern/vfs_aio.c (revision 225616)
	+++ head/sys/kern/vfs_aio.c (revision 225617)
	@@ -1,3002 +1,3002 @@
	/*-
	* Copyright (c) 1997 John S. Dyson. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. John S. Dyson's name may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* DISCLAIMER: This code isn't warranted to do anything useful. Anything
	* bad that happens because of using this software isn't the responsibility
	* of the author. This software is distributed AS-IS.
	*/

	/*
	* This file contains support for the POSIX 1003.1B AIO/LIO facility.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/malloc.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/capability.h>
	#include <sys/eventhandler.h>
	#include <sys/sysproto.h>
	#include <sys/filedesc.h>
	#include <sys/kernel.h>
	#include <sys/module.h>
	#include <sys/kthread.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/unistd.h>
	#include <sys/posix4.h>
	#include <sys/proc.h>
	#include <sys/resourcevar.h>
	#include <sys/signalvar.h>
	#include <sys/protosw.h>
	#include <sys/sema.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/syscall.h>
	#include <sys/sysent.h>
	#include <sys/sysctl.h>
	#include <sys/sx.h>
	#include <sys/taskqueue.h>
	#include <sys/vnode.h>
	#include <sys/conf.h>
	#include <sys/event.h>
	#include <sys/mount.h>

	#include <machine/atomic.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/uma.h>
	#include <sys/aio.h>

	#include "opt_vfs_aio.h"

	/*
	* Counter for allocating reference ids to new jobs. Wrapped to 1 on
	* overflow. (XXX will be removed soon.)
	*/
	static u_long jobrefid;

	/*
	* Counter for aio_fsync.
	*/
	static uint64_t jobseqno;

	#define JOBST_NULL 0
	#define JOBST_JOBQSOCK 1
	#define JOBST_JOBQGLOBAL 2
	#define JOBST_JOBRUNNING 3
	#define JOBST_JOBFINISHED 4
	#define JOBST_JOBQBUF 5
	#define JOBST_JOBQSYNC 6

	#ifndef MAX_AIO_PER_PROC
	#define MAX_AIO_PER_PROC 32
	#endif

	#ifndef MAX_AIO_QUEUE_PER_PROC
	#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */
	#endif

	#ifndef MAX_AIO_PROCS
	#define MAX_AIO_PROCS 32
	#endif

	#ifndef MAX_AIO_QUEUE
	#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */
	#endif

	#ifndef TARGET_AIO_PROCS
	#define TARGET_AIO_PROCS 4
	#endif

	#ifndef MAX_BUF_AIO
	#define MAX_BUF_AIO 16
	#endif

	#ifndef AIOD_TIMEOUT_DEFAULT
	#define AIOD_TIMEOUT_DEFAULT (10 * hz)
	#endif

	#ifndef AIOD_LIFETIME_DEFAULT
	#define AIOD_LIFETIME_DEFAULT (30 * hz)
	#endif

	FEATURE(aio, "Asynchronous I/O");

	static MALLOC_DEFINE(M_LIO, "lio", "listio aio control block list");

	static SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "Async IO management");

	static int max_aio_procs = MAX_AIO_PROCS;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs,
	CTLFLAG_RW, &max_aio_procs, 0,
	"Maximum number of kernel threads to use for handling async IO ");

	static int num_aio_procs = 0;
	SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs,
	CTLFLAG_RD, &num_aio_procs, 0,
	"Number of presently active kernel threads for async IO");

	/*
	* The code will adjust the actual number of AIO processes towards this
	* number when it gets a chance.
	*/
	static int target_aio_procs = TARGET_AIO_PROCS;
	SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, CTLFLAG_RW, &target_aio_procs,
	0, "Preferred number of ready kernel threads for async IO");

	static int max_queue_count = MAX_AIO_QUEUE;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, CTLFLAG_RW, &max_queue_count, 0,
	"Maximum number of aio requests to queue, globally");

	static int num_queue_count = 0;
	SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, CTLFLAG_RD, &num_queue_count, 0,
	"Number of queued aio requests");

	static int num_buf_aio = 0;
	SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, CTLFLAG_RD, &num_buf_aio, 0,
	"Number of aio requests presently handled by the buf subsystem");

	/* Number of async I/O thread in the process of being started */
	/* XXX This should be local to aio_aqueue() */
	static int num_aio_resv_start = 0;

	static int aiod_timeout;
	SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, CTLFLAG_RW, &aiod_timeout, 0,
	"Timeout value for synchronous aio operations");

	static int aiod_lifetime;
	SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, CTLFLAG_RW, &aiod_lifetime, 0,
	"Maximum lifetime for idle aiod");

	static int unloadable = 0;
	SYSCTL_INT(_vfs_aio, OID_AUTO, unloadable, CTLFLAG_RW, &unloadable, 0,
	"Allow unload of aio (not recommended)");


	static int max_aio_per_proc = MAX_AIO_PER_PROC;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, CTLFLAG_RW, &max_aio_per_proc,
	0, "Maximum active aio requests per process (stored in the process)");

	static int max_aio_queue_per_proc = MAX_AIO_QUEUE_PER_PROC;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, CTLFLAG_RW,
	&max_aio_queue_per_proc, 0,
	"Maximum queued aio requests per process (stored in the process)");

	static int max_buf_aio = MAX_BUF_AIO;
	SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
	"Maximum buf aio requests per process (stored in the process)");

	typedef struct oaiocb {
	int aio_fildes; /* File descriptor */
	off_t aio_offset; /* File offset for I/O */
	volatile void aio_buf; / I/O buffer in process space */
	size_t aio_nbytes; /* Number of bytes for I/O */
	struct osigevent aio_sigevent; /* Signal to deliver */
	int aio_lio_opcode; /* LIO opcode */
	int aio_reqprio; /* Request priority -- ignored */
	struct __aiocb_private _aiocb_private;
	} oaiocb_t;

	/*
	* Below is a key of locks used to protect each member of struct aiocblist
	* aioliojob and kaioinfo and any backends.
	*
	* * - need not protected
	* a - locked by kaioinfo lock
	* b - locked by backend lock, the backend lock can be null in some cases,
	* for example, BIO belongs to this type, in this case, proc lock is
	* reused.
	* c - locked by aio_job_mtx, the lock for the generic file I/O backend.
	*/

	/*
	* Current, there is only two backends: BIO and generic file I/O.
	* socket I/O is served by generic file I/O, this is not a good idea, since
	* disk file I/O and any other types without O_NONBLOCK flag can block daemon
	* threads, if there is no thread to serve socket I/O, the socket I/O will be
	* delayed too long or starved, we should create some threads dedicated to
	* sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
	* systems we really need non-blocking interface, fiddling O_NONBLOCK in file
	* structure is not safe because there is race between userland and aio
	* daemons.
	*/

	struct aiocblist {
	TAILQ_ENTRY(aiocblist) list; /* (b) internal list of for backend */
	TAILQ_ENTRY(aiocblist) plist; /* (a) list of jobs for each backend */
	TAILQ_ENTRY(aiocblist) allist; /* (a) list of all jobs in proc */
	int jobflags; /* (a) job flags */
	int jobstate; /* (b) job state */
	int inputcharge; /* () input blockes /
	int outputcharge; /* () output blockes /
	struct buf bp; / (*) private to BIO backend,
	* buffer pointer
	*/
	struct proc userproc; / () user process /
	struct ucred cred; / () active credential when created /
	struct file fd_file; / () pointer to file structure /
	struct aioliojob lio; / () optional lio job /
	struct aiocb uuaiocb; / () pointer in userspace of aiocb /
	struct knlist klist; /* (a) list of knotes */
	struct aiocb uaiocb; /* () kernel I/O control block /
	ksiginfo_t ksi; /* (a) realtime signal info */
	struct task biotask; /* () private to BIO backend /
	uint64_t seqno; /* () job number /
	int pending; /* (a) number of pending I/O, aio_fsync only */
	};

	/* jobflags */
	#define AIOCBLIST_DONE 0x01
	#define AIOCBLIST_BUFDONE 0x02
	#define AIOCBLIST_RUNDOWN 0x04
	#define AIOCBLIST_CHECKSYNC 0x08

	/*
	* AIO process info
	*/
	#define AIOP_FREE 0x1 /* proc on free queue */

	struct aiothreadlist {
	int aiothreadflags; /* (c) AIO proc flags */
	TAILQ_ENTRY(aiothreadlist) list; /* (c) list of processes */
	struct thread aiothread; / () the AIO thread /
	};

	/*
	* data-structure for lio signal management
	*/
	struct aioliojob {
	int lioj_flags; /* (a) listio flags */
	int lioj_count; /* (a) listio flags */
	int lioj_finished_count; /* (a) listio flags */
	struct sigevent lioj_signal; /* (a) signal on all I/O done */
	TAILQ_ENTRY(aioliojob) lioj_list; /* (a) lio list */
	struct knlist klist; /* (a) list of knotes */
	ksiginfo_t lioj_ksi; /* (a) Realtime signal info */
	};

	#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */
	#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */
	#define LIOJ_KEVENT_POSTED 0x4 /* kevent triggered */

	/*
	* per process aio data structure
	*/
	struct kaioinfo {
	struct mtx kaio_mtx; /* the lock to protect this struct */
	int kaio_flags; /* (a) per process kaio flags */
	int kaio_maxactive_count; /* () maximum number of AIOs /
	int kaio_active_count; /* (c) number of currently used AIOs */
	int kaio_qallowed_count; /* () maxiumu size of AIO queue /
	int kaio_count; /* (a) size of AIO queue */
	int kaio_ballowed_count; /* () maximum number of buffers /
	int kaio_buffer_count; /* (a) number of physio buffers */
	TAILQ_HEAD(,aiocblist) kaio_all; /* (a) all AIOs in the process */
	TAILQ_HEAD(,aiocblist) kaio_done; /* (a) done queue for process */
	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
	TAILQ_HEAD(,aiocblist) kaio_jobqueue; /* (a) job queue for process */
	TAILQ_HEAD(,aiocblist) kaio_bufqueue; /* (a) buffer job queue for process */
	TAILQ_HEAD(,aiocblist) kaio_sockqueue; /* (a) queue for aios waiting on sockets,
	* NOT USED YET.
	*/
	TAILQ_HEAD(,aiocblist) kaio_syncqueue; /* (a) queue for aio_fsync */
	struct task kaio_task; /* () task to kick aio threads /
	};

	#define AIO_LOCK(ki) mtx_lock(&(ki)->kaio_mtx)
	#define AIO_UNLOCK(ki) mtx_unlock(&(ki)->kaio_mtx)
	#define AIO_LOCK_ASSERT(ki, f) mtx_assert(&(ki)->kaio_mtx, (f))
	#define AIO_MTX(ki) (&(ki)->kaio_mtx)

	#define KAIO_RUNDOWN 0x1 /* process is being run down */
	#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant event */

	/*
	* Operations used to interact with userland aio control blocks.
	* Different ABIs provide their own operations.
	*/
	struct aiocb_ops {
	int (copyin)(struct aiocb ujob, struct aiocb *kjob);
	long (fetch_status)(struct aiocb ujob);
	long (fetch_error)(struct aiocb ujob);
	int (store_status)(struct aiocb ujob, long status);
	int (store_error)(struct aiocb ujob, long error);
	int (store_kernelinfo)(struct aiocb ujob, long jobref);
	int (store_aiocb)(struct aiocb ujobp, struct aiocb ujob);
	};

	static TAILQ_HEAD(,aiothreadlist) aio_freeproc; /* (c) Idle daemons */
	static struct sema aio_newproc_sem;
	static struct mtx aio_job_mtx;
	static struct mtx aio_sock_mtx;
	static TAILQ_HEAD(,aiocblist) aio_jobs; /* (c) Async job list */
	static struct unrhdr *aiod_unr;

	void aio_init_aioinfo(struct proc *p);
	static int aio_onceonly(void);
	static int aio_free_entry(struct aiocblist *aiocbe);
	static void aio_process(struct aiocblist *aiocbe);
	static int aio_newproc(int *);
	int aio_aqueue(struct thread td, struct aiocb job,
	struct aioliojob lio, int type, struct aiocb_ops ops);
	static void aio_physwakeup(struct buf *bp);
	static void aio_proc_rundown(void arg, struct proc p);
	static void aio_proc_rundown_exec(void arg, struct proc p, struct image_params *imgp);
	static int aio_qphysio(struct proc p, struct aiocblist iocb);
	static void biohelper(void *, int);
	static void aio_daemon(void *param);
	static void aio_swake_cb(struct socket , struct sockbuf );
	static int aio_unload(void);
	static void aio_bio_done_notify(struct proc userp, struct aiocblist aiocbe, int type);
	#define DONE_BUF 1
	#define DONE_QUEUE 2
	static int aio_kick(struct proc *userp);
	static void aio_kick_nowait(struct proc *userp);
	static void aio_kick_helper(void *context, int pending);
	static int filt_aioattach(struct knote *kn);
	static void filt_aiodetach(struct knote *kn);
	static int filt_aio(struct knote *kn, long hint);
	static int filt_lioattach(struct knote *kn);
	static void filt_liodetach(struct knote *kn);
	static int filt_lio(struct knote *kn, long hint);

	/*
	* Zones for:
	* kaio Per process async io info
	* aiop async io thread data
	* aiocb async io jobs
	* aiol list io job pointer - internal to aio_suspend XXX
	* aiolio list io jobs
	*/
	static uma_zone_t kaio_zone, aiop_zone, aiocb_zone, aiol_zone, aiolio_zone;

	/* kqueue filters for aio */
	static struct filterops aio_filtops = {
	.f_isfd = 0,
	.f_attach = filt_aioattach,
	.f_detach = filt_aiodetach,
	.f_event = filt_aio,
	};
	static struct filterops lio_filtops = {
	.f_isfd = 0,
	.f_attach = filt_lioattach,
	.f_detach = filt_liodetach,
	.f_event = filt_lio
	};

	static eventhandler_tag exit_tag, exec_tag;

	TASKQUEUE_DEFINE_THREAD(aiod_bio);

	/*
	* Main operations function for use as a kernel module.
	*/
	static int
	aio_modload(struct module module, int cmd, void arg)
	{
	int error = 0;

	switch (cmd) {
	case MOD_LOAD:
	aio_onceonly();
	break;
	case MOD_UNLOAD:
	error = aio_unload();
	break;
	case MOD_SHUTDOWN:
	break;
	default:
	error = EINVAL;
	break;
	}
	return (error);
	}

	static moduledata_t aio_mod = {
	"aio",
	&aio_modload,
	NULL
	};

	static struct syscall_helper_data aio_syscalls[] = {
	SYSCALL_INIT_HELPER(aio_cancel),
	SYSCALL_INIT_HELPER(aio_error),
	SYSCALL_INIT_HELPER(aio_fsync),
	SYSCALL_INIT_HELPER(aio_read),
	SYSCALL_INIT_HELPER(aio_return),
	SYSCALL_INIT_HELPER(aio_suspend),
	SYSCALL_INIT_HELPER(aio_waitcomplete),
	SYSCALL_INIT_HELPER(aio_write),
	SYSCALL_INIT_HELPER(lio_listio),
	SYSCALL_INIT_HELPER(oaio_read),
	SYSCALL_INIT_HELPER(oaio_write),
	SYSCALL_INIT_HELPER(olio_listio),
	SYSCALL_INIT_LAST
	};

	#ifdef COMPAT_FREEBSD32
	#include <sys/mount.h>
	#include <sys/socket.h>
	#include <compat/freebsd32/freebsd32.h>
	#include <compat/freebsd32/freebsd32_proto.h>
	#include <compat/freebsd32/freebsd32_signal.h>
	#include <compat/freebsd32/freebsd32_syscall.h>
	#include <compat/freebsd32/freebsd32_util.h>

	static struct syscall_helper_data aio32_syscalls[] = {
	SYSCALL32_INIT_HELPER(freebsd32_aio_return),
	SYSCALL32_INIT_HELPER(freebsd32_aio_suspend),
	SYSCALL32_INIT_HELPER(freebsd32_aio_cancel),
	SYSCALL32_INIT_HELPER(freebsd32_aio_error),
	SYSCALL32_INIT_HELPER(freebsd32_aio_fsync),
	SYSCALL32_INIT_HELPER(freebsd32_aio_read),
	SYSCALL32_INIT_HELPER(freebsd32_aio_write),
	SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete),
	SYSCALL32_INIT_HELPER(freebsd32_lio_listio),
	SYSCALL32_INIT_HELPER(freebsd32_oaio_read),
	SYSCALL32_INIT_HELPER(freebsd32_oaio_write),
	SYSCALL32_INIT_HELPER(freebsd32_olio_listio),
	SYSCALL_INIT_LAST
	};
	#endif

	DECLARE_MODULE(aio, aio_mod,
	SI_SUB_VFS, SI_ORDER_ANY);
	MODULE_VERSION(aio, 1);

	/*
	* Startup initialization
	*/
	static int
	aio_onceonly(void)
	{
	int error;

	/* XXX: should probably just use so->callback */
	aio_swake = &aio_swake_cb;
	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
	EVENTHANDLER_PRI_ANY);
	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL,
	EVENTHANDLER_PRI_ANY);
	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
	TAILQ_INIT(&aio_freeproc);
	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
	mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF);
	TAILQ_INIT(&aio_jobs);
	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
	NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	aiocb_zone = uma_zcreate("AIOCB", sizeof(struct aiocblist), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
	NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
	NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
	jobrefid = 1;
	async_io_version = _POSIX_VERSION;
	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, AIO_LISTIO_MAX);
	p31b_setcfg(CTL_P1003_1B_AIO_MAX, MAX_AIO_QUEUE);
	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, 0);

	error = syscall_helper_register(aio_syscalls);
	if (error)
	return (error);
	#ifdef COMPAT_FREEBSD32
	error = syscall32_helper_register(aio32_syscalls);
	if (error)
	return (error);
	#endif
	return (0);
	}

	/*
	* Callback for unload of AIO when used as a module.
	*/
	static int
	aio_unload(void)
	{
	int error;

	/*
	* XXX: no unloads by default, it's too dangerous.
	* perhaps we could do it if locked out callers and then
	* did an aio_proc_rundown() on each process.
	*
	* jhb: aio_proc_rundown() needs to run on curproc though,
	* so I don't think that would fly.
	*/
	if (!unloadable)
	return (EOPNOTSUPP);

	#ifdef COMPAT_FREEBSD32
	syscall32_helper_unregister(aio32_syscalls);
	#endif
	syscall_helper_unregister(aio_syscalls);

	error = kqueue_del_filteropts(EVFILT_AIO);
	if (error)
	return error;
	error = kqueue_del_filteropts(EVFILT_LIO);
	if (error)
	return error;
	async_io_version = 0;
	aio_swake = NULL;
	taskqueue_free(taskqueue_aiod_bio);
	delete_unrhdr(aiod_unr);
	uma_zdestroy(kaio_zone);
	uma_zdestroy(aiop_zone);
	uma_zdestroy(aiocb_zone);
	uma_zdestroy(aiol_zone);
	uma_zdestroy(aiolio_zone);
	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
	EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
	mtx_destroy(&aio_job_mtx);
	mtx_destroy(&aio_sock_mtx);
	sema_destroy(&aio_newproc_sem);
	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
	p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
	return (0);
	}

	/*
	* Init the per-process aioinfo structure. The aioinfo limits are set
	* per-process for user limit (resource) management.
	*/
	void
	aio_init_aioinfo(struct proc *p)
	{
	struct kaioinfo *ki;

	ki = uma_zalloc(kaio_zone, M_WAITOK);
	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF);
	ki->kaio_flags = 0;
	ki->kaio_maxactive_count = max_aio_per_proc;
	ki->kaio_active_count = 0;
	ki->kaio_qallowed_count = max_aio_queue_per_proc;
	ki->kaio_count = 0;
	ki->kaio_ballowed_count = max_buf_aio;
	ki->kaio_buffer_count = 0;
	TAILQ_INIT(&ki->kaio_all);
	TAILQ_INIT(&ki->kaio_done);
	TAILQ_INIT(&ki->kaio_jobqueue);
	TAILQ_INIT(&ki->kaio_bufqueue);
	TAILQ_INIT(&ki->kaio_liojoblist);
	TAILQ_INIT(&ki->kaio_sockqueue);
	TAILQ_INIT(&ki->kaio_syncqueue);
	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
	PROC_LOCK(p);
	if (p->p_aioinfo == NULL) {
	p->p_aioinfo = ki;
	PROC_UNLOCK(p);
	} else {
	PROC_UNLOCK(p);
	mtx_destroy(&ki->kaio_mtx);
	uma_zfree(kaio_zone, ki);
	}

	while (num_aio_procs < MIN(target_aio_procs, max_aio_procs))
	aio_newproc(NULL);
	}

	static int
	aio_sendsig(struct proc p, struct sigevent sigev, ksiginfo_t *ksi)
	{
	struct thread *td;
	int error;

	error = sigev_findtd(p, sigev, &td);
	if (error)
	return (error);
	if (!KSI_ONQ(ksi)) {
	ksiginfo_set_sigev(ksi, sigev);
	ksi->ksi_code = SI_ASYNCIO;
	ksi->ksi_flags \|= KSI_EXT \| KSI_INS;
	tdsendsignal(p, td, ksi->ksi_signo, ksi);
	}
	PROC_UNLOCK(p);
	return (error);
	}

	/*
	* Free a job entry. Wait for completion if it is currently active, but don't
	* delay forever. If we delay, we return a flag that says that we have to
	* restart the queue scan.
	*/
	static int
	aio_free_entry(struct aiocblist *aiocbe)
	{
	struct kaioinfo *ki;
	struct aioliojob *lj;
	struct proc *p;

	p = aiocbe->userproc;
	MPASS(curproc == p);
	ki = p->p_aioinfo;
	MPASS(ki != NULL);

	AIO_LOCK_ASSERT(ki, MA_OWNED);
	MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);

	atomic_subtract_int(&num_queue_count, 1);

	ki->kaio_count--;
	MPASS(ki->kaio_count >= 0);

	TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);

	lj = aiocbe->lio;
	if (lj) {
	lj->lioj_count--;
	lj->lioj_finished_count--;

	if (lj->lioj_count == 0) {
	TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
	/* lio is going away, we need to destroy any knotes */
	knlist_delete(&lj->klist, curthread, 1);
	PROC_LOCK(p);
	sigqueue_take(&lj->lioj_ksi);
	PROC_UNLOCK(p);
	uma_zfree(aiolio_zone, lj);
	}
	}

	/* aiocbe is going away, we need to destroy any knotes */
	knlist_delete(&aiocbe->klist, curthread, 1);
	PROC_LOCK(p);
	sigqueue_take(&aiocbe->ksi);
	PROC_UNLOCK(p);

	MPASS(aiocbe->bp == NULL);
	aiocbe->jobstate = JOBST_NULL;
	AIO_UNLOCK(ki);

	/*
	* The thread argument here is used to find the owning process
	* and is also passed to fo_close() which may pass it to various
	* places such as devsw close() routines. Because of that, we
	* need a thread pointer from the process owning the job that is
	* persistent and won't disappear out from under us or move to
	* another process.
	*
	* Currently, all the callers of this function call it to remove
	* an aiocblist from the current process' job list either via a
	* syscall or due to the current process calling exit() or
	* execve(). Thus, we know that p == curproc. We also know that
	* curthread can't exit since we are curthread.
	*
	* Therefore, we use curthread as the thread to pass to
	* knlist_delete(). This does mean that it is possible for the
	* thread pointer at close time to differ from the thread pointer
	* at open time, but this is already true of file descriptors in
	* a multithreaded process.
	*/
	fdrop(aiocbe->fd_file, curthread);
	crfree(aiocbe->cred);
	uma_zfree(aiocb_zone, aiocbe);
	AIO_LOCK(ki);

	return (0);
	}

	static void
	aio_proc_rundown_exec(void arg, struct proc p, struct image_params *imgp __unused)
	{
	aio_proc_rundown(arg, p);
	}

	/*
	* Rundown the jobs for a given process.
	*/
	static void
	aio_proc_rundown(void arg, struct proc p)
	{
	struct kaioinfo *ki;
	struct aioliojob *lj;
	struct aiocblist cbe, cbn;
	struct file *fp;
	struct socket *so;
	int remove;

	KASSERT(curthread->td_proc == p,
	("%s: called on non-curproc", __func__));
	ki = p->p_aioinfo;
	if (ki == NULL)
	return;

	AIO_LOCK(ki);
	ki->kaio_flags \|= KAIO_RUNDOWN;

	restart:

	/*
	* Try to cancel all pending requests. This code simulates
	* aio_cancel on all pending I/O requests.
	*/
	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
	remove = 0;
	mtx_lock(&aio_job_mtx);
	if (cbe->jobstate == JOBST_JOBQGLOBAL) {
	TAILQ_REMOVE(&aio_jobs, cbe, list);
	remove = 1;
	} else if (cbe->jobstate == JOBST_JOBQSOCK) {
	fp = cbe->fd_file;
	MPASS(fp->f_type == DTYPE_SOCKET);
	so = fp->f_data;
	TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
	remove = 1;
	} else if (cbe->jobstate == JOBST_JOBQSYNC) {
	TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
	remove = 1;
	}
	mtx_unlock(&aio_job_mtx);

	if (remove) {
	cbe->jobstate = JOBST_JOBFINISHED;
	cbe->uaiocb._aiocb_private.status = -1;
	cbe->uaiocb._aiocb_private.error = ECANCELED;
	TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
	aio_bio_done_notify(p, cbe, DONE_QUEUE);
	}
	}

	/* Wait for all running I/O to be finished */
	if (TAILQ_FIRST(&ki->kaio_bufqueue) \|\|
	TAILQ_FIRST(&ki->kaio_jobqueue)) {
	ki->kaio_flags \|= KAIO_WAKEUP;
	msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
	goto restart;
	}

	/* Free all completed I/O requests. */
	while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
	aio_free_entry(cbe);

	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
	if (lj->lioj_count == 0) {
	TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
	knlist_delete(&lj->klist, curthread, 1);
	PROC_LOCK(p);
	sigqueue_take(&lj->lioj_ksi);
	PROC_UNLOCK(p);
	uma_zfree(aiolio_zone, lj);
	} else {
	panic("LIO job not cleaned up: C:%d, FC:%d\n",
	lj->lioj_count, lj->lioj_finished_count);
	}
	}
	AIO_UNLOCK(ki);
	taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task);
	mtx_destroy(&ki->kaio_mtx);
	uma_zfree(kaio_zone, ki);
	p->p_aioinfo = NULL;
	}

	/*
	* Select a job to run (called by an AIO daemon).
	*/
	static struct aiocblist *
	aio_selectjob(struct aiothreadlist *aiop)
	{
	struct aiocblist *aiocbe;
	struct kaioinfo *ki;
	struct proc *userp;

	mtx_assert(&aio_job_mtx, MA_OWNED);
	TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
	userp = aiocbe->userproc;
	ki = userp->p_aioinfo;

	if (ki->kaio_active_count < ki->kaio_maxactive_count) {
	TAILQ_REMOVE(&aio_jobs, aiocbe, list);
	/* Account for currently active jobs. */
	ki->kaio_active_count++;
	aiocbe->jobstate = JOBST_JOBRUNNING;
	break;
	}
	}
	return (aiocbe);
	}

	/*
	* Move all data to a permanent storage device, this code
	* simulates fsync syscall.
	*/
	static int
	aio_fsync_vnode(struct thread td, struct vnode vp)
	{
	struct mount *mp;
	int vfslocked;
	int error;

	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	if ((error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0)
	goto drop;
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	if (vp->v_object != NULL) {
	VM_OBJECT_LOCK(vp->v_object);
	vm_object_page_clean(vp->v_object, 0, 0, 0);
	VM_OBJECT_UNLOCK(vp->v_object);
	}
	error = VOP_FSYNC(vp, MNT_WAIT, td);

	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	drop:
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* The AIO processing activity. This is the code that does the I/O request for
	* the non-physio version of the operations. The normal vn operations are used,
	* and this code should work in all instances for every type of file, including
	* pipes, sockets, fifos, and regular files.
	*
	* XXX I don't think it works well for socket, pipe, and fifo.
	*/
	static void
	aio_process(struct aiocblist *aiocbe)
	{
	struct ucred *td_savedcred;
	struct thread *td;
	struct aiocb *cb;
	struct file *fp;
	struct socket *so;
	struct uio auio;
	struct iovec aiov;
	int cnt;
	int error;
	int oublock_st, oublock_end;
	int inblock_st, inblock_end;

	td = curthread;
	td_savedcred = td->td_ucred;
	td->td_ucred = aiocbe->cred;
	cb = &aiocbe->uaiocb;
	fp = aiocbe->fd_file;

	if (cb->aio_lio_opcode == LIO_SYNC) {
	error = 0;
	cnt = 0;
	if (fp->f_vnode != NULL)
	error = aio_fsync_vnode(td, fp->f_vnode);
	cb->_aiocb_private.error = error;
	cb->_aiocb_private.status = 0;
	td->td_ucred = td_savedcred;
	return;
	}

	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
	aiov.iov_len = cb->aio_nbytes;

	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = cb->aio_offset;
	auio.uio_resid = cb->aio_nbytes;
	cnt = cb->aio_nbytes;
	auio.uio_segflg = UIO_USERSPACE;
	auio.uio_td = td;

	inblock_st = td->td_ru.ru_inblock;
	oublock_st = td->td_ru.ru_oublock;
	/*
	* aio_aqueue() acquires a reference to the file that is
	* released in aio_free_entry().
	*/
	if (cb->aio_lio_opcode == LIO_READ) {
	auio.uio_rw = UIO_READ;
	if (auio.uio_resid == 0)
	error = 0;
	else
	error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
	} else {
	if (fp->f_type == DTYPE_VNODE)
	bwillwrite();
	auio.uio_rw = UIO_WRITE;
	error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
	}
	inblock_end = td->td_ru.ru_inblock;
	oublock_end = td->td_ru.ru_oublock;

	aiocbe->inputcharge = inblock_end - inblock_st;
	aiocbe->outputcharge = oublock_end - oublock_st;

	if ((error) && (auio.uio_resid != cnt)) {
	if (error == ERESTART \|\| error == EINTR \|\| error == EWOULDBLOCK)
	error = 0;
	if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
	int sigpipe = 1;
	if (fp->f_type == DTYPE_SOCKET) {
	so = fp->f_data;
	if (so->so_options & SO_NOSIGPIPE)
	sigpipe = 0;
	}
	if (sigpipe) {
	PROC_LOCK(aiocbe->userproc);
	- psignal(aiocbe->userproc, SIGPIPE);
	+ kern_psignal(aiocbe->userproc, SIGPIPE);
	PROC_UNLOCK(aiocbe->userproc);
	}
	}
	}

	cnt -= auio.uio_resid;
	cb->_aiocb_private.error = error;
	cb->_aiocb_private.status = cnt;
	td->td_ucred = td_savedcred;
	}

	static void
	aio_bio_done_notify(struct proc userp, struct aiocblist aiocbe, int type)
	{
	struct aioliojob *lj;
	struct kaioinfo *ki;
	struct aiocblist scb, scbn;
	int lj_done;

	ki = userp->p_aioinfo;
	AIO_LOCK_ASSERT(ki, MA_OWNED);
	lj = aiocbe->lio;
	lj_done = 0;
	if (lj) {
	lj->lioj_finished_count++;
	if (lj->lioj_count == lj->lioj_finished_count)
	lj_done = 1;
	}
	if (type == DONE_QUEUE) {
	aiocbe->jobflags \|= AIOCBLIST_DONE;
	} else {
	aiocbe->jobflags \|= AIOCBLIST_BUFDONE;
	}
	TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
	aiocbe->jobstate = JOBST_JOBFINISHED;

	if (ki->kaio_flags & KAIO_RUNDOWN)
	goto notification_done;

	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL \|\|
	aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
	aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);

	KNOTE_LOCKED(&aiocbe->klist, 1);

	if (lj_done) {
	if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
	lj->lioj_flags \|= LIOJ_KEVENT_POSTED;
	KNOTE_LOCKED(&lj->klist, 1);
	}
	if ((lj->lioj_flags & (LIOJ_SIGNAL\|LIOJ_SIGNAL_POSTED))
	== LIOJ_SIGNAL
	&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL \|\|
	lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
	aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
	lj->lioj_flags \|= LIOJ_SIGNAL_POSTED;
	}
	}

	notification_done:
	if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
	TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
	if (aiocbe->fd_file == scb->fd_file &&
	aiocbe->seqno < scb->seqno) {
	if (--scb->pending == 0) {
	mtx_lock(&aio_job_mtx);
	scb->jobstate = JOBST_JOBQGLOBAL;
	TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list);
	TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
	aio_kick_nowait(userp);
	mtx_unlock(&aio_job_mtx);
	}
	}
	}
	}
	if (ki->kaio_flags & KAIO_WAKEUP) {
	ki->kaio_flags &= ~KAIO_WAKEUP;
	wakeup(&userp->p_aioinfo);
	}
	}

	/*
	* The AIO daemon, most of the actual work is done in aio_process,
	* but the setup (and address space mgmt) is done in this routine.
	*/
	static void
	aio_daemon(void *_id)
	{
	struct aiocblist *aiocbe;
	struct aiothreadlist *aiop;
	struct kaioinfo *ki;
	struct proc curcp, mycp, *userp;
	struct vmspace myvm, tmpvm;
	struct thread *td = curthread;
	int id = (intptr_t)_id;

	/*
	* Local copies of curproc (cp) and vmspace (myvm)
	*/
	mycp = td->td_proc;
	myvm = mycp->p_vmspace;

	KASSERT(mycp->p_textvp == NULL, ("kthread has a textvp"));

	/*
	* Allocate and ready the aio control info. There is one aiop structure
	* per daemon.
	*/
	aiop = uma_zalloc(aiop_zone, M_WAITOK);
	aiop->aiothread = td;
	aiop->aiothreadflags = 0;

	/* The daemon resides in its own pgrp. */
	- setsid(td, NULL);
	+ sys_setsid(td, NULL);

	/*
	* Wakeup parent process. (Parent sleeps to keep from blasting away
	* and creating too many daemons.)
	*/
	sema_post(&aio_newproc_sem);

	mtx_lock(&aio_job_mtx);
	for (;;) {
	/*
	* curcp is the current daemon process context.
	* userp is the current user process context.
	*/
	curcp = mycp;

	/*
	* Take daemon off of free queue
	*/
	if (aiop->aiothreadflags & AIOP_FREE) {
	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	aiop->aiothreadflags &= ~AIOP_FREE;
	}

	/*
	* Check for jobs.
	*/
	while ((aiocbe = aio_selectjob(aiop)) != NULL) {
	mtx_unlock(&aio_job_mtx);
	userp = aiocbe->userproc;

	/*
	* Connect to process address space for user program.
	*/
	if (userp != curcp) {
	/*
	* Save the current address space that we are
	* connected to.
	*/
	tmpvm = mycp->p_vmspace;

	/*
	* Point to the new user address space, and
	* refer to it.
	*/
	mycp->p_vmspace = userp->p_vmspace;
	atomic_add_int(&mycp->p_vmspace->vm_refcnt, 1);

	/* Activate the new mapping. */
	pmap_activate(FIRST_THREAD_IN_PROC(mycp));

	/*
	* If the old address space wasn't the daemons
	* own address space, then we need to remove the
	* daemon's reference from the other process
	* that it was acting on behalf of.
	*/
	if (tmpvm != myvm) {
	vmspace_free(tmpvm);
	}
	curcp = userp;
	}

	ki = userp->p_aioinfo;

	/* Do the I/O function. */
	aio_process(aiocbe);

	mtx_lock(&aio_job_mtx);
	/* Decrement the active job count. */
	ki->kaio_active_count--;
	mtx_unlock(&aio_job_mtx);

	AIO_LOCK(ki);
	TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
	aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
	AIO_UNLOCK(ki);

	mtx_lock(&aio_job_mtx);
	}

	/*
	* Disconnect from user address space.
	*/
	if (curcp != mycp) {

	mtx_unlock(&aio_job_mtx);

	/* Get the user address space to disconnect from. */
	tmpvm = mycp->p_vmspace;

	/* Get original address space for daemon. */
	mycp->p_vmspace = myvm;

	/* Activate the daemon's address space. */
	pmap_activate(FIRST_THREAD_IN_PROC(mycp));
	#ifdef DIAGNOSTIC
	if (tmpvm == myvm) {
	printf("AIOD: vmspace problem -- %d\n",
	mycp->p_pid);
	}
	#endif
	/* Remove our vmspace reference. */
	vmspace_free(tmpvm);

	curcp = mycp;

	mtx_lock(&aio_job_mtx);
	/*
	* We have to restart to avoid race, we only sleep if
	* no job can be selected, that should be
	* curcp == mycp.
	*/
	continue;
	}

	mtx_assert(&aio_job_mtx, MA_OWNED);

	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
	aiop->aiothreadflags \|= AIOP_FREE;

	/*
	* If daemon is inactive for a long time, allow it to exit,
	* thereby freeing resources.
	*/
	if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy",
	aiod_lifetime)) {
	if (TAILQ_EMPTY(&aio_jobs)) {
	if ((aiop->aiothreadflags & AIOP_FREE) &&
	(num_aio_procs > target_aio_procs)) {
	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	num_aio_procs--;
	mtx_unlock(&aio_job_mtx);
	uma_zfree(aiop_zone, aiop);
	free_unr(aiod_unr, id);
	#ifdef DIAGNOSTIC
	if (mycp->p_vmspace->vm_refcnt <= 1) {
	printf("AIOD: bad vm refcnt for"
	" exiting daemon: %d\n",
	mycp->p_vmspace->vm_refcnt);
	}
	#endif
	kproc_exit(0);
	}
	}
	}
	}
	mtx_unlock(&aio_job_mtx);
	panic("shouldn't be here\n");
	}

	/*
	* Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
	* AIO daemon modifies its environment itself.
	*/
	static int
	aio_newproc(int *start)
	{
	int error;
	struct proc *p;
	int id;

	id = alloc_unr(aiod_unr);
	error = kproc_create(aio_daemon, (void *)(intptr_t)id, &p,
	RFNOWAIT, 0, "aiod%d", id);
	if (error == 0) {
	/*
	* Wait until daemon is started.
	*/
	sema_wait(&aio_newproc_sem);
	mtx_lock(&aio_job_mtx);
	num_aio_procs++;
	if (start != NULL)
	(*start)--;
	mtx_unlock(&aio_job_mtx);
	} else {
	free_unr(aiod_unr, id);
	}
	return (error);
	}

	/*
	* Try the high-performance, low-overhead physio method for eligible
	* VCHR devices. This method doesn't use an aio helper thread, and
	* thus has very low overhead.
	*
	* Assumes that the caller, aio_aqueue(), has incremented the file
	* structure's reference count, preventing its deallocation for the
	* duration of this call.
	*/
	static int
	aio_qphysio(struct proc p, struct aiocblist aiocbe)
	{
	struct aiocb *cb;
	struct file *fp;
	struct buf *bp;
	struct vnode *vp;
	struct kaioinfo *ki;
	struct aioliojob *lj;
	int error;

	cb = &aiocbe->uaiocb;
	fp = aiocbe->fd_file;

	if (fp->f_type != DTYPE_VNODE)
	return (-1);

	vp = fp->f_vnode;

	/*
	* If its not a disk, we don't want to return a positive error.
	* It causes the aio code to not fall through to try the thread
	* way when you're talking to a regular file.
	*/
	if (!vn_isdisk(vp, &error)) {
	if (error == ENOTBLK)
	return (-1);
	else
	return (error);
	}

	if (vp->v_bufobj.bo_bsize == 0)
	return (-1);

	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
	return (-1);

	if (cb->aio_nbytes > vp->v_rdev->si_iosize_max)
	return (-1);

	if (cb->aio_nbytes >
	MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
	return (-1);

	ki = p->p_aioinfo;
	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
	return (-1);

	/* Create and build a buffer header for a transfer. */
	bp = (struct buf *)getpbuf(NULL);
	BUF_KERNPROC(bp);

	AIO_LOCK(ki);
	ki->kaio_count++;
	ki->kaio_buffer_count++;
	lj = aiocbe->lio;
	if (lj)
	lj->lioj_count++;
	AIO_UNLOCK(ki);

	/*
	* Get a copy of the kva from the physical buffer.
	*/
	error = 0;

	bp->b_bcount = cb->aio_nbytes;
	bp->b_bufsize = cb->aio_nbytes;
	bp->b_iodone = aio_physwakeup;
	bp->b_saveaddr = bp->b_data;
	bp->b_data = (void *)(uintptr_t)cb->aio_buf;
	bp->b_offset = cb->aio_offset;
	bp->b_iooffset = cb->aio_offset;
	bp->b_blkno = btodb(cb->aio_offset);
	bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;

	/*
	* Bring buffer into kernel space.
	*/
	if (vmapbuf(bp) < 0) {
	error = EFAULT;
	goto doerror;
	}

	AIO_LOCK(ki);
	aiocbe->bp = bp;
	bp->b_caller1 = (void *)aiocbe;
	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
	aiocbe->jobstate = JOBST_JOBQBUF;
	cb->_aiocb_private.status = cb->aio_nbytes;
	AIO_UNLOCK(ki);

	atomic_add_int(&num_queue_count, 1);
	atomic_add_int(&num_buf_aio, 1);

	bp->b_error = 0;

	TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);

	/* Perform transfer. */
	dev_strategy(vp->v_rdev, bp);
	return (0);

	doerror:
	AIO_LOCK(ki);
	ki->kaio_count--;
	ki->kaio_buffer_count--;
	if (lj)
	lj->lioj_count--;
	aiocbe->bp = NULL;
	AIO_UNLOCK(ki);
	relpbuf(bp, NULL);
	return (error);
	}

	/*
	* Wake up aio requests that may be serviceable now.
	*/
	static void
	aio_swake_cb(struct socket so, struct sockbuf sb)
	{
	struct aiocblist cb, cbn;
	int opcode;

	SOCKBUF_LOCK_ASSERT(sb);
	if (sb == &so->so_snd)
	opcode = LIO_WRITE;
	else
	opcode = LIO_READ;

	sb->sb_flags &= ~SB_AIO;
	mtx_lock(&aio_job_mtx);
	TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
	if (opcode == cb->uaiocb.aio_lio_opcode) {
	if (cb->jobstate != JOBST_JOBQSOCK)
	panic("invalid queue value");
	/* XXX
	* We don't have actual sockets backend yet,
	* so we simply move the requests to the generic
	* file I/O backend.
	*/
	TAILQ_REMOVE(&so->so_aiojobq, cb, list);
	TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
	aio_kick_nowait(cb->userproc);
	}
	}
	mtx_unlock(&aio_job_mtx);
	}

	static int
	convert_old_sigevent(struct osigevent osig, struct sigevent nsig)
	{

	/*
	* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
	* supported by AIO with the old sigevent structure.
	*/
	nsig->sigev_notify = osig->sigev_notify;
	switch (nsig->sigev_notify) {
	case SIGEV_NONE:
	break;
	case SIGEV_SIGNAL:
	nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
	break;
	case SIGEV_KEVENT:
	nsig->sigev_notify_kqueue =
	osig->__sigev_u.__sigev_notify_kqueue;
	nsig->sigev_value.sival_ptr = osig->sigev_value.sival_ptr;
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	aiocb_copyin_old_sigevent(struct aiocb ujob, struct aiocb kjob)
	{
	struct oaiocb *ojob;
	int error;

	bzero(kjob, sizeof(struct aiocb));
	error = copyin(ujob, kjob, sizeof(struct oaiocb));
	if (error)
	return (error);
	ojob = (struct oaiocb *)kjob;
	return (convert_old_sigevent(&ojob->aio_sigevent, &kjob->aio_sigevent));
	}

	static int
	aiocb_copyin(struct aiocb ujob, struct aiocb kjob)
	{

	return (copyin(ujob, kjob, sizeof(struct aiocb)));
	}

	static long
	aiocb_fetch_status(struct aiocb *ujob)
	{

	return (fuword(&ujob->_aiocb_private.status));
	}

	static long
	aiocb_fetch_error(struct aiocb *ujob)
	{

	return (fuword(&ujob->_aiocb_private.error));
	}

	static int
	aiocb_store_status(struct aiocb *ujob, long status)
	{

	return (suword(&ujob->_aiocb_private.status, status));
	}

	static int
	aiocb_store_error(struct aiocb *ujob, long error)
	{

	return (suword(&ujob->_aiocb_private.error, error));
	}

	static int
	aiocb_store_kernelinfo(struct aiocb *ujob, long jobref)
	{

	return (suword(&ujob->_aiocb_private.kernelinfo, jobref));
	}

	static int
	aiocb_store_aiocb(struct aiocb *ujobp, struct aiocb ujob)
	{

	return (suword(ujobp, (long)ujob));
	}

	static struct aiocb_ops aiocb_ops = {
	.copyin = aiocb_copyin,
	.fetch_status = aiocb_fetch_status,
	.fetch_error = aiocb_fetch_error,
	.store_status = aiocb_store_status,
	.store_error = aiocb_store_error,
	.store_kernelinfo = aiocb_store_kernelinfo,
	.store_aiocb = aiocb_store_aiocb,
	};

	static struct aiocb_ops aiocb_ops_osigevent = {
	.copyin = aiocb_copyin_old_sigevent,
	.fetch_status = aiocb_fetch_status,
	.fetch_error = aiocb_fetch_error,
	.store_status = aiocb_store_status,
	.store_error = aiocb_store_error,
	.store_kernelinfo = aiocb_store_kernelinfo,
	.store_aiocb = aiocb_store_aiocb,
	};

	/*
	* Queue a new AIO request. Choosing either the threaded or direct physio VCHR
	* technique is done in this code.
	*/
	int
	aio_aqueue(struct thread td, struct aiocb job, struct aioliojob *lj,
	int type, struct aiocb_ops *ops)
	{
	struct proc *p = td->td_proc;
	struct file *fp;
	struct socket *so;
	struct aiocblist aiocbe, cb;
	struct kaioinfo *ki;
	struct kevent kev;
	struct sockbuf *sb;
	int opcode;
	int error;
	int fd, kqfd;
	int jid;

	if (p->p_aioinfo == NULL)
	aio_init_aioinfo(p);

	ki = p->p_aioinfo;

	ops->store_status(job, -1);
	ops->store_error(job, 0);
	ops->store_kernelinfo(job, -1);

	if (num_queue_count >= max_queue_count \|\|
	ki->kaio_count >= ki->kaio_qallowed_count) {
	ops->store_error(job, EAGAIN);
	return (EAGAIN);
	}

	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK \| M_ZERO);
	aiocbe->inputcharge = 0;
	aiocbe->outputcharge = 0;
	knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki));

	error = ops->copyin(job, &aiocbe->uaiocb);
	if (error) {
	ops->store_error(job, error);
	uma_zfree(aiocb_zone, aiocbe);
	return (error);
	}

	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
	aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
	aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
	aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
	ops->store_error(job, EINVAL);
	uma_zfree(aiocb_zone, aiocbe);
	return (EINVAL);
	}

	if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL \|\|
	aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
	!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
	uma_zfree(aiocb_zone, aiocbe);
	return (EINVAL);
	}

	ksiginfo_init(&aiocbe->ksi);

	/* Save userspace address of the job info. */
	aiocbe->uuaiocb = job;

	/* Get the opcode. */
	if (type != LIO_NOP)
	aiocbe->uaiocb.aio_lio_opcode = type;
	opcode = aiocbe->uaiocb.aio_lio_opcode;

	/*
	* Validate the opcode and fetch the file object for the specified
	* file descriptor.
	*
	* XXXRW: Moved the opcode validation up here so that we don't
	* retrieve a file descriptor without knowing what the capabiltity
	* should be.
	*/
	fd = aiocbe->uaiocb.aio_fildes;
	switch (opcode) {
	case LIO_WRITE:
	error = fget_write(td, fd, CAP_WRITE \| CAP_SEEK, &fp);
	break;
	case LIO_READ:
	error = fget_read(td, fd, CAP_READ \| CAP_SEEK, &fp);
	break;
	case LIO_SYNC:
	error = fget(td, fd, CAP_FSYNC, &fp);
	break;
	case LIO_NOP:
	error = fget(td, fd, 0, &fp);
	break;
	default:
	error = EINVAL;
	}
	if (error) {
	uma_zfree(aiocb_zone, aiocbe);
	ops->store_error(job, error);
	return (error);
	}

	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
	error = EINVAL;
	goto aqueue_fail;
	}

	if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
	error = EINVAL;
	goto aqueue_fail;
	}

	aiocbe->fd_file = fp;

	mtx_lock(&aio_job_mtx);
	jid = jobrefid++;
	aiocbe->seqno = jobseqno++;
	mtx_unlock(&aio_job_mtx);
	error = ops->store_kernelinfo(job, jid);
	if (error) {
	error = EINVAL;
	goto aqueue_fail;
	}
	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;

	if (opcode == LIO_NOP) {
	fdrop(fp, td);
	uma_zfree(aiocb_zone, aiocbe);
	return (0);
	}

	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
	goto no_kqueue;
	kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
	kev.ident = (uintptr_t)aiocbe->uuaiocb;
	kev.filter = EVFILT_AIO;
	kev.flags = EV_ADD \| EV_ENABLE \| EV_FLAG1;
	kev.data = (intptr_t)aiocbe;
	kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
	error = kqfd_register(kqfd, &kev, td, 1);
	aqueue_fail:
	if (error) {
	fdrop(fp, td);
	uma_zfree(aiocb_zone, aiocbe);
	ops->store_error(job, error);
	goto done;
	}
	no_kqueue:

	ops->store_error(job, EINPROGRESS);
	aiocbe->uaiocb._aiocb_private.error = EINPROGRESS;
	aiocbe->userproc = p;
	aiocbe->cred = crhold(td->td_ucred);
	aiocbe->jobflags = 0;
	aiocbe->lio = lj;

	if (opcode == LIO_SYNC)
	goto queueit;

	if (fp->f_type == DTYPE_SOCKET) {
	/*
	* Alternate queueing for socket ops: Reach down into the
	* descriptor to get the socket data. Then check to see if the
	* socket is ready to be read or written (based on the requested
	* operation).
	*
	* If it is not ready for io, then queue the aiocbe on the
	* socket, and set the flags so we get a call when sbnotify()
	* happens.
	*
	* Note if opcode is neither LIO_WRITE nor LIO_READ we lock
	* and unlock the snd sockbuf for no reason.
	*/
	so = fp->f_data;
	sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
	SOCKBUF_LOCK(sb);
	if (((opcode == LIO_READ) && (!soreadable(so))) \|\| ((opcode ==
	LIO_WRITE) && (!sowriteable(so)))) {
	sb->sb_flags \|= SB_AIO;

	mtx_lock(&aio_job_mtx);
	TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
	mtx_unlock(&aio_job_mtx);

	AIO_LOCK(ki);
	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
	aiocbe->jobstate = JOBST_JOBQSOCK;
	ki->kaio_count++;
	if (lj)
	lj->lioj_count++;
	AIO_UNLOCK(ki);
	SOCKBUF_UNLOCK(sb);
	atomic_add_int(&num_queue_count, 1);
	error = 0;
	goto done;
	}
	SOCKBUF_UNLOCK(sb);
	}

	if ((error = aio_qphysio(p, aiocbe)) == 0)
	goto done;
	#if 0
	if (error > 0) {
	aiocbe->uaiocb._aiocb_private.error = error;
	ops->store_error(job, error);
	goto done;
	}
	#endif
	queueit:
	/* No buffer for daemon I/O. */
	aiocbe->bp = NULL;
	atomic_add_int(&num_queue_count, 1);

	AIO_LOCK(ki);
	ki->kaio_count++;
	if (lj)
	lj->lioj_count++;
	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
	if (opcode == LIO_SYNC) {
	TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
	if (cb->fd_file == aiocbe->fd_file &&
	cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
	cb->seqno < aiocbe->seqno) {
	cb->jobflags \|= AIOCBLIST_CHECKSYNC;
	aiocbe->pending++;
	}
	}
	TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
	if (cb->fd_file == aiocbe->fd_file &&
	cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
	cb->seqno < aiocbe->seqno) {
	cb->jobflags \|= AIOCBLIST_CHECKSYNC;
	aiocbe->pending++;
	}
	}
	if (aiocbe->pending != 0) {
	TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
	aiocbe->jobstate = JOBST_JOBQSYNC;
	AIO_UNLOCK(ki);
	goto done;
	}
	}
	mtx_lock(&aio_job_mtx);
	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
	aiocbe->jobstate = JOBST_JOBQGLOBAL;
	aio_kick_nowait(p);
	mtx_unlock(&aio_job_mtx);
	AIO_UNLOCK(ki);
	error = 0;
	done:
	return (error);
	}

	static void
	aio_kick_nowait(struct proc *userp)
	{
	struct kaioinfo *ki = userp->p_aioinfo;
	struct aiothreadlist *aiop;

	mtx_assert(&aio_job_mtx, MA_OWNED);
	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	aiop->aiothreadflags &= ~AIOP_FREE;
	wakeup(aiop->aiothread);
	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
	((ki->kaio_active_count + num_aio_resv_start) <
	ki->kaio_maxactive_count)) {
	taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task);
	}
	}

	static int
	aio_kick(struct proc *userp)
	{
	struct kaioinfo *ki = userp->p_aioinfo;
	struct aiothreadlist *aiop;
	int error, ret = 0;

	mtx_assert(&aio_job_mtx, MA_OWNED);
	retryproc:
	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
	TAILQ_REMOVE(&aio_freeproc, aiop, list);
	aiop->aiothreadflags &= ~AIOP_FREE;
	wakeup(aiop->aiothread);
	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
	((ki->kaio_active_count + num_aio_resv_start) <
	ki->kaio_maxactive_count)) {
	num_aio_resv_start++;
	mtx_unlock(&aio_job_mtx);
	error = aio_newproc(&num_aio_resv_start);
	mtx_lock(&aio_job_mtx);
	if (error) {
	num_aio_resv_start--;
	goto retryproc;
	}
	} else {
	ret = -1;
	}
	return (ret);
	}

	static void
	aio_kick_helper(void *context, int pending)
	{
	struct proc *userp = context;

	mtx_lock(&aio_job_mtx);
	while (--pending >= 0) {
	if (aio_kick(userp))
	break;
	}
	mtx_unlock(&aio_job_mtx);
	}

	/*
	* Support the aio_return system call, as a side-effect, kernel resources are
	* released.
	*/
	static int
	kern_aio_return(struct thread td, struct aiocb uaiocb, struct aiocb_ops *ops)
	{
	struct proc *p = td->td_proc;
	struct aiocblist *cb;
	struct kaioinfo *ki;
	int status, error;

	ki = p->p_aioinfo;
	if (ki == NULL)
	return (EINVAL);
	AIO_LOCK(ki);
	TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
	if (cb->uuaiocb == uaiocb)
	break;
	}
	if (cb != NULL) {
	MPASS(cb->jobstate == JOBST_JOBFINISHED);
	status = cb->uaiocb._aiocb_private.status;
	error = cb->uaiocb._aiocb_private.error;
	td->td_retval[0] = status;
	if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
	td->td_ru.ru_oublock += cb->outputcharge;
	cb->outputcharge = 0;
	} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
	td->td_ru.ru_inblock += cb->inputcharge;
	cb->inputcharge = 0;
	}
	aio_free_entry(cb);
	AIO_UNLOCK(ki);
	ops->store_error(uaiocb, error);
	ops->store_status(uaiocb, status);
	} else {
	error = EINVAL;
	AIO_UNLOCK(ki);
	}
	return (error);
	}

	int
	-aio_return(struct thread td, struct aio_return_args uap)
	+sys_aio_return(struct thread td, struct aio_return_args uap)
	{

	return (kern_aio_return(td, uap->aiocbp, &aiocb_ops));
	}

	/*
	* Allow a process to wakeup when any of the I/O requests are completed.
	*/
	static int
	kern_aio_suspend(struct thread td, int njoblist, struct aiocb *ujoblist,
	struct timespec *ts)
	{
	struct proc *p = td->td_proc;
	struct timeval atv;
	struct kaioinfo *ki;
	struct aiocblist cb, cbfirst;
	int error, i, timo;

	timo = 0;
	if (ts) {
	if (ts->tv_nsec < 0 \|\| ts->tv_nsec >= 1000000000)
	return (EINVAL);

	TIMESPEC_TO_TIMEVAL(&atv, ts);
	if (itimerfix(&atv))
	return (EINVAL);
	timo = tvtohz(&atv);
	}

	ki = p->p_aioinfo;
	if (ki == NULL)
	return (EAGAIN);

	if (njoblist == 0)
	return (0);

	AIO_LOCK(ki);
	for (;;) {
	cbfirst = NULL;
	error = 0;
	TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
	for (i = 0; i < njoblist; i++) {
	if (cb->uuaiocb == ujoblist[i]) {
	if (cbfirst == NULL)
	cbfirst = cb;
	if (cb->jobstate == JOBST_JOBFINISHED)
	goto RETURN;
	}
	}
	}
	/* All tasks were finished. */
	if (cbfirst == NULL)
	break;

	ki->kaio_flags \|= KAIO_WAKEUP;
	error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO \| PCATCH,
	"aiospn", timo);
	if (error == ERESTART)
	error = EINTR;
	if (error)
	break;
	}
	RETURN:
	AIO_UNLOCK(ki);
	return (error);
	}

	int
	-aio_suspend(struct thread td, struct aio_suspend_args uap)
	+sys_aio_suspend(struct thread td, struct aio_suspend_args uap)
	{
	struct timespec ts, *tsp;
	struct aiocb **ujoblist;
	int error;

	if (uap->nent < 0 \|\| uap->nent > AIO_LISTIO_MAX)
	return (EINVAL);

	if (uap->timeout) {
	/* Get timespec struct. */
	if ((error = copyin(uap->timeout, &ts, sizeof(ts))) != 0)
	return (error);
	tsp = &ts;
	} else
	tsp = NULL;

	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
	error = copyin(uap->aiocbp, ujoblist, uap->nent * sizeof(ujoblist[0]));
	if (error == 0)
	error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
	uma_zfree(aiol_zone, ujoblist);
	return (error);
	}

	/*
	* aio_cancel cancels any non-physio aio operations not currently in
	* progress.
	*/
	int
	-aio_cancel(struct thread td, struct aio_cancel_args uap)
	+sys_aio_cancel(struct thread td, struct aio_cancel_args uap)
	{
	struct proc *p = td->td_proc;
	struct kaioinfo *ki;
	struct aiocblist cbe, cbn;
	struct file *fp;
	struct socket *so;
	int error;
	int remove;
	int cancelled = 0;
	int notcancelled = 0;
	struct vnode *vp;

	/* Lookup file object. */
	error = fget(td, uap->fd, 0, &fp);
	if (error)
	return (error);

	ki = p->p_aioinfo;
	if (ki == NULL)
	goto done;

	if (fp->f_type == DTYPE_VNODE) {
	vp = fp->f_vnode;
	if (vn_isdisk(vp, &error)) {
	fdrop(fp, td);
	td->td_retval[0] = AIO_NOTCANCELED;
	return (0);
	}
	}

	AIO_LOCK(ki);
	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
	if ((uap->fd == cbe->uaiocb.aio_fildes) &&
	((uap->aiocbp == NULL) \|\|
	(uap->aiocbp == cbe->uuaiocb))) {
	remove = 0;

	mtx_lock(&aio_job_mtx);
	if (cbe->jobstate == JOBST_JOBQGLOBAL) {
	TAILQ_REMOVE(&aio_jobs, cbe, list);
	remove = 1;
	} else if (cbe->jobstate == JOBST_JOBQSOCK) {
	MPASS(fp->f_type == DTYPE_SOCKET);
	so = fp->f_data;
	TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
	remove = 1;
	} else if (cbe->jobstate == JOBST_JOBQSYNC) {
	TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
	remove = 1;
	}
	mtx_unlock(&aio_job_mtx);

	if (remove) {
	TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
	cbe->uaiocb._aiocb_private.status = -1;
	cbe->uaiocb._aiocb_private.error = ECANCELED;
	aio_bio_done_notify(p, cbe, DONE_QUEUE);
	cancelled++;
	} else {
	notcancelled++;
	}
	if (uap->aiocbp != NULL)
	break;
	}
	}
	AIO_UNLOCK(ki);

	done:
	fdrop(fp, td);

	if (uap->aiocbp != NULL) {
	if (cancelled) {
	td->td_retval[0] = AIO_CANCELED;
	return (0);
	}
	}

	if (notcancelled) {
	td->td_retval[0] = AIO_NOTCANCELED;
	return (0);
	}

	if (cancelled) {
	td->td_retval[0] = AIO_CANCELED;
	return (0);
	}

	td->td_retval[0] = AIO_ALLDONE;

	return (0);
	}

	/*
	* aio_error is implemented in the kernel level for compatibility purposes
	* only. For a user mode async implementation, it would be best to do it in
	* a userland subroutine.
	*/
	static int
	kern_aio_error(struct thread td, struct aiocb aiocbp, struct aiocb_ops *ops)
	{
	struct proc *p = td->td_proc;
	struct aiocblist *cb;
	struct kaioinfo *ki;
	int status;

	ki = p->p_aioinfo;
	if (ki == NULL) {
	td->td_retval[0] = EINVAL;
	return (0);
	}

	AIO_LOCK(ki);
	TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
	if (cb->uuaiocb == aiocbp) {
	if (cb->jobstate == JOBST_JOBFINISHED)
	td->td_retval[0] =
	cb->uaiocb._aiocb_private.error;
	else
	td->td_retval[0] = EINPROGRESS;
	AIO_UNLOCK(ki);
	return (0);
	}
	}
	AIO_UNLOCK(ki);

	/*
	* Hack for failure of aio_aqueue.
	*/
	status = ops->fetch_status(aiocbp);
	if (status == -1) {
	td->td_retval[0] = ops->fetch_error(aiocbp);
	return (0);
	}

	td->td_retval[0] = EINVAL;
	return (0);
	}

	int
	-aio_error(struct thread td, struct aio_error_args uap)
	+sys_aio_error(struct thread td, struct aio_error_args uap)
	{

	return (kern_aio_error(td, uap->aiocbp, &aiocb_ops));
	}

	/* syscall - asynchronous read from a file (REALTIME) */
	int
	-oaio_read(struct thread td, struct oaio_read_args uap)
	+sys_oaio_read(struct thread td, struct oaio_read_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
	&aiocb_ops_osigevent));
	}

	int
	-aio_read(struct thread td, struct aio_read_args uap)
	+sys_aio_read(struct thread td, struct aio_read_args uap)
	{

	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, &aiocb_ops));
	}

	/* syscall - asynchronous write to a file (REALTIME) */
	int
	-oaio_write(struct thread td, struct oaio_write_args uap)
	+sys_oaio_write(struct thread td, struct oaio_write_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
	&aiocb_ops_osigevent));
	}

	int
	-aio_write(struct thread td, struct aio_write_args uap)
	+sys_aio_write(struct thread td, struct aio_write_args uap)
	{

	return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
	}

	static int
	kern_lio_listio(struct thread td, int mode, struct aiocb const *uacb_list,
	struct aiocb *acb_list, int nent, struct sigevent sig,
	struct aiocb_ops *ops)
	{
	struct proc *p = td->td_proc;
	struct aiocb *iocb;
	struct kaioinfo *ki;
	struct aioliojob *lj;
	struct kevent kev;
	int error;
	int nerror;
	int i;

	if ((mode != LIO_NOWAIT) && (mode != LIO_WAIT))
	return (EINVAL);

	if (nent < 0 \|\| nent > AIO_LISTIO_MAX)
	return (EINVAL);

	if (p->p_aioinfo == NULL)
	aio_init_aioinfo(p);

	ki = p->p_aioinfo;

	lj = uma_zalloc(aiolio_zone, M_WAITOK);
	lj->lioj_flags = 0;
	lj->lioj_count = 0;
	lj->lioj_finished_count = 0;
	knlist_init_mtx(&lj->klist, AIO_MTX(ki));
	ksiginfo_init(&lj->lioj_ksi);

	/*
	* Setup signal.
	*/
	if (sig && (mode == LIO_NOWAIT)) {
	bcopy(sig, &lj->lioj_signal, sizeof(lj->lioj_signal));
	if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
	/* Assume only new style KEVENT */
	kev.filter = EVFILT_LIO;
	kev.flags = EV_ADD \| EV_ENABLE \| EV_FLAG1;
	kev.ident = (uintptr_t)uacb_list; /* something unique */
	kev.data = (intptr_t)lj;
	/* pass user defined sigval data */
	kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
	error = kqfd_register(
	lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
	if (error) {
	uma_zfree(aiolio_zone, lj);
	return (error);
	}
	} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
	;
	} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL \|\|
	lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
	if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
	uma_zfree(aiolio_zone, lj);
	return EINVAL;
	}
	lj->lioj_flags \|= LIOJ_SIGNAL;
	} else {
	uma_zfree(aiolio_zone, lj);
	return EINVAL;
	}
	}

	AIO_LOCK(ki);
	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
	/*
	* Add extra aiocb count to avoid the lio to be freed
	* by other threads doing aio_waitcomplete or aio_return,
	* and prevent event from being sent until we have queued
	* all tasks.
	*/
	lj->lioj_count = 1;
	AIO_UNLOCK(ki);

	/*
	* Get pointers to the list of I/O requests.
	*/
	nerror = 0;
	for (i = 0; i < nent; i++) {
	iocb = acb_list[i];
	if (iocb != NULL) {
	error = aio_aqueue(td, iocb, lj, LIO_NOP, ops);
	if (error != 0)
	nerror++;
	}
	}

	error = 0;
	AIO_LOCK(ki);
	if (mode == LIO_WAIT) {
	while (lj->lioj_count - 1 != lj->lioj_finished_count) {
	ki->kaio_flags \|= KAIO_WAKEUP;
	error = msleep(&p->p_aioinfo, AIO_MTX(ki),
	PRIBIO \| PCATCH, "aiospn", 0);
	if (error == ERESTART)
	error = EINTR;
	if (error)
	break;
	}
	} else {
	if (lj->lioj_count - 1 == lj->lioj_finished_count) {
	if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
	lj->lioj_flags \|= LIOJ_KEVENT_POSTED;
	KNOTE_LOCKED(&lj->klist, 1);
	}
	if ((lj->lioj_flags & (LIOJ_SIGNAL\|LIOJ_SIGNAL_POSTED))
	== LIOJ_SIGNAL
	&& (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL \|\|
	lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
	aio_sendsig(p, &lj->lioj_signal,
	&lj->lioj_ksi);
	lj->lioj_flags \|= LIOJ_SIGNAL_POSTED;
	}
	}
	}
	lj->lioj_count--;
	if (lj->lioj_count == 0) {
	TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
	knlist_delete(&lj->klist, curthread, 1);
	PROC_LOCK(p);
	sigqueue_take(&lj->lioj_ksi);
	PROC_UNLOCK(p);
	AIO_UNLOCK(ki);
	uma_zfree(aiolio_zone, lj);
	} else
	AIO_UNLOCK(ki);

	if (nerror)
	return (EIO);
	return (error);
	}

	/* syscall - list directed I/O (REALTIME) */
	int
	-olio_listio(struct thread td, struct olio_listio_args uap)
	+sys_olio_listio(struct thread td, struct olio_listio_args uap)
	{
	struct aiocb **acb_list;
	struct sigevent *sigp, sig;
	struct osigevent osig;
	int error, nent;

	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
	return (EINVAL);

	nent = uap->nent;
	if (nent < 0 \|\| nent > AIO_LISTIO_MAX)
	return (EINVAL);

	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
	error = copyin(uap->sig, &osig, sizeof(osig));
	if (error)
	return (error);
	error = convert_old_sigevent(&osig, &sig);
	if (error)
	return (error);
	sigp = &sig;
	} else
	sigp = NULL;

	acb_list = malloc(sizeof(struct aiocb ) nent, M_LIO, M_WAITOK);
	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
	if (error == 0)
	error = kern_lio_listio(td, uap->mode,
	(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
	&aiocb_ops_osigevent);
	free(acb_list, M_LIO);
	return (error);
	}

	/* syscall - list directed I/O (REALTIME) */
	int
	-lio_listio(struct thread td, struct lio_listio_args uap)
	+sys_lio_listio(struct thread td, struct lio_listio_args uap)
	{
	struct aiocb **acb_list;
	struct sigevent *sigp, sig;
	int error, nent;

	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
	return (EINVAL);

	nent = uap->nent;
	if (nent < 0 \|\| nent > AIO_LISTIO_MAX)
	return (EINVAL);

	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
	error = copyin(uap->sig, &sig, sizeof(sig));
	if (error)
	return (error);
	sigp = &sig;
	} else
	sigp = NULL;

	acb_list = malloc(sizeof(struct aiocb ) nent, M_LIO, M_WAITOK);
	error = copyin(uap->acb_list, acb_list, nent * sizeof(acb_list[0]));
	if (error == 0)
	error = kern_lio_listio(td, uap->mode, uap->acb_list, acb_list,
	nent, sigp, &aiocb_ops);
	free(acb_list, M_LIO);
	return (error);
	}

	/*
	* Called from interrupt thread for physio, we should return as fast
	* as possible, so we schedule a biohelper task.
	*/
	static void
	aio_physwakeup(struct buf *bp)
	{
	struct aiocblist *aiocbe;

	aiocbe = (struct aiocblist *)bp->b_caller1;
	taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
	}

	/*
	* Task routine to perform heavy tasks, process wakeup, and signals.
	*/
	static void
	biohelper(void *context, int pending)
	{
	struct aiocblist *aiocbe = context;
	struct buf *bp;
	struct proc *userp;
	struct kaioinfo *ki;
	int nblks;

	bp = aiocbe->bp;
	userp = aiocbe->userproc;
	ki = userp->p_aioinfo;
	AIO_LOCK(ki);
	aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
	aiocbe->uaiocb._aiocb_private.error = 0;
	if (bp->b_ioflags & BIO_ERROR)
	aiocbe->uaiocb._aiocb_private.error = bp->b_error;
	nblks = btodb(aiocbe->uaiocb.aio_nbytes);
	if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
	aiocbe->outputcharge += nblks;
	else
	aiocbe->inputcharge += nblks;
	aiocbe->bp = NULL;
	TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
	ki->kaio_buffer_count--;
	aio_bio_done_notify(userp, aiocbe, DONE_BUF);
	AIO_UNLOCK(ki);

	/* Release mapping into kernel space. */
	vunmapbuf(bp);
	relpbuf(bp, NULL);
	atomic_subtract_int(&num_buf_aio, 1);
	}

	/* syscall - wait for the next completion of an aio request */
	static int
	kern_aio_waitcomplete(struct thread td, struct aiocb *aiocbp,
	struct timespec ts, struct aiocb_ops ops)
	{
	struct proc *p = td->td_proc;
	struct timeval atv;
	struct kaioinfo *ki;
	struct aiocblist *cb;
	struct aiocb *uuaiocb;
	int error, status, timo;

	ops->store_aiocb(aiocbp, NULL);

	timo = 0;
	if (ts) {
	if ((ts->tv_nsec < 0) \|\| (ts->tv_nsec >= 1000000000))
	return (EINVAL);

	TIMESPEC_TO_TIMEVAL(&atv, ts);
	if (itimerfix(&atv))
	return (EINVAL);
	timo = tvtohz(&atv);
	}

	if (p->p_aioinfo == NULL)
	aio_init_aioinfo(p);
	ki = p->p_aioinfo;

	error = 0;
	cb = NULL;
	AIO_LOCK(ki);
	while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
	ki->kaio_flags \|= KAIO_WAKEUP;
	error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO \| PCATCH,
	"aiowc", timo);
	if (timo && error == ERESTART)
	error = EINTR;
	if (error)
	break;
	}

	if (cb != NULL) {
	MPASS(cb->jobstate == JOBST_JOBFINISHED);
	uuaiocb = cb->uuaiocb;
	status = cb->uaiocb._aiocb_private.status;
	error = cb->uaiocb._aiocb_private.error;
	td->td_retval[0] = status;
	if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
	td->td_ru.ru_oublock += cb->outputcharge;
	cb->outputcharge = 0;
	} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
	td->td_ru.ru_inblock += cb->inputcharge;
	cb->inputcharge = 0;
	}
	aio_free_entry(cb);
	AIO_UNLOCK(ki);
	ops->store_aiocb(aiocbp, uuaiocb);
	ops->store_error(uuaiocb, error);
	ops->store_status(uuaiocb, status);
	} else
	AIO_UNLOCK(ki);

	return (error);
	}

	int
	-aio_waitcomplete(struct thread td, struct aio_waitcomplete_args uap)
	+sys_aio_waitcomplete(struct thread td, struct aio_waitcomplete_args uap)
	{
	struct timespec ts, *tsp;
	int error;

	if (uap->timeout) {
	/* Get timespec struct. */
	error = copyin(uap->timeout, &ts, sizeof(ts));
	if (error)
	return (error);
	tsp = &ts;
	} else
	tsp = NULL;

	return (kern_aio_waitcomplete(td, uap->aiocbp, tsp, &aiocb_ops));
	}

	static int
	kern_aio_fsync(struct thread td, int op, struct aiocb aiocbp,
	struct aiocb_ops *ops)
	{
	struct proc *p = td->td_proc;
	struct kaioinfo *ki;

	if (op != O_SYNC) /* XXX lack of O_DSYNC */
	return (EINVAL);
	ki = p->p_aioinfo;
	if (ki == NULL)
	aio_init_aioinfo(p);
	return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
	}

	int
	-aio_fsync(struct thread td, struct aio_fsync_args uap)
	+sys_aio_fsync(struct thread td, struct aio_fsync_args uap)
	{

	return (kern_aio_fsync(td, uap->op, uap->aiocbp, &aiocb_ops));
	}

	/* kqueue attach function */
	static int
	filt_aioattach(struct knote *kn)
	{
	struct aiocblist aiocbe = (struct aiocblist )kn->kn_sdata;

	/*
	* The aiocbe pointer must be validated before using it, so
	* registration is restricted to the kernel; the user cannot
	* set EV_FLAG1.
	*/
	if ((kn->kn_flags & EV_FLAG1) == 0)
	return (EPERM);
	kn->kn_ptr.p_aio = aiocbe;
	kn->kn_flags &= ~EV_FLAG1;

	knlist_add(&aiocbe->klist, kn, 0);

	return (0);
	}

	/* kqueue detach function */
	static void
	filt_aiodetach(struct knote *kn)
	{
	struct aiocblist *aiocbe = kn->kn_ptr.p_aio;

	if (!knlist_empty(&aiocbe->klist))
	knlist_remove(&aiocbe->klist, kn, 0);
	}

	/* kqueue filter function */
	/ARGSUSED/
	static int
	filt_aio(struct knote *kn, long hint)
	{
	struct aiocblist *aiocbe = kn->kn_ptr.p_aio;

	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
	if (aiocbe->jobstate != JOBST_JOBFINISHED)
	return (0);
	kn->kn_flags \|= EV_EOF;
	return (1);
	}

	/* kqueue attach function */
	static int
	filt_lioattach(struct knote *kn)
	{
	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;

	/*
	* The aioliojob pointer must be validated before using it, so
	* registration is restricted to the kernel; the user cannot
	* set EV_FLAG1.
	*/
	if ((kn->kn_flags & EV_FLAG1) == 0)
	return (EPERM);
	kn->kn_ptr.p_lio = lj;
	kn->kn_flags &= ~EV_FLAG1;

	knlist_add(&lj->klist, kn, 0);

	return (0);
	}

	/* kqueue detach function */
	static void
	filt_liodetach(struct knote *kn)
	{
	struct aioliojob * lj = kn->kn_ptr.p_lio;

	if (!knlist_empty(&lj->klist))
	knlist_remove(&lj->klist, kn, 0);
	}

	/* kqueue filter function */
	/ARGSUSED/
	static int
	filt_lio(struct knote *kn, long hint)
	{
	struct aioliojob * lj = kn->kn_ptr.p_lio;

	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
	}

	#ifdef COMPAT_FREEBSD32

	struct __aiocb_private32 {
	int32_t status;
	int32_t error;
	uint32_t kernelinfo;
	};

	typedef struct oaiocb32 {
	int aio_fildes; /* File descriptor */
	uint64_t aio_offset __packed; /* File offset for I/O */
	uint32_t aio_buf; /* I/O buffer in process space */
	uint32_t aio_nbytes; /* Number of bytes for I/O */
	struct osigevent32 aio_sigevent; /* Signal to deliver */
	int aio_lio_opcode; /* LIO opcode */
	int aio_reqprio; /* Request priority -- ignored */
	struct __aiocb_private32 _aiocb_private;
	} oaiocb32_t;

	typedef struct aiocb32 {
	int32_t aio_fildes; /* File descriptor */
	uint64_t aio_offset __packed; /* File offset for I/O */
	uint32_t aio_buf; /* I/O buffer in process space */
	uint32_t aio_nbytes; /* Number of bytes for I/O */
	int __spare__[2];
	uint32_t __spare2__;
	int aio_lio_opcode; /* LIO opcode */
	int aio_reqprio; /* Request priority -- ignored */
	struct __aiocb_private32 _aiocb_private;
	struct sigevent32 aio_sigevent; /* Signal to deliver */
	} aiocb32_t;

	static int
	convert_old_sigevent32(struct osigevent32 osig, struct sigevent nsig)
	{

	/*
	* Only SIGEV_NONE, SIGEV_SIGNAL, and SIGEV_KEVENT are
	* supported by AIO with the old sigevent structure.
	*/
	CP(osig, nsig, sigev_notify);
	switch (nsig->sigev_notify) {
	case SIGEV_NONE:
	break;
	case SIGEV_SIGNAL:
	nsig->sigev_signo = osig->__sigev_u.__sigev_signo;
	break;
	case SIGEV_KEVENT:
	nsig->sigev_notify_kqueue =
	osig->__sigev_u.__sigev_notify_kqueue;
	PTRIN_CP(osig, nsig, sigev_value.sival_ptr);
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	aiocb32_copyin_old_sigevent(struct aiocb ujob, struct aiocb kjob)
	{
	struct oaiocb32 job32;
	int error;

	bzero(kjob, sizeof(struct aiocb));
	error = copyin(ujob, &job32, sizeof(job32));
	if (error)
	return (error);

	CP(job32, *kjob, aio_fildes);
	CP(job32, *kjob, aio_offset);
	PTRIN_CP(job32, *kjob, aio_buf);
	CP(job32, *kjob, aio_nbytes);
	CP(job32, *kjob, aio_lio_opcode);
	CP(job32, *kjob, aio_reqprio);
	CP(job32, *kjob, _aiocb_private.status);
	CP(job32, *kjob, _aiocb_private.error);
	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
	return (convert_old_sigevent32(&job32.aio_sigevent,
	&kjob->aio_sigevent));
	}

	static int
	convert_sigevent32(struct sigevent32 sig32, struct sigevent sig)
	{

	CP(sig32, sig, sigev_notify);
	switch (sig->sigev_notify) {
	case SIGEV_NONE:
	break;
	case SIGEV_THREAD_ID:
	CP(sig32, sig, sigev_notify_thread_id);
	/* FALLTHROUGH */
	case SIGEV_SIGNAL:
	CP(sig32, sig, sigev_signo);
	break;
	case SIGEV_KEVENT:
	CP(sig32, sig, sigev_notify_kqueue);
	PTRIN_CP(sig32, sig, sigev_value.sival_ptr);
	break;
	default:
	return (EINVAL);
	}
	return (0);
	}

	static int
	aiocb32_copyin(struct aiocb ujob, struct aiocb kjob)
	{
	struct aiocb32 job32;
	int error;

	error = copyin(ujob, &job32, sizeof(job32));
	if (error)
	return (error);
	CP(job32, *kjob, aio_fildes);
	CP(job32, *kjob, aio_offset);
	PTRIN_CP(job32, *kjob, aio_buf);
	CP(job32, *kjob, aio_nbytes);
	CP(job32, *kjob, aio_lio_opcode);
	CP(job32, *kjob, aio_reqprio);
	CP(job32, *kjob, _aiocb_private.status);
	CP(job32, *kjob, _aiocb_private.error);
	PTRIN_CP(job32, *kjob, _aiocb_private.kernelinfo);
	return (convert_sigevent32(&job32.aio_sigevent, &kjob->aio_sigevent));
	}

	static long
	aiocb32_fetch_status(struct aiocb *ujob)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (fuword32(&ujob32->_aiocb_private.status));
	}

	static long
	aiocb32_fetch_error(struct aiocb *ujob)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (fuword32(&ujob32->_aiocb_private.error));
	}

	static int
	aiocb32_store_status(struct aiocb *ujob, long status)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (suword32(&ujob32->_aiocb_private.status, status));
	}

	static int
	aiocb32_store_error(struct aiocb *ujob, long error)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (suword32(&ujob32->_aiocb_private.error, error));
	}

	static int
	aiocb32_store_kernelinfo(struct aiocb *ujob, long jobref)
	{
	struct aiocb32 *ujob32;

	ujob32 = (struct aiocb32 *)ujob;
	return (suword32(&ujob32->_aiocb_private.kernelinfo, jobref));
	}

	static int
	aiocb32_store_aiocb(struct aiocb *ujobp, struct aiocb ujob)
	{

	return (suword32(ujobp, (long)ujob));
	}

	static struct aiocb_ops aiocb32_ops = {
	.copyin = aiocb32_copyin,
	.fetch_status = aiocb32_fetch_status,
	.fetch_error = aiocb32_fetch_error,
	.store_status = aiocb32_store_status,
	.store_error = aiocb32_store_error,
	.store_kernelinfo = aiocb32_store_kernelinfo,
	.store_aiocb = aiocb32_store_aiocb,
	};

	static struct aiocb_ops aiocb32_ops_osigevent = {
	.copyin = aiocb32_copyin_old_sigevent,
	.fetch_status = aiocb32_fetch_status,
	.fetch_error = aiocb32_fetch_error,
	.store_status = aiocb32_store_status,
	.store_error = aiocb32_store_error,
	.store_kernelinfo = aiocb32_store_kernelinfo,
	.store_aiocb = aiocb32_store_aiocb,
	};

	int
	freebsd32_aio_return(struct thread td, struct freebsd32_aio_return_args uap)
	{

	return (kern_aio_return(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
	}

	int
	freebsd32_aio_suspend(struct thread td, struct freebsd32_aio_suspend_args uap)
	{
	struct timespec32 ts32;
	struct timespec ts, *tsp;
	struct aiocb **ujoblist;
	uint32_t *ujoblist32;
	int error, i;

	if (uap->nent < 0 \|\| uap->nent > AIO_LISTIO_MAX)
	return (EINVAL);

	if (uap->timeout) {
	/* Get timespec struct. */
	if ((error = copyin(uap->timeout, &ts32, sizeof(ts32))) != 0)
	return (error);
	CP(ts32, ts, tv_sec);
	CP(ts32, ts, tv_nsec);
	tsp = &ts;
	} else
	tsp = NULL;

	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
	ujoblist32 = (uint32_t *)ujoblist;
	error = copyin(uap->aiocbp, ujoblist32, uap->nent *
	sizeof(ujoblist32[0]));
	if (error == 0) {
	for (i = uap->nent; i > 0; i--)
	ujoblist[i] = PTRIN(ujoblist32[i]);

	error = kern_aio_suspend(td, uap->nent, ujoblist, tsp);
	}
	uma_zfree(aiol_zone, ujoblist);
	return (error);
	}

	int
	freebsd32_aio_cancel(struct thread td, struct freebsd32_aio_cancel_args uap)
	{

	- return (aio_cancel(td, (struct aio_cancel_args *)uap));
	+ return (sys_aio_cancel(td, (struct aio_cancel_args *)uap));
	}

	int
	freebsd32_aio_error(struct thread td, struct freebsd32_aio_error_args uap)
	{

	return (kern_aio_error(td, (struct aiocb *)uap->aiocbp, &aiocb32_ops));
	}

	int
	freebsd32_oaio_read(struct thread td, struct freebsd32_oaio_read_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
	&aiocb32_ops_osigevent));
	}

	int
	freebsd32_aio_read(struct thread td, struct freebsd32_aio_read_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ,
	&aiocb32_ops));
	}

	int
	freebsd32_oaio_write(struct thread td, struct freebsd32_oaio_write_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
	&aiocb32_ops_osigevent));
	}

	int
	freebsd32_aio_write(struct thread td, struct freebsd32_aio_write_args uap)
	{

	return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE,
	&aiocb32_ops));
	}

	int
	freebsd32_aio_waitcomplete(struct thread *td,
	struct freebsd32_aio_waitcomplete_args *uap)
	{
	struct timespec32 ts32;
	struct timespec ts, *tsp;
	int error;

	if (uap->timeout) {
	/* Get timespec struct. */
	error = copyin(uap->timeout, &ts32, sizeof(ts32));
	if (error)
	return (error);
	CP(ts32, ts, tv_sec);
	CP(ts32, ts, tv_nsec);
	tsp = &ts;
	} else
	tsp = NULL;

	return (kern_aio_waitcomplete(td, (struct aiocb **)uap->aiocbp, tsp,
	&aiocb32_ops));
	}

	int
	freebsd32_aio_fsync(struct thread td, struct freebsd32_aio_fsync_args uap)
	{

	return (kern_aio_fsync(td, uap->op, (struct aiocb *)uap->aiocbp,
	&aiocb32_ops));
	}

	int
	freebsd32_olio_listio(struct thread td, struct freebsd32_olio_listio_args uap)
	{
	struct aiocb **acb_list;
	struct sigevent *sigp, sig;
	struct osigevent32 osig;
	uint32_t *acb_list32;
	int error, i, nent;

	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
	return (EINVAL);

	nent = uap->nent;
	if (nent < 0 \|\| nent > AIO_LISTIO_MAX)
	return (EINVAL);

	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
	error = copyin(uap->sig, &osig, sizeof(osig));
	if (error)
	return (error);
	error = convert_old_sigevent32(&osig, &sig);
	if (error)
	return (error);
	sigp = &sig;
	} else
	sigp = NULL;

	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
	if (error) {
	free(acb_list32, M_LIO);
	return (error);
	}
	acb_list = malloc(sizeof(struct aiocb ) nent, M_LIO, M_WAITOK);
	for (i = 0; i < nent; i++)
	acb_list[i] = PTRIN(acb_list32[i]);
	free(acb_list32, M_LIO);

	error = kern_lio_listio(td, uap->mode,
	(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
	&aiocb32_ops_osigevent);
	free(acb_list, M_LIO);
	return (error);
	}

	int
	freebsd32_lio_listio(struct thread td, struct freebsd32_lio_listio_args uap)
	{
	struct aiocb **acb_list;
	struct sigevent *sigp, sig;
	struct sigevent32 sig32;
	uint32_t *acb_list32;
	int error, i, nent;

	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
	return (EINVAL);

	nent = uap->nent;
	if (nent < 0 \|\| nent > AIO_LISTIO_MAX)
	return (EINVAL);

	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
	error = copyin(uap->sig, &sig32, sizeof(sig32));
	if (error)
	return (error);
	error = convert_sigevent32(&sig32, &sig);
	if (error)
	return (error);
	sigp = &sig;
	} else
	sigp = NULL;

	acb_list32 = malloc(sizeof(uint32_t) * nent, M_LIO, M_WAITOK);
	error = copyin(uap->acb_list, acb_list32, nent * sizeof(uint32_t));
	if (error) {
	free(acb_list32, M_LIO);
	return (error);
	}
	acb_list = malloc(sizeof(struct aiocb ) nent, M_LIO, M_WAITOK);
	for (i = 0; i < nent; i++)
	acb_list[i] = PTRIN(acb_list32[i]);
	free(acb_list32, M_LIO);

	error = kern_lio_listio(td, uap->mode,
	(struct aiocb * const *)uap->acb_list, acb_list, nent, sigp,
	&aiocb32_ops);
	free(acb_list, M_LIO);
	return (error);
	}

	#endif
	Index: head/sys/kern/vfs_cache.c
	===================================================================
	--- head/sys/kern/vfs_cache.c (revision 225616)
	+++ head/sys/kern/vfs_cache.c (revision 225617)
	@@ -1,1248 +1,1248 @@
	/*-
	* Copyright (c) 1989, 1993, 1995
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Poul-Henning Kamp of the FreeBSD Project.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_kdtrace.h"
	#include "opt_ktrace.h"

	#include <sys/param.h>
	#include <sys/filedesc.h>
	#include <sys/fnv_hash.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/namei.h>
	#include <sys/proc.h>
	#include <sys/rwlock.h>
	#include <sys/sdt.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/vnode.h>
	#ifdef KTRACE
	#include <sys/ktrace.h>
	#endif

	#include <vm/uma.h>

	SDT_PROVIDER_DECLARE(vfs);
	SDT_PROBE_DEFINE3(vfs, namecache, enter, done, done, "struct vnode ", "char ",
	"struct vnode *");
	SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, done, "struct vnode *",
	"char *");
	SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, entry, "struct vnode *");
	SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, hit, "struct vnode *",
	"struct char ", "struct vnode ");
	SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, miss, "struct vnode *");
	SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, return, "int",
	"struct vnode ", "struct char ");
	SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, hit, "struct vnode ", "char ",
	"struct vnode *");
	SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit_negative, hit-negative,
	"struct vnode ", "char ");
	SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, miss, "struct vnode *",
	"char *");
	SDT_PROBE_DEFINE1(vfs, namecache, purge, done, done, "struct vnode *");
	SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, done, "struct vnode *");
	SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, done, "struct mount *");
	SDT_PROBE_DEFINE3(vfs, namecache, zap, done, done, "struct vnode ", "char ",
	"struct vnode *");
	SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, done, "struct vnode *",
	"char *");

	/*
	* This structure describes the elements in the cache of recent
	* names looked up by namei.
	*/

	struct namecache {
	LIST_ENTRY(namecache) nc_hash; /* hash chain */
	LIST_ENTRY(namecache) nc_src; /* source vnode list */
	TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */
	struct vnode nc_dvp; / vnode of parent of name */
	struct vnode nc_vp; / vnode the name refers to */
	u_char nc_flag; /* flag bits */
	u_char nc_nlen; /* length of name */
	char nc_name[0]; /* segment name + nul */
	};

	/*
	* Name caching works as follows:
	*
	* Names found by directory scans are retained in a cache
	* for future reference. It is managed LRU, so frequently
	* used names will hang around. Cache is indexed by hash value
	* obtained from (vp, name) where vp refers to the directory
	* containing name.
	*
	* If it is a "negative" entry, (i.e. for a name that is known NOT to
	* exist) the vnode pointer will be NULL.
	*
	* Upon reaching the last segment of a path, if the reference
	* is for DELETE, or NOCACHE is set (rewrite), and the
	* name is located in the cache, it will be dropped.
	*/

	/*
	* Structures associated with name cacheing.
	*/
	#define NCHHASH(hash) \
	(&nchashtbl[(hash) & nchash])
	static LIST_HEAD(nchashhead, namecache) nchashtbl; / Hash Table */
	static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */
	static u_long nchash; /* size of hash table */
	SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
	"Size of namecache hash table");
	static u_long ncnegfactor = 16; /* ratio of negative entries */
	SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
	"Ratio of negative namecache entries");
	static u_long numneg; /* number of negative entries allocated */
	SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
	"Number of negative entries in namecache");
	static u_long numcache; /* number of cache entries allocated */
	SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
	"Number of namecache entries");
	static u_long numcachehv; /* number of cache entries with vnodes held */
	SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0,
	"Number of namecache entries with vnodes held");
	static u_int ncsizefactor = 2;
	SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
	"Size factor for namecache");

	struct nchstats nchstats; /* cache effectiveness statistics */

	static struct rwlock cache_lock;
	RW_SYSINIT(vfscache, &cache_lock, "Name Cache");

	#define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock)
	#define CACHE_RLOCK() rw_rlock(&cache_lock)
	#define CACHE_RUNLOCK() rw_runlock(&cache_lock)
	#define CACHE_WLOCK() rw_wlock(&cache_lock)
	#define CACHE_WUNLOCK() rw_wunlock(&cache_lock)

	/*
	* UMA zones for the VFS cache.
	*
	* The small cache is used for entries with short names, which are the
	* most common. The large cache is used for entries which are too big to
	* fit in the small cache.
	*/
	static uma_zone_t cache_zone_small;
	static uma_zone_t cache_zone_large;

	#define CACHE_PATH_CUTOFF 35
	#define CACHE_ZONE_SMALL (sizeof(struct namecache) + CACHE_PATH_CUTOFF \
	+ 1)
	#define CACHE_ZONE_LARGE (sizeof(struct namecache) + NAME_MAX + 1)

	#define cache_alloc(len) uma_zalloc(((len) <= CACHE_PATH_CUTOFF) ? \
	cache_zone_small : cache_zone_large, M_WAITOK)
	#define cache_free(ncp) do { \
	if (ncp != NULL) \
	uma_zfree(((ncp)->nc_nlen <= CACHE_PATH_CUTOFF) ? \
	cache_zone_small : cache_zone_large, (ncp)); \
	} while (0)

	static int doingcache = 1; /* 1 => enable the cache */
	SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
	"VFS namecache enabled");

	/* Export size information to userland */
	SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
	sizeof(struct namecache), "sizeof(struct namecache)");

	/*
	* The new name cache statistics
	*/
	static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
	"Name cache statistics");
	#define STATNODE(mode, name, var, descr) \
	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, mode, var, 0, descr);
	STATNODE(CTLFLAG_RD, numneg, &numneg, "Number of negative cache entries");
	STATNODE(CTLFLAG_RD, numcache, &numcache, "Number of cache entries");
	static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls,
	"Number of cache lookups");
	static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits,
	"Number of '.' hits");
	static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits,
	"Number of '..' hits");
	static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks,
	"Number of checks in lookup");
	static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss,
	"Number of cache misses");
	static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap,
	"Number of cache misses we do not want to cache");
	static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps,
	"Number of cache hits (positive) we do not want to cache");
	static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits,
	"Number of cache hits (positive)");
	static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps,
	"Number of cache hits (negative) we do not want to cache");
	static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits,
	"Number of cache hits (negative)");
	static u_long numupgrades; STATNODE(CTLFLAG_RD, numupgrades, &numupgrades,
	"Number of updates of the cache after lookup (write lock + retry)");

	SYSCTL_OPAQUE(_vfs_cache, OID_AUTO, nchstats, CTLFLAG_RD \| CTLFLAG_MPSAFE,
	&nchstats, sizeof(nchstats), "LU",
	"VFS cache effectiveness statistics");



	static void cache_zap(struct namecache *ncp);
	static int vn_vptocnp_locked(struct vnode *vp, struct ucred cred, char *buf,
	u_int *buflen);
	static int vn_fullpath1(struct thread td, struct vnode vp, struct vnode *rdir,
	char buf, char *retbuf, u_int buflen);

	static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");

	/*
	* Flags in namecache.nc_flag
	*/
	#define NCF_WHITE 0x01
	#define NCF_ISDOTDOT 0x02

	#ifdef DIAGNOSTIC
	/*
	* Grab an atomic snapshot of the name cache hash chain lengths
	*/
	SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL, "hash table stats");

	static int
	sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
	{
	int error;
	struct nchashhead *ncpp;
	struct namecache *ncp;
	int n_nchash;
	int count;

	n_nchash = nchash + 1; /* nchash is max index, not count */
	if (!req->oldptr)
	return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));

	/* Scan hash tables for applicable entries */
	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
	CACHE_RLOCK();
	count = 0;
	LIST_FOREACH(ncp, ncpp, nc_hash) {
	count++;
	}
	CACHE_RUNLOCK();
	error = SYSCTL_OUT(req, &count, sizeof(count));
	if (error)
	return (error);
	}
	return (0);
	}
	SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT\|CTLFLAG_RD\|
	CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
	"nchash chain lengths");

	static int
	sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
	{
	int error;
	struct nchashhead *ncpp;
	struct namecache *ncp;
	int n_nchash;
	int count, maxlength, used, pct;

	if (!req->oldptr)
	return SYSCTL_OUT(req, 0, 4 * sizeof(int));

	n_nchash = nchash + 1; /* nchash is max index, not count */
	used = 0;
	maxlength = 0;

	/* Scan hash tables for applicable entries */
	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
	count = 0;
	CACHE_RLOCK();
	LIST_FOREACH(ncp, ncpp, nc_hash) {
	count++;
	}
	CACHE_RUNLOCK();
	if (count)
	used++;
	if (maxlength < count)
	maxlength = count;
	}
	n_nchash = nchash + 1;
	pct = (used * 100 * 100) / n_nchash;
	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
	if (error)
	return (error);
	error = SYSCTL_OUT(req, &used, sizeof(used));
	if (error)
	return (error);
	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
	if (error)
	return (error);
	error = SYSCTL_OUT(req, &pct, sizeof(pct));
	if (error)
	return (error);
	return (0);
	}
	SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT\|CTLFLAG_RD\|
	CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
	"nchash chain lengths");
	#endif

	/*
	* cache_zap():
	*
	* Removes a namecache entry from cache, whether it contains an actual
	* pointer to a vnode or if it is just a negative cache entry.
	*/
	static void
	cache_zap(ncp)
	struct namecache *ncp;
	{
	struct vnode *vp;

	rw_assert(&cache_lock, RA_WLOCKED);
	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
	#ifdef KDTRACE_HOOKS
	if (ncp->nc_vp != NULL) {
	SDT_PROBE(vfs, namecache, zap, done, ncp->nc_dvp,
	ncp->nc_name, ncp->nc_vp, 0, 0);
	} else {
	SDT_PROBE(vfs, namecache, zap_negative, done, ncp->nc_dvp,
	ncp->nc_name, 0, 0, 0);
	}
	#endif
	vp = NULL;
	LIST_REMOVE(ncp, nc_hash);
	if (ncp->nc_flag & NCF_ISDOTDOT) {
	if (ncp == ncp->nc_dvp->v_cache_dd)
	ncp->nc_dvp->v_cache_dd = NULL;
	} else {
	LIST_REMOVE(ncp, nc_src);
	if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
	vp = ncp->nc_dvp;
	numcachehv--;
	}
	}
	if (ncp->nc_vp) {
	TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
	if (ncp == ncp->nc_vp->v_cache_dd)
	ncp->nc_vp->v_cache_dd = NULL;
	} else {
	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
	numneg--;
	}
	numcache--;
	cache_free(ncp);
	if (vp)
	vdrop(vp);
	}

	/*
	* Lookup an entry in the cache
	*
	* Lookup is called with dvp pointing to the directory to search,
	* cnp pointing to the name of the entry being sought. If the lookup
	* succeeds, the vnode is returned in *vpp, and a status of -1 is
	* returned. If the lookup determines that the name does not exist
	* (negative cacheing), a status of ENOENT is returned. If the lookup
	* fails, a status of zero is returned. If the directory vnode is
	* recycled out from under us due to a forced unmount, a status of
	* ENOENT is returned.
	*
	* vpp is locked and ref'd on return. If we're looking up DOTDOT, dvp is
	* unlocked. If we're looking up . an extra ref is taken, but the lock is
	* not recursively acquired.
	*/

	int
	cache_lookup(dvp, vpp, cnp)
	struct vnode *dvp;
	struct vnode **vpp;
	struct componentname *cnp;
	{
	struct namecache *ncp;
	uint32_t hash;
	int error, ltype, wlocked;

	if (!doingcache) {
	cnp->cn_flags &= ~MAKEENTRY;
	return (0);
	}
	retry:
	CACHE_RLOCK();
	wlocked = 0;
	numcalls++;
	error = 0;

	retry_wlocked:
	if (cnp->cn_nameptr[0] == '.') {
	if (cnp->cn_namelen == 1) {
	*vpp = dvp;
	CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
	dvp, cnp->cn_nameptr);
	dothits++;
	SDT_PROBE(vfs, namecache, lookup, hit, dvp, ".",
	*vpp, 0, 0);
	goto success;
	}
	if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
	dotdothits++;
	if (dvp->v_cache_dd == NULL) {
	SDT_PROBE(vfs, namecache, lookup, miss, dvp,
	"..", NULL, 0, 0);
	goto unlock;
	}
	if ((cnp->cn_flags & MAKEENTRY) == 0) {
	if (!wlocked && !CACHE_UPGRADE_LOCK())
	goto wlock;
	if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
	cache_zap(dvp->v_cache_dd);
	dvp->v_cache_dd = NULL;
	CACHE_WUNLOCK();
	return (0);
	}
	if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT)
	*vpp = dvp->v_cache_dd->nc_vp;
	else
	*vpp = dvp->v_cache_dd->nc_dvp;
	/* Return failure if negative entry was found. */
	if (*vpp == NULL) {
	ncp = dvp->v_cache_dd;
	goto negative_success;
	}
	CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
	dvp, cnp->cn_nameptr, *vpp);
	SDT_PROBE(vfs, namecache, lookup, hit, dvp, "..",
	*vpp, 0, 0);
	goto success;
	}
	}

	hash = fnv_32_buf(cnp->cn_nameptr, cnp->cn_namelen, FNV1_32_INIT);
	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
	numchecks++;
	if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
	!bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
	break;
	}

	/* We failed to find an entry */
	if (ncp == NULL) {
	SDT_PROBE(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
	NULL, 0, 0);
	if ((cnp->cn_flags & MAKEENTRY) == 0) {
	nummisszap++;
	} else {
	nummiss++;
	}
	nchstats.ncs_miss++;
	goto unlock;
	}

	/* We don't want to have an entry, so dump it */
	if ((cnp->cn_flags & MAKEENTRY) == 0) {
	numposzaps++;
	nchstats.ncs_badhits++;
	if (!wlocked && !CACHE_UPGRADE_LOCK())
	goto wlock;
	cache_zap(ncp);
	CACHE_WUNLOCK();
	return (0);
	}

	/* We found a "positive" match, return the vnode */
	if (ncp->nc_vp) {
	numposhits++;
	nchstats.ncs_goodhits++;
	*vpp = ncp->nc_vp;
	CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
	dvp, cnp->cn_nameptr, *vpp, ncp);
	SDT_PROBE(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
	*vpp, 0, 0);
	goto success;
	}

	negative_success:
	/* We found a negative match, and want to create it, so purge */
	if (cnp->cn_nameiop == CREATE) {
	numnegzaps++;
	nchstats.ncs_badhits++;
	if (!wlocked && !CACHE_UPGRADE_LOCK())
	goto wlock;
	cache_zap(ncp);
	CACHE_WUNLOCK();
	return (0);
	}

	if (!wlocked && !CACHE_UPGRADE_LOCK())
	goto wlock;
	numneghits++;
	/*
	* We found a "negative" match, so we shift it to the end of
	* the "negative" cache entries queue to satisfy LRU. Also,
	* check to see if the entry is a whiteout; indicate this to
	* the componentname, if so.
	*/
	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
	nchstats.ncs_neghits++;
	if (ncp->nc_flag & NCF_WHITE)
	cnp->cn_flags \|= ISWHITEOUT;
	SDT_PROBE(vfs, namecache, lookup, hit_negative, dvp, ncp->nc_name,
	0, 0, 0);
	CACHE_WUNLOCK();
	return (ENOENT);

	wlock:
	/*
	* We need to update the cache after our lookup, so upgrade to
	* a write lock and retry the operation.
	*/
	CACHE_RUNLOCK();
	CACHE_WLOCK();
	numupgrades++;
	wlocked = 1;
	goto retry_wlocked;

	success:
	/*
	* On success we return a locked and ref'd vnode as per the lookup
	* protocol.
	*/
	if (dvp == vpp) { / lookup on "." */
	VREF(*vpp);
	if (wlocked)
	CACHE_WUNLOCK();
	else
	CACHE_RUNLOCK();
	/*
	* When we lookup "." we still can be asked to lock it
	* differently...
	*/
	ltype = cnp->cn_lkflags & LK_TYPE_MASK;
	if (ltype != VOP_ISLOCKED(*vpp)) {
	if (ltype == LK_EXCLUSIVE) {
	vn_lock(*vpp, LK_UPGRADE \| LK_RETRY);
	if ((*vpp)->v_iflag & VI_DOOMED) {
	/* forced unmount */
	vrele(*vpp);
	*vpp = NULL;
	return (ENOENT);
	}
	} else
	vn_lock(*vpp, LK_DOWNGRADE \| LK_RETRY);
	}
	return (-1);
	}
	ltype = 0; /* silence gcc warning */
	if (cnp->cn_flags & ISDOTDOT) {
	ltype = VOP_ISLOCKED(dvp);
	VOP_UNLOCK(dvp, 0);
	}
	VI_LOCK(*vpp);
	if (wlocked)
	CACHE_WUNLOCK();
	else
	CACHE_RUNLOCK();
	error = vget(*vpp, cnp->cn_lkflags \| LK_INTERLOCK, cnp->cn_thread);
	if (cnp->cn_flags & ISDOTDOT) {
	vn_lock(dvp, ltype \| LK_RETRY);
	if (dvp->v_iflag & VI_DOOMED) {
	if (error == 0)
	vput(*vpp);
	*vpp = NULL;
	return (ENOENT);
	}
	}
	if (error) {
	*vpp = NULL;
	goto retry;
	}
	if ((cnp->cn_flags & ISLASTCN) &&
	(cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
	ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
	}
	return (-1);

	unlock:
	if (wlocked)
	CACHE_WUNLOCK();
	else
	CACHE_RUNLOCK();
	return (0);
	}

	/*
	* Add an entry to the cache.
	*/
	void
	cache_enter(dvp, vp, cnp)
	struct vnode *dvp;
	struct vnode *vp;
	struct componentname *cnp;
	{
	struct namecache ncp, n2;
	struct nchashhead *ncpp;
	uint32_t hash;
	int flag;
	int hold;
	int zap;
	int len;

	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
	VNASSERT(vp == NULL \|\| (vp->v_iflag & VI_DOOMED) == 0, vp,
	("cache_enter: Adding a doomed vnode"));
	VNASSERT(dvp == NULL \|\| (dvp->v_iflag & VI_DOOMED) == 0, dvp,
	("cache_enter: Doomed vnode used as src"));

	if (!doingcache)
	return;

	/*
	* Avoid blowout in namecache entries.
	*/
	if (numcache >= desiredvnodes * ncsizefactor)
	return;

	flag = 0;
	if (cnp->cn_nameptr[0] == '.') {
	if (cnp->cn_namelen == 1)
	return;
	if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
	CACHE_WLOCK();
	/*
	* If dotdot entry already exists, just retarget it
	* to new parent vnode, otherwise continue with new
	* namecache entry allocation.
	*/
	if ((ncp = dvp->v_cache_dd) != NULL &&
	ncp->nc_flag & NCF_ISDOTDOT) {
	KASSERT(ncp->nc_dvp == dvp,
	("wrong isdotdot parent"));
	if (ncp->nc_vp != NULL)
	TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
	ncp, nc_dst);
	else
	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
	if (vp != NULL)
	TAILQ_INSERT_HEAD(&vp->v_cache_dst,
	ncp, nc_dst);
	else
	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
	ncp->nc_vp = vp;
	CACHE_WUNLOCK();
	return;
	}
	dvp->v_cache_dd = NULL;
	SDT_PROBE(vfs, namecache, enter, done, dvp, "..", vp,
	0, 0);
	CACHE_WUNLOCK();
	flag = NCF_ISDOTDOT;
	}
	}

	hold = 0;
	zap = 0;

	/*
	* Calculate the hash key and setup as much of the new
	* namecache entry as possible before acquiring the lock.
	*/
	ncp = cache_alloc(cnp->cn_namelen);
	ncp->nc_vp = vp;
	ncp->nc_dvp = dvp;
	ncp->nc_flag = flag;
	len = ncp->nc_nlen = cnp->cn_namelen;
	hash = fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
	strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
	CACHE_WLOCK();

	/*
	* See if this vnode or negative entry is already in the cache
	* with this name. This can happen with concurrent lookups of
	* the same path name.
	*/
	ncpp = NCHHASH(hash);
	LIST_FOREACH(n2, ncpp, nc_hash) {
	if (n2->nc_dvp == dvp &&
	n2->nc_nlen == cnp->cn_namelen &&
	!bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
	CACHE_WUNLOCK();
	cache_free(ncp);
	return;
	}
	}

	if (flag == NCF_ISDOTDOT) {
	/*
	* See if we are trying to add .. entry, but some other lookup
	* has populated v_cache_dd pointer already.
	*/
	if (dvp->v_cache_dd != NULL) {
	CACHE_WUNLOCK();
	cache_free(ncp);
	return;
	}
	KASSERT(vp == NULL \|\| vp->v_type == VDIR,
	("wrong vnode type %p", vp));
	dvp->v_cache_dd = ncp;
	}

	numcache++;
	if (!vp) {
	numneg++;
	if (cnp->cn_flags & ISWHITEOUT)
	ncp->nc_flag \|= NCF_WHITE;
	} else if (vp->v_type == VDIR) {
	if (flag != NCF_ISDOTDOT) {
	if ((n2 = vp->v_cache_dd) != NULL &&
	(n2->nc_flag & NCF_ISDOTDOT) != 0)
	cache_zap(n2);
	vp->v_cache_dd = ncp;
	}
	} else {
	vp->v_cache_dd = NULL;
	}

	/*
	* Insert the new namecache entry into the appropriate chain
	* within the cache entries table.
	*/
	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
	if (flag != NCF_ISDOTDOT) {
	if (LIST_EMPTY(&dvp->v_cache_src)) {
	hold = 1;
	numcachehv++;
	}
	LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
	}

	/*
	* If the entry is "negative", we place it into the
	* "negative" cache queue, otherwise, we place it into the
	* destination vnode's cache entries queue.
	*/
	if (vp) {
	TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
	SDT_PROBE(vfs, namecache, enter, done, dvp, ncp->nc_name, vp,
	0, 0);
	} else {
	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
	SDT_PROBE(vfs, namecache, enter_negative, done, dvp,
	ncp->nc_name, 0, 0, 0);
	}
	if (numneg * ncnegfactor > numcache) {
	ncp = TAILQ_FIRST(&ncneg);
	zap = 1;
	}
	if (hold)
	vhold(dvp);
	if (zap)
	cache_zap(ncp);
	CACHE_WUNLOCK();
	}

	/*
	* Name cache initialization, from vfs_init() when we are booting
	*/
	static void
	nchinit(void *dummy __unused)
	{

	TAILQ_INIT(&ncneg);

	cache_zone_small = uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL,
	NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
	cache_zone_large = uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL,
	NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);

	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
	}
	SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);


	/*
	* Invalidate all entries to a particular vnode.
	*/
	void
	cache_purge(vp)
	struct vnode *vp;
	{

	CTR1(KTR_VFS, "cache_purge(%p)", vp);
	SDT_PROBE(vfs, namecache, purge, done, vp, 0, 0, 0, 0);
	CACHE_WLOCK();
	while (!LIST_EMPTY(&vp->v_cache_src))
	cache_zap(LIST_FIRST(&vp->v_cache_src));
	while (!TAILQ_EMPTY(&vp->v_cache_dst))
	cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
	if (vp->v_cache_dd != NULL) {
	KASSERT(vp->v_cache_dd->nc_flag & NCF_ISDOTDOT,
	("lost dotdot link"));
	cache_zap(vp->v_cache_dd);
	}
	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
	CACHE_WUNLOCK();
	}

	/*
	* Invalidate all negative entries for a particular directory vnode.
	*/
	void
	cache_purge_negative(vp)
	struct vnode *vp;
	{
	struct namecache cp, ncp;

	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
	SDT_PROBE(vfs, namecache, purge_negative, done, vp, 0, 0, 0, 0);
	CACHE_WLOCK();
	LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) {
	if (cp->nc_vp == NULL)
	cache_zap(cp);
	}
	CACHE_WUNLOCK();
	}

	/*
	* Flush all entries referencing a particular filesystem.
	*/
	void
	cache_purgevfs(mp)
	struct mount *mp;
	{
	struct nchashhead *ncpp;
	struct namecache ncp, nnp;

	/* Scan hash tables for applicable entries */
	SDT_PROBE(vfs, namecache, purgevfs, done, mp, 0, 0, 0, 0);
	CACHE_WLOCK();
	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
	LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
	if (ncp->nc_dvp->v_mount == mp)
	cache_zap(ncp);
	}
	}
	CACHE_WUNLOCK();
	}

	/*
	* Perform canonical checks and cache lookup and pass on to filesystem
	* through the vop_cachedlookup only if needed.
	*/

	int
	vfs_cache_lookup(ap)
	struct vop_lookup_args /* {
	struct vnode *a_dvp;
	struct vnode **a_vpp;
	struct componentname *a_cnp;
	} / ap;
	{
	struct vnode *dvp;
	int error;
	struct vnode **vpp = ap->a_vpp;
	struct componentname *cnp = ap->a_cnp;
	struct ucred *cred = cnp->cn_cred;
	int flags = cnp->cn_flags;
	struct thread *td = cnp->cn_thread;

	*vpp = NULL;
	dvp = ap->a_dvp;

	if (dvp->v_type != VDIR)
	return (ENOTDIR);

	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
	(cnp->cn_nameiop == DELETE \|\| cnp->cn_nameiop == RENAME))
	return (EROFS);

	error = VOP_ACCESS(dvp, VEXEC, cred, td);
	if (error)
	return (error);

	error = cache_lookup(dvp, vpp, cnp);
	if (error == 0)
	return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
	if (error == -1)
	return (0);
	return (error);
	}


	#ifndef _SYS_SYSPROTO_H_
	struct __getcwd_args {
	u_char *buf;
	u_int buflen;
	};
	#endif

	/*
	* XXX All of these sysctls would probably be more productive dead.
	*/
	static int disablecwd;
	SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
	"Disable the getcwd syscall");

	/* Implementation of the getcwd syscall. */
	int
	-__getcwd(td, uap)
	+sys___getcwd(td, uap)
	struct thread *td;
	struct __getcwd_args *uap;
	{

	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
	}

	int
	kern___getcwd(struct thread td, u_char buf, enum uio_seg bufseg, u_int buflen)
	{
	char bp, tmpbuf;
	struct filedesc *fdp;
	struct vnode cdir, rdir;
	int error, vfslocked;

	if (disablecwd)
	return (ENODEV);
	if (buflen < 2)
	return (EINVAL);
	if (buflen > MAXPATHLEN)
	buflen = MAXPATHLEN;

	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
	fdp = td->td_proc->p_fd;
	FILEDESC_SLOCK(fdp);
	cdir = fdp->fd_cdir;
	VREF(cdir);
	rdir = fdp->fd_rdir;
	VREF(rdir);
	FILEDESC_SUNLOCK(fdp);
	error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
	vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
	vrele(rdir);
	VFS_UNLOCK_GIANT(vfslocked);
	vfslocked = VFS_LOCK_GIANT(cdir->v_mount);
	vrele(cdir);
	VFS_UNLOCK_GIANT(vfslocked);

	if (!error) {
	if (bufseg == UIO_SYSSPACE)
	bcopy(bp, buf, strlen(bp) + 1);
	else
	error = copyout(bp, buf, strlen(bp) + 1);
	#ifdef KTRACE
	if (KTRPOINT(curthread, KTR_NAMEI))
	ktrnamei(bp);
	#endif
	}
	free(tmpbuf, M_TEMP);
	return (error);
	}

	/*
	* Thus begins the fullpath magic.
	*/

	#undef STATNODE
	#define STATNODE(name, descr) \
	static u_int name; \
	SYSCTL_UINT(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr)

	static int disablefullpath;
	SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
	"Disable the vn_fullpath function");

	/* These count for kern___getcwd(), too. */
	STATNODE(numfullpathcalls, "Number of fullpath search calls");
	STATNODE(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
	STATNODE(numfullpathfail2,
	"Number of fullpath search errors (VOP_VPTOCNP failures)");
	STATNODE(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
	STATNODE(numfullpathfound, "Number of successful fullpath calls");

	/*
	* Retrieve the full filesystem path that correspond to a vnode from the name
	* cache (if available)
	*/
	int
	vn_fullpath(struct thread td, struct vnode vn, char retbuf, char freebuf)
	{
	char *buf;
	struct filedesc *fdp;
	struct vnode *rdir;
	int error, vfslocked;

	if (disablefullpath)
	return (ENODEV);
	if (vn == NULL)
	return (EINVAL);

	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
	fdp = td->td_proc->p_fd;
	FILEDESC_SLOCK(fdp);
	rdir = fdp->fd_rdir;
	VREF(rdir);
	FILEDESC_SUNLOCK(fdp);
	error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
	vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
	vrele(rdir);
	VFS_UNLOCK_GIANT(vfslocked);

	if (!error)
	*freebuf = buf;
	else
	free(buf, M_TEMP);
	return (error);
	}

	/*
	* This function is similar to vn_fullpath, but it attempts to lookup the
	* pathname relative to the global root mount point. This is required for the
	* auditing sub-system, as audited pathnames must be absolute, relative to the
	* global root mount point.
	*/
	int
	vn_fullpath_global(struct thread td, struct vnode vn,
	char retbuf, char freebuf)
	{
	char *buf;
	int error;

	if (disablefullpath)
	return (ENODEV);
	if (vn == NULL)
	return (EINVAL);
	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
	error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
	if (!error)
	*freebuf = buf;
	else
	free(buf, M_TEMP);
	return (error);
	}

	int
	vn_vptocnp(struct vnode *vp, struct ucred cred, char buf, u_int buflen)
	{
	int error;

	CACHE_RLOCK();
	error = vn_vptocnp_locked(vp, cred, buf, buflen);
	if (error == 0) {
	/*
	* vn_vptocnp_locked() dropped hold acquired by
	* VOP_VPTOCNP immediately after locking the
	* cache. Since we are going to drop the cache rlock,
	* re-hold the result.
	*/
	vhold(*vp);
	CACHE_RUNLOCK();
	}
	return (error);
	}

	static int
	vn_vptocnp_locked(struct vnode *vp, struct ucred cred, char *buf,
	u_int *buflen)
	{
	struct vnode *dvp;
	struct namecache *ncp;
	int error, vfslocked;

	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
	if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
	break;
	}
	if (ncp != NULL) {
	if (*buflen < ncp->nc_nlen) {
	CACHE_RUNLOCK();
	numfullpathfail4++;
	error = ENOMEM;
	SDT_PROBE(vfs, namecache, fullpath, return, error,
	vp, NULL, 0, 0);
	return (error);
	}
	*buflen -= ncp->nc_nlen;
	memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
	SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
	ncp->nc_name, vp, 0, 0);
	*vp = ncp->nc_dvp;
	return (0);
	}
	SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0, 0, 0);

	vhold(*vp);
	CACHE_RUNLOCK();
	vfslocked = VFS_LOCK_GIANT((*vp)->v_mount);
	vn_lock(*vp, LK_SHARED \| LK_RETRY);
	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
	VOP_UNLOCK(*vp, 0);
	vdrop(*vp);
	VFS_UNLOCK_GIANT(vfslocked);
	if (error) {
	numfullpathfail2++;
	SDT_PROBE(vfs, namecache, fullpath, return, error, vp,
	NULL, 0, 0);
	return (error);
	}

	*vp = dvp;
	CACHE_RLOCK();
	if ((*vp)->v_iflag & VI_DOOMED) {
	/* forced unmount */
	CACHE_RUNLOCK();
	vdrop(*vp);
	error = ENOENT;
	SDT_PROBE(vfs, namecache, fullpath, return, error, vp,
	NULL, 0, 0);
	return (error);
	}
	vdrop(*vp);

	return (0);
	}

	/*
	* The magic behind kern___getcwd() and vn_fullpath().
	*/
	static int
	vn_fullpath1(struct thread td, struct vnode vp, struct vnode *rdir,
	char buf, char *retbuf, u_int buflen)
	{
	int error, slash_prefixed;
	#ifdef KDTRACE_HOOKS
	struct vnode *startvp = vp;
	#endif

	buflen--;
	buf[buflen] = '\0';
	error = 0;
	slash_prefixed = 0;

	SDT_PROBE(vfs, namecache, fullpath, entry, vp, 0, 0, 0, 0);
	numfullpathcalls++;
	CACHE_RLOCK();
	if (vp->v_type != VDIR) {
	error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
	if (error)
	return (error);
	if (buflen == 0) {
	CACHE_RUNLOCK();
	return (ENOMEM);
	}
	buf[--buflen] = '/';
	slash_prefixed = 1;
	}
	while (vp != rdir && vp != rootvnode) {
	if (vp->v_vflag & VV_ROOT) {
	if (vp->v_iflag & VI_DOOMED) { /* forced unmount */
	CACHE_RUNLOCK();
	error = ENOENT;
	SDT_PROBE(vfs, namecache, fullpath, return,
	error, vp, NULL, 0, 0);
	break;
	}
	vp = vp->v_mount->mnt_vnodecovered;
	continue;
	}
	if (vp->v_type != VDIR) {
	CACHE_RUNLOCK();
	numfullpathfail1++;
	error = ENOTDIR;
	SDT_PROBE(vfs, namecache, fullpath, return,
	error, vp, NULL, 0, 0);
	break;
	}
	error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
	if (error)
	break;
	if (buflen == 0) {
	CACHE_RUNLOCK();
	error = ENOMEM;
	SDT_PROBE(vfs, namecache, fullpath, return, error,
	startvp, NULL, 0, 0);
	break;
	}
	buf[--buflen] = '/';
	slash_prefixed = 1;
	}
	if (error)
	return (error);
	if (!slash_prefixed) {
	if (buflen == 0) {
	CACHE_RUNLOCK();
	numfullpathfail4++;
	SDT_PROBE(vfs, namecache, fullpath, return, ENOMEM,
	startvp, NULL, 0, 0);
	return (ENOMEM);
	}
	buf[--buflen] = '/';
	}
	numfullpathfound++;
	CACHE_RUNLOCK();

	SDT_PROBE(vfs, namecache, fullpath, return, 0, startvp, buf + buflen,
	0, 0);
	*retbuf = buf + buflen;
	return (0);
	}

	int
	vn_commname(struct vnode vp, char buf, u_int buflen)
	{
	struct namecache *ncp;
	int l;

	CACHE_RLOCK();
	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
	if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
	break;
	if (ncp == NULL) {
	CACHE_RUNLOCK();
	return (ENOENT);
	}
	l = min(ncp->nc_nlen, buflen - 1);
	memcpy(buf, ncp->nc_name, l);
	CACHE_RUNLOCK();
	buf[l] = '\0';
	return (0);
	}
	Index: head/sys/kern/vfs_extattr.c
	===================================================================
	--- head/sys/kern/vfs_extattr.c (revision 225616)
	+++ head/sys/kern/vfs_extattr.c (revision 225617)
	@@ -1,795 +1,795 @@
	/*-
	* Copyright (c) 1999-2001 Robert N. M. Watson
	* All rights reserved.
	*
	* This software was developed by Robert Watson for the TrustedBSD Project.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/lock.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/sysproto.h>
	#include <sys/fcntl.h>
	#include <sys/namei.h>
	#include <sys/filedesc.h>
	#include <sys/limits.h>
	#include <sys/vnode.h>
	#include <sys/proc.h>
	#include <sys/extattr.h>

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>

	/*
	* Syscall to push extended attribute configuration information into the VFS.
	* Accepts a path, which it converts to a mountpoint, as well as a command
	* (int cmd), and attribute name and misc data.
	*
	* Currently this is used only by UFS1 extended attributes.
	*/
	int
	-extattrctl(td, uap)
	+sys_extattrctl(td, uap)
	struct thread *td;
	struct extattrctl_args /* {
	const char *path;
	int cmd;
	const char *filename;
	int attrnamespace;
	const char *attrname;
	} / uap;
	{
	struct vnode *filename_vp;
	struct nameidata nd;
	struct mount mp, mp_writable;
	char attrname[EXTATTR_MAXNAMELEN];
	int vfslocked, fnvfslocked, error;

	AUDIT_ARG_CMD(uap->cmd);
	AUDIT_ARG_VALUE(uap->attrnamespace);
	/*
	* uap->attrname is not always defined. We check again later when we
	* invoke the VFS call so as to pass in NULL there if needed.
	*/
	if (uap->attrname != NULL) {
	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
	NULL);
	if (error)
	return (error);
	}
	AUDIT_ARG_TEXT(attrname);

	vfslocked = fnvfslocked = 0;
	mp = NULL;
	filename_vp = NULL;
	if (uap->filename != NULL) {
	NDINIT(&nd, LOOKUP, MPSAFE \| FOLLOW \| AUDITVNODE2,
	UIO_USERSPACE, uap->filename, td);
	error = namei(&nd);
	if (error)
	return (error);
	fnvfslocked = NDHASGIANT(&nd);
	filename_vp = nd.ni_vp;
	NDFREE(&nd, NDF_NO_VP_RELE);
	}

	/* uap->path is always defined. */
	NDINIT(&nd, LOOKUP, MPSAFE \| FOLLOW \| LOCKLEAF \| AUDITVNODE1,
	UIO_USERSPACE, uap->path, td);
	error = namei(&nd);
	if (error)
	goto out;
	vfslocked = NDHASGIANT(&nd);
	mp = nd.ni_vp->v_mount;
	error = vfs_busy(mp, 0);
	if (error) {
	NDFREE(&nd, 0);
	mp = NULL;
	goto out;
	}
	VOP_UNLOCK(nd.ni_vp, 0);
	error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT \| PCATCH);
	NDFREE(&nd, NDF_NO_VP_UNLOCK);
	if (error)
	goto out;
	if (filename_vp != NULL) {
	/*
	* uap->filename is not always defined. If it is,
	* grab a vnode lock, which VFS_EXTATTRCTL() will
	* later release.
	*/
	error = vn_lock(filename_vp, LK_EXCLUSIVE);
	if (error) {
	vn_finished_write(mp_writable);
	goto out;
	}
	}

	error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace,
	uap->attrname != NULL ? attrname : NULL);

	vn_finished_write(mp_writable);
	out:
	if (mp != NULL)
	vfs_unbusy(mp);

	/*
	* VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp,
	* so vrele it if it is defined.
	*/
	if (filename_vp != NULL)
	vrele(filename_vp);
	VFS_UNLOCK_GIANT(fnvfslocked);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*-
	* Set a named extended attribute on a file or directory
	*
	* Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
	* kernelspace string pointer "attrname", userspace buffer
	* pointer "data", buffer length "nbytes", thread "td".
	* Returns: 0 on success, an error number otherwise
	* Locks: none
	* References: vp must be a valid reference for the duration of the call
	*/
	static int
	extattr_set_vp(struct vnode vp, int attrnamespace, const char attrname,
	void data, size_t nbytes, struct thread td)
	{
	struct mount *mp;
	struct uio auio;
	struct iovec aiov;
	ssize_t cnt;
	int error;

	VFS_ASSERT_GIANT(vp->v_mount);
	error = vn_start_write(vp, &mp, V_WAIT \| PCATCH);
	if (error)
	return (error);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);

	aiov.iov_base = data;
	aiov.iov_len = nbytes;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = 0;
	if (nbytes > INT_MAX) {
	error = EINVAL;
	goto done;
	}
	auio.uio_resid = nbytes;
	auio.uio_rw = UIO_WRITE;
	auio.uio_segflg = UIO_USERSPACE;
	auio.uio_td = td;
	cnt = nbytes;

	#ifdef MAC
	error = mac_vnode_check_setextattr(td->td_ucred, vp, attrnamespace,
	attrname);
	if (error)
	goto done;
	#endif

	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
	td->td_ucred, td);
	cnt -= auio.uio_resid;
	td->td_retval[0] = cnt;

	done:
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	return (error);
	}

	int
	-extattr_set_fd(td, uap)
	+sys_extattr_set_fd(td, uap)
	struct thread *td;
	struct extattr_set_fd_args /* {
	int fd;
	int attrnamespace;
	const char *attrname;
	void *data;
	size_t nbytes;
	} / uap;
	{
	struct file *fp;
	char attrname[EXTATTR_MAXNAMELEN];
	int vfslocked, error;

	AUDIT_ARG_FD(uap->fd);
	AUDIT_ARG_VALUE(uap->attrnamespace);
	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
	if (error)
	return (error);
	AUDIT_ARG_TEXT(attrname);

	error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_SET, &fp);
	if (error)
	return (error);

	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
	error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
	attrname, uap->data, uap->nbytes, td);
	fdrop(fp, td);
	VFS_UNLOCK_GIANT(vfslocked);

	return (error);
	}

	int
	-extattr_set_file(td, uap)
	+sys_extattr_set_file(td, uap)
	struct thread *td;
	struct extattr_set_file_args /* {
	const char *path;
	int attrnamespace;
	const char *attrname;
	void *data;
	size_t nbytes;
	} / uap;
	{
	struct nameidata nd;
	char attrname[EXTATTR_MAXNAMELEN];
	int vfslocked, error;

	AUDIT_ARG_VALUE(uap->attrnamespace);
	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
	if (error)
	return (error);
	AUDIT_ARG_TEXT(attrname);

	NDINIT(&nd, LOOKUP, MPSAFE \| FOLLOW \| AUDITVNODE1, UIO_USERSPACE,
	uap->path, td);
	error = namei(&nd);
	if (error)
	return (error);
	NDFREE(&nd, NDF_ONLY_PNBUF);

	vfslocked = NDHASGIANT(&nd);
	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
	uap->data, uap->nbytes, td);

	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	int
	-extattr_set_link(td, uap)
	+sys_extattr_set_link(td, uap)
	struct thread *td;
	struct extattr_set_link_args /* {
	const char *path;
	int attrnamespace;
	const char *attrname;
	void *data;
	size_t nbytes;
	} / uap;
	{
	struct nameidata nd;
	char attrname[EXTATTR_MAXNAMELEN];
	int vfslocked, error;

	AUDIT_ARG_VALUE(uap->attrnamespace);
	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
	if (error)
	return (error);
	AUDIT_ARG_TEXT(attrname);

	NDINIT(&nd, LOOKUP, MPSAFE \| NOFOLLOW \| AUDITVNODE1, UIO_USERSPACE,
	uap->path, td);
	error = namei(&nd);
	if (error)
	return (error);
	NDFREE(&nd, NDF_ONLY_PNBUF);

	vfslocked = NDHASGIANT(&nd);
	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
	uap->data, uap->nbytes, td);

	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*-
	* Get a named extended attribute on a file or directory
	*
	* Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
	* kernelspace string pointer "attrname", userspace buffer
	* pointer "data", buffer length "nbytes", thread "td".
	* Returns: 0 on success, an error number otherwise
	* Locks: none
	* References: vp must be a valid reference for the duration of the call
	*/
	static int
	extattr_get_vp(struct vnode vp, int attrnamespace, const char attrname,
	void data, size_t nbytes, struct thread td)
	{
	struct uio auio, *auiop;
	struct iovec aiov;
	ssize_t cnt;
	size_t size, *sizep;
	int error;

	VFS_ASSERT_GIANT(vp->v_mount);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);

	/*
	* Slightly unusual semantics: if the user provides a NULL data
	* pointer, they don't want to receive the data, just the maximum
	* read length.
	*/
	auiop = NULL;
	sizep = NULL;
	cnt = 0;
	if (data != NULL) {
	aiov.iov_base = data;
	aiov.iov_len = nbytes;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = 0;
	if (nbytes > INT_MAX) {
	error = EINVAL;
	goto done;
	}
	auio.uio_resid = nbytes;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = UIO_USERSPACE;
	auio.uio_td = td;
	auiop = &auio;
	cnt = nbytes;
	} else
	sizep = &size;

	#ifdef MAC
	error = mac_vnode_check_getextattr(td->td_ucred, vp, attrnamespace,
	attrname);
	if (error)
	goto done;
	#endif

	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
	td->td_ucred, td);

	if (auiop != NULL) {
	cnt -= auio.uio_resid;
	td->td_retval[0] = cnt;
	} else
	td->td_retval[0] = size;

	done:
	VOP_UNLOCK(vp, 0);
	return (error);
	}

	int
	-extattr_get_fd(td, uap)
	+sys_extattr_get_fd(td, uap)
	struct thread *td;
	struct extattr_get_fd_args /* {
	int fd;
	int attrnamespace;
	const char *attrname;
	void *data;
	size_t nbytes;
	} / uap;
	{
	struct file *fp;
	char attrname[EXTATTR_MAXNAMELEN];
	int vfslocked, error;

	AUDIT_ARG_FD(uap->fd);
	AUDIT_ARG_VALUE(uap->attrnamespace);
	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
	if (error)
	return (error);
	AUDIT_ARG_TEXT(attrname);

	error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_GET, &fp);
	if (error)
	return (error);

	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
	error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
	attrname, uap->data, uap->nbytes, td);

	fdrop(fp, td);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	int
	-extattr_get_file(td, uap)
	+sys_extattr_get_file(td, uap)
	struct thread *td;
	struct extattr_get_file_args /* {
	const char *path;
	int attrnamespace;
	const char *attrname;
	void *data;
	size_t nbytes;
	} / uap;
	{
	struct nameidata nd;
	char attrname[EXTATTR_MAXNAMELEN];
	int vfslocked, error;

	AUDIT_ARG_VALUE(uap->attrnamespace);
	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
	if (error)
	return (error);
	AUDIT_ARG_TEXT(attrname);

	NDINIT(&nd, LOOKUP, MPSAFE \| FOLLOW \| AUDITVNODE1, UIO_USERSPACE,
	uap->path, td);
	error = namei(&nd);
	if (error)
	return (error);
	NDFREE(&nd, NDF_ONLY_PNBUF);

	vfslocked = NDHASGIANT(&nd);
	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
	uap->data, uap->nbytes, td);

	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	int
	-extattr_get_link(td, uap)
	+sys_extattr_get_link(td, uap)
	struct thread *td;
	struct extattr_get_link_args /* {
	const char *path;
	int attrnamespace;
	const char *attrname;
	void *data;
	size_t nbytes;
	} / uap;
	{
	struct nameidata nd;
	char attrname[EXTATTR_MAXNAMELEN];
	int vfslocked, error;

	AUDIT_ARG_VALUE(uap->attrnamespace);
	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
	if (error)
	return (error);
	AUDIT_ARG_TEXT(attrname);

	NDINIT(&nd, LOOKUP, MPSAFE \| NOFOLLOW \| AUDITVNODE1, UIO_USERSPACE,
	uap->path, td);
	error = namei(&nd);
	if (error)
	return (error);
	NDFREE(&nd, NDF_ONLY_PNBUF);

	vfslocked = NDHASGIANT(&nd);
	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
	uap->data, uap->nbytes, td);

	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* extattr_delete_vp(): Delete a named extended attribute on a file or
	* directory
	*
	* Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
	* kernelspace string pointer "attrname", proc "p"
	* Returns: 0 on success, an error number otherwise
	* Locks: none
	* References: vp must be a valid reference for the duration of the call
	*/
	static int
	extattr_delete_vp(struct vnode vp, int attrnamespace, const char attrname,
	struct thread *td)
	{
	struct mount *mp;
	int error;

	VFS_ASSERT_GIANT(vp->v_mount);
	error = vn_start_write(vp, &mp, V_WAIT \| PCATCH);
	if (error)
	return (error);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);

	#ifdef MAC
	error = mac_vnode_check_deleteextattr(td->td_ucred, vp, attrnamespace,
	attrname);
	if (error)
	goto done;
	#endif

	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred,
	td);
	if (error == EOPNOTSUPP)
	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
	td->td_ucred, td);
	#ifdef MAC
	done:
	#endif
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	return (error);
	}

	int
	-extattr_delete_fd(td, uap)
	+sys_extattr_delete_fd(td, uap)
	struct thread *td;
	struct extattr_delete_fd_args /* {
	int fd;
	int attrnamespace;
	const char *attrname;
	} / uap;
	{
	struct file *fp;
	char attrname[EXTATTR_MAXNAMELEN];
	int vfslocked, error;

	AUDIT_ARG_FD(uap->fd);
	AUDIT_ARG_VALUE(uap->attrnamespace);
	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
	if (error)
	return (error);
	AUDIT_ARG_TEXT(attrname);

	error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_DELETE,
	&fp);
	if (error)
	return (error);

	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
	error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
	attrname, td);
	fdrop(fp, td);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	int
	-extattr_delete_file(td, uap)
	+sys_extattr_delete_file(td, uap)
	struct thread *td;
	struct extattr_delete_file_args /* {
	const char *path;
	int attrnamespace;
	const char *attrname;
	} / uap;
	{
	struct nameidata nd;
	char attrname[EXTATTR_MAXNAMELEN];
	int vfslocked, error;

	AUDIT_ARG_VALUE(uap->attrnamespace);
	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
	if (error)
	return(error);
	AUDIT_ARG_TEXT(attrname);

	NDINIT(&nd, LOOKUP, MPSAFE \| FOLLOW \| AUDITVNODE1, UIO_USERSPACE,
	uap->path, td);
	error = namei(&nd);
	if (error)
	return(error);
	NDFREE(&nd, NDF_ONLY_PNBUF);

	vfslocked = NDHASGIANT(&nd);
	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return(error);
	}

	int
	-extattr_delete_link(td, uap)
	+sys_extattr_delete_link(td, uap)
	struct thread *td;
	struct extattr_delete_link_args /* {
	const char *path;
	int attrnamespace;
	const char *attrname;
	} / uap;
	{
	struct nameidata nd;
	char attrname[EXTATTR_MAXNAMELEN];
	int vfslocked, error;

	AUDIT_ARG_VALUE(uap->attrnamespace);
	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
	if (error)
	return(error);
	AUDIT_ARG_TEXT(attrname);

	NDINIT(&nd, LOOKUP, MPSAFE \| NOFOLLOW \| AUDITVNODE1, UIO_USERSPACE,
	uap->path, td);
	error = namei(&nd);
	if (error)
	return(error);
	NDFREE(&nd, NDF_ONLY_PNBUF);

	vfslocked = NDHASGIANT(&nd);
	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return(error);
	}

	/*-
	* Retrieve a list of extended attributes on a file or directory.
	*
	* Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace",
	* userspace buffer pointer "data", buffer length "nbytes",
	* thread "td".
	* Returns: 0 on success, an error number otherwise
	* Locks: none
	* References: vp must be a valid reference for the duration of the call
	*/
	static int
	extattr_list_vp(struct vnode vp, int attrnamespace, void data,
	size_t nbytes, struct thread *td)
	{
	struct uio auio, *auiop;
	size_t size, *sizep;
	struct iovec aiov;
	ssize_t cnt;
	int error;

	VFS_ASSERT_GIANT(vp->v_mount);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);

	auiop = NULL;
	sizep = NULL;
	cnt = 0;
	if (data != NULL) {
	aiov.iov_base = data;
	aiov.iov_len = nbytes;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = 0;
	if (nbytes > INT_MAX) {
	error = EINVAL;
	goto done;
	}
	auio.uio_resid = nbytes;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = UIO_USERSPACE;
	auio.uio_td = td;
	auiop = &auio;
	cnt = nbytes;
	} else
	sizep = &size;

	#ifdef MAC
	error = mac_vnode_check_listextattr(td->td_ucred, vp, attrnamespace);
	if (error)
	goto done;
	#endif

	error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep,
	td->td_ucred, td);

	if (auiop != NULL) {
	cnt -= auio.uio_resid;
	td->td_retval[0] = cnt;
	} else
	td->td_retval[0] = size;

	done:
	VOP_UNLOCK(vp, 0);
	return (error);
	}


	int
	-extattr_list_fd(td, uap)
	+sys_extattr_list_fd(td, uap)
	struct thread *td;
	struct extattr_list_fd_args /* {
	int fd;
	int attrnamespace;
	void *data;
	size_t nbytes;
	} / uap;
	{
	struct file *fp;
	int vfslocked, error;

	AUDIT_ARG_FD(uap->fd);
	AUDIT_ARG_VALUE(uap->attrnamespace);
	error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_LIST, &fp);
	if (error)
	return (error);

	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
	error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
	uap->nbytes, td);

	fdrop(fp, td);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	int
	-extattr_list_file(td, uap)
	+sys_extattr_list_file(td, uap)
	struct thread*td;
	struct extattr_list_file_args /* {
	const char *path;
	int attrnamespace;
	void *data;
	size_t nbytes;
	} / uap;
	{
	struct nameidata nd;
	int vfslocked, error;

	AUDIT_ARG_VALUE(uap->attrnamespace);
	NDINIT(&nd, LOOKUP, MPSAFE \| FOLLOW \| AUDITVNODE1, UIO_USERSPACE,
	uap->path, td);
	error = namei(&nd);
	if (error)
	return (error);
	NDFREE(&nd, NDF_ONLY_PNBUF);

	vfslocked = NDHASGIANT(&nd);
	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
	uap->nbytes, td);

	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	int
	-extattr_list_link(td, uap)
	+sys_extattr_list_link(td, uap)
	struct thread*td;
	struct extattr_list_link_args /* {
	const char *path;
	int attrnamespace;
	void *data;
	size_t nbytes;
	} / uap;
	{
	struct nameidata nd;
	int vfslocked, error;

	AUDIT_ARG_VALUE(uap->attrnamespace);
	NDINIT(&nd, LOOKUP, MPSAFE \| NOFOLLOW \| AUDITVNODE1, UIO_USERSPACE,
	uap->path, td);
	error = namei(&nd);
	if (error)
	return (error);
	NDFREE(&nd, NDF_ONLY_PNBUF);

	vfslocked = NDHASGIANT(&nd);
	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
	uap->nbytes, td);

	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	Index: head/sys/kern/vfs_mount.c
	===================================================================
	--- head/sys/kern/vfs_mount.c (revision 225616)
	+++ head/sys/kern/vfs_mount.c (revision 225617)
	@@ -1,1958 +1,1958 @@
	/*-
	* Copyright (c) 1999-2004 Poul-Henning Kamp
	* Copyright (c) 1999 Michael Smith
	* Copyright (c) 1989, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/conf.h>
	#include <sys/fcntl.h>
	#include <sys/jail.h>
	#include <sys/kernel.h>
	#include <sys/libkern.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/filedesc.h>
	#include <sys/reboot.h>
	#include <sys/sbuf.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>
	#include <sys/sx.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/systm.h>
	#include <sys/vnode.h>
	#include <vm/uma.h>

	#include <geom/geom.h>

	#include <machine/stdarg.h>

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>

	#define VFS_MOUNTARG_SIZE_MAX (1024 * 64)

	static int vfs_domount(struct thread td, const char fstype,
	char fspath, int fsflags, struct vfsoptlist *optlist);
	static void free_mntarg(struct mntarg *ma);

	static int usermount = 0;
	SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
	"Unprivileged users may mount and unmount file systems");

	MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
	MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
	static uma_zone_t mount_zone;

	/* List of mounted filesystems. */
	struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);

	/* For any iteration/modification of mountlist */
	struct mtx mountlist_mtx;
	MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);

	/*
	* Global opts, taken by all filesystems
	*/
	static const char *global_opts[] = {
	"errmsg",
	"fstype",
	"fspath",
	"ro",
	"rw",
	"nosuid",
	"noexec",
	NULL
	};

	static int
	mount_init(void *mem, int size, int flags)
	{
	struct mount *mp;

	mp = (struct mount *)mem;
	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
	lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
	return (0);
	}

	static void
	mount_fini(void *mem, int size)
	{
	struct mount *mp;

	mp = (struct mount *)mem;
	lockdestroy(&mp->mnt_explock);
	mtx_destroy(&mp->mnt_mtx);
	}

	static void
	vfs_mount_init(void *dummy __unused)
	{

	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount), NULL,
	NULL, mount_init, mount_fini, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
	}
	SYSINIT(vfs_mount, SI_SUB_VFS, SI_ORDER_ANY, vfs_mount_init, NULL);

	/*
	* ---------------------------------------------------------------------
	* Functions for building and sanitizing the mount options
	*/

	/* Remove one mount option. */
	static void
	vfs_freeopt(struct vfsoptlist opts, struct vfsopt opt)
	{

	TAILQ_REMOVE(opts, opt, link);
	free(opt->name, M_MOUNT);
	if (opt->value != NULL)
	free(opt->value, M_MOUNT);
	free(opt, M_MOUNT);
	}

	/* Release all resources related to the mount options. */
	void
	vfs_freeopts(struct vfsoptlist *opts)
	{
	struct vfsopt *opt;

	while (!TAILQ_EMPTY(opts)) {
	opt = TAILQ_FIRST(opts);
	vfs_freeopt(opts, opt);
	}
	free(opts, M_MOUNT);
	}

	void
	vfs_deleteopt(struct vfsoptlist opts, const char name)
	{
	struct vfsopt opt, temp;

	if (opts == NULL)
	return;
	TAILQ_FOREACH_SAFE(opt, opts, link, temp) {
	if (strcmp(opt->name, name) == 0)
	vfs_freeopt(opts, opt);
	}
	}

	static int
	vfs_isopt_ro(const char *opt)
	{

	if (strcmp(opt, "ro") == 0 \|\| strcmp(opt, "rdonly") == 0 \|\|
	strcmp(opt, "norw") == 0)
	return (1);
	return (0);
	}

	static int
	vfs_isopt_rw(const char *opt)
	{

	if (strcmp(opt, "rw") == 0 \|\| strcmp(opt, "noro") == 0)
	return (1);
	return (0);
	}

	/*
	* Check if options are equal (with or without the "no" prefix).
	*/
	static int
	vfs_equalopts(const char opt1, const char opt2)
	{
	char *p;

	/* "opt" vs. "opt" or "noopt" vs. "noopt" */
	if (strcmp(opt1, opt2) == 0)
	return (1);
	/* "noopt" vs. "opt" */
	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
	return (1);
	/* "opt" vs. "noopt" */
	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
	return (1);
	while ((p = strchr(opt1, '.')) != NULL &&
	!strncmp(opt1, opt2, ++p - opt1)) {
	opt2 += p - opt1;
	opt1 = p;
	/* "foo.noopt" vs. "foo.opt" */
	if (strncmp(opt1, "no", 2) == 0 && strcmp(opt1 + 2, opt2) == 0)
	return (1);
	/* "foo.opt" vs. "foo.noopt" */
	if (strncmp(opt2, "no", 2) == 0 && strcmp(opt1, opt2 + 2) == 0)
	return (1);
	}
	/* "ro" / "rdonly" / "norw" / "rw" / "noro" */
	if ((vfs_isopt_ro(opt1) \|\| vfs_isopt_rw(opt1)) &&
	(vfs_isopt_ro(opt2) \|\| vfs_isopt_rw(opt2)))
	return (1);
	return (0);
	}

	/*
	* If a mount option is specified several times,
	* (with or without the "no" prefix) only keep
	* the last occurence of it.
	*/
	static void
	vfs_sanitizeopts(struct vfsoptlist *opts)
	{
	struct vfsopt opt, opt2, *tmp;

	TAILQ_FOREACH_REVERSE(opt, opts, vfsoptlist, link) {
	opt2 = TAILQ_PREV(opt, vfsoptlist, link);
	while (opt2 != NULL) {
	if (vfs_equalopts(opt->name, opt2->name)) {
	tmp = TAILQ_PREV(opt2, vfsoptlist, link);
	vfs_freeopt(opts, opt2);
	opt2 = tmp;
	} else {
	opt2 = TAILQ_PREV(opt2, vfsoptlist, link);
	}
	}
	}
	}

	/*
	* Build a linked list of mount options from a struct uio.
	*/
	int
	vfs_buildopts(struct uio auio, struct vfsoptlist *options)
	{
	struct vfsoptlist *opts;
	struct vfsopt *opt;
	size_t memused, namelen, optlen;
	unsigned int i, iovcnt;
	int error;

	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
	TAILQ_INIT(opts);
	memused = 0;
	iovcnt = auio->uio_iovcnt;
	for (i = 0; i < iovcnt; i += 2) {
	namelen = auio->uio_iov[i].iov_len;
	optlen = auio->uio_iov[i + 1].iov_len;
	memused += sizeof(struct vfsopt) + optlen + namelen;
	/*
	* Avoid consuming too much memory, and attempts to overflow
	* memused.
	*/
	if (memused > VFS_MOUNTARG_SIZE_MAX \|\|
	optlen > VFS_MOUNTARG_SIZE_MAX \|\|
	namelen > VFS_MOUNTARG_SIZE_MAX) {
	error = EINVAL;
	goto bad;
	}

	opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
	opt->name = malloc(namelen, M_MOUNT, M_WAITOK);
	opt->value = NULL;
	opt->len = 0;
	opt->pos = i / 2;
	opt->seen = 0;

	/*
	* Do this early, so jumps to "bad" will free the current
	* option.
	*/
	TAILQ_INSERT_TAIL(opts, opt, link);

	if (auio->uio_segflg == UIO_SYSSPACE) {
	bcopy(auio->uio_iov[i].iov_base, opt->name, namelen);
	} else {
	error = copyin(auio->uio_iov[i].iov_base, opt->name,
	namelen);
	if (error)
	goto bad;
	}
	/* Ensure names are null-terminated strings. */
	if (namelen == 0 \|\| opt->name[namelen - 1] != '\0') {
	error = EINVAL;
	goto bad;
	}
	if (optlen != 0) {
	opt->len = optlen;
	opt->value = malloc(optlen, M_MOUNT, M_WAITOK);
	if (auio->uio_segflg == UIO_SYSSPACE) {
	bcopy(auio->uio_iov[i + 1].iov_base, opt->value,
	optlen);
	} else {
	error = copyin(auio->uio_iov[i + 1].iov_base,
	opt->value, optlen);
	if (error)
	goto bad;
	}
	}
	}
	vfs_sanitizeopts(opts);
	*options = opts;
	return (0);
	bad:
	vfs_freeopts(opts);
	return (error);
	}

	/*
	* Merge the old mount options with the new ones passed
	* in the MNT_UPDATE case.
	*
	* XXX: This function will keep a "nofoo" option in the new
	* options. E.g, if the option's canonical name is "foo",
	* "nofoo" ends up in the mount point's active options.
	*/
	static void
	vfs_mergeopts(struct vfsoptlist toopts, struct vfsoptlist oldopts)
	{
	struct vfsopt opt, new;

	TAILQ_FOREACH(opt, oldopts, link) {
	new = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
	new->name = strdup(opt->name, M_MOUNT);
	if (opt->len != 0) {
	new->value = malloc(opt->len, M_MOUNT, M_WAITOK);
	bcopy(opt->value, new->value, opt->len);
	} else
	new->value = NULL;
	new->len = opt->len;
	new->seen = opt->seen;
	TAILQ_INSERT_HEAD(toopts, new, link);
	}
	vfs_sanitizeopts(toopts);
	}

	/*
	* Mount a filesystem.
	*/
	int
	-nmount(td, uap)
	+sys_nmount(td, uap)
	struct thread *td;
	struct nmount_args /* {
	struct iovec *iovp;
	unsigned int iovcnt;
	int flags;
	} / uap;
	{
	struct uio *auio;
	int error;
	u_int iovcnt;

	AUDIT_ARG_FFLAGS(uap->flags);
	CTR4(KTR_VFS, "%s: iovp %p with iovcnt %d and flags %d", __func__,
	uap->iovp, uap->iovcnt, uap->flags);

	/*
	* Filter out MNT_ROOTFS. We do not want clients of nmount() in
	* userspace to set this flag, but we must filter it out if we want
	* MNT_UPDATE on the root file system to work.
	* MNT_ROOTFS should only be set by the kernel when mounting its
	* root file system.
	*/
	uap->flags &= ~MNT_ROOTFS;

	iovcnt = uap->iovcnt;
	/*
	* Check that we have an even number of iovec's
	* and that we have at least two options.
	*/
	if ((iovcnt & 1) \|\| (iovcnt < 4)) {
	CTR2(KTR_VFS, "%s: failed for invalid iovcnt %d", __func__,
	uap->iovcnt);
	return (EINVAL);
	}

	error = copyinuio(uap->iovp, iovcnt, &auio);
	if (error) {
	CTR2(KTR_VFS, "%s: failed for invalid uio op with %d errno",
	__func__, error);
	return (error);
	}
	error = vfs_donmount(td, uap->flags, auio);

	free(auio, M_IOV);
	return (error);
	}

	/*
	* ---------------------------------------------------------------------
	* Various utility functions
	*/

	void
	vfs_ref(struct mount *mp)
	{

	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
	MNT_ILOCK(mp);
	MNT_REF(mp);
	MNT_IUNLOCK(mp);
	}

	void
	vfs_rel(struct mount *mp)
	{

	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
	MNT_ILOCK(mp);
	MNT_REL(mp);
	MNT_IUNLOCK(mp);
	}

	/*
	* Allocate and initialize the mount point struct.
	*/
	struct mount *
	vfs_mount_alloc(struct vnode vp, struct vfsconf vfsp, const char *fspath,
	struct ucred *cred)
	{
	struct mount *mp;

	mp = uma_zalloc(mount_zone, M_WAITOK);
	bzero(&mp->mnt_startzero,
	__rangeof(struct mount, mnt_startzero, mnt_endzero));
	TAILQ_INIT(&mp->mnt_nvnodelist);
	mp->mnt_nvnodelistsize = 0;
	mp->mnt_ref = 0;
	(void) vfs_busy(mp, MBF_NOWAIT);
	mp->mnt_op = vfsp->vfc_vfsops;
	mp->mnt_vfc = vfsp;
	vfsp->vfc_refcount++; /* XXX Unlocked */
	mp->mnt_stat.f_type = vfsp->vfc_typenum;
	mp->mnt_gen++;
	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
	mp->mnt_vnodecovered = vp;
	mp->mnt_cred = crdup(cred);
	mp->mnt_stat.f_owner = cred->cr_uid;
	strlcpy(mp->mnt_stat.f_mntonname, fspath, MNAMELEN);
	mp->mnt_iosize_max = DFLTPHYS;
	#ifdef MAC
	mac_mount_init(mp);
	mac_mount_create(cred, mp);
	#endif
	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
	return (mp);
	}

	/*
	* Destroy the mount struct previously allocated by vfs_mount_alloc().
	*/
	void
	vfs_mount_destroy(struct mount *mp)
	{

	MNT_ILOCK(mp);
	mp->mnt_kern_flag \|= MNTK_REFEXPIRE;
	if (mp->mnt_kern_flag & MNTK_MWAIT) {
	mp->mnt_kern_flag &= ~MNTK_MWAIT;
	wakeup(mp);
	}
	while (mp->mnt_ref)
	msleep(mp, MNT_MTX(mp), PVFS, "mntref", 0);
	KASSERT(mp->mnt_ref == 0,
	("%s: invalid refcount in the drain path @ %s:%d", __func__,
	__FILE__, __LINE__));
	if (mp->mnt_writeopcount != 0)
	panic("vfs_mount_destroy: nonzero writeopcount");
	if (mp->mnt_secondary_writes != 0)
	panic("vfs_mount_destroy: nonzero secondary_writes");
	mp->mnt_vfc->vfc_refcount--;
	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
	struct vnode *vp;

	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
	vprint("", vp);
	panic("unmount: dangling vnode");
	}
	if (mp->mnt_nvnodelistsize != 0)
	panic("vfs_mount_destroy: nonzero nvnodelistsize");
	if (mp->mnt_lockref != 0)
	panic("vfs_mount_destroy: nonzero lock refcount");
	MNT_IUNLOCK(mp);
	#ifdef MAC
	mac_mount_destroy(mp);
	#endif
	if (mp->mnt_opt != NULL)
	vfs_freeopts(mp->mnt_opt);
	crfree(mp->mnt_cred);
	uma_zfree(mount_zone, mp);
	}

	int
	vfs_donmount(struct thread td, int fsflags, struct uio fsoptions)
	{
	struct vfsoptlist *optlist;
	struct vfsopt opt, tmp_opt;
	char fstype, fspath, *errmsg;
	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;

	errmsg = fspath = NULL;
	errmsg_len = fspathlen = 0;
	errmsg_pos = -1;

	error = vfs_buildopts(fsoptions, &optlist);
	if (error)
	return (error);

	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
	errmsg_pos = vfs_getopt_pos(optlist, "errmsg");

	/*
	* We need these two options before the others,
	* and they are mandatory for any filesystem.
	* Ensure they are NUL terminated as well.
	*/
	fstypelen = 0;
	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
	if (error \|\| fstype[fstypelen - 1] != '\0') {
	error = EINVAL;
	if (errmsg != NULL)
	strncpy(errmsg, "Invalid fstype", errmsg_len);
	goto bail;
	}
	fspathlen = 0;
	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
	if (error \|\| fspath[fspathlen - 1] != '\0') {
	error = EINVAL;
	if (errmsg != NULL)
	strncpy(errmsg, "Invalid fspath", errmsg_len);
	goto bail;
	}

	/*
	* We need to see if we have the "update" option
	* before we call vfs_domount(), since vfs_domount() has special
	* logic based on MNT_UPDATE. This is very important
	* when we want to update the root filesystem.
	*/
	TAILQ_FOREACH_SAFE(opt, optlist, link, tmp_opt) {
	if (strcmp(opt->name, "update") == 0) {
	fsflags \|= MNT_UPDATE;
	vfs_freeopt(optlist, opt);
	}
	else if (strcmp(opt->name, "async") == 0)
	fsflags \|= MNT_ASYNC;
	else if (strcmp(opt->name, "force") == 0) {
	fsflags \|= MNT_FORCE;
	vfs_freeopt(optlist, opt);
	}
	else if (strcmp(opt->name, "reload") == 0) {
	fsflags \|= MNT_RELOAD;
	vfs_freeopt(optlist, opt);
	}
	else if (strcmp(opt->name, "multilabel") == 0)
	fsflags \|= MNT_MULTILABEL;
	else if (strcmp(opt->name, "noasync") == 0)
	fsflags &= ~MNT_ASYNC;
	else if (strcmp(opt->name, "noatime") == 0)
	fsflags \|= MNT_NOATIME;
	else if (strcmp(opt->name, "atime") == 0) {
	free(opt->name, M_MOUNT);
	opt->name = strdup("nonoatime", M_MOUNT);
	}
	else if (strcmp(opt->name, "noclusterr") == 0)
	fsflags \|= MNT_NOCLUSTERR;
	else if (strcmp(opt->name, "clusterr") == 0) {
	free(opt->name, M_MOUNT);
	opt->name = strdup("nonoclusterr", M_MOUNT);
	}
	else if (strcmp(opt->name, "noclusterw") == 0)
	fsflags \|= MNT_NOCLUSTERW;
	else if (strcmp(opt->name, "clusterw") == 0) {
	free(opt->name, M_MOUNT);
	opt->name = strdup("nonoclusterw", M_MOUNT);
	}
	else if (strcmp(opt->name, "noexec") == 0)
	fsflags \|= MNT_NOEXEC;
	else if (strcmp(opt->name, "exec") == 0) {
	free(opt->name, M_MOUNT);
	opt->name = strdup("nonoexec", M_MOUNT);
	}
	else if (strcmp(opt->name, "nosuid") == 0)
	fsflags \|= MNT_NOSUID;
	else if (strcmp(opt->name, "suid") == 0) {
	free(opt->name, M_MOUNT);
	opt->name = strdup("nonosuid", M_MOUNT);
	}
	else if (strcmp(opt->name, "nosymfollow") == 0)
	fsflags \|= MNT_NOSYMFOLLOW;
	else if (strcmp(opt->name, "symfollow") == 0) {
	free(opt->name, M_MOUNT);
	opt->name = strdup("nonosymfollow", M_MOUNT);
	}
	else if (strcmp(opt->name, "noro") == 0)
	fsflags &= ~MNT_RDONLY;
	else if (strcmp(opt->name, "rw") == 0)
	fsflags &= ~MNT_RDONLY;
	else if (strcmp(opt->name, "ro") == 0)
	fsflags \|= MNT_RDONLY;
	else if (strcmp(opt->name, "rdonly") == 0) {
	free(opt->name, M_MOUNT);
	opt->name = strdup("ro", M_MOUNT);
	fsflags \|= MNT_RDONLY;
	}
	else if (strcmp(opt->name, "suiddir") == 0)
	fsflags \|= MNT_SUIDDIR;
	else if (strcmp(opt->name, "sync") == 0)
	fsflags \|= MNT_SYNCHRONOUS;
	else if (strcmp(opt->name, "union") == 0)
	fsflags \|= MNT_UNION;
	}

	/*
	* Be ultra-paranoid about making sure the type and fspath
	* variables will fit in our mp buffers, including the
	* terminating NUL.
	*/
	if (fstypelen >= MFSNAMELEN - 1 \|\| fspathlen >= MNAMELEN - 1) {
	error = ENAMETOOLONG;
	goto bail;
	}

	error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
	bail:
	/* copyout the errmsg */
	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
	&& errmsg_len > 0 && errmsg != NULL) {
	if (fsoptions->uio_segflg == UIO_SYSSPACE) {
	bcopy(errmsg,
	fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
	fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
	} else {
	copyout(errmsg,
	fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
	fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
	}
	}

	if (optlist != NULL)
	vfs_freeopts(optlist);
	return (error);
	}

	/*
	* Old mount API.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct mount_args {
	char *type;
	char *path;
	int flags;
	caddr_t data;
	};
	#endif
	/* ARGSUSED */
	int
	-mount(td, uap)
	+sys_mount(td, uap)
	struct thread *td;
	struct mount_args /* {
	char *type;
	char *path;
	int flags;
	caddr_t data;
	} / uap;
	{
	char *fstype;
	struct vfsconf *vfsp = NULL;
	struct mntarg *ma = NULL;
	int error;

	AUDIT_ARG_FFLAGS(uap->flags);

	/*
	* Filter out MNT_ROOTFS. We do not want clients of mount() in
	* userspace to set this flag, but we must filter it out if we want
	* MNT_UPDATE on the root file system to work.
	* MNT_ROOTFS should only be set by the kernel when mounting its
	* root file system.
	*/
	uap->flags &= ~MNT_ROOTFS;

	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
	if (error) {
	free(fstype, M_TEMP);
	return (error);
	}

	AUDIT_ARG_TEXT(fstype);
	mtx_lock(&Giant);
	vfsp = vfs_byname_kld(fstype, td, &error);
	free(fstype, M_TEMP);
	if (vfsp == NULL) {
	mtx_unlock(&Giant);
	return (ENOENT);
	}
	if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
	mtx_unlock(&Giant);
	return (EOPNOTSUPP);
	}

	ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
	ma = mount_argb(ma, uap->flags & MNT_RDONLY, "noro");
	ma = mount_argb(ma, !(uap->flags & MNT_NOSUID), "nosuid");
	ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec");

	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags);
	mtx_unlock(&Giant);
	return (error);
	}

	/*
	* vfs_domount_first(): first file system mount (not update)
	*/
	static int
	vfs_domount_first(
	struct thread td, / Calling thread. */
	struct vfsconf vfsp, / File system type. */
	char fspath, / Mount path. */
	struct vnode vp, / Vnode to be covered. */
	int fsflags, /* Flags common to all filesystems. */
	struct vfsoptlist *optlist / Options local to the filesystem. */
	)
	{
	struct vattr va;
	struct mount *mp;
	struct vnode *newdp;
	int error;

	mtx_assert(&Giant, MA_OWNED);
	ASSERT_VOP_ELOCKED(vp, __func__);
	KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));

	/*
	* If the user is not root, ensure that they own the directory
	* onto which we are attempting to mount.
	*/
	error = VOP_GETATTR(vp, &va, td->td_ucred);
	if (error == 0 && va.va_uid != td->td_ucred->cr_uid)
	error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN, 0);
	if (error == 0)
	error = vinvalbuf(vp, V_SAVE, 0, 0);
	if (error == 0 && vp->v_type != VDIR)
	error = ENOTDIR;
	if (error == 0) {
	VI_LOCK(vp);
	if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL)
	vp->v_iflag \|= VI_MOUNT;
	else
	error = EBUSY;
	VI_UNLOCK(vp);
	}
	if (error != 0) {
	vput(vp);
	return (error);
	}
	VOP_UNLOCK(vp, 0);

	/* Allocate and initialize the filesystem. */
	mp = vfs_mount_alloc(vp, vfsp, fspath, td->td_ucred);
	/* XXXMAC: pass to vfs_mount_alloc? */
	mp->mnt_optnew = *optlist;
	/* Set the mount level flags. */
	mp->mnt_flag = (fsflags & (MNT_UPDATEMASK \| MNT_ROOTFS \| MNT_RDONLY));

	/*
	* Mount the filesystem.
	* XXX The final recipients of VFS_MOUNT just overwrite the ndp they
	* get. No freeing of cn_pnbuf.
	*/
	error = VFS_MOUNT(mp);
	if (error != 0) {
	vfs_unbusy(mp);
	vfs_mount_destroy(mp);
	VI_LOCK(vp);
	vp->v_iflag &= ~VI_MOUNT;
	VI_UNLOCK(vp);
	vrele(vp);
	return (error);
	}

	if (mp->mnt_opt != NULL)
	vfs_freeopts(mp->mnt_opt);
	mp->mnt_opt = mp->mnt_optnew;
	*optlist = NULL;
	(void)VFS_STATFS(mp, &mp->mnt_stat);

	/*
	* Prevent external consumers of mount options from reading mnt_optnew.
	*/
	mp->mnt_optnew = NULL;

	MNT_ILOCK(mp);
	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
	mp->mnt_kern_flag \|= MNTK_ASYNC;
	else
	mp->mnt_kern_flag &= ~MNTK_ASYNC;
	MNT_IUNLOCK(mp);

	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	cache_purge(vp);
	VI_LOCK(vp);
	vp->v_iflag &= ~VI_MOUNT;
	VI_UNLOCK(vp);
	vp->v_mountedhere = mp;
	/* Place the new filesystem at the end of the mount list. */
	mtx_lock(&mountlist_mtx);
	TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list);
	mtx_unlock(&mountlist_mtx);
	vfs_event_signal(NULL, VQ_MOUNT, 0);
	if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp))
	panic("mount: lost mount");
	VOP_UNLOCK(newdp, 0);
	VOP_UNLOCK(vp, 0);
	mountcheckdirs(vp, newdp);
	vrele(newdp);
	if ((mp->mnt_flag & MNT_RDONLY) == 0)
	vfs_allocate_syncvnode(mp);
	vfs_unbusy(mp);
	return (0);
	}

	/*
	* vfs_domount_update(): update of mounted file system
	*/
	static int
	vfs_domount_update(
	struct thread td, / Calling thread. */
	struct vnode vp, / Mount point vnode. */
	int fsflags, /* Flags common to all filesystems. */
	struct vfsoptlist *optlist / Options local to the filesystem. */
	)
	{
	struct oexport_args oexport;
	struct export_args export;
	struct mount *mp;
	int error, export_error, flag;

	mtx_assert(&Giant, MA_OWNED);
	ASSERT_VOP_ELOCKED(vp, __func__);
	KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));

	if ((vp->v_vflag & VV_ROOT) == 0) {
	vput(vp);
	return (EINVAL);
	}
	mp = vp->v_mount;
	/*
	* We only allow the filesystem to be reloaded if it
	* is currently mounted read-only.
	*/
	flag = mp->mnt_flag;
	if ((fsflags & MNT_RELOAD) != 0 && (flag & MNT_RDONLY) == 0) {
	vput(vp);
	return (EOPNOTSUPP); /* Needs translation */
	}
	/*
	* Only privileged root, or (if MNT_USER is set) the user that
	* did the original mount is permitted to update it.
	*/
	error = vfs_suser(mp, td);
	if (error != 0) {
	vput(vp);
	return (error);
	}
	if (vfs_busy(mp, MBF_NOWAIT)) {
	vput(vp);
	return (EBUSY);
	}
	VI_LOCK(vp);
	if ((vp->v_iflag & VI_MOUNT) != 0 \|\| vp->v_mountedhere != NULL) {
	VI_UNLOCK(vp);
	vfs_unbusy(mp);
	vput(vp);
	return (EBUSY);
	}
	vp->v_iflag \|= VI_MOUNT;
	VI_UNLOCK(vp);
	VOP_UNLOCK(vp, 0);

	MNT_ILOCK(mp);
	mp->mnt_flag &= ~MNT_UPDATEMASK;
	mp->mnt_flag \|= fsflags & (MNT_RELOAD \| MNT_FORCE \| MNT_UPDATE \|
	MNT_SNAPSHOT \| MNT_ROOTFS \| MNT_UPDATEMASK \| MNT_RDONLY);
	if ((mp->mnt_flag & MNT_ASYNC) == 0)
	mp->mnt_kern_flag &= ~MNTK_ASYNC;
	MNT_IUNLOCK(mp);
	mp->mnt_optnew = *optlist;
	vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);

	/*
	* Mount the filesystem.
	* XXX The final recipients of VFS_MOUNT just overwrite the ndp they
	* get. No freeing of cn_pnbuf.
	*/
	error = VFS_MOUNT(mp);

	export_error = 0;
	if (error == 0) {
	/* Process the export option. */
	if (vfs_copyopt(mp->mnt_optnew, "export", &export,
	sizeof(export)) == 0) {
	export_error = vfs_export(mp, &export);
	} else if (vfs_copyopt(mp->mnt_optnew, "export", &oexport,
	sizeof(oexport)) == 0) {
	export.ex_flags = oexport.ex_flags;
	export.ex_root = oexport.ex_root;
	export.ex_anon = oexport.ex_anon;
	export.ex_addr = oexport.ex_addr;
	export.ex_addrlen = oexport.ex_addrlen;
	export.ex_mask = oexport.ex_mask;
	export.ex_masklen = oexport.ex_masklen;
	export.ex_indexfile = oexport.ex_indexfile;
	export.ex_numsecflavors = 0;
	export_error = vfs_export(mp, &export);
	}
	}

	MNT_ILOCK(mp);
	if (error == 0) {
	mp->mnt_flag &= ~(MNT_UPDATE \| MNT_RELOAD \| MNT_FORCE \|
	MNT_SNAPSHOT);
	} else {
	/*
	* If we fail, restore old mount flags. MNT_QUOTA is special,
	* because it is not part of MNT_UPDATEMASK, but it could have
	* changed in the meantime if quotactl(2) was called.
	* All in all we want current value of MNT_QUOTA, not the old
	* one.
	*/
	mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) \| (flag & ~MNT_QUOTA);
	}
	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
	mp->mnt_kern_flag \|= MNTK_ASYNC;
	else
	mp->mnt_kern_flag &= ~MNTK_ASYNC;
	MNT_IUNLOCK(mp);

	if (error != 0)
	goto end;

	if (mp->mnt_opt != NULL)
	vfs_freeopts(mp->mnt_opt);
	mp->mnt_opt = mp->mnt_optnew;
	*optlist = NULL;
	(void)VFS_STATFS(mp, &mp->mnt_stat);
	/*
	* Prevent external consumers of mount options from reading
	* mnt_optnew.
	*/
	mp->mnt_optnew = NULL;

	if ((mp->mnt_flag & MNT_RDONLY) == 0)
	vfs_allocate_syncvnode(mp);
	else
	vfs_deallocate_syncvnode(mp);
	end:
	vfs_unbusy(mp);
	VI_LOCK(vp);
	vp->v_iflag &= ~VI_MOUNT;
	VI_UNLOCK(vp);
	vrele(vp);
	return (error != 0 ? error : export_error);
	}

	/*
	* vfs_domount(): actually attempt a filesystem mount.
	*/
	static int
	vfs_domount(
	struct thread td, / Calling thread. */
	const char fstype, / Filesystem type. */
	char fspath, / Mount path. */
	int fsflags, /* Flags common to all filesystems. */
	struct vfsoptlist *optlist / Options local to the filesystem. */
	)
	{
	struct vfsconf *vfsp;
	struct nameidata nd;
	struct vnode *vp;
	int error;

	/*
	* Be ultra-paranoid about making sure the type and fspath
	* variables will fit in our mp buffers, including the
	* terminating NUL.
	*/
	if (strlen(fstype) >= MFSNAMELEN \|\| strlen(fspath) >= MNAMELEN)
	return (ENAMETOOLONG);

	if (jailed(td->td_ucred) \|\| usermount == 0) {
	if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
	return (error);
	}

	/*
	* Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
	*/
	if (fsflags & MNT_EXPORTED) {
	error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
	if (error)
	return (error);
	}
	if (fsflags & MNT_SUIDDIR) {
	error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
	if (error)
	return (error);
	}
	/*
	* Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
	*/
	if ((fsflags & (MNT_NOSUID \| MNT_USER)) != (MNT_NOSUID \| MNT_USER)) {
	if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
	fsflags \|= MNT_NOSUID \| MNT_USER;
	}

	/* Load KLDs before we lock the covered vnode to avoid reversals. */
	vfsp = NULL;
	if ((fsflags & MNT_UPDATE) == 0) {
	/* Don't try to load KLDs if we're mounting the root. */
	if (fsflags & MNT_ROOTFS)
	vfsp = vfs_byname(fstype);
	else
	vfsp = vfs_byname_kld(fstype, td, &error);
	if (vfsp == NULL)
	return (ENODEV);
	if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL))
	return (EPERM);
	}

	/*
	* Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
	*/
	NDINIT(&nd, LOOKUP, FOLLOW \| LOCKLEAF \| MPSAFE \| AUDITVNODE1,
	UIO_SYSSPACE, fspath, td);
	error = namei(&nd);
	if (error != 0)
	return (error);
	if (!NDHASGIANT(&nd))
	mtx_lock(&Giant);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vp = nd.ni_vp;
	if ((fsflags & MNT_UPDATE) == 0) {
	error = vfs_domount_first(td, vfsp, fspath, vp, fsflags,
	optlist);
	} else {
	error = vfs_domount_update(td, vp, fsflags, optlist);
	}
	mtx_unlock(&Giant);

	ASSERT_VI_UNLOCKED(vp, __func__);
	ASSERT_VOP_UNLOCKED(vp, __func__);

	return (error);
	}

	/*
	* Unmount a filesystem.
	*
	* Note: unmount takes a path to the vnode mounted on as argument, not
	* special file (as before).
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct unmount_args {
	char *path;
	int flags;
	};
	#endif
	/* ARGSUSED */
	int
	-unmount(td, uap)
	+sys_unmount(td, uap)
	struct thread *td;
	register struct unmount_args /* {
	char *path;
	int flags;
	} / uap;
	{
	struct mount *mp;
	char *pathbuf;
	int error, id0, id1;

	AUDIT_ARG_VALUE(uap->flags);
	if (jailed(td->td_ucred) \|\| usermount == 0) {
	error = priv_check(td, PRIV_VFS_UNMOUNT);
	if (error)
	return (error);
	}

	pathbuf = malloc(MNAMELEN, M_TEMP, M_WAITOK);
	error = copyinstr(uap->path, pathbuf, MNAMELEN, NULL);
	if (error) {
	free(pathbuf, M_TEMP);
	return (error);
	}
	mtx_lock(&Giant);
	if (uap->flags & MNT_BYFSID) {
	AUDIT_ARG_TEXT(pathbuf);
	/* Decode the filesystem ID. */
	if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
	mtx_unlock(&Giant);
	free(pathbuf, M_TEMP);
	return (EINVAL);
	}

	mtx_lock(&mountlist_mtx);
	TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
	if (mp->mnt_stat.f_fsid.val[0] == id0 &&
	mp->mnt_stat.f_fsid.val[1] == id1)
	break;
	}
	mtx_unlock(&mountlist_mtx);
	} else {
	AUDIT_ARG_UPATH1(td, pathbuf);
	mtx_lock(&mountlist_mtx);
	TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
	if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
	break;
	}
	mtx_unlock(&mountlist_mtx);
	}
	free(pathbuf, M_TEMP);
	if (mp == NULL) {
	/*
	* Previously we returned ENOENT for a nonexistent path and
	* EINVAL for a non-mountpoint. We cannot tell these apart
	* now, so in the !MNT_BYFSID case return the more likely
	* EINVAL for compatibility.
	*/
	mtx_unlock(&Giant);
	return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
	}

	/*
	* Don't allow unmounting the root filesystem.
	*/
	if (mp->mnt_flag & MNT_ROOTFS) {
	mtx_unlock(&Giant);
	return (EINVAL);
	}
	error = dounmount(mp, uap->flags, td);
	mtx_unlock(&Giant);
	return (error);
	}

	/*
	* Do the actual filesystem unmount.
	*/
	int
	dounmount(mp, flags, td)
	struct mount *mp;
	int flags;
	struct thread *td;
	{
	struct vnode coveredvp, fsrootvp;
	int error;
	int async_flag;
	int mnt_gen_r;

	mtx_assert(&Giant, MA_OWNED);

	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
	mnt_gen_r = mp->mnt_gen;
	VI_LOCK(coveredvp);
	vholdl(coveredvp);
	vn_lock(coveredvp, LK_EXCLUSIVE \| LK_INTERLOCK \| LK_RETRY);
	vdrop(coveredvp);
	/*
	* Check for mp being unmounted while waiting for the
	* covered vnode lock.
	*/
	if (coveredvp->v_mountedhere != mp \|\|
	coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
	VOP_UNLOCK(coveredvp, 0);
	return (EBUSY);
	}
	}
	/*
	* Only privileged root, or (if MNT_USER is set) the user that did the
	* original mount is permitted to unmount this filesystem.
	*/
	error = vfs_suser(mp, td);
	if (error) {
	if (coveredvp)
	VOP_UNLOCK(coveredvp, 0);
	return (error);
	}

	MNT_ILOCK(mp);
	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
	MNT_IUNLOCK(mp);
	if (coveredvp)
	VOP_UNLOCK(coveredvp, 0);
	return (EBUSY);
	}
	mp->mnt_kern_flag \|= MNTK_UNMOUNT \| MNTK_NOINSMNTQ;
	/* Allow filesystems to detect that a forced unmount is in progress. */
	if (flags & MNT_FORCE)
	mp->mnt_kern_flag \|= MNTK_UNMOUNTF;
	error = 0;
	if (mp->mnt_lockref) {
	if ((flags & MNT_FORCE) == 0) {
	mp->mnt_kern_flag &= ~(MNTK_UNMOUNT \| MNTK_NOINSMNTQ \|
	MNTK_UNMOUNTF);
	if (mp->mnt_kern_flag & MNTK_MWAIT) {
	mp->mnt_kern_flag &= ~MNTK_MWAIT;
	wakeup(mp);
	}
	MNT_IUNLOCK(mp);
	if (coveredvp)
	VOP_UNLOCK(coveredvp, 0);
	return (EBUSY);
	}
	mp->mnt_kern_flag \|= MNTK_DRAINING;
	error = msleep(&mp->mnt_lockref, MNT_MTX(mp), PVFS,
	"mount drain", 0);
	}
	MNT_IUNLOCK(mp);
	KASSERT(mp->mnt_lockref == 0,
	("%s: invalid lock refcount in the drain path @ %s:%d",
	__func__, __FILE__, __LINE__));
	KASSERT(error == 0,
	("%s: invalid return value for msleep in the drain path @ %s:%d",
	__func__, __FILE__, __LINE__));
	vn_start_write(NULL, &mp, V_WAIT);

	if (mp->mnt_flag & MNT_EXPUBLIC)
	vfs_setpublicfs(NULL, NULL, NULL);

	vfs_msync(mp, MNT_WAIT);
	MNT_ILOCK(mp);
	async_flag = mp->mnt_flag & MNT_ASYNC;
	mp->mnt_flag &= ~MNT_ASYNC;
	mp->mnt_kern_flag &= ~MNTK_ASYNC;
	MNT_IUNLOCK(mp);
	cache_purgevfs(mp); /* remove cache entries for this file sys */
	vfs_deallocate_syncvnode(mp);
	/*
	* For forced unmounts, move process cdir/rdir refs on the fs root
	* vnode to the covered vnode. For non-forced unmounts we want
	* such references to cause an EBUSY error.
	*/
	if ((flags & MNT_FORCE) &&
	VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
	if (mp->mnt_vnodecovered != NULL)
	mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
	if (fsrootvp == rootvnode) {
	vrele(rootvnode);
	rootvnode = NULL;
	}
	vput(fsrootvp);
	}
	if (((mp->mnt_flag & MNT_RDONLY) \|\|
	(error = VFS_SYNC(mp, MNT_WAIT)) == 0) \|\| (flags & MNT_FORCE) != 0)
	error = VFS_UNMOUNT(mp, flags);
	vn_finished_write(mp);
	/*
	* If we failed to flush the dirty blocks for this mount point,
	* undo all the cdir/rdir and rootvnode changes we made above.
	* Unless we failed to do so because the device is reporting that
	* it doesn't exist anymore.
	*/
	if (error && error != ENXIO) {
	if ((flags & MNT_FORCE) &&
	VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
	if (mp->mnt_vnodecovered != NULL)
	mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
	if (rootvnode == NULL) {
	rootvnode = fsrootvp;
	vref(rootvnode);
	}
	vput(fsrootvp);
	}
	MNT_ILOCK(mp);
	mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ;
	if ((mp->mnt_flag & MNT_RDONLY) == 0) {
	MNT_IUNLOCK(mp);
	vfs_allocate_syncvnode(mp);
	MNT_ILOCK(mp);
	}
	mp->mnt_kern_flag &= ~(MNTK_UNMOUNT \| MNTK_UNMOUNTF);
	mp->mnt_flag \|= async_flag;
	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
	mp->mnt_kern_flag \|= MNTK_ASYNC;
	if (mp->mnt_kern_flag & MNTK_MWAIT) {
	mp->mnt_kern_flag &= ~MNTK_MWAIT;
	wakeup(mp);
	}
	MNT_IUNLOCK(mp);
	if (coveredvp)
	VOP_UNLOCK(coveredvp, 0);
	return (error);
	}
	mtx_lock(&mountlist_mtx);
	TAILQ_REMOVE(&mountlist, mp, mnt_list);
	mtx_unlock(&mountlist_mtx);
	if (coveredvp != NULL) {
	coveredvp->v_mountedhere = NULL;
	vput(coveredvp);
	}
	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
	vfs_mount_destroy(mp);
	return (0);
	}

	/*
	* Report errors during filesystem mounting.
	*/
	void
	vfs_mount_error(struct mount mp, const char fmt, ...)
	{
	struct vfsoptlist *moptlist = mp->mnt_optnew;
	va_list ap;
	int error, len;
	char *errmsg;

	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
	if (error \|\| errmsg == NULL \|\| len <= 0)
	return;

	va_start(ap, fmt);
	vsnprintf(errmsg, (size_t)len, fmt, ap);
	va_end(ap);
	}

	void
	vfs_opterror(struct vfsoptlist opts, const char fmt, ...)
	{
	va_list ap;
	int error, len;
	char *errmsg;

	error = vfs_getopt(opts, "errmsg", (void **)&errmsg, &len);
	if (error \|\| errmsg == NULL \|\| len <= 0)
	return;

	va_start(ap, fmt);
	vsnprintf(errmsg, (size_t)len, fmt, ap);
	va_end(ap);
	}

	/*
	* ---------------------------------------------------------------------
	* Functions for querying mount options/arguments from filesystems.
	*/

	/*
	* Check that no unknown options are given
	*/
	int
	vfs_filteropt(struct vfsoptlist opts, const char *legal)
	{
	struct vfsopt *opt;
	char errmsg[255];
	const char *t, p, *q;
	int ret = 0;

	TAILQ_FOREACH(opt, opts, link) {
	p = opt->name;
	q = NULL;
	if (p[0] == 'n' && p[1] == 'o')
	q = p + 2;
	for(t = global_opts; *t != NULL; t++) {
	if (strcmp(*t, p) == 0)
	break;
	if (q != NULL) {
	if (strcmp(*t, q) == 0)
	break;
	}
	}
	if (*t != NULL)
	continue;
	for(t = legal; *t != NULL; t++) {
	if (strcmp(*t, p) == 0)
	break;
	if (q != NULL) {
	if (strcmp(*t, q) == 0)
	break;
	}
	}
	if (*t != NULL)
	continue;
	snprintf(errmsg, sizeof(errmsg),
	"mount option <%s> is unknown", p);
	ret = EINVAL;
	}
	if (ret != 0) {
	TAILQ_FOREACH(opt, opts, link) {
	if (strcmp(opt->name, "errmsg") == 0) {
	strncpy((char *)opt->value, errmsg, opt->len);
	break;
	}
	}
	if (opt == NULL)
	printf("%s\n", errmsg);
	}
	return (ret);
	}

	/*
	* Get a mount option by its name.
	*
	* Return 0 if the option was found, ENOENT otherwise.
	* If len is non-NULL it will be filled with the length
	* of the option. If buf is non-NULL, it will be filled
	* with the address of the option.
	*/
	int
	vfs_getopt(opts, name, buf, len)
	struct vfsoptlist *opts;
	const char *name;
	void **buf;
	int *len;
	{
	struct vfsopt *opt;

	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));

	TAILQ_FOREACH(opt, opts, link) {
	if (strcmp(name, opt->name) == 0) {
	opt->seen = 1;
	if (len != NULL)
	*len = opt->len;
	if (buf != NULL)
	*buf = opt->value;
	return (0);
	}
	}
	return (ENOENT);
	}

	int
	vfs_getopt_pos(struct vfsoptlist opts, const char name)
	{
	struct vfsopt *opt;

	if (opts == NULL)
	return (-1);

	TAILQ_FOREACH(opt, opts, link) {
	if (strcmp(name, opt->name) == 0) {
	opt->seen = 1;
	return (opt->pos);
	}
	}
	return (-1);
	}

	char *
	vfs_getopts(struct vfsoptlist opts, const char name, int *error)
	{
	struct vfsopt *opt;

	*error = 0;
	TAILQ_FOREACH(opt, opts, link) {
	if (strcmp(name, opt->name) != 0)
	continue;
	opt->seen = 1;
	if (opt->len == 0 \|\|
	((char *)opt->value)[opt->len - 1] != '\0') {
	*error = EINVAL;
	return (NULL);
	}
	return (opt->value);
	}
	*error = ENOENT;
	return (NULL);
	}

	int
	vfs_flagopt(struct vfsoptlist opts, const char name, uint64_t *w,
	uint64_t val)
	{
	struct vfsopt *opt;

	TAILQ_FOREACH(opt, opts, link) {
	if (strcmp(name, opt->name) == 0) {
	opt->seen = 1;
	if (w != NULL)
	*w \|= val;
	return (1);
	}
	}
	if (w != NULL)
	*w &= ~val;
	return (0);
	}

	int
	vfs_scanopt(struct vfsoptlist opts, const char name, const char *fmt, ...)
	{
	va_list ap;
	struct vfsopt *opt;
	int ret;

	KASSERT(opts != NULL, ("vfs_getopt: caller passed 'opts' as NULL"));

	TAILQ_FOREACH(opt, opts, link) {
	if (strcmp(name, opt->name) != 0)
	continue;
	opt->seen = 1;
	if (opt->len == 0 \|\| opt->value == NULL)
	return (0);
	if (((char *)opt->value)[opt->len - 1] != '\0')
	return (0);
	va_start(ap, fmt);
	ret = vsscanf(opt->value, fmt, ap);
	va_end(ap);
	return (ret);
	}
	return (0);
	}

	int
	vfs_setopt(struct vfsoptlist opts, const char name, void *value, int len)
	{
	struct vfsopt *opt;

	TAILQ_FOREACH(opt, opts, link) {
	if (strcmp(name, opt->name) != 0)
	continue;
	opt->seen = 1;
	if (opt->value == NULL)
	opt->len = len;
	else {
	if (opt->len != len)
	return (EINVAL);
	bcopy(value, opt->value, len);
	}
	return (0);
	}
	return (ENOENT);
	}

	int
	vfs_setopt_part(struct vfsoptlist opts, const char name, void *value, int len)
	{
	struct vfsopt *opt;

	TAILQ_FOREACH(opt, opts, link) {
	if (strcmp(name, opt->name) != 0)
	continue;
	opt->seen = 1;
	if (opt->value == NULL)
	opt->len = len;
	else {
	if (opt->len < len)
	return (EINVAL);
	opt->len = len;
	bcopy(value, opt->value, len);
	}
	return (0);
	}
	return (ENOENT);
	}

	int
	vfs_setopts(struct vfsoptlist opts, const char name, const char *value)
	{
	struct vfsopt *opt;

	TAILQ_FOREACH(opt, opts, link) {
	if (strcmp(name, opt->name) != 0)
	continue;
	opt->seen = 1;
	if (opt->value == NULL)
	opt->len = strlen(value) + 1;
	else if (strlcpy(opt->value, value, opt->len) >= opt->len)
	return (EINVAL);
	return (0);
	}
	return (ENOENT);
	}

	/*
	* Find and copy a mount option.
	*
	* The size of the buffer has to be specified
	* in len, if it is not the same length as the
	* mount option, EINVAL is returned.
	* Returns ENOENT if the option is not found.
	*/
	int
	vfs_copyopt(opts, name, dest, len)
	struct vfsoptlist *opts;
	const char *name;
	void *dest;
	int len;
	{
	struct vfsopt *opt;

	KASSERT(opts != NULL, ("vfs_copyopt: caller passed 'opts' as NULL"));

	TAILQ_FOREACH(opt, opts, link) {
	if (strcmp(name, opt->name) == 0) {
	opt->seen = 1;
	if (len != opt->len)
	return (EINVAL);
	bcopy(opt->value, dest, opt->len);
	return (0);
	}
	}
	return (ENOENT);
	}

	/*
	* This is a helper function for filesystems to traverse their
	* vnodes. See MNT_VNODE_FOREACH() in sys/mount.h
	*/

	struct vnode *
	__mnt_vnode_next(struct vnode *mvp, struct mount mp)
	{
	struct vnode *vp;

	mtx_assert(MNT_MTX(mp), MA_OWNED);

	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
	if (should_yield()) {
	MNT_IUNLOCK(mp);
	kern_yield(PRI_UNCHANGED);
	MNT_ILOCK(mp);
	}
	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
	while (vp != NULL && vp->v_type == VMARKER)
	vp = TAILQ_NEXT(vp, v_nmntvnodes);

	/* Check if we are done */
	if (vp == NULL) {
	__mnt_vnode_markerfree(mvp, mp);
	return (NULL);
	}
	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
	return (vp);
	}

	struct vnode *
	__mnt_vnode_first(struct vnode *mvp, struct mount mp)
	{
	struct vnode *vp;

	mtx_assert(MNT_MTX(mp), MA_OWNED);

	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
	while (vp != NULL && vp->v_type == VMARKER)
	vp = TAILQ_NEXT(vp, v_nmntvnodes);

	/* Check if we are done */
	if (vp == NULL) {
	*mvp = NULL;
	return (NULL);
	}
	MNT_REF(mp);
	MNT_IUNLOCK(mp);
	mvp = (struct vnode ) malloc(sizeof(struct vnode),
	M_VNODE_MARKER,
	M_WAITOK \| M_ZERO);
	MNT_ILOCK(mp);
	(*mvp)->v_type = VMARKER;

	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
	while (vp != NULL && vp->v_type == VMARKER)
	vp = TAILQ_NEXT(vp, v_nmntvnodes);

	/* Check if we are done */
	if (vp == NULL) {
	MNT_IUNLOCK(mp);
	free(*mvp, M_VNODE_MARKER);
	MNT_ILOCK(mp);
	*mvp = NULL;
	MNT_REL(mp);
	return (NULL);
	}
	(*mvp)->v_mount = mp;
	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
	return (vp);
	}


	void
	__mnt_vnode_markerfree(struct vnode *mvp, struct mount mp)
	{

	if (*mvp == NULL)
	return;

	mtx_assert(MNT_MTX(mp), MA_OWNED);

	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
	MNT_IUNLOCK(mp);
	free(*mvp, M_VNODE_MARKER);
	MNT_ILOCK(mp);
	*mvp = NULL;
	MNT_REL(mp);
	}


	int
	__vfs_statfs(struct mount mp, struct statfs sbp)
	{
	int error;

	error = mp->mnt_op->vfs_statfs(mp, &mp->mnt_stat);
	if (sbp != &mp->mnt_stat)
	*sbp = mp->mnt_stat;
	return (error);
	}

	void
	vfs_mountedfrom(struct mount mp, const char from)
	{

	bzero(mp->mnt_stat.f_mntfromname, sizeof mp->mnt_stat.f_mntfromname);
	strlcpy(mp->mnt_stat.f_mntfromname, from,
	sizeof mp->mnt_stat.f_mntfromname);
	}

	/*
	* ---------------------------------------------------------------------
	* This is the api for building mount args and mounting filesystems from
	* inside the kernel.
	*
	* The API works by accumulation of individual args. First error is
	* latched.
	*
	* XXX: should be documented in new manpage kernel_mount(9)
	*/

	/* A memory allocation which must be freed when we are done */
	struct mntaarg {
	SLIST_ENTRY(mntaarg) next;
	};

	/* The header for the mount arguments */
	struct mntarg {
	struct iovec *v;
	int len;
	int error;
	SLIST_HEAD(, mntaarg) list;
	};

	/*
	* Add a boolean argument.
	*
	* flag is the boolean value.
	* name must start with "no".
	*/
	struct mntarg *
	mount_argb(struct mntarg ma, int flag, const char name)
	{

	KASSERT(name[0] == 'n' && name[1] == 'o',
	("mount_argb(...,%s): name must start with 'no'", name));

	return (mount_arg(ma, name + (flag ? 2 : 0), NULL, 0));
	}

	/*
	* Add an argument printf style
	*/
	struct mntarg *
	mount_argf(struct mntarg ma, const char name, const char *fmt, ...)
	{
	va_list ap;
	struct mntaarg *maa;
	struct sbuf *sb;
	int len;

	if (ma == NULL) {
	ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK \| M_ZERO);
	SLIST_INIT(&ma->list);
	}
	if (ma->error)
	return (ma);

	ma->v = realloc(ma->v, sizeof ma->v (ma->len + 2),
	M_MOUNT, M_WAITOK);
	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
	ma->v[ma->len].iov_len = strlen(name) + 1;
	ma->len++;

	sb = sbuf_new_auto();
	va_start(ap, fmt);
	sbuf_vprintf(sb, fmt, ap);
	va_end(ap);
	sbuf_finish(sb);
	len = sbuf_len(sb) + 1;
	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK \| M_ZERO);
	SLIST_INSERT_HEAD(&ma->list, maa, next);
	bcopy(sbuf_data(sb), maa + 1, len);
	sbuf_delete(sb);

	ma->v[ma->len].iov_base = maa + 1;
	ma->v[ma->len].iov_len = len;
	ma->len++;

	return (ma);
	}

	/*
	* Add an argument which is a userland string.
	*/
	struct mntarg *
	mount_argsu(struct mntarg ma, const char name, const void *val, int len)
	{
	struct mntaarg *maa;
	char *tbuf;

	if (val == NULL)
	return (ma);
	if (ma == NULL) {
	ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK \| M_ZERO);
	SLIST_INIT(&ma->list);
	}
	if (ma->error)
	return (ma);
	maa = malloc(sizeof *maa + len, M_MOUNT, M_WAITOK \| M_ZERO);
	SLIST_INSERT_HEAD(&ma->list, maa, next);
	tbuf = (void *)(maa + 1);
	ma->error = copyinstr(val, tbuf, len, NULL);
	return (mount_arg(ma, name, tbuf, -1));
	}

	/*
	* Plain argument.
	*
	* If length is -1, treat value as a C string.
	*/
	struct mntarg *
	mount_arg(struct mntarg ma, const char name, const void *val, int len)
	{

	if (ma == NULL) {
	ma = malloc(sizeof *ma, M_MOUNT, M_WAITOK \| M_ZERO);
	SLIST_INIT(&ma->list);
	}
	if (ma->error)
	return (ma);

	ma->v = realloc(ma->v, sizeof ma->v (ma->len + 2),
	M_MOUNT, M_WAITOK);
	ma->v[ma->len].iov_base = (void *)(uintptr_t)name;
	ma->v[ma->len].iov_len = strlen(name) + 1;
	ma->len++;

	ma->v[ma->len].iov_base = (void *)(uintptr_t)val;
	if (len < 0)
	ma->v[ma->len].iov_len = strlen(val) + 1;
	else
	ma->v[ma->len].iov_len = len;
	ma->len++;
	return (ma);
	}

	/*
	* Free a mntarg structure
	*/
	static void
	free_mntarg(struct mntarg *ma)
	{
	struct mntaarg *maa;

	while (!SLIST_EMPTY(&ma->list)) {
	maa = SLIST_FIRST(&ma->list);
	SLIST_REMOVE_HEAD(&ma->list, next);
	free(maa, M_MOUNT);
	}
	free(ma->v, M_MOUNT);
	free(ma, M_MOUNT);
	}

	/*
	* Mount a filesystem
	*/
	int
	kernel_mount(struct mntarg *ma, int flags)
	{
	struct uio auio;
	int error;

	KASSERT(ma != NULL, ("kernel_mount NULL ma"));
	KASSERT(ma->v != NULL, ("kernel_mount NULL ma->v"));
	KASSERT(!(ma->len & 1), ("kernel_mount odd ma->len (%d)", ma->len));

	auio.uio_iov = ma->v;
	auio.uio_iovcnt = ma->len;
	auio.uio_segflg = UIO_SYSSPACE;

	error = ma->error;
	if (!error)
	error = vfs_donmount(curthread, flags, &auio);
	free_mntarg(ma);
	return (error);
	}

	/*
	* A printflike function to mount a filesystem.
	*/
	int
	kernel_vmount(int flags, ...)
	{
	struct mntarg *ma = NULL;
	va_list ap;
	const char *cp;
	const void *vp;
	int error;

	va_start(ap, flags);
	for (;;) {
	cp = va_arg(ap, const char *);
	if (cp == NULL)
	break;
	vp = va_arg(ap, const void *);
	ma = mount_arg(ma, cp, vp, (vp != NULL ? -1 : 0));
	}
	va_end(ap);

	error = kernel_mount(ma, flags);
	return (error);
	}

	void
	vfs_oexport_conv(const struct oexport_args oexp, struct export_args exp)
	{

	bcopy(oexp, exp, sizeof(*oexp));
	exp->ex_numsecflavors = 0;
	}
	Index: head/sys/kern/vfs_syscalls.c
	===================================================================
	--- head/sys/kern/vfs_syscalls.c (revision 225616)
	+++ head/sys/kern/vfs_syscalls.c (revision 225617)
	@@ -1,4847 +1,4847 @@
	/*-
	* Copyright (c) 1989, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_capsicum.h"
	#include "opt_compat.h"
	#include "opt_kdtrace.h"
	#include "opt_ktrace.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/capability.h>
	#include <sys/disk.h>
	#include <sys/sysent.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/sysproto.h>
	#include <sys/namei.h>
	#include <sys/filedesc.h>
	#include <sys/kernel.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/filio.h>
	#include <sys/limits.h>
	#include <sys/linker.h>
	#include <sys/sdt.h>
	#include <sys/stat.h>
	#include <sys/sx.h>
	#include <sys/unistd.h>
	#include <sys/vnode.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/dirent.h>
	#include <sys/jail.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#ifdef KTRACE
	#include <sys/ktrace.h>
	#endif

	#include <machine/stdarg.h>

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/uma.h>

	SDT_PROVIDER_DEFINE(vfs);
	SDT_PROBE_DEFINE(vfs, , stat, mode, mode);
	SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *");
	SDT_PROBE_ARGTYPE(vfs, , stat, mode, 1, "int");
	SDT_PROBE_DEFINE(vfs, , stat, reg, reg);
	SDT_PROBE_ARGTYPE(vfs, , stat, reg, 0, "char *");
	SDT_PROBE_ARGTYPE(vfs, , stat, reg, 1, "int");

	static int chroot_refuse_vdir_fds(struct filedesc *fdp);
	static int getutimes(const struct timeval , enum uio_seg, struct timespec );
	static int setfflags(struct thread td, struct vnode , int);
	static int setutimes(struct thread td, struct vnode ,
	const struct timespec *, int, int);
	static int vn_access(struct vnode vp, int user_flags, struct ucred cred,
	struct thread *td);

	/*
	* The module initialization routine for POSIX asynchronous I/O will
	* set this to the version of AIO that it implements. (Zero means
	* that it is not implemented.) This value is used here by pathconf()
	* and in kern_descrip.c by fpathconf().
	*/
	int async_io_version;

	#ifdef DEBUG
	static int syncprt = 0;
	SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
	#endif

	/*
	* Sync each mounted filesystem.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct sync_args {
	int dummy;
	};
	#endif
	/* ARGSUSED */
	int
	-sync(td, uap)
	+sys_sync(td, uap)
	struct thread *td;
	struct sync_args *uap;
	{
	struct mount mp, nmp;
	int vfslocked;

	mtx_lock(&mountlist_mtx);
	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	if (vfs_busy(mp, MBF_NOWAIT \| MBF_MNTLSTLOCK)) {
	nmp = TAILQ_NEXT(mp, mnt_list);
	continue;
	}
	vfslocked = VFS_LOCK_GIANT(mp);
	if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
	vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
	MNT_ILOCK(mp);
	mp->mnt_noasync++;
	mp->mnt_kern_flag &= ~MNTK_ASYNC;
	MNT_IUNLOCK(mp);
	vfs_msync(mp, MNT_NOWAIT);
	VFS_SYNC(mp, MNT_NOWAIT);
	MNT_ILOCK(mp);
	mp->mnt_noasync--;
	if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
	mp->mnt_noasync == 0)
	mp->mnt_kern_flag \|= MNTK_ASYNC;
	MNT_IUNLOCK(mp);
	vn_finished_write(mp);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	mtx_lock(&mountlist_mtx);
	nmp = TAILQ_NEXT(mp, mnt_list);
	vfs_unbusy(mp);
	}
	mtx_unlock(&mountlist_mtx);
	return (0);
	}

	/*
	* Change filesystem quotas.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct quotactl_args {
	char *path;
	int cmd;
	int uid;
	caddr_t arg;
	};
	#endif
	int
	-quotactl(td, uap)
	+sys_quotactl(td, uap)
	struct thread *td;
	register struct quotactl_args /* {
	char *path;
	int cmd;
	int uid;
	caddr_t arg;
	} / uap;
	{
	struct mount *mp;
	int vfslocked;
	int error;
	struct nameidata nd;

	AUDIT_ARG_CMD(uap->cmd);
	AUDIT_ARG_UID(uap->uid);
	if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
	return (EPERM);
	NDINIT(&nd, LOOKUP, FOLLOW \| LOCKLEAF \| MPSAFE \| AUDITVNODE1,
	UIO_USERSPACE, uap->path, td);
	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	mp = nd.ni_vp->v_mount;
	vfs_ref(mp);
	vput(nd.ni_vp);
	error = vfs_busy(mp, 0);
	vfs_rel(mp);
	if (error) {
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
	vfs_unbusy(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Used by statfs conversion routines to scale the block size up if
	* necessary so that all of the block counts are <= 'max_size'. Note
	* that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
	* value of 'n'.
	*/
	void
	statfs_scale_blocks(struct statfs *sf, long max_size)
	{
	uint64_t count;
	int shift;

	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));

	/*
	* Attempt to scale the block counts to give a more accurate
	* overview to userland of the ratio of free space to used
	* space. To do this, find the largest block count and compute
	* a divisor that lets it fit into a signed integer <= max_size.
	*/
	if (sf->f_bavail < 0)
	count = -sf->f_bavail;
	else
	count = sf->f_bavail;
	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
	if (count <= max_size)
	return;

	count >>= flsl(max_size);
	shift = 0;
	while (count > 0) {
	shift++;
	count >>=1;
	}

	sf->f_bsize <<= shift;
	sf->f_blocks >>= shift;
	sf->f_bfree >>= shift;
	sf->f_bavail >>= shift;
	}

	/*
	* Get filesystem statistics.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct statfs_args {
	char *path;
	struct statfs *buf;
	};
	#endif
	int
	-statfs(td, uap)
	+sys_statfs(td, uap)
	struct thread *td;
	register struct statfs_args /* {
	char *path;
	struct statfs *buf;
	} / uap;
	{
	struct statfs sf;
	int error;

	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
	if (error == 0)
	error = copyout(&sf, uap->buf, sizeof(sf));
	return (error);
	}

	int
	kern_statfs(struct thread td, char path, enum uio_seg pathseg,
	struct statfs *buf)
	{
	struct mount *mp;
	struct statfs *sp, sb;
	int vfslocked;
	int error;
	struct nameidata nd;

	NDINIT(&nd, LOOKUP, FOLLOW \| LOCKSHARED \| LOCKLEAF \| MPSAFE \|
	AUDITVNODE1, pathseg, path, td);
	error = namei(&nd);
	if (error)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	mp = nd.ni_vp->v_mount;
	vfs_ref(mp);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(nd.ni_vp);
	error = vfs_busy(mp, 0);
	vfs_rel(mp);
	if (error) {
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	#ifdef MAC
	error = mac_mount_check_stat(td->td_ucred, mp);
	if (error)
	goto out;
	#endif
	/*
	* Set these in case the underlying filesystem fails to do so.
	*/
	sp = &mp->mnt_stat;
	sp->f_version = STATFS_VERSION;
	sp->f_namemax = NAME_MAX;
	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	error = VFS_STATFS(mp, sp);
	if (error)
	goto out;
	if (priv_check(td, PRIV_VFS_GENERATION)) {
	bcopy(sp, &sb, sizeof(sb));
	sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
	prison_enforce_statfs(td->td_ucred, mp, &sb);
	sp = &sb;
	}
	buf = sp;
	out:
	vfs_unbusy(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Get filesystem statistics.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct fstatfs_args {
	int fd;
	struct statfs *buf;
	};
	#endif
	int
	-fstatfs(td, uap)
	+sys_fstatfs(td, uap)
	struct thread *td;
	register struct fstatfs_args /* {
	int fd;
	struct statfs *buf;
	} / uap;
	{
	struct statfs sf;
	int error;

	error = kern_fstatfs(td, uap->fd, &sf);
	if (error == 0)
	error = copyout(&sf, uap->buf, sizeof(sf));
	return (error);
	}

	int
	kern_fstatfs(struct thread td, int fd, struct statfs buf)
	{
	struct file *fp;
	struct mount *mp;
	struct statfs *sp, sb;
	int vfslocked;
	struct vnode *vp;
	int error;

	AUDIT_ARG_FD(fd);
	error = getvnode(td->td_proc->p_fd, fd, CAP_FSTATFS, &fp);
	if (error)
	return (error);
	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	#ifdef AUDIT
	AUDIT_ARG_VNODE1(vp);
	#endif
	mp = vp->v_mount;
	if (mp)
	vfs_ref(mp);
	VOP_UNLOCK(vp, 0);
	fdrop(fp, td);
	if (mp == NULL) {
	error = EBADF;
	goto out;
	}
	error = vfs_busy(mp, 0);
	vfs_rel(mp);
	if (error) {
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	#ifdef MAC
	error = mac_mount_check_stat(td->td_ucred, mp);
	if (error)
	goto out;
	#endif
	/*
	* Set these in case the underlying filesystem fails to do so.
	*/
	sp = &mp->mnt_stat;
	sp->f_version = STATFS_VERSION;
	sp->f_namemax = NAME_MAX;
	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	error = VFS_STATFS(mp, sp);
	if (error)
	goto out;
	if (priv_check(td, PRIV_VFS_GENERATION)) {
	bcopy(sp, &sb, sizeof(sb));
	sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
	prison_enforce_statfs(td->td_ucred, mp, &sb);
	sp = &sb;
	}
	buf = sp;
	out:
	if (mp)
	vfs_unbusy(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Get statistics on all filesystems.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct getfsstat_args {
	struct statfs *buf;
	long bufsize;
	int flags;
	};
	#endif
	int
	-getfsstat(td, uap)
	+sys_getfsstat(td, uap)
	struct thread *td;
	register struct getfsstat_args /* {
	struct statfs *buf;
	long bufsize;
	int flags;
	} / uap;
	{

	return (kern_getfsstat(td, &uap->buf, uap->bufsize, UIO_USERSPACE,
	uap->flags));
	}

	/*
	* If (bufsize > 0 && bufseg == UIO_SYSSPACE)
	* The caller is responsible for freeing memory which will be allocated
	* in '*buf'.
	*/
	int
	kern_getfsstat(struct thread td, struct statfs *buf, size_t bufsize,
	enum uio_seg bufseg, int flags)
	{
	struct mount mp, nmp;
	struct statfs sfsp, sp, sb;
	size_t count, maxcount;
	int vfslocked;
	int error;

	maxcount = bufsize / sizeof(struct statfs);
	if (bufsize == 0)
	sfsp = NULL;
	else if (bufseg == UIO_USERSPACE)
	sfsp = *buf;
	else /* if (bufseg == UIO_SYSSPACE) */ {
	count = 0;
	mtx_lock(&mountlist_mtx);
	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
	count++;
	}
	mtx_unlock(&mountlist_mtx);
	if (maxcount > count)
	maxcount = count;
	sfsp = buf = malloc(maxcount sizeof(struct statfs), M_TEMP,
	M_WAITOK);
	}
	count = 0;
	mtx_lock(&mountlist_mtx);
	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
	if (prison_canseemount(td->td_ucred, mp) != 0) {
	nmp = TAILQ_NEXT(mp, mnt_list);
	continue;
	}
	#ifdef MAC
	if (mac_mount_check_stat(td->td_ucred, mp) != 0) {
	nmp = TAILQ_NEXT(mp, mnt_list);
	continue;
	}
	#endif
	if (vfs_busy(mp, MBF_NOWAIT \| MBF_MNTLSTLOCK)) {
	nmp = TAILQ_NEXT(mp, mnt_list);
	continue;
	}
	vfslocked = VFS_LOCK_GIANT(mp);
	if (sfsp && count < maxcount) {
	sp = &mp->mnt_stat;
	/*
	* Set these in case the underlying filesystem
	* fails to do so.
	*/
	sp->f_version = STATFS_VERSION;
	sp->f_namemax = NAME_MAX;
	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	/*
	* If MNT_NOWAIT or MNT_LAZY is specified, do not
	* refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
	* overrides MNT_WAIT.
	*/
	if (((flags & (MNT_LAZY\|MNT_NOWAIT)) == 0 \|\|
	(flags & MNT_WAIT)) &&
	(error = VFS_STATFS(mp, sp))) {
	VFS_UNLOCK_GIANT(vfslocked);
	mtx_lock(&mountlist_mtx);
	nmp = TAILQ_NEXT(mp, mnt_list);
	vfs_unbusy(mp);
	continue;
	}
	if (priv_check(td, PRIV_VFS_GENERATION)) {
	bcopy(sp, &sb, sizeof(sb));
	sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
	prison_enforce_statfs(td->td_ucred, mp, &sb);
	sp = &sb;
	}
	if (bufseg == UIO_SYSSPACE)
	bcopy(sp, sfsp, sizeof(*sp));
	else /* if (bufseg == UIO_USERSPACE) */ {
	error = copyout(sp, sfsp, sizeof(*sp));
	if (error) {
	vfs_unbusy(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	}
	sfsp++;
	}
	VFS_UNLOCK_GIANT(vfslocked);
	count++;
	mtx_lock(&mountlist_mtx);
	nmp = TAILQ_NEXT(mp, mnt_list);
	vfs_unbusy(mp);
	}
	mtx_unlock(&mountlist_mtx);
	if (sfsp && count > maxcount)
	td->td_retval[0] = maxcount;
	else
	td->td_retval[0] = count;
	return (0);
	}

	#ifdef COMPAT_FREEBSD4
	/*
	* Get old format filesystem statistics.
	*/
	static void cvtstatfs(struct statfs , struct ostatfs );

	#ifndef _SYS_SYSPROTO_H_
	struct freebsd4_statfs_args {
	char *path;
	struct ostatfs *buf;
	};
	#endif
	int
	freebsd4_statfs(td, uap)
	struct thread *td;
	struct freebsd4_statfs_args /* {
	char *path;
	struct ostatfs *buf;
	} / uap;
	{
	struct ostatfs osb;
	struct statfs sf;
	int error;

	error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
	if (error)
	return (error);
	cvtstatfs(&sf, &osb);
	return (copyout(&osb, uap->buf, sizeof(osb)));
	}

	/*
	* Get filesystem statistics.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct freebsd4_fstatfs_args {
	int fd;
	struct ostatfs *buf;
	};
	#endif
	int
	freebsd4_fstatfs(td, uap)
	struct thread *td;
	struct freebsd4_fstatfs_args /* {
	int fd;
	struct ostatfs *buf;
	} / uap;
	{
	struct ostatfs osb;
	struct statfs sf;
	int error;

	error = kern_fstatfs(td, uap->fd, &sf);
	if (error)
	return (error);
	cvtstatfs(&sf, &osb);
	return (copyout(&osb, uap->buf, sizeof(osb)));
	}

	/*
	* Get statistics on all filesystems.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct freebsd4_getfsstat_args {
	struct ostatfs *buf;
	long bufsize;
	int flags;
	};
	#endif
	int
	freebsd4_getfsstat(td, uap)
	struct thread *td;
	register struct freebsd4_getfsstat_args /* {
	struct ostatfs *buf;
	long bufsize;
	int flags;
	} / uap;
	{
	struct statfs buf, sp;
	struct ostatfs osb;
	size_t count, size;
	int error;

	count = uap->bufsize / sizeof(struct ostatfs);
	size = count * sizeof(struct statfs);
	error = kern_getfsstat(td, &buf, size, UIO_SYSSPACE, uap->flags);
	if (size > 0) {
	count = td->td_retval[0];
	sp = buf;
	while (count > 0 && error == 0) {
	cvtstatfs(sp, &osb);
	error = copyout(&osb, uap->buf, sizeof(osb));
	sp++;
	uap->buf++;
	count--;
	}
	free(buf, M_TEMP);
	}
	return (error);
	}

	/*
	* Implement fstatfs() for (NFS) file handles.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct freebsd4_fhstatfs_args {
	struct fhandle *u_fhp;
	struct ostatfs *buf;
	};
	#endif
	int
	freebsd4_fhstatfs(td, uap)
	struct thread *td;
	struct freebsd4_fhstatfs_args /* {
	struct fhandle *u_fhp;
	struct ostatfs *buf;
	} / uap;
	{
	struct ostatfs osb;
	struct statfs sf;
	fhandle_t fh;
	int error;

	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
	if (error)
	return (error);
	error = kern_fhstatfs(td, fh, &sf);
	if (error)
	return (error);
	cvtstatfs(&sf, &osb);
	return (copyout(&osb, uap->buf, sizeof(osb)));
	}

	/*
	* Convert a new format statfs structure to an old format statfs structure.
	*/
	static void
	cvtstatfs(nsp, osp)
	struct statfs *nsp;
	struct ostatfs *osp;
	{

	statfs_scale_blocks(nsp, LONG_MAX);
	bzero(osp, sizeof(*osp));
	osp->f_bsize = nsp->f_bsize;
	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
	osp->f_blocks = nsp->f_blocks;
	osp->f_bfree = nsp->f_bfree;
	osp->f_bavail = nsp->f_bavail;
	osp->f_files = MIN(nsp->f_files, LONG_MAX);
	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
	osp->f_owner = nsp->f_owner;
	osp->f_type = nsp->f_type;
	osp->f_flags = nsp->f_flags;
	osp->f_syncwrites = MIN(nsp->f_syncwrites, LONG_MAX);
	osp->f_asyncwrites = MIN(nsp->f_asyncwrites, LONG_MAX);
	osp->f_syncreads = MIN(nsp->f_syncreads, LONG_MAX);
	osp->f_asyncreads = MIN(nsp->f_asyncreads, LONG_MAX);
	strlcpy(osp->f_fstypename, nsp->f_fstypename,
	MIN(MFSNAMELEN, OMFSNAMELEN));
	strlcpy(osp->f_mntonname, nsp->f_mntonname,
	MIN(MNAMELEN, OMNAMELEN));
	strlcpy(osp->f_mntfromname, nsp->f_mntfromname,
	MIN(MNAMELEN, OMNAMELEN));
	osp->f_fsid = nsp->f_fsid;
	}
	#endif /* COMPAT_FREEBSD4 */

	/*
	* Change current working directory to a given file descriptor.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct fchdir_args {
	int fd;
	};
	#endif
	int
	-fchdir(td, uap)
	+sys_fchdir(td, uap)
	struct thread *td;
	struct fchdir_args /* {
	int fd;
	} / uap;
	{
	register struct filedesc *fdp = td->td_proc->p_fd;
	struct vnode vp, tdp, *vpold;
	struct mount *mp;
	struct file *fp;
	int vfslocked;
	int error;

	AUDIT_ARG_FD(uap->fd);
	if ((error = getvnode(fdp, uap->fd, CAP_FCHDIR, &fp)) != 0)
	return (error);
	vp = fp->f_vnode;
	VREF(vp);
	fdrop(fp, td);
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	AUDIT_ARG_VNODE1(vp);
	error = change_dir(vp, td);
	while (!error && (mp = vp->v_mountedhere) != NULL) {
	int tvfslocked;
	if (vfs_busy(mp, 0))
	continue;
	tvfslocked = VFS_LOCK_GIANT(mp);
	error = VFS_ROOT(mp, LK_SHARED, &tdp);
	vfs_unbusy(mp);
	if (error) {
	VFS_UNLOCK_GIANT(tvfslocked);
	break;
	}
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	vp = tdp;
	vfslocked = tvfslocked;
	}
	if (error) {
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	FILEDESC_XLOCK(fdp);
	vpold = fdp->fd_cdir;
	fdp->fd_cdir = vp;
	FILEDESC_XUNLOCK(fdp);
	vfslocked = VFS_LOCK_GIANT(vpold->v_mount);
	vrele(vpold);
	VFS_UNLOCK_GIANT(vfslocked);
	return (0);
	}

	/*
	* Change current working directory (``.'').
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct chdir_args {
	char *path;
	};
	#endif
	int
	-chdir(td, uap)
	+sys_chdir(td, uap)
	struct thread *td;
	struct chdir_args /* {
	char *path;
	} / uap;
	{

	return (kern_chdir(td, uap->path, UIO_USERSPACE));
	}

	int
	kern_chdir(struct thread td, char path, enum uio_seg pathseg)
	{
	register struct filedesc *fdp = td->td_proc->p_fd;
	int error;
	struct nameidata nd;
	struct vnode *vp;
	int vfslocked;

	NDINIT(&nd, LOOKUP, FOLLOW \| LOCKSHARED \| LOCKLEAF \| AUDITVNODE1 \|
	MPSAFE, pathseg, path, td);
	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	if ((error = change_dir(nd.ni_vp, td)) != 0) {
	vput(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	return (error);
	}
	VOP_UNLOCK(nd.ni_vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	FILEDESC_XLOCK(fdp);
	vp = fdp->fd_cdir;
	fdp->fd_cdir = nd.ni_vp;
	FILEDESC_XUNLOCK(fdp);
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (0);
	}

	/*
	* Helper function for raised chroot(2) security function: Refuse if
	* any filedescriptors are open directories.
	*/
	static int
	chroot_refuse_vdir_fds(fdp)
	struct filedesc *fdp;
	{
	struct vnode *vp;
	struct file *fp;
	int fd;

	FILEDESC_LOCK_ASSERT(fdp);

	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
	fp = fget_locked(fdp, fd);
	if (fp == NULL)
	continue;
	if (fp->f_type == DTYPE_VNODE) {
	vp = fp->f_vnode;
	if (vp->v_type == VDIR)
	return (EPERM);
	}
	}
	return (0);
	}

	/*
	* This sysctl determines if we will allow a process to chroot(2) if it
	* has a directory open:
	* 0: disallowed for all processes.
	* 1: allowed for processes that were not already chroot(2)'ed.
	* 2: allowed for all processes.
	*/

	static int chroot_allow_open_directories = 1;

	SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
	&chroot_allow_open_directories, 0, "");

	/*
	* Change notion of root (``/'') directory.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct chroot_args {
	char *path;
	};
	#endif
	int
	-chroot(td, uap)
	+sys_chroot(td, uap)
	struct thread *td;
	struct chroot_args /* {
	char *path;
	} / uap;
	{
	int error;
	struct nameidata nd;
	int vfslocked;

	error = priv_check(td, PRIV_VFS_CHROOT);
	if (error)
	return (error);
	NDINIT(&nd, LOOKUP, FOLLOW \| LOCKSHARED \| LOCKLEAF \| MPSAFE \|
	AUDITVNODE1, UIO_USERSPACE, uap->path, td);
	error = namei(&nd);
	if (error)
	goto error;
	vfslocked = NDHASGIANT(&nd);
	if ((error = change_dir(nd.ni_vp, td)) != 0)
	goto e_vunlock;
	#ifdef MAC
	if ((error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp)))
	goto e_vunlock;
	#endif
	VOP_UNLOCK(nd.ni_vp, 0);
	error = change_root(nd.ni_vp, td);
	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	return (error);
	e_vunlock:
	vput(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	error:
	NDFREE(&nd, NDF_ONLY_PNBUF);
	return (error);
	}

	/*
	* Common routine for chroot and chdir. Callers must provide a locked vnode
	* instance.
	*/
	int
	change_dir(vp, td)
	struct vnode *vp;
	struct thread *td;
	{
	int error;

	ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
	if (vp->v_type != VDIR)
	return (ENOTDIR);
	#ifdef MAC
	error = mac_vnode_check_chdir(td->td_ucred, vp);
	if (error)
	return (error);
	#endif
	error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
	return (error);
	}

	/*
	* Common routine for kern_chroot() and jail_attach(). The caller is
	* responsible for invoking priv_check() and mac_vnode_check_chroot() to
	* authorize this operation.
	*/
	int
	change_root(vp, td)
	struct vnode *vp;
	struct thread *td;
	{
	struct filedesc *fdp;
	struct vnode *oldvp;
	int vfslocked;
	int error;

	VFS_ASSERT_GIANT(vp->v_mount);
	fdp = td->td_proc->p_fd;
	FILEDESC_XLOCK(fdp);
	if (chroot_allow_open_directories == 0 \|\|
	(chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
	error = chroot_refuse_vdir_fds(fdp);
	if (error) {
	FILEDESC_XUNLOCK(fdp);
	return (error);
	}
	}
	oldvp = fdp->fd_rdir;
	fdp->fd_rdir = vp;
	VREF(fdp->fd_rdir);
	if (!fdp->fd_jdir) {
	fdp->fd_jdir = vp;
	VREF(fdp->fd_jdir);
	}
	FILEDESC_XUNLOCK(fdp);
	vfslocked = VFS_LOCK_GIANT(oldvp->v_mount);
	vrele(oldvp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (0);
	}

	static __inline cap_rights_t
	flags_to_rights(int flags)
	{
	cap_rights_t rights = 0;

	switch ((flags & O_ACCMODE)) {
	case O_RDONLY:
	rights \|= CAP_READ;
	break;

	case O_RDWR:
	rights \|= CAP_READ;
	/* fall through */

	case O_WRONLY:
	rights \|= CAP_WRITE;
	break;

	case O_EXEC:
	rights \|= CAP_FEXECVE;
	break;
	}

	if (flags & O_CREAT)
	rights \|= CAP_CREATE;

	if (flags & O_TRUNC)
	rights \|= CAP_FTRUNCATE;

	if ((flags & O_EXLOCK) \|\| (flags & O_SHLOCK))
	rights \|= CAP_FLOCK;

	return (rights);
	}

	/*
	* Check permissions, allocate an open file structure, and call the device
	* open routine if any.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct open_args {
	char *path;
	int flags;
	int mode;
	};
	#endif
	int
	-open(td, uap)
	+sys_open(td, uap)
	struct thread *td;
	register struct open_args /* {
	char *path;
	int flags;
	int mode;
	} / uap;
	{

	return (kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct openat_args {
	int fd;
	char *path;
	int flag;
	int mode;
	};
	#endif
	int
	-openat(struct thread td, struct openat_args uap)
	+sys_openat(struct thread td, struct openat_args uap)
	{

	return (kern_openat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
	uap->mode));
	}

	int
	kern_open(struct thread td, char path, enum uio_seg pathseg, int flags,
	int mode)
	{

	return (kern_openat(td, AT_FDCWD, path, pathseg, flags, mode));
	}

	int
	kern_openat(struct thread td, int fd, char path, enum uio_seg pathseg,
	int flags, int mode)
	{
	struct proc *p = td->td_proc;
	struct filedesc *fdp = p->p_fd;
	struct file *fp;
	struct vnode *vp;
	int cmode;
	struct file *nfp;
	int type, indx = -1, error, error_open;
	struct flock lf;
	struct nameidata nd;
	int vfslocked;
	cap_rights_t rights_needed = CAP_LOOKUP;

	AUDIT_ARG_FFLAGS(flags);
	AUDIT_ARG_MODE(mode);
	/* XXX: audit dirfd */
	rights_needed \|= flags_to_rights(flags);
	/*
	* Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
	* may be specified.
	*/
	if (flags & O_EXEC) {
	if (flags & O_ACCMODE)
	return (EINVAL);
	} else if ((flags & O_ACCMODE) == O_ACCMODE)
	return (EINVAL);
	else
	flags = FFLAGS(flags);

	/*
	* allocate the file descriptor, but don't install a descriptor yet
	*/
	error = falloc_noinstall(td, &nfp);
	if (error)
	return (error);
	/* An extra reference on `nfp' has been held for us by falloc_noinstall(). */
	fp = nfp;
	/* Set the flags early so the finit in devfs can pick them up. */
	fp->f_flag = flags & FMASK;
	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW \| AUDITVNODE1 \| MPSAFE, pathseg,
	path, fd, rights_needed, td);
	td->td_dupfd = -1; /* XXX check for fdopen */
	error = vn_open(&nd, &flags, cmode, fp);
	if (error) {
	/*
	* If the vn_open replaced the method vector, something
	* wonderous happened deep below and we just pass it up
	* pretending we know what we do.
	*/
	if (error == ENXIO && fp->f_ops != &badfileops)
	goto success;

	/*
	* handle special fdopen() case. bleh. dupfdopen() is
	* responsible for dropping the old contents of ofiles[indx]
	* if it succeeds.
	*
	* Don't do this for relative (capability) lookups; we don't
	* understand exactly what would happen, and we don't think
	* that it ever should.
	*/
	if ((nd.ni_strictrelative == 0) &&
	(error == ENODEV \|\| error == ENXIO) &&
	(td->td_dupfd >= 0)) {
	/* XXX from fdopen */
	error_open = error;
	if ((error = finstall(td, fp, &indx, flags)) != 0)
	goto bad_unlocked;
	if ((error = dupfdopen(td, fdp, indx, td->td_dupfd,
	flags, error_open)) == 0)
	goto success;
	}
	/*
	* Clean up the descriptor, but only if another thread hadn't
	* replaced or closed it.
	*/
	if (indx != -1)
	fdclose(fdp, fp, indx, td);
	fdrop(fp, td);

	if (error == ERESTART)
	error = EINTR;
	return (error);
	}
	td->td_dupfd = 0;
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vp = nd.ni_vp;

	/*
	* Store the vnode, for any f_type. Typically, the vnode use
	* count is decremented by direct call to vn_closefile() for
	* files that switched type in the cdevsw fdopen() method.
	*/
	fp->f_vnode = vp;
	/*
	* If the file wasn't claimed by devfs bind it to the normal
	* vnode operations here.
	*/
	if (fp->f_ops == &badfileops) {
	KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
	fp->f_seqcount = 1;
	finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops);
	}

	VOP_UNLOCK(vp, 0);
	if (fp->f_type == DTYPE_VNODE && (flags & (O_EXLOCK \| O_SHLOCK)) != 0) {
	lf.l_whence = SEEK_SET;
	lf.l_start = 0;
	lf.l_len = 0;
	if (flags & O_EXLOCK)
	lf.l_type = F_WRLCK;
	else
	lf.l_type = F_RDLCK;
	type = F_FLOCK;
	if ((flags & FNONBLOCK) == 0)
	type \|= F_WAIT;
	if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
	type)) != 0)
	goto bad;
	atomic_set_int(&fp->f_flag, FHASLOCK);
	}
	if (flags & O_TRUNC) {
	error = fo_truncate(fp, 0, td->td_ucred, td);
	if (error)
	goto bad;
	}
	VFS_UNLOCK_GIANT(vfslocked);
	success:
	/*
	* If we haven't already installed the FD (for dupfdopen), do so now.
	*/
	if (indx == -1) {
	#ifdef CAPABILITIES
	if (nd.ni_strictrelative == 1) {
	/*
	* We are doing a strict relative lookup; wrap the
	* result in a capability.
	*/
	if ((error = kern_capwrap(td, fp, nd.ni_baserights,
	&indx)) != 0)
	goto bad_unlocked;
	} else
	#endif
	if ((error = finstall(td, fp, &indx, flags)) != 0)
	goto bad_unlocked;

	}

	/*
	* Release our private reference, leaving the one associated with
	* the descriptor table intact.
	*/
	fdrop(fp, td);
	td->td_retval[0] = indx;
	return (0);
	bad:
	VFS_UNLOCK_GIANT(vfslocked);
	bad_unlocked:
	if (indx != -1)
	fdclose(fdp, fp, indx, td);
	fdrop(fp, td);
	td->td_retval[0] = -1;
	return (error);
	}

	#ifdef COMPAT_43
	/*
	* Create a file.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct ocreat_args {
	char *path;
	int mode;
	};
	#endif
	int
	ocreat(td, uap)
	struct thread *td;
	register struct ocreat_args /* {
	char *path;
	int mode;
	} / uap;
	{

	return (kern_open(td, uap->path, UIO_USERSPACE,
	O_WRONLY \| O_CREAT \| O_TRUNC, uap->mode));
	}
	#endif /* COMPAT_43 */

	/*
	* Create a special file.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct mknod_args {
	char *path;
	int mode;
	int dev;
	};
	#endif
	int
	-mknod(td, uap)
	+sys_mknod(td, uap)
	struct thread *td;
	register struct mknod_args /* {
	char *path;
	int mode;
	int dev;
	} / uap;
	{

	return (kern_mknod(td, uap->path, UIO_USERSPACE, uap->mode, uap->dev));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct mknodat_args {
	int fd;
	char *path;
	mode_t mode;
	dev_t dev;
	};
	#endif
	int
	-mknodat(struct thread td, struct mknodat_args uap)
	+sys_mknodat(struct thread td, struct mknodat_args uap)
	{

	return (kern_mknodat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode,
	uap->dev));
	}

	int
	kern_mknod(struct thread td, char path, enum uio_seg pathseg, int mode,
	int dev)
	{

	return (kern_mknodat(td, AT_FDCWD, path, pathseg, mode, dev));
	}

	int
	kern_mknodat(struct thread td, int fd, char path, enum uio_seg pathseg,
	int mode, int dev)
	{
	struct vnode *vp;
	struct mount *mp;
	struct vattr vattr;
	int error;
	int whiteout = 0;
	struct nameidata nd;
	int vfslocked;

	AUDIT_ARG_MODE(mode);
	AUDIT_ARG_DEV(dev);
	switch (mode & S_IFMT) {
	case S_IFCHR:
	case S_IFBLK:
	error = priv_check(td, PRIV_VFS_MKNOD_DEV);
	break;
	case S_IFMT:
	error = priv_check(td, PRIV_VFS_MKNOD_BAD);
	break;
	case S_IFWHT:
	error = priv_check(td, PRIV_VFS_MKNOD_WHT);
	break;
	case S_IFIFO:
	if (dev == 0)
	return (kern_mkfifoat(td, fd, path, pathseg, mode));
	/* FALLTHROUGH */
	default:
	error = EINVAL;
	break;
	}
	if (error)
	return (error);
	restart:
	bwillwrite();
	NDINIT_ATRIGHTS(&nd, CREATE,
	LOCKPARENT \| SAVENAME \| MPSAFE \| AUDITVNODE1, pathseg, path, fd,
	CAP_MKFIFO, td);
	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	vp = nd.ni_vp;
	if (vp != NULL) {
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (vp == nd.ni_dvp)
	vrele(nd.ni_dvp);
	else
	vput(nd.ni_dvp);
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (EEXIST);
	} else {
	VATTR_NULL(&vattr);
	vattr.va_mode = (mode & ALLPERMS) &
	~td->td_proc->p_fd->fd_cmask;
	vattr.va_rdev = dev;
	whiteout = 0;

	switch (mode & S_IFMT) {
	case S_IFMT: /* used by badsect to flag bad sectors */
	vattr.va_type = VBAD;
	break;
	case S_IFCHR:
	vattr.va_type = VCHR;
	break;
	case S_IFBLK:
	vattr.va_type = VBLK;
	break;
	case S_IFWHT:
	whiteout = 1;
	break;
	default:
	panic("kern_mknod: invalid mode");
	}
	}
	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(nd.ni_dvp);
	VFS_UNLOCK_GIANT(vfslocked);
	if ((error = vn_start_write(NULL, &mp, V_XSLEEP \| PCATCH)) != 0)
	return (error);
	goto restart;
	}
	#ifdef MAC
	if (error == 0 && !whiteout)
	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
	&nd.ni_cnd, &vattr);
	#endif
	if (!error) {
	if (whiteout)
	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
	else {
	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
	&nd.ni_cnd, &vattr);
	if (error == 0)
	vput(nd.ni_vp);
	}
	}
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(nd.ni_dvp);
	vn_finished_write(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Create a named pipe.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct mkfifo_args {
	char *path;
	int mode;
	};
	#endif
	int
	-mkfifo(td, uap)
	+sys_mkfifo(td, uap)
	struct thread *td;
	register struct mkfifo_args /* {
	char *path;
	int mode;
	} / uap;
	{

	return (kern_mkfifo(td, uap->path, UIO_USERSPACE, uap->mode));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct mkfifoat_args {
	int fd;
	char *path;
	mode_t mode;
	};
	#endif
	int
	-mkfifoat(struct thread td, struct mkfifoat_args uap)
	+sys_mkfifoat(struct thread td, struct mkfifoat_args uap)
	{

	return (kern_mkfifoat(td, uap->fd, uap->path, UIO_USERSPACE,
	uap->mode));
	}

	int
	kern_mkfifo(struct thread td, char path, enum uio_seg pathseg, int mode)
	{

	return (kern_mkfifoat(td, AT_FDCWD, path, pathseg, mode));
	}

	int
	kern_mkfifoat(struct thread td, int fd, char path, enum uio_seg pathseg,
	int mode)
	{
	struct mount *mp;
	struct vattr vattr;
	int error;
	struct nameidata nd;
	int vfslocked;

	AUDIT_ARG_MODE(mode);
	restart:
	bwillwrite();
	NDINIT_AT(&nd, CREATE, LOCKPARENT \| SAVENAME \| MPSAFE \| AUDITVNODE1,
	pathseg, path, fd, td);
	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	if (nd.ni_vp != NULL) {
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (nd.ni_vp == nd.ni_dvp)
	vrele(nd.ni_dvp);
	else
	vput(nd.ni_dvp);
	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (EEXIST);
	}
	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(nd.ni_dvp);
	VFS_UNLOCK_GIANT(vfslocked);
	if ((error = vn_start_write(NULL, &mp, V_XSLEEP \| PCATCH)) != 0)
	return (error);
	goto restart;
	}
	VATTR_NULL(&vattr);
	vattr.va_type = VFIFO;
	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
	#ifdef MAC
	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
	&vattr);
	if (error)
	goto out;
	#endif
	error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
	if (error == 0)
	vput(nd.ni_vp);
	#ifdef MAC
	out:
	#endif
	vput(nd.ni_dvp);
	vn_finished_write(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	return (error);
	}

	/*
	* Make a hard file link.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct link_args {
	char *path;
	char *link;
	};
	#endif
	int
	-link(td, uap)
	+sys_link(td, uap)
	struct thread *td;
	register struct link_args /* {
	char *path;
	char *link;
	} / uap;
	{

	return (kern_link(td, uap->path, uap->link, UIO_USERSPACE));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct linkat_args {
	int fd1;
	char *path1;
	int fd2;
	char *path2;
	int flag;
	};
	#endif
	int
	-linkat(struct thread td, struct linkat_args uap)
	+sys_linkat(struct thread td, struct linkat_args uap)
	{
	int flag;

	flag = uap->flag;
	if (flag & ~AT_SYMLINK_FOLLOW)
	return (EINVAL);

	return (kern_linkat(td, uap->fd1, uap->fd2, uap->path1, uap->path2,
	UIO_USERSPACE, (flag & AT_SYMLINK_FOLLOW) ? FOLLOW : NOFOLLOW));
	}

	int hardlink_check_uid = 0;
	SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
	&hardlink_check_uid, 0,
	"Unprivileged processes cannot create hard links to files owned by other "
	"users");
	static int hardlink_check_gid = 0;
	SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_gid, CTLFLAG_RW,
	&hardlink_check_gid, 0,
	"Unprivileged processes cannot create hard links to files owned by other "
	"groups");

	static int
	can_hardlink(struct vnode vp, struct ucred cred)
	{
	struct vattr va;
	int error;

	if (!hardlink_check_uid && !hardlink_check_gid)
	return (0);

	error = VOP_GETATTR(vp, &va, cred);
	if (error != 0)
	return (error);

	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
	error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
	if (error)
	return (error);
	}

	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
	error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
	if (error)
	return (error);
	}

	return (0);
	}

	int
	kern_link(struct thread td, char path, char *link, enum uio_seg segflg)
	{

	return (kern_linkat(td, AT_FDCWD, AT_FDCWD, path,link, segflg, FOLLOW));
	}

	int
	kern_linkat(struct thread td, int fd1, int fd2, char path1, char *path2,
	enum uio_seg segflg, int follow)
	{
	struct vnode *vp;
	struct mount *mp;
	struct nameidata nd;
	int vfslocked;
	int lvfslocked;
	int error;

	bwillwrite();
	NDINIT_AT(&nd, LOOKUP, follow \| MPSAFE \| AUDITVNODE1, segflg, path1,
	fd1, td);

	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vp = nd.ni_vp;
	if (vp->v_type == VDIR) {
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (EPERM); /* POSIX */
	}
	if ((error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0) {
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	NDINIT_AT(&nd, CREATE, LOCKPARENT \| SAVENAME \| MPSAFE \| AUDITVNODE2,
	segflg, path2, fd2, td);
	if ((error = namei(&nd)) == 0) {
	lvfslocked = NDHASGIANT(&nd);
	if (nd.ni_vp != NULL) {
	if (nd.ni_dvp == nd.ni_vp)
	vrele(nd.ni_dvp);
	else
	vput(nd.ni_dvp);
	vrele(nd.ni_vp);
	error = EEXIST;
	} else if ((error = vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY))
	== 0) {
	error = can_hardlink(vp, td->td_ucred);
	if (error == 0)
	#ifdef MAC
	error = mac_vnode_check_link(td->td_ucred,
	nd.ni_dvp, vp, &nd.ni_cnd);
	if (error == 0)
	#endif
	error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
	VOP_UNLOCK(vp, 0);
	vput(nd.ni_dvp);
	}
	NDFREE(&nd, NDF_ONLY_PNBUF);
	VFS_UNLOCK_GIANT(lvfslocked);
	}
	vrele(vp);
	vn_finished_write(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Make a symbolic link.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct symlink_args {
	char *path;
	char *link;
	};
	#endif
	int
	-symlink(td, uap)
	+sys_symlink(td, uap)
	struct thread *td;
	register struct symlink_args /* {
	char *path;
	char *link;
	} / uap;
	{

	return (kern_symlink(td, uap->path, uap->link, UIO_USERSPACE));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct symlinkat_args {
	char *path;
	int fd;
	char *path2;
	};
	#endif
	int
	-symlinkat(struct thread td, struct symlinkat_args uap)
	+sys_symlinkat(struct thread td, struct symlinkat_args uap)
	{

	return (kern_symlinkat(td, uap->path1, uap->fd, uap->path2,
	UIO_USERSPACE));
	}

	int
	kern_symlink(struct thread td, char path, char *link, enum uio_seg segflg)
	{

	return (kern_symlinkat(td, path, AT_FDCWD, link, segflg));
	}

	int
	kern_symlinkat(struct thread td, char path1, int fd, char *path2,
	enum uio_seg segflg)
	{
	struct mount *mp;
	struct vattr vattr;
	char *syspath;
	int error;
	struct nameidata nd;
	int vfslocked;

	if (segflg == UIO_SYSSPACE) {
	syspath = path1;
	} else {
	syspath = uma_zalloc(namei_zone, M_WAITOK);
	if ((error = copyinstr(path1, syspath, MAXPATHLEN, NULL)) != 0)
	goto out;
	}
	AUDIT_ARG_TEXT(syspath);
	restart:
	bwillwrite();
	NDINIT_AT(&nd, CREATE, LOCKPARENT \| SAVENAME \| MPSAFE \| AUDITVNODE1,
	segflg, path2, fd, td);
	if ((error = namei(&nd)) != 0)
	goto out;
	vfslocked = NDHASGIANT(&nd);
	if (nd.ni_vp) {
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (nd.ni_vp == nd.ni_dvp)
	vrele(nd.ni_dvp);
	else
	vput(nd.ni_dvp);
	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	error = EEXIST;
	goto out;
	}
	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(nd.ni_dvp);
	VFS_UNLOCK_GIANT(vfslocked);
	if ((error = vn_start_write(NULL, &mp, V_XSLEEP \| PCATCH)) != 0)
	goto out;
	goto restart;
	}
	VATTR_NULL(&vattr);
	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
	#ifdef MAC
	vattr.va_type = VLNK;
	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
	&vattr);
	if (error)
	goto out2;
	#endif
	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
	if (error == 0)
	vput(nd.ni_vp);
	#ifdef MAC
	out2:
	#endif
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(nd.ni_dvp);
	vn_finished_write(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	out:
	if (segflg != UIO_SYSSPACE)
	uma_zfree(namei_zone, syspath);
	return (error);
	}

	/*
	* Delete a whiteout from the filesystem.
	*/
	int
	-undelete(td, uap)
	+sys_undelete(td, uap)
	struct thread *td;
	register struct undelete_args /* {
	char *path;
	} / uap;
	{
	int error;
	struct mount *mp;
	struct nameidata nd;
	int vfslocked;

	restart:
	bwillwrite();
	NDINIT(&nd, DELETE, LOCKPARENT \| DOWHITEOUT \| MPSAFE \| AUDITVNODE1,
	UIO_USERSPACE, uap->path, td);
	error = namei(&nd);
	if (error)
	return (error);
	vfslocked = NDHASGIANT(&nd);

	if (nd.ni_vp != NULLVP \|\| !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (nd.ni_vp == nd.ni_dvp)
	vrele(nd.ni_dvp);
	else
	vput(nd.ni_dvp);
	if (nd.ni_vp)
	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (EEXIST);
	}
	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(nd.ni_dvp);
	VFS_UNLOCK_GIANT(vfslocked);
	if ((error = vn_start_write(NULL, &mp, V_XSLEEP \| PCATCH)) != 0)
	return (error);
	goto restart;
	}
	error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(nd.ni_dvp);
	vn_finished_write(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Delete a name from the filesystem.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct unlink_args {
	char *path;
	};
	#endif
	int
	-unlink(td, uap)
	+sys_unlink(td, uap)
	struct thread *td;
	struct unlink_args /* {
	char *path;
	} / uap;
	{

	return (kern_unlink(td, uap->path, UIO_USERSPACE));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct unlinkat_args {
	int fd;
	char *path;
	int flag;
	};
	#endif
	int
	-unlinkat(struct thread td, struct unlinkat_args uap)
	+sys_unlinkat(struct thread td, struct unlinkat_args uap)
	{
	int flag = uap->flag;
	int fd = uap->fd;
	char *path = uap->path;

	if (flag & ~AT_REMOVEDIR)
	return (EINVAL);

	if (flag & AT_REMOVEDIR)
	return (kern_rmdirat(td, fd, path, UIO_USERSPACE));
	else
	return (kern_unlinkat(td, fd, path, UIO_USERSPACE, 0));
	}

	int
	kern_unlink(struct thread td, char path, enum uio_seg pathseg)
	{

	return (kern_unlinkat(td, AT_FDCWD, path, pathseg, 0));
	}

	int
	kern_unlinkat(struct thread td, int fd, char path, enum uio_seg pathseg,
	ino_t oldinum)
	{
	struct mount *mp;
	struct vnode *vp;
	int error;
	struct nameidata nd;
	struct stat sb;
	int vfslocked;

	restart:
	bwillwrite();
	NDINIT_AT(&nd, DELETE, LOCKPARENT \| LOCKLEAF \| MPSAFE \| AUDITVNODE1,
	pathseg, path, fd, td);
	if ((error = namei(&nd)) != 0)
	return (error == EINVAL ? EPERM : error);
	vfslocked = NDHASGIANT(&nd);
	vp = nd.ni_vp;
	if (vp->v_type == VDIR && oldinum == 0) {
	error = EPERM; /* POSIX */
	} else if (oldinum != 0 &&
	((error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td)) == 0) &&
	sb.st_ino != oldinum) {
	error = EIDRM; /* Identifier removed */
	} else {
	/*
	* The root of a mounted filesystem cannot be deleted.
	*
	* XXX: can this only be a VDIR case?
	*/
	if (vp->v_vflag & VV_ROOT)
	error = EBUSY;
	}
	if (error == 0) {
	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(nd.ni_dvp);
	if (vp == nd.ni_dvp)
	vrele(vp);
	else
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	if ((error = vn_start_write(NULL, &mp,
	V_XSLEEP \| PCATCH)) != 0)
	return (error);
	goto restart;
	}
	#ifdef MAC
	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
	&nd.ni_cnd);
	if (error)
	goto out;
	#endif
	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
	#ifdef MAC
	out:
	#endif
	vn_finished_write(mp);
	}
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(nd.ni_dvp);
	if (vp == nd.ni_dvp)
	vrele(vp);
	else
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Reposition read/write file offset.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct lseek_args {
	int fd;
	int pad;
	off_t offset;
	int whence;
	};
	#endif
	int
	-lseek(td, uap)
	+sys_lseek(td, uap)
	struct thread *td;
	register struct lseek_args /* {
	int fd;
	int pad;
	off_t offset;
	int whence;
	} / uap;
	{
	struct ucred *cred = td->td_ucred;
	struct file *fp;
	struct vnode *vp;
	struct vattr vattr;
	off_t offset, size;
	int error, noneg;
	int vfslocked;

	AUDIT_ARG_FD(uap->fd);
	if ((error = fget(td, uap->fd, CAP_SEEK, &fp)) != 0)
	return (error);
	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
	fdrop(fp, td);
	return (ESPIPE);
	}
	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	noneg = (vp->v_type != VCHR);
	offset = uap->offset;
	switch (uap->whence) {
	case L_INCR:
	if (noneg &&
	(fp->f_offset < 0 \|\|
	(offset > 0 && fp->f_offset > OFF_MAX - offset))) {
	error = EOVERFLOW;
	break;
	}
	offset += fp->f_offset;
	break;
	case L_XTND:
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	error = VOP_GETATTR(vp, &vattr, cred);
	VOP_UNLOCK(vp, 0);
	if (error)
	break;

	/*
	* If the file references a disk device, then fetch
	* the media size and use that to determine the ending
	* offset.
	*/
	if (vattr.va_size == 0 && vp->v_type == VCHR &&
	fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
	vattr.va_size = size;
	if (noneg &&
	(vattr.va_size > OFF_MAX \|\|
	(offset > 0 && vattr.va_size > OFF_MAX - offset))) {
	error = EOVERFLOW;
	break;
	}
	offset += vattr.va_size;
	break;
	case L_SET:
	break;
	case SEEK_DATA:
	error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
	break;
	case SEEK_HOLE:
	error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
	break;
	default:
	error = EINVAL;
	}
	if (error == 0 && noneg && offset < 0)
	error = EINVAL;
	if (error != 0)
	goto drop;
	fp->f_offset = offset;
	(off_t )(td->td_retval) = fp->f_offset;
	drop:
	fdrop(fp, td);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	#if defined(COMPAT_43)
	/*
	* Reposition read/write file offset.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct olseek_args {
	int fd;
	long offset;
	int whence;
	};
	#endif
	int
	olseek(td, uap)
	struct thread *td;
	register struct olseek_args /* {
	int fd;
	long offset;
	int whence;
	} / uap;
	{
	struct lseek_args /* {
	int fd;
	int pad;
	off_t offset;
	int whence;
	} */ nuap;

	nuap.fd = uap->fd;
	nuap.offset = uap->offset;
	nuap.whence = uap->whence;
	- return (lseek(td, &nuap));
	+ return (sys_lseek(td, &nuap));
	}
	#endif /* COMPAT_43 */

	/* Version with the 'pad' argument */
	int
	freebsd6_lseek(td, uap)
	struct thread *td;
	register struct freebsd6_lseek_args *uap;
	{
	struct lseek_args ouap;

	ouap.fd = uap->fd;
	ouap.offset = uap->offset;
	ouap.whence = uap->whence;
	- return (lseek(td, &ouap));
	+ return (sys_lseek(td, &ouap));
	}

	/*
	* Check access permissions using passed credentials.
	*/
	static int
	vn_access(vp, user_flags, cred, td)
	struct vnode *vp;
	int user_flags;
	struct ucred *cred;
	struct thread *td;
	{
	int error;
	accmode_t accmode;

	/* Flags == 0 means only check for existence. */
	error = 0;
	if (user_flags) {
	accmode = 0;
	if (user_flags & R_OK)
	accmode \|= VREAD;
	if (user_flags & W_OK)
	accmode \|= VWRITE;
	if (user_flags & X_OK)
	accmode \|= VEXEC;
	#ifdef MAC
	error = mac_vnode_check_access(cred, vp, accmode);
	if (error)
	return (error);
	#endif
	if ((accmode & VWRITE) == 0 \|\| (error = vn_writechk(vp)) == 0)
	error = VOP_ACCESS(vp, accmode, cred, td);
	}
	return (error);
	}

	/*
	* Check access permissions using "real" credentials.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct access_args {
	char *path;
	int flags;
	};
	#endif
	int
	-access(td, uap)
	+sys_access(td, uap)
	struct thread *td;
	register struct access_args /* {
	char *path;
	int flags;
	} / uap;
	{

	return (kern_access(td, uap->path, UIO_USERSPACE, uap->flags));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct faccessat_args {
	int dirfd;
	char *path;
	int mode;
	int flag;
	}
	#endif
	int
	-faccessat(struct thread td, struct faccessat_args uap)
	+sys_faccessat(struct thread td, struct faccessat_args uap)
	{

	if (uap->flag & ~AT_EACCESS)
	return (EINVAL);
	return (kern_accessat(td, uap->fd, uap->path, UIO_USERSPACE, uap->flag,
	uap->mode));
	}

	int
	kern_access(struct thread td, char path, enum uio_seg pathseg, int mode)
	{

	return (kern_accessat(td, AT_FDCWD, path, pathseg, 0, mode));
	}

	int
	kern_accessat(struct thread td, int fd, char path, enum uio_seg pathseg,
	int flags, int mode)
	{
	struct ucred cred, tmpcred;
	struct vnode *vp;
	struct nameidata nd;
	int vfslocked;
	int error;

	/*
	* Create and modify a temporary credential instead of one that
	* is potentially shared.
	*/
	if (!(flags & AT_EACCESS)) {
	cred = td->td_ucred;
	tmpcred = crdup(cred);
	tmpcred->cr_uid = cred->cr_ruid;
	tmpcred->cr_groups[0] = cred->cr_rgid;
	td->td_ucred = tmpcred;
	} else
	cred = tmpcred = td->td_ucred;
	AUDIT_ARG_VALUE(mode);
	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW \| LOCKSHARED \| LOCKLEAF \| MPSAFE \|
	AUDITVNODE1, pathseg, path, fd, CAP_FSTAT, td);
	if ((error = namei(&nd)) != 0)
	goto out1;
	vfslocked = NDHASGIANT(&nd);
	vp = nd.ni_vp;

	error = vn_access(vp, mode, tmpcred, td);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	out1:
	if (!(flags & AT_EACCESS)) {
	td->td_ucred = cred;
	crfree(tmpcred);
	}
	return (error);
	}

	/*
	* Check access permissions using "effective" credentials.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct eaccess_args {
	char *path;
	int flags;
	};
	#endif
	int
	-eaccess(td, uap)
	+sys_eaccess(td, uap)
	struct thread *td;
	register struct eaccess_args /* {
	char *path;
	int flags;
	} / uap;
	{

	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->flags));
	}

	int
	kern_eaccess(struct thread td, char path, enum uio_seg pathseg, int flags)
	{

	return (kern_accessat(td, AT_FDCWD, path, pathseg, AT_EACCESS, flags));
	}

	#if defined(COMPAT_43)
	/*
	* Get file status; this version follows links.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct ostat_args {
	char *path;
	struct ostat *ub;
	};
	#endif
	int
	ostat(td, uap)
	struct thread *td;
	register struct ostat_args /* {
	char *path;
	struct ostat *ub;
	} / uap;
	{
	struct stat sb;
	struct ostat osb;
	int error;

	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
	if (error)
	return (error);
	cvtstat(&sb, &osb);
	error = copyout(&osb, uap->ub, sizeof (osb));
	return (error);
	}

	/*
	* Get file status; this version does not follow links.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct olstat_args {
	char *path;
	struct ostat *ub;
	};
	#endif
	int
	olstat(td, uap)
	struct thread *td;
	register struct olstat_args /* {
	char *path;
	struct ostat *ub;
	} / uap;
	{
	struct stat sb;
	struct ostat osb;
	int error;

	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
	if (error)
	return (error);
	cvtstat(&sb, &osb);
	error = copyout(&osb, uap->ub, sizeof (osb));
	return (error);
	}

	/*
	* Convert from an old to a new stat structure.
	*/
	void
	cvtstat(st, ost)
	struct stat *st;
	struct ostat *ost;
	{

	ost->st_dev = st->st_dev;
	ost->st_ino = st->st_ino;
	ost->st_mode = st->st_mode;
	ost->st_nlink = st->st_nlink;
	ost->st_uid = st->st_uid;
	ost->st_gid = st->st_gid;
	ost->st_rdev = st->st_rdev;
	if (st->st_size < (quad_t)1 << 32)
	ost->st_size = st->st_size;
	else
	ost->st_size = -2;
	ost->st_atim = st->st_atim;
	ost->st_mtim = st->st_mtim;
	ost->st_ctim = st->st_ctim;
	ost->st_blksize = st->st_blksize;
	ost->st_blocks = st->st_blocks;
	ost->st_flags = st->st_flags;
	ost->st_gen = st->st_gen;
	}
	#endif /* COMPAT_43 */

	/*
	* Get file status; this version follows links.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct stat_args {
	char *path;
	struct stat *ub;
	};
	#endif
	int
	-stat(td, uap)
	+sys_stat(td, uap)
	struct thread *td;
	register struct stat_args /* {
	char *path;
	struct stat *ub;
	} / uap;
	{
	struct stat sb;
	int error;

	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
	if (error == 0)
	error = copyout(&sb, uap->ub, sizeof (sb));
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct fstatat_args {
	int fd;
	char *path;
	struct stat *buf;
	int flag;
	}
	#endif
	int
	-fstatat(struct thread td, struct fstatat_args uap)
	+sys_fstatat(struct thread td, struct fstatat_args uap)
	{
	struct stat sb;
	int error;

	error = kern_statat(td, uap->flag, uap->fd, uap->path,
	UIO_USERSPACE, &sb);
	if (error == 0)
	error = copyout(&sb, uap->buf, sizeof (sb));
	return (error);
	}

	int
	kern_stat(struct thread td, char path, enum uio_seg pathseg, struct stat *sbp)
	{

	return (kern_statat(td, 0, AT_FDCWD, path, pathseg, sbp));
	}

	int
	kern_statat(struct thread td, int flag, int fd, char path,
	enum uio_seg pathseg, struct stat *sbp)
	{

	return (kern_statat_vnhook(td, flag, fd, path, pathseg, sbp, NULL));
	}

	int
	kern_statat_vnhook(struct thread td, int flag, int fd, char path,
	enum uio_seg pathseg, struct stat *sbp,
	void (hook)(struct vnode vp, struct stat *sbp))
	{
	struct nameidata nd;
	struct stat sb;
	int error, vfslocked;

	if (flag & ~AT_SYMLINK_NOFOLLOW)
	return (EINVAL);

	NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
	FOLLOW) \| LOCKSHARED \| LOCKLEAF \| AUDITVNODE1 \| MPSAFE, pathseg,
	path, fd, CAP_FSTAT, td);

	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
	if (!error) {
	SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
	if (S_ISREG(sb.st_mode))
	SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
	if (__predict_false(hook != NULL))
	hook(nd.ni_vp, &sb);
	}
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	if (error)
	return (error);
	*sbp = sb;
	#ifdef KTRACE
	if (KTRPOINT(td, KTR_STRUCT))
	ktrstat(&sb);
	#endif
	return (0);
	}

	/*
	* Get file status; this version does not follow links.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct lstat_args {
	char *path;
	struct stat *ub;
	};
	#endif
	int
	-lstat(td, uap)
	+sys_lstat(td, uap)
	struct thread *td;
	register struct lstat_args /* {
	char *path;
	struct stat *ub;
	} / uap;
	{
	struct stat sb;
	int error;

	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
	if (error == 0)
	error = copyout(&sb, uap->ub, sizeof (sb));
	return (error);
	}

	int
	kern_lstat(struct thread td, char path, enum uio_seg pathseg, struct stat *sbp)
	{

	return (kern_statat(td, AT_SYMLINK_NOFOLLOW, AT_FDCWD, path, pathseg,
	sbp));
	}

	/*
	* Implementation of the NetBSD [l]stat() functions.
	*/
	void
	cvtnstat(sb, nsb)
	struct stat *sb;
	struct nstat *nsb;
	{
	bzero(nsb, sizeof *nsb);
	nsb->st_dev = sb->st_dev;
	nsb->st_ino = sb->st_ino;
	nsb->st_mode = sb->st_mode;
	nsb->st_nlink = sb->st_nlink;
	nsb->st_uid = sb->st_uid;
	nsb->st_gid = sb->st_gid;
	nsb->st_rdev = sb->st_rdev;
	nsb->st_atim = sb->st_atim;
	nsb->st_mtim = sb->st_mtim;
	nsb->st_ctim = sb->st_ctim;
	nsb->st_size = sb->st_size;
	nsb->st_blocks = sb->st_blocks;
	nsb->st_blksize = sb->st_blksize;
	nsb->st_flags = sb->st_flags;
	nsb->st_gen = sb->st_gen;
	nsb->st_birthtim = sb->st_birthtim;
	}

	#ifndef _SYS_SYSPROTO_H_
	struct nstat_args {
	char *path;
	struct nstat *ub;
	};
	#endif
	int
	-nstat(td, uap)
	+sys_nstat(td, uap)
	struct thread *td;
	register struct nstat_args /* {
	char *path;
	struct nstat *ub;
	} / uap;
	{
	struct stat sb;
	struct nstat nsb;
	int error;

	error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
	if (error)
	return (error);
	cvtnstat(&sb, &nsb);
	error = copyout(&nsb, uap->ub, sizeof (nsb));
	return (error);
	}

	/*
	* NetBSD lstat. Get file status; this version does not follow links.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct lstat_args {
	char *path;
	struct stat *ub;
	};
	#endif
	int
	-nlstat(td, uap)
	+sys_nlstat(td, uap)
	struct thread *td;
	register struct nlstat_args /* {
	char *path;
	struct nstat *ub;
	} / uap;
	{
	struct stat sb;
	struct nstat nsb;
	int error;

	error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
	if (error)
	return (error);
	cvtnstat(&sb, &nsb);
	error = copyout(&nsb, uap->ub, sizeof (nsb));
	return (error);
	}

	/*
	* Get configurable pathname variables.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct pathconf_args {
	char *path;
	int name;
	};
	#endif
	int
	-pathconf(td, uap)
	+sys_pathconf(td, uap)
	struct thread *td;
	register struct pathconf_args /* {
	char *path;
	int name;
	} / uap;
	{

	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, FOLLOW));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct lpathconf_args {
	char *path;
	int name;
	};
	#endif
	int
	-lpathconf(td, uap)
	+sys_lpathconf(td, uap)
	struct thread *td;
	register struct lpathconf_args /* {
	char *path;
	int name;
	} / uap;
	{

	return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW));
	}

	int
	kern_pathconf(struct thread td, char path, enum uio_seg pathseg, int name,
	u_long flags)
	{
	struct nameidata nd;
	int error, vfslocked;

	NDINIT(&nd, LOOKUP, LOCKSHARED \| LOCKLEAF \| MPSAFE \| AUDITVNODE1 \|
	flags, pathseg, path, td);
	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);

	/* If asynchronous I/O is available, it works for all files. */
	if (name == _PC_ASYNC_IO)
	td->td_retval[0] = async_io_version;
	else
	error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
	vput(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Return target name of a symbolic link.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct readlink_args {
	char *path;
	char *buf;
	size_t count;
	};
	#endif
	int
	-readlink(td, uap)
	+sys_readlink(td, uap)
	struct thread *td;
	register struct readlink_args /* {
	char *path;
	char *buf;
	size_t count;
	} / uap;
	{

	return (kern_readlink(td, uap->path, UIO_USERSPACE, uap->buf,
	UIO_USERSPACE, uap->count));
	}
	#ifndef _SYS_SYSPROTO_H_
	struct readlinkat_args {
	int fd;
	char *path;
	char *buf;
	size_t bufsize;
	};
	#endif
	int
	-readlinkat(struct thread td, struct readlinkat_args uap)
	+sys_readlinkat(struct thread td, struct readlinkat_args uap)
	{

	return (kern_readlinkat(td, uap->fd, uap->path, UIO_USERSPACE,
	uap->buf, UIO_USERSPACE, uap->bufsize));
	}

	int
	kern_readlink(struct thread td, char path, enum uio_seg pathseg, char *buf,
	enum uio_seg bufseg, size_t count)
	{

	return (kern_readlinkat(td, AT_FDCWD, path, pathseg, buf, bufseg,
	count));
	}

	int
	kern_readlinkat(struct thread td, int fd, char path, enum uio_seg pathseg,
	char *buf, enum uio_seg bufseg, size_t count)
	{
	struct vnode *vp;
	struct iovec aiov;
	struct uio auio;
	int error;
	struct nameidata nd;
	int vfslocked;

	if (count > INT_MAX)
	return (EINVAL);

	NDINIT_AT(&nd, LOOKUP, NOFOLLOW \| LOCKSHARED \| LOCKLEAF \| MPSAFE \|
	AUDITVNODE1, pathseg, path, fd, td);

	if ((error = namei(&nd)) != 0)
	return (error);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vfslocked = NDHASGIANT(&nd);
	vp = nd.ni_vp;
	#ifdef MAC
	error = mac_vnode_check_readlink(td->td_ucred, vp);
	if (error) {
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	#endif
	if (vp->v_type != VLNK)
	error = EINVAL;
	else {
	aiov.iov_base = buf;
	aiov.iov_len = count;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_offset = 0;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = bufseg;
	auio.uio_td = td;
	auio.uio_resid = count;
	error = VOP_READLINK(vp, &auio, td->td_ucred);
	}
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	td->td_retval[0] = count - auio.uio_resid;
	return (error);
	}

	/*
	* Common implementation code for chflags() and fchflags().
	*/
	static int
	setfflags(td, vp, flags)
	struct thread *td;
	struct vnode *vp;
	int flags;
	{
	int error;
	struct mount *mp;
	struct vattr vattr;

	/*
	* Prevent non-root users from setting flags on devices. When
	* a device is reused, users can retain ownership of the device
	* if they are allowed to set flags and programs assume that
	* chown can't fail when done as root.
	*/
	if (vp->v_type == VCHR \|\| vp->v_type == VBLK) {
	error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
	if (error)
	return (error);
	}

	if ((error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0)
	return (error);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	VATTR_NULL(&vattr);
	vattr.va_flags = flags;
	#ifdef MAC
	error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
	if (error == 0)
	#endif
	error = VOP_SETATTR(vp, &vattr, td->td_ucred);
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	return (error);
	}

	/*
	* Change flags of a file given a path name.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct chflags_args {
	char *path;
	int flags;
	};
	#endif
	int
	-chflags(td, uap)
	+sys_chflags(td, uap)
	struct thread *td;
	register struct chflags_args /* {
	char *path;
	int flags;
	} / uap;
	{
	int error;
	struct nameidata nd;
	int vfslocked;

	AUDIT_ARG_FFLAGS(uap->flags);
	NDINIT(&nd, LOOKUP, FOLLOW \| MPSAFE \| AUDITVNODE1, UIO_USERSPACE,
	uap->path, td);
	if ((error = namei(&nd)) != 0)
	return (error);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vfslocked = NDHASGIANT(&nd);
	error = setfflags(td, nd.ni_vp, uap->flags);
	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Same as chflags() but doesn't follow symlinks.
	*/
	int
	-lchflags(td, uap)
	+sys_lchflags(td, uap)
	struct thread *td;
	register struct lchflags_args /* {
	char *path;
	int flags;
	} / uap;
	{
	int error;
	struct nameidata nd;
	int vfslocked;

	AUDIT_ARG_FFLAGS(uap->flags);
	NDINIT(&nd, LOOKUP, NOFOLLOW \| MPSAFE \| AUDITVNODE1, UIO_USERSPACE,
	uap->path, td);
	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	error = setfflags(td, nd.ni_vp, uap->flags);
	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Change flags of a file given a file descriptor.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct fchflags_args {
	int fd;
	int flags;
	};
	#endif
	int
	-fchflags(td, uap)
	+sys_fchflags(td, uap)
	struct thread *td;
	register struct fchflags_args /* {
	int fd;
	int flags;
	} / uap;
	{
	struct file *fp;
	int vfslocked;
	int error;

	AUDIT_ARG_FD(uap->fd);
	AUDIT_ARG_FFLAGS(uap->flags);
	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FCHFLAGS,
	&fp)) != 0)
	return (error);
	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
	#ifdef AUDIT
	vn_lock(fp->f_vnode, LK_SHARED \| LK_RETRY);
	AUDIT_ARG_VNODE1(fp->f_vnode);
	VOP_UNLOCK(fp->f_vnode, 0);
	#endif
	error = setfflags(td, fp->f_vnode, uap->flags);
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	return (error);
	}

	/*
	* Common implementation code for chmod(), lchmod() and fchmod().
	*/
	int
	setfmode(td, cred, vp, mode)
	struct thread *td;
	struct ucred *cred;
	struct vnode *vp;
	int mode;
	{
	int error;
	struct mount *mp;
	struct vattr vattr;

	if ((error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0)
	return (error);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	VATTR_NULL(&vattr);
	vattr.va_mode = mode & ALLPERMS;
	#ifdef MAC
	error = mac_vnode_check_setmode(cred, vp, vattr.va_mode);
	if (error == 0)
	#endif
	error = VOP_SETATTR(vp, &vattr, cred);
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	return (error);
	}

	/*
	* Change mode of a file given path name.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct chmod_args {
	char *path;
	int mode;
	};
	#endif
	int
	-chmod(td, uap)
	+sys_chmod(td, uap)
	struct thread *td;
	register struct chmod_args /* {
	char *path;
	int mode;
	} / uap;
	{

	return (kern_chmod(td, uap->path, UIO_USERSPACE, uap->mode));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct fchmodat_args {
	int dirfd;
	char *path;
	mode_t mode;
	int flag;
	}
	#endif
	int
	-fchmodat(struct thread td, struct fchmodat_args uap)
	+sys_fchmodat(struct thread td, struct fchmodat_args uap)
	{
	int flag = uap->flag;
	int fd = uap->fd;
	char *path = uap->path;
	mode_t mode = uap->mode;

	if (flag & ~AT_SYMLINK_NOFOLLOW)
	return (EINVAL);

	return (kern_fchmodat(td, fd, path, UIO_USERSPACE, mode, flag));
	}

	int
	kern_chmod(struct thread td, char path, enum uio_seg pathseg, int mode)
	{

	return (kern_fchmodat(td, AT_FDCWD, path, pathseg, mode, 0));
	}

	/*
	* Change mode of a file given path name (don't follow links.)
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct lchmod_args {
	char *path;
	int mode;
	};
	#endif
	int
	-lchmod(td, uap)
	+sys_lchmod(td, uap)
	struct thread *td;
	register struct lchmod_args /* {
	char *path;
	int mode;
	} / uap;
	{

	return (kern_fchmodat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
	uap->mode, AT_SYMLINK_NOFOLLOW));
	}


	int
	kern_fchmodat(struct thread td, int fd, char path, enum uio_seg pathseg,
	mode_t mode, int flag)
	{
	int error;
	struct nameidata nd;
	int vfslocked;
	int follow;

	AUDIT_ARG_MODE(mode);
	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
	NDINIT_ATRIGHTS(&nd, LOOKUP, follow \| MPSAFE \| AUDITVNODE1, pathseg,
	path, fd, CAP_FCHMOD, td);
	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Change mode of a file given a file descriptor.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct fchmod_args {
	int fd;
	int mode;
	};
	#endif
	int
	-fchmod(struct thread td, struct fchmod_args uap)
	+sys_fchmod(struct thread td, struct fchmod_args uap)
	{
	struct file *fp;
	int error;

	AUDIT_ARG_FD(uap->fd);
	AUDIT_ARG_MODE(uap->mode);

	error = fget(td, uap->fd, CAP_FCHMOD, &fp);
	if (error != 0)
	return (error);
	error = fo_chmod(fp, uap->mode, td->td_ucred, td);
	fdrop(fp, td);
	return (error);
	}

	/*
	* Common implementation for chown(), lchown(), and fchown()
	*/
	int
	setfown(td, cred, vp, uid, gid)
	struct thread *td;
	struct ucred *cred;
	struct vnode *vp;
	uid_t uid;
	gid_t gid;
	{
	int error;
	struct mount *mp;
	struct vattr vattr;

	if ((error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0)
	return (error);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	VATTR_NULL(&vattr);
	vattr.va_uid = uid;
	vattr.va_gid = gid;
	#ifdef MAC
	error = mac_vnode_check_setowner(cred, vp, vattr.va_uid,
	vattr.va_gid);
	if (error == 0)
	#endif
	error = VOP_SETATTR(vp, &vattr, cred);
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	return (error);
	}

	/*
	* Set ownership given a path name.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct chown_args {
	char *path;
	int uid;
	int gid;
	};
	#endif
	int
	-chown(td, uap)
	+sys_chown(td, uap)
	struct thread *td;
	register struct chown_args /* {
	char *path;
	int uid;
	int gid;
	} / uap;
	{

	return (kern_chown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct fchownat_args {
	int fd;
	const char * path;
	uid_t uid;
	gid_t gid;
	int flag;
	};
	#endif
	int
	-fchownat(struct thread td, struct fchownat_args uap)
	+sys_fchownat(struct thread td, struct fchownat_args uap)
	{
	int flag;

	flag = uap->flag;
	if (flag & ~AT_SYMLINK_NOFOLLOW)
	return (EINVAL);

	return (kern_fchownat(td, uap->fd, uap->path, UIO_USERSPACE, uap->uid,
	uap->gid, uap->flag));
	}

	int
	kern_chown(struct thread td, char path, enum uio_seg pathseg, int uid,
	int gid)
	{

	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid, 0));
	}

	int
	kern_fchownat(struct thread td, int fd, char path, enum uio_seg pathseg,
	int uid, int gid, int flag)
	{
	struct nameidata nd;
	int error, vfslocked, follow;

	AUDIT_ARG_OWNER(uid, gid);
	follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
	NDINIT_ATRIGHTS(&nd, LOOKUP, follow \| MPSAFE \| AUDITVNODE1, pathseg,
	path, fd, CAP_FCHOWN, td);

	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Set ownership given a path name, do not cross symlinks.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct lchown_args {
	char *path;
	int uid;
	int gid;
	};
	#endif
	int
	-lchown(td, uap)
	+sys_lchown(td, uap)
	struct thread *td;
	register struct lchown_args /* {
	char *path;
	int uid;
	int gid;
	} / uap;
	{

	return (kern_lchown(td, uap->path, UIO_USERSPACE, uap->uid, uap->gid));
	}

	int
	kern_lchown(struct thread td, char path, enum uio_seg pathseg, int uid,
	int gid)
	{

	return (kern_fchownat(td, AT_FDCWD, path, pathseg, uid, gid,
	AT_SYMLINK_NOFOLLOW));
	}

	/*
	* Set ownership given a file descriptor.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct fchown_args {
	int fd;
	int uid;
	int gid;
	};
	#endif
	int
	-fchown(td, uap)
	+sys_fchown(td, uap)
	struct thread *td;
	register struct fchown_args /* {
	int fd;
	int uid;
	int gid;
	} / uap;
	{
	struct file *fp;
	int error;

	AUDIT_ARG_FD(uap->fd);
	AUDIT_ARG_OWNER(uap->uid, uap->gid);
	error = fget(td, uap->fd, CAP_FCHOWN, &fp);
	if (error != 0)
	return (error);
	error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
	fdrop(fp, td);
	return (error);
	}

	/*
	* Common implementation code for utimes(), lutimes(), and futimes().
	*/
	static int
	getutimes(usrtvp, tvpseg, tsp)
	const struct timeval *usrtvp;
	enum uio_seg tvpseg;
	struct timespec *tsp;
	{
	struct timeval tv[2];
	const struct timeval *tvp;
	int error;

	if (usrtvp == NULL) {
	vfs_timestamp(&tsp[0]);
	tsp[1] = tsp[0];
	} else {
	if (tvpseg == UIO_SYSSPACE) {
	tvp = usrtvp;
	} else {
	if ((error = copyin(usrtvp, tv, sizeof(tv))) != 0)
	return (error);
	tvp = tv;
	}

	if (tvp[0].tv_usec < 0 \|\| tvp[0].tv_usec >= 1000000 \|\|
	tvp[1].tv_usec < 0 \|\| tvp[1].tv_usec >= 1000000)
	return (EINVAL);
	TIMEVAL_TO_TIMESPEC(&tvp[0], &tsp[0]);
	TIMEVAL_TO_TIMESPEC(&tvp[1], &tsp[1]);
	}
	return (0);
	}

	/*
	* Common implementation code for utimes(), lutimes(), and futimes().
	*/
	static int
	setutimes(td, vp, ts, numtimes, nullflag)
	struct thread *td;
	struct vnode *vp;
	const struct timespec *ts;
	int numtimes;
	int nullflag;
	{
	int error, setbirthtime;
	struct mount *mp;
	struct vattr vattr;

	if ((error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0)
	return (error);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	setbirthtime = 0;
	if (numtimes < 3 && !VOP_GETATTR(vp, &vattr, td->td_ucred) &&
	timespeccmp(&ts[1], &vattr.va_birthtime, < ))
	setbirthtime = 1;
	VATTR_NULL(&vattr);
	vattr.va_atime = ts[0];
	vattr.va_mtime = ts[1];
	if (setbirthtime)
	vattr.va_birthtime = ts[1];
	if (numtimes > 2)
	vattr.va_birthtime = ts[2];
	if (nullflag)
	vattr.va_vaflags \|= VA_UTIMES_NULL;
	#ifdef MAC
	error = mac_vnode_check_setutimes(td->td_ucred, vp, vattr.va_atime,
	vattr.va_mtime);
	#endif
	if (error == 0)
	error = VOP_SETATTR(vp, &vattr, td->td_ucred);
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	return (error);
	}

	/*
	* Set the access and modification times of a file.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct utimes_args {
	char *path;
	struct timeval *tptr;
	};
	#endif
	int
	-utimes(td, uap)
	+sys_utimes(td, uap)
	struct thread *td;
	register struct utimes_args /* {
	char *path;
	struct timeval *tptr;
	} / uap;
	{

	return (kern_utimes(td, uap->path, UIO_USERSPACE, uap->tptr,
	UIO_USERSPACE));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct futimesat_args {
	int fd;
	const char * path;
	const struct timeval * times;
	};
	#endif
	int
	-futimesat(struct thread td, struct futimesat_args uap)
	+sys_futimesat(struct thread td, struct futimesat_args uap)
	{

	return (kern_utimesat(td, uap->fd, uap->path, UIO_USERSPACE,
	uap->times, UIO_USERSPACE));
	}

	int
	kern_utimes(struct thread td, char path, enum uio_seg pathseg,
	struct timeval *tptr, enum uio_seg tptrseg)
	{

	return (kern_utimesat(td, AT_FDCWD, path, pathseg, tptr, tptrseg));
	}

	int
	kern_utimesat(struct thread td, int fd, char path, enum uio_seg pathseg,
	struct timeval *tptr, enum uio_seg tptrseg)
	{
	struct nameidata nd;
	struct timespec ts[2];
	int error, vfslocked;

	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
	return (error);
	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW \| MPSAFE \| AUDITVNODE1, pathseg,
	path, fd, CAP_FUTIMES, td);

	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Set the access and modification times of a file.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct lutimes_args {
	char *path;
	struct timeval *tptr;
	};
	#endif
	int
	-lutimes(td, uap)
	+sys_lutimes(td, uap)
	struct thread *td;
	register struct lutimes_args /* {
	char *path;
	struct timeval *tptr;
	} / uap;
	{

	return (kern_lutimes(td, uap->path, UIO_USERSPACE, uap->tptr,
	UIO_USERSPACE));
	}

	int
	kern_lutimes(struct thread td, char path, enum uio_seg pathseg,
	struct timeval *tptr, enum uio_seg tptrseg)
	{
	struct timespec ts[2];
	int error;
	struct nameidata nd;
	int vfslocked;

	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
	return (error);
	NDINIT(&nd, LOOKUP, NOFOLLOW \| MPSAFE \| AUDITVNODE1, pathseg, path, td);
	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
	vrele(nd.ni_vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Set the access and modification times of a file.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct futimes_args {
	int fd;
	struct timeval *tptr;
	};
	#endif
	int
	-futimes(td, uap)
	+sys_futimes(td, uap)
	struct thread *td;
	register struct futimes_args /* {
	int fd;
	struct timeval *tptr;
	} / uap;
	{

	return (kern_futimes(td, uap->fd, uap->tptr, UIO_USERSPACE));
	}

	int
	kern_futimes(struct thread td, int fd, struct timeval tptr,
	enum uio_seg tptrseg)
	{
	struct timespec ts[2];
	struct file *fp;
	int vfslocked;
	int error;

	AUDIT_ARG_FD(fd);
	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
	return (error);
	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_FUTIMES, &fp))
	!= 0)
	return (error);
	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
	#ifdef AUDIT
	vn_lock(fp->f_vnode, LK_SHARED \| LK_RETRY);
	AUDIT_ARG_VNODE1(fp->f_vnode);
	VOP_UNLOCK(fp->f_vnode, 0);
	#endif
	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	return (error);
	}

	/*
	* Truncate a file given its path name.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct truncate_args {
	char *path;
	int pad;
	off_t length;
	};
	#endif
	int
	-truncate(td, uap)
	+sys_truncate(td, uap)
	struct thread *td;
	register struct truncate_args /* {
	char *path;
	int pad;
	off_t length;
	} / uap;
	{

	return (kern_truncate(td, uap->path, UIO_USERSPACE, uap->length));
	}

	int
	kern_truncate(struct thread td, char path, enum uio_seg pathseg, off_t length)
	{
	struct mount *mp;
	struct vnode *vp;
	struct vattr vattr;
	int error;
	struct nameidata nd;
	int vfslocked;

	if (length < 0)
	return(EINVAL);
	NDINIT(&nd, LOOKUP, FOLLOW \| MPSAFE \| AUDITVNODE1, pathseg, path, td);
	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	vp = nd.ni_vp;
	if ((error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0) {
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	if (vp->v_type == VDIR)
	error = EISDIR;
	#ifdef MAC
	else if ((error = mac_vnode_check_write(td->td_ucred, NOCRED, vp))) {
	}
	#endif
	else if ((error = vn_writechk(vp)) == 0 &&
	(error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td)) == 0) {
	VATTR_NULL(&vattr);
	vattr.va_size = length;
	error = VOP_SETATTR(vp, &vattr, td->td_ucred);
	}
	vput(vp);
	vn_finished_write(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	#if defined(COMPAT_43)
	/*
	* Truncate a file given its path name.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct otruncate_args {
	char *path;
	long length;
	};
	#endif
	int
	otruncate(td, uap)
	struct thread *td;
	register struct otruncate_args /* {
	char *path;
	long length;
	} / uap;
	{
	struct truncate_args /* {
	char *path;
	int pad;
	off_t length;
	} */ nuap;

	nuap.path = uap->path;
	nuap.length = uap->length;
	- return (truncate(td, &nuap));
	+ return (sys_truncate(td, &nuap));
	}
	#endif /* COMPAT_43 */

	/* Versions with the pad argument */
	int
	freebsd6_truncate(struct thread td, struct freebsd6_truncate_args uap)
	{
	struct truncate_args ouap;

	ouap.path = uap->path;
	ouap.length = uap->length;
	- return (truncate(td, &ouap));
	+ return (sys_truncate(td, &ouap));
	}

	int
	freebsd6_ftruncate(struct thread td, struct freebsd6_ftruncate_args uap)
	{
	struct ftruncate_args ouap;

	ouap.fd = uap->fd;
	ouap.length = uap->length;
	- return (ftruncate(td, &ouap));
	+ return (sys_ftruncate(td, &ouap));
	}

	/*
	* Sync an open file.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct fsync_args {
	int fd;
	};
	#endif
	int
	-fsync(td, uap)
	+sys_fsync(td, uap)
	struct thread *td;
	struct fsync_args /* {
	int fd;
	} / uap;
	{
	struct vnode *vp;
	struct mount *mp;
	struct file *fp;
	int vfslocked;
	int error, lock_flags;

	AUDIT_ARG_FD(uap->fd);
	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FSYNC,
	&fp)) != 0)
	return (error);
	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	if ((error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0)
	goto drop;
	if (MNT_SHARED_WRITES(mp) \|\|
	((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
	lock_flags = LK_SHARED;
	} else {
	lock_flags = LK_EXCLUSIVE;
	}
	vn_lock(vp, lock_flags \| LK_RETRY);
	AUDIT_ARG_VNODE1(vp);
	if (vp->v_object != NULL) {
	VM_OBJECT_LOCK(vp->v_object);
	vm_object_page_clean(vp->v_object, 0, 0, 0);
	VM_OBJECT_UNLOCK(vp->v_object);
	}
	error = VOP_FSYNC(vp, MNT_WAIT, td);

	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	drop:
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	return (error);
	}

	/*
	* Rename files. Source and destination must either both be directories, or
	* both not be directories. If target is a directory, it must be empty.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct rename_args {
	char *from;
	char *to;
	};
	#endif
	int
	-rename(td, uap)
	+sys_rename(td, uap)
	struct thread *td;
	register struct rename_args /* {
	char *from;
	char *to;
	} / uap;
	{

	return (kern_rename(td, uap->from, uap->to, UIO_USERSPACE));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct renameat_args {
	int oldfd;
	char *old;
	int newfd;
	char *new;
	};
	#endif
	int
	-renameat(struct thread td, struct renameat_args uap)
	+sys_renameat(struct thread td, struct renameat_args uap)
	{

	return (kern_renameat(td, uap->oldfd, uap->old, uap->newfd, uap->new,
	UIO_USERSPACE));
	}

	int
	kern_rename(struct thread td, char from, char *to, enum uio_seg pathseg)
	{

	return (kern_renameat(td, AT_FDCWD, from, AT_FDCWD, to, pathseg));
	}

	int
	kern_renameat(struct thread td, int oldfd, char old, int newfd, char *new,
	enum uio_seg pathseg)
	{
	struct mount *mp = NULL;
	struct vnode tvp, fvp, *tdvp;
	struct nameidata fromnd, tond;
	int tvfslocked;
	int fvfslocked;
	int error;

	bwillwrite();
	#ifdef MAC
	NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT \| LOCKLEAF \| SAVESTART \|
	MPSAFE \| AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
	#else
	NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT \| SAVESTART \| MPSAFE \|
	AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
	#endif

	if ((error = namei(&fromnd)) != 0)
	return (error);
	fvfslocked = NDHASGIANT(&fromnd);
	tvfslocked = 0;
	#ifdef MAC
	error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
	fromnd.ni_vp, &fromnd.ni_cnd);
	VOP_UNLOCK(fromnd.ni_dvp, 0);
	if (fromnd.ni_dvp != fromnd.ni_vp)
	VOP_UNLOCK(fromnd.ni_vp, 0);
	#endif
	fvp = fromnd.ni_vp;
	if (error == 0)
	error = vn_start_write(fvp, &mp, V_WAIT \| PCATCH);
	if (error != 0) {
	NDFREE(&fromnd, NDF_ONLY_PNBUF);
	vrele(fromnd.ni_dvp);
	vrele(fvp);
	goto out1;
	}
	NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT \| LOCKLEAF \| NOCACHE \|
	SAVESTART \| MPSAFE \| AUDITVNODE2, pathseg, new, newfd, CAP_CREATE,
	td);
	if (fromnd.ni_vp->v_type == VDIR)
	tond.ni_cnd.cn_flags \|= WILLBEDIR;
	if ((error = namei(&tond)) != 0) {
	/* Translate error code for rename("dir1", "dir2/."). */
	if (error == EISDIR && fvp->v_type == VDIR)
	error = EINVAL;
	NDFREE(&fromnd, NDF_ONLY_PNBUF);
	vrele(fromnd.ni_dvp);
	vrele(fvp);
	vn_finished_write(mp);
	goto out1;
	}
	tvfslocked = NDHASGIANT(&tond);
	tdvp = tond.ni_dvp;
	tvp = tond.ni_vp;
	if (tvp != NULL) {
	if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
	error = ENOTDIR;
	goto out;
	} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
	error = EISDIR;
	goto out;
	}
	}
	if (fvp == tdvp) {
	error = EINVAL;
	goto out;
	}
	/*
	* If the source is the same as the destination (that is, if they
	* are links to the same vnode), then there is nothing to do.
	*/
	if (fvp == tvp)
	error = -1;
	#ifdef MAC
	else
	error = mac_vnode_check_rename_to(td->td_ucred, tdvp,
	tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
	#endif
	out:
	if (!error) {
	error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
	tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
	NDFREE(&fromnd, NDF_ONLY_PNBUF);
	NDFREE(&tond, NDF_ONLY_PNBUF);
	} else {
	NDFREE(&fromnd, NDF_ONLY_PNBUF);
	NDFREE(&tond, NDF_ONLY_PNBUF);
	if (tvp)
	vput(tvp);
	if (tdvp == tvp)
	vrele(tdvp);
	else
	vput(tdvp);
	vrele(fromnd.ni_dvp);
	vrele(fvp);
	}
	vrele(tond.ni_startdir);
	vn_finished_write(mp);
	out1:
	if (fromnd.ni_startdir)
	vrele(fromnd.ni_startdir);
	VFS_UNLOCK_GIANT(fvfslocked);
	VFS_UNLOCK_GIANT(tvfslocked);
	if (error == -1)
	return (0);
	return (error);
	}

	/*
	* Make a directory file.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct mkdir_args {
	char *path;
	int mode;
	};
	#endif
	int
	-mkdir(td, uap)
	+sys_mkdir(td, uap)
	struct thread *td;
	register struct mkdir_args /* {
	char *path;
	int mode;
	} / uap;
	{

	return (kern_mkdir(td, uap->path, UIO_USERSPACE, uap->mode));
	}

	#ifndef _SYS_SYSPROTO_H_
	struct mkdirat_args {
	int fd;
	char *path;
	mode_t mode;
	};
	#endif
	int
	-mkdirat(struct thread td, struct mkdirat_args uap)
	+sys_mkdirat(struct thread td, struct mkdirat_args uap)
	{

	return (kern_mkdirat(td, uap->fd, uap->path, UIO_USERSPACE, uap->mode));
	}

	int
	kern_mkdir(struct thread td, char path, enum uio_seg segflg, int mode)
	{

	return (kern_mkdirat(td, AT_FDCWD, path, segflg, mode));
	}

	int
	kern_mkdirat(struct thread td, int fd, char path, enum uio_seg segflg,
	int mode)
	{
	struct mount *mp;
	struct vnode *vp;
	struct vattr vattr;
	int error;
	struct nameidata nd;
	int vfslocked;

	AUDIT_ARG_MODE(mode);
	restart:
	bwillwrite();
	NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT \| SAVENAME \| MPSAFE \|
	AUDITVNODE1, segflg, path, fd, CAP_MKDIR, td);
	nd.ni_cnd.cn_flags \|= WILLBEDIR;
	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	vp = nd.ni_vp;
	if (vp != NULL) {
	NDFREE(&nd, NDF_ONLY_PNBUF);
	/*
	* XXX namei called with LOCKPARENT but not LOCKLEAF has
	* the strange behaviour of leaving the vnode unlocked
	* if the target is the same vnode as the parent.
	*/
	if (vp == nd.ni_dvp)
	vrele(nd.ni_dvp);
	else
	vput(nd.ni_dvp);
	vrele(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (EEXIST);
	}
	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(nd.ni_dvp);
	VFS_UNLOCK_GIANT(vfslocked);
	if ((error = vn_start_write(NULL, &mp, V_XSLEEP \| PCATCH)) != 0)
	return (error);
	goto restart;
	}
	VATTR_NULL(&vattr);
	vattr.va_type = VDIR;
	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
	#ifdef MAC
	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
	&vattr);
	if (error)
	goto out;
	#endif
	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
	#ifdef MAC
	out:
	#endif
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(nd.ni_dvp);
	if (!error)
	vput(nd.ni_vp);
	vn_finished_write(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Remove a directory file.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct rmdir_args {
	char *path;
	};
	#endif
	int
	-rmdir(td, uap)
	+sys_rmdir(td, uap)
	struct thread *td;
	struct rmdir_args /* {
	char *path;
	} / uap;
	{

	return (kern_rmdir(td, uap->path, UIO_USERSPACE));
	}

	int
	kern_rmdir(struct thread td, char path, enum uio_seg pathseg)
	{

	return (kern_rmdirat(td, AT_FDCWD, path, pathseg));
	}

	int
	kern_rmdirat(struct thread td, int fd, char path, enum uio_seg pathseg)
	{
	struct mount *mp;
	struct vnode *vp;
	int error;
	struct nameidata nd;
	int vfslocked;

	restart:
	bwillwrite();
	NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT \| LOCKLEAF \| MPSAFE \|
	AUDITVNODE1, pathseg, path, fd, CAP_RMDIR, td);
	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	vp = nd.ni_vp;
	if (vp->v_type != VDIR) {
	error = ENOTDIR;
	goto out;
	}
	/*
	* No rmdir "." please.
	*/
	if (nd.ni_dvp == vp) {
	error = EINVAL;
	goto out;
	}
	/*
	* The root of a mounted filesystem cannot be deleted.
	*/
	if (vp->v_vflag & VV_ROOT) {
	error = EBUSY;
	goto out;
	}
	#ifdef MAC
	error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
	&nd.ni_cnd);
	if (error)
	goto out;
	#endif
	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(vp);
	if (nd.ni_dvp == vp)
	vrele(nd.ni_dvp);
	else
	vput(nd.ni_dvp);
	VFS_UNLOCK_GIANT(vfslocked);
	if ((error = vn_start_write(NULL, &mp, V_XSLEEP \| PCATCH)) != 0)
	return (error);
	goto restart;
	}
	error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
	vn_finished_write(mp);
	out:
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vput(vp);
	if (nd.ni_dvp == vp)
	vrele(nd.ni_dvp);
	else
	vput(nd.ni_dvp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	#ifdef COMPAT_43
	/*
	* Read a block of directory entries in a filesystem independent format.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct ogetdirentries_args {
	int fd;
	char *buf;
	u_int count;
	long *basep;
	};
	#endif
	int
	ogetdirentries(struct thread td, struct ogetdirentries_args uap)
	{
	long loff;
	int error;

	error = kern_ogetdirentries(td, uap, &loff);
	if (error == 0)
	error = copyout(&loff, uap->basep, sizeof(long));
	return (error);
	}

	int
	kern_ogetdirentries(struct thread td, struct ogetdirentries_args uap,
	long *ploff)
	{
	struct vnode *vp;
	struct file *fp;
	struct uio auio, kuio;
	struct iovec aiov, kiov;
	struct dirent dp, edp;
	caddr_t dirbuf;
	int error, eofflag, readcnt, vfslocked;
	long loff;

	/* XXX arbitrary sanity limit on `count'. */
	if (uap->count > 64 * 1024)
	return (EINVAL);
	if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_READ,
	&fp)) != 0)
	return (error);
	if ((fp->f_flag & FREAD) == 0) {
	fdrop(fp, td);
	return (EBADF);
	}
	vp = fp->f_vnode;
	unionread:
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	if (vp->v_type != VDIR) {
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	return (EINVAL);
	}
	aiov.iov_base = uap->buf;
	aiov.iov_len = uap->count;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = UIO_USERSPACE;
	auio.uio_td = td;
	auio.uio_resid = uap->count;
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	loff = auio.uio_offset = fp->f_offset;
	#ifdef MAC
	error = mac_vnode_check_readdir(td->td_ucred, vp);
	if (error) {
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	return (error);
	}
	#endif
	# if (BYTE_ORDER != LITTLE_ENDIAN)
	if (vp->v_mount->mnt_maxsymlinklen <= 0) {
	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
	NULL, NULL);
	fp->f_offset = auio.uio_offset;
	} else
	# endif
	{
	kuio = auio;
	kuio.uio_iov = &kiov;
	kuio.uio_segflg = UIO_SYSSPACE;
	kiov.iov_len = uap->count;
	dirbuf = malloc(uap->count, M_TEMP, M_WAITOK);
	kiov.iov_base = dirbuf;
	error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
	NULL, NULL);
	fp->f_offset = kuio.uio_offset;
	if (error == 0) {
	readcnt = uap->count - kuio.uio_resid;
	edp = (struct dirent *)&dirbuf[readcnt];
	for (dp = (struct dirent *)dirbuf; dp < edp; ) {
	# if (BYTE_ORDER == LITTLE_ENDIAN)
	/*
	* The expected low byte of
	* dp->d_namlen is our dp->d_type.
	* The high MBZ byte of dp->d_namlen
	* is our dp->d_namlen.
	*/
	dp->d_type = dp->d_namlen;
	dp->d_namlen = 0;
	# else
	/*
	* The dp->d_type is the high byte
	* of the expected dp->d_namlen,
	* so must be zero'ed.
	*/
	dp->d_type = 0;
	# endif
	if (dp->d_reclen > 0) {
	dp = (struct dirent *)
	((char *)dp + dp->d_reclen);
	} else {
	error = EIO;
	break;
	}
	}
	if (dp >= edp)
	error = uiomove(dirbuf, readcnt, &auio);
	}
	free(dirbuf, M_TEMP);
	}
	if (error) {
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	return (error);
	}
	if (uap->count == auio.uio_resid &&
	(vp->v_vflag & VV_ROOT) &&
	(vp->v_mount->mnt_flag & MNT_UNION)) {
	struct vnode *tvp = vp;
	vp = vp->v_mount->mnt_vnodecovered;
	VREF(vp);
	fp->f_vnode = vp;
	fp->f_data = vp;
	fp->f_offset = 0;
	vput(tvp);
	VFS_UNLOCK_GIANT(vfslocked);
	goto unionread;
	}
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	fdrop(fp, td);
	td->td_retval[0] = uap->count - auio.uio_resid;
	if (error == 0)
	*ploff = loff;
	return (error);
	}
	#endif /* COMPAT_43 */

	/*
	* Read a block of directory entries in a filesystem independent format.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct getdirentries_args {
	int fd;
	char *buf;
	u_int count;
	long *basep;
	};
	#endif
	int
	-getdirentries(td, uap)
	+sys_getdirentries(td, uap)
	struct thread *td;
	register struct getdirentries_args /* {
	int fd;
	char *buf;
	u_int count;
	long *basep;
	} / uap;
	{
	long base;
	int error;

	error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base);
	if (error)
	return (error);
	if (uap->basep != NULL)
	error = copyout(&base, uap->basep, sizeof(long));
	return (error);
	}

	int
	kern_getdirentries(struct thread td, int fd, char buf, u_int count,
	long *basep)
	{
	struct vnode *vp;
	struct file *fp;
	struct uio auio;
	struct iovec aiov;
	int vfslocked;
	long loff;
	int error, eofflag;

	AUDIT_ARG_FD(fd);
	if (count > INT_MAX)
	return (EINVAL);
	if ((error = getvnode(td->td_proc->p_fd, fd, CAP_READ \| CAP_SEEK,
	&fp)) != 0)
	return (error);
	if ((fp->f_flag & FREAD) == 0) {
	fdrop(fp, td);
	return (EBADF);
	}
	vp = fp->f_vnode;
	unionread:
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	if (vp->v_type != VDIR) {
	VFS_UNLOCK_GIANT(vfslocked);
	error = EINVAL;
	goto fail;
	}
	aiov.iov_base = buf;
	aiov.iov_len = count;
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = UIO_USERSPACE;
	auio.uio_td = td;
	auio.uio_resid = count;
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	AUDIT_ARG_VNODE1(vp);
	loff = auio.uio_offset = fp->f_offset;
	#ifdef MAC
	error = mac_vnode_check_readdir(td->td_ucred, vp);
	if (error == 0)
	#endif
	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
	NULL);
	fp->f_offset = auio.uio_offset;
	if (error) {
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	goto fail;
	}
	if (count == auio.uio_resid &&
	(vp->v_vflag & VV_ROOT) &&
	(vp->v_mount->mnt_flag & MNT_UNION)) {
	struct vnode *tvp = vp;
	vp = vp->v_mount->mnt_vnodecovered;
	VREF(vp);
	fp->f_vnode = vp;
	fp->f_data = vp;
	fp->f_offset = 0;
	vput(tvp);
	VFS_UNLOCK_GIANT(vfslocked);
	goto unionread;
	}
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	*basep = loff;
	td->td_retval[0] = count - auio.uio_resid;
	fail:
	fdrop(fp, td);
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct getdents_args {
	int fd;
	char *buf;
	size_t count;
	};
	#endif
	int
	-getdents(td, uap)
	+sys_getdents(td, uap)
	struct thread *td;
	register struct getdents_args /* {
	int fd;
	char *buf;
	u_int count;
	} / uap;
	{
	struct getdirentries_args ap;
	ap.fd = uap->fd;
	ap.buf = uap->buf;
	ap.count = uap->count;
	ap.basep = NULL;
	- return (getdirentries(td, &ap));
	+ return (sys_getdirentries(td, &ap));
	}

	/*
	* Set the mode mask for creation of filesystem nodes.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct umask_args {
	int newmask;
	};
	#endif
	int
	-umask(td, uap)
	+sys_umask(td, uap)
	struct thread *td;
	struct umask_args /* {
	int newmask;
	} / uap;
	{
	register struct filedesc *fdp;

	FILEDESC_XLOCK(td->td_proc->p_fd);
	fdp = td->td_proc->p_fd;
	td->td_retval[0] = fdp->fd_cmask;
	fdp->fd_cmask = uap->newmask & ALLPERMS;
	FILEDESC_XUNLOCK(td->td_proc->p_fd);
	return (0);
	}

	/*
	* Void all references to file by ripping underlying filesystem away from
	* vnode.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct revoke_args {
	char *path;
	};
	#endif
	int
	-revoke(td, uap)
	+sys_revoke(td, uap)
	struct thread *td;
	register struct revoke_args /* {
	char *path;
	} / uap;
	{
	struct vnode *vp;
	struct vattr vattr;
	int error;
	struct nameidata nd;
	int vfslocked;

	NDINIT(&nd, LOOKUP, FOLLOW \| LOCKLEAF \| MPSAFE \| AUDITVNODE1,
	UIO_USERSPACE, uap->path, td);
	if ((error = namei(&nd)) != 0)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	vp = nd.ni_vp;
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (vp->v_type != VCHR \|\| vp->v_rdev == NULL) {
	error = EINVAL;
	goto out;
	}
	#ifdef MAC
	error = mac_vnode_check_revoke(td->td_ucred, vp);
	if (error)
	goto out;
	#endif
	error = VOP_GETATTR(vp, &vattr, td->td_ucred);
	if (error)
	goto out;
	if (td->td_ucred->cr_uid != vattr.va_uid) {
	error = priv_check(td, PRIV_VFS_ADMIN);
	if (error)
	goto out;
	}
	if (vcount(vp) > 1)
	VOP_REVOKE(vp, REVOKEALL);
	out:
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Convert a user file descriptor to a kernel file entry and check that, if it
	* is a capability, the correct rights are present. A reference on the file
	* entry is held upon returning.
	*/
	int
	getvnode(struct filedesc *fdp, int fd, cap_rights_t rights,
	struct file **fpp)
	{
	struct file *fp;
	#ifdef CAPABILITIES
	struct file *fp_fromcap;
	#endif
	int error;

	error = 0;
	fp = NULL;
	if ((fdp == NULL) \|\| (fp = fget_unlocked(fdp, fd)) == NULL)
	return (EBADF);
	#ifdef CAPABILITIES
	/*
	* If the file descriptor is for a capability, test rights and use the
	* file descriptor referenced by the capability.
	*/
	error = cap_funwrap(fp, rights, &fp_fromcap);
	if (error) {
	fdrop(fp, curthread);
	return (error);
	}
	if (fp != fp_fromcap) {
	fhold(fp_fromcap);
	fdrop(fp, curthread);
	fp = fp_fromcap;
	}
	#endif /* CAPABILITIES */
	if (fp->f_vnode == NULL) {
	fdrop(fp, curthread);
	return (EINVAL);
	}
	*fpp = fp;
	return (0);
	}


	/*
	* Get an (NFS) file handle.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct lgetfh_args {
	char *fname;
	fhandle_t *fhp;
	};
	#endif
	int
	-lgetfh(td, uap)
	+sys_lgetfh(td, uap)
	struct thread *td;
	register struct lgetfh_args *uap;
	{
	struct nameidata nd;
	fhandle_t fh;
	register struct vnode *vp;
	int vfslocked;
	int error;

	error = priv_check(td, PRIV_VFS_GETFH);
	if (error)
	return (error);
	NDINIT(&nd, LOOKUP, NOFOLLOW \| LOCKLEAF \| MPSAFE \| AUDITVNODE1,
	UIO_USERSPACE, uap->fname, td);
	error = namei(&nd);
	if (error)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vp = nd.ni_vp;
	bzero(&fh, sizeof(fh));
	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
	error = VOP_VPTOFH(vp, &fh.fh_fid);
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	if (error)
	return (error);
	error = copyout(&fh, uap->fhp, sizeof (fh));
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct getfh_args {
	char *fname;
	fhandle_t *fhp;
	};
	#endif
	int
	-getfh(td, uap)
	+sys_getfh(td, uap)
	struct thread *td;
	register struct getfh_args *uap;
	{
	struct nameidata nd;
	fhandle_t fh;
	register struct vnode *vp;
	int vfslocked;
	int error;

	error = priv_check(td, PRIV_VFS_GETFH);
	if (error)
	return (error);
	NDINIT(&nd, LOOKUP, FOLLOW \| LOCKLEAF \| MPSAFE \| AUDITVNODE1,
	UIO_USERSPACE, uap->fname, td);
	error = namei(&nd);
	if (error)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vp = nd.ni_vp;
	bzero(&fh, sizeof(fh));
	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
	error = VOP_VPTOFH(vp, &fh.fh_fid);
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	if (error)
	return (error);
	error = copyout(&fh, uap->fhp, sizeof (fh));
	return (error);
	}

	/*
	* syscall for the rpc.lockd to use to translate a NFS file handle into an
	* open descriptor.
	*
	* warning: do not remove the priv_check() call or this becomes one giant
	* security hole.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct fhopen_args {
	const struct fhandle *u_fhp;
	int flags;
	};
	#endif
	int
	-fhopen(td, uap)
	+sys_fhopen(td, uap)
	struct thread *td;
	struct fhopen_args /* {
	const struct fhandle *u_fhp;
	int flags;
	} / uap;
	{
	struct proc *p = td->td_proc;
	struct mount *mp;
	struct vnode *vp;
	struct fhandle fhp;
	struct vattr vat;
	struct vattr *vap = &vat;
	struct flock lf;
	struct file *fp;
	register struct filedesc *fdp = p->p_fd;
	int fmode, error, type;
	accmode_t accmode;
	struct file *nfp;
	int vfslocked;
	int indx;

	error = priv_check(td, PRIV_VFS_FHOPEN);
	if (error)
	return (error);
	fmode = FFLAGS(uap->flags);
	/* why not allow a non-read/write open for our lockd? */
	if (((fmode & (FREAD \| FWRITE)) == 0) \|\| (fmode & O_CREAT))
	return (EINVAL);
	error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
	if (error)
	return(error);
	/* find the mount point */
	mp = vfs_busyfs(&fhp.fh_fsid);
	if (mp == NULL)
	return (ESTALE);
	vfslocked = VFS_LOCK_GIANT(mp);
	/* now give me my vnode, it gets returned to me locked */
	error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
	vfs_unbusy(mp);
	if (error)
	goto out;
	/*
	* from now on we have to make sure not
	* to forget about the vnode
	* any error that causes an abort must vput(vp)
	* just set error = err and 'goto bad;'.
	*/

	/*
	* from vn_open
	*/
	if (vp->v_type == VLNK) {
	error = EMLINK;
	goto bad;
	}
	if (vp->v_type == VSOCK) {
	error = EOPNOTSUPP;
	goto bad;
	}
	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
	error = ENOTDIR;
	goto bad;
	}
	accmode = 0;
	if (fmode & (FWRITE \| O_TRUNC)) {
	if (vp->v_type == VDIR) {
	error = EISDIR;
	goto bad;
	}
	error = vn_writechk(vp);
	if (error)
	goto bad;
	accmode \|= VWRITE;
	}
	if (fmode & FREAD)
	accmode \|= VREAD;
	if ((fmode & O_APPEND) && (fmode & FWRITE))
	accmode \|= VAPPEND;
	#ifdef MAC
	error = mac_vnode_check_open(td->td_ucred, vp, accmode);
	if (error)
	goto bad;
	#endif
	if (accmode) {
	error = VOP_ACCESS(vp, accmode, td->td_ucred, td);
	if (error)
	goto bad;
	}
	if (fmode & O_TRUNC) {
	vfs_ref(mp);
	VOP_UNLOCK(vp, 0); /* XXX */
	if ((error = vn_start_write(NULL, &mp, V_WAIT \| PCATCH)) != 0) {
	vrele(vp);
	vfs_rel(mp);
	goto out;
	}
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY); /* XXX */
	vfs_rel(mp);
	#ifdef MAC
	/*
	* We don't yet have fp->f_cred, so use td->td_ucred, which
	* should be right.
	*/
	error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp);
	if (error == 0) {
	#endif
	VATTR_NULL(vap);
	vap->va_size = 0;
	error = VOP_SETATTR(vp, vap, td->td_ucred);
	#ifdef MAC
	}
	#endif
	vn_finished_write(mp);
	if (error)
	goto bad;
	}
	error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL);
	if (error)
	goto bad;

	if (fmode & FWRITE)
	vp->v_writecount++;

	/*
	* end of vn_open code
	*/

	if ((error = falloc(td, &nfp, &indx, fmode)) != 0) {
	if (fmode & FWRITE)
	vp->v_writecount--;
	goto bad;
	}
	/* An extra reference on `nfp' has been held for us by falloc(). */
	fp = nfp;
	nfp->f_vnode = vp;
	finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
	if (fmode & (O_EXLOCK \| O_SHLOCK)) {
	lf.l_whence = SEEK_SET;
	lf.l_start = 0;
	lf.l_len = 0;
	if (fmode & O_EXLOCK)
	lf.l_type = F_WRLCK;
	else
	lf.l_type = F_RDLCK;
	type = F_FLOCK;
	if ((fmode & FNONBLOCK) == 0)
	type \|= F_WAIT;
	VOP_UNLOCK(vp, 0);
	if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
	type)) != 0) {
	/*
	* The lock request failed. Normally close the
	* descriptor but handle the case where someone might
	* have dup()d or close()d it when we weren't looking.
	*/
	fdclose(fdp, fp, indx, td);

	/*
	* release our private reference
	*/
	fdrop(fp, td);
	goto out;
	}
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	atomic_set_int(&fp->f_flag, FHASLOCK);
	}

	VOP_UNLOCK(vp, 0);
	fdrop(fp, td);
	VFS_UNLOCK_GIANT(vfslocked);
	td->td_retval[0] = indx;
	return (0);

	bad:
	vput(vp);
	out:
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Stat an (NFS) file handle.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct fhstat_args {
	struct fhandle *u_fhp;
	struct stat *sb;
	};
	#endif
	int
	-fhstat(td, uap)
	+sys_fhstat(td, uap)
	struct thread *td;
	register struct fhstat_args /* {
	struct fhandle *u_fhp;
	struct stat *sb;
	} / uap;
	{
	struct stat sb;
	fhandle_t fh;
	struct mount *mp;
	struct vnode *vp;
	int vfslocked;
	int error;

	error = priv_check(td, PRIV_VFS_FHSTAT);
	if (error)
	return (error);
	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
	if (error)
	return (error);
	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
	return (ESTALE);
	vfslocked = VFS_LOCK_GIANT(mp);
	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
	vfs_unbusy(mp);
	if (error) {
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	if (error)
	return (error);
	error = copyout(&sb, uap->sb, sizeof(sb));
	return (error);
	}

	/*
	* Implement fstatfs() for (NFS) file handles.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct fhstatfs_args {
	struct fhandle *u_fhp;
	struct statfs *buf;
	};
	#endif
	int
	-fhstatfs(td, uap)
	+sys_fhstatfs(td, uap)
	struct thread *td;
	struct fhstatfs_args /* {
	struct fhandle *u_fhp;
	struct statfs *buf;
	} / uap;
	{
	struct statfs sf;
	fhandle_t fh;
	int error;

	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
	if (error)
	return (error);
	error = kern_fhstatfs(td, fh, &sf);
	if (error)
	return (error);
	return (copyout(&sf, uap->buf, sizeof(sf)));
	}

	int
	kern_fhstatfs(struct thread td, fhandle_t fh, struct statfs buf)
	{
	struct statfs *sp;
	struct mount *mp;
	struct vnode *vp;
	int vfslocked;
	int error;

	error = priv_check(td, PRIV_VFS_FHSTATFS);
	if (error)
	return (error);
	if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
	return (ESTALE);
	vfslocked = VFS_LOCK_GIANT(mp);
	error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
	if (error) {
	vfs_unbusy(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	vput(vp);
	error = prison_canseemount(td->td_ucred, mp);
	if (error)
	goto out;
	#ifdef MAC
	error = mac_mount_check_stat(td->td_ucred, mp);
	if (error)
	goto out;
	#endif
	/*
	* Set these in case the underlying filesystem fails to do so.
	*/
	sp = &mp->mnt_stat;
	sp->f_version = STATFS_VERSION;
	sp->f_namemax = NAME_MAX;
	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
	error = VFS_STATFS(mp, sp);
	if (error == 0)
	buf = sp;
	out:
	vfs_unbusy(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	static int
	kern_posix_fallocate(struct thread *td, int fd, off_t offset, off_t len)
	{
	struct file *fp;
	struct mount *mp;
	struct vnode *vp;
	off_t olen, ooffset;
	int error, vfslocked;

	fp = NULL;
	vfslocked = 0;
	error = fget(td, fd, CAP_WRITE, &fp);
	if (error != 0)
	goto out;

	switch (fp->f_type) {
	case DTYPE_VNODE:
	break;
	case DTYPE_PIPE:
	case DTYPE_FIFO:
	error = ESPIPE;
	goto out;
	default:
	error = ENODEV;
	goto out;
	}
	if ((fp->f_flag & FWRITE) == 0) {
	error = EBADF;
	goto out;
	}
	vp = fp->f_vnode;
	if (vp->v_type != VREG) {
	error = ENODEV;
	goto out;
	}
	if (offset < 0 \|\| len <= 0) {
	error = EINVAL;
	goto out;
	}
	/* Check for wrap. */
	if (offset > OFF_MAX - len) {
	error = EFBIG;
	goto out;
	}

	/* Allocating blocks may take a long time, so iterate. */
	for (;;) {
	olen = len;
	ooffset = offset;

	bwillwrite();
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	mp = NULL;
	error = vn_start_write(vp, &mp, V_WAIT \| PCATCH);
	if (error != 0) {
	VFS_UNLOCK_GIANT(vfslocked);
	break;
	}
	error = vn_lock(vp, LK_EXCLUSIVE);
	if (error != 0) {
	vn_finished_write(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	break;
	}
	#ifdef MAC
	error = mac_vnode_check_write(td->td_ucred, fp->f_cred, vp);
	if (error == 0)
	#endif
	error = VOP_ALLOCATE(vp, &offset, &len);
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	VFS_UNLOCK_GIANT(vfslocked);

	if (olen + ooffset != offset + len) {
	panic("offset + len changed from %jx/%jx to %jx/%jx",
	ooffset, olen, offset, len);
	}
	if (error != 0 \|\| len == 0)
	break;
	KASSERT(olen > len, ("Iteration did not make progress?"));
	maybe_yield();
	}
	out:
	if (fp != NULL)
	fdrop(fp, td);
	return (error);
	}

	int
	-posix_fallocate(struct thread td, struct posix_fallocate_args uap)
	+sys_posix_fallocate(struct thread td, struct posix_fallocate_args uap)
	{

	return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
	}
	Index: head/sys/kern/vfs_vnops.c
	===================================================================
	--- head/sys/kern/vfs_vnops.c (revision 225616)
	+++ head/sys/kern/vfs_vnops.c (revision 225617)
	@@ -1,1415 +1,1415 @@
	/*-
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	* (c) UNIX System Laboratories, Inc.
	* All or some portions of this file are derived from material licensed
	* to the University of California by American Telephone and Telegraph
	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
	* the permission of UNIX System Laboratories, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/kdb.h>
	#include <sys/stat.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/limits.h>
	#include <sys/lock.h>
	#include <sys/mount.h>
	#include <sys/mutex.h>
	#include <sys/namei.h>
	#include <sys/vnode.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/filio.h>
	#include <sys/resourcevar.h>
	#include <sys/sx.h>
	#include <sys/ttycom.h>
	#include <sys/conf.h>
	#include <sys/syslog.h>
	#include <sys/unistd.h>

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_object.h>

	static fo_rdwr_t vn_read;
	static fo_rdwr_t vn_write;
	static fo_truncate_t vn_truncate;
	static fo_ioctl_t vn_ioctl;
	static fo_poll_t vn_poll;
	static fo_kqfilter_t vn_kqfilter;
	static fo_stat_t vn_statfile;
	static fo_close_t vn_closefile;

	struct fileops vnops = {
	.fo_read = vn_read,
	.fo_write = vn_write,
	.fo_truncate = vn_truncate,
	.fo_ioctl = vn_ioctl,
	.fo_poll = vn_poll,
	.fo_kqfilter = vn_kqfilter,
	.fo_stat = vn_statfile,
	.fo_close = vn_closefile,
	.fo_chmod = vn_chmod,
	.fo_chown = vn_chown,
	.fo_flags = DFLAG_PASSABLE \| DFLAG_SEEKABLE
	};

	int
	vn_open(ndp, flagp, cmode, fp)
	struct nameidata *ndp;
	int *flagp, cmode;
	struct file *fp;
	{
	struct thread *td = ndp->ni_cnd.cn_thread;

	return (vn_open_cred(ndp, flagp, cmode, 0, td->td_ucred, fp));
	}

	/*
	* Common code for vnode open operations.
	* Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
	*
	* Note that this does NOT free nameidata for the successful case,
	* due to the NDINIT being done elsewhere.
	*/
	int
	vn_open_cred(struct nameidata ndp, int flagp, int cmode, u_int vn_open_flags,
	struct ucred cred, struct file fp)
	{
	struct vnode *vp;
	struct mount *mp;
	struct thread *td = ndp->ni_cnd.cn_thread;
	struct vattr vat;
	struct vattr *vap = &vat;
	int fmode, error;
	accmode_t accmode;
	int vfslocked, mpsafe;

	mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
	restart:
	vfslocked = 0;
	fmode = *flagp;
	if (fmode & O_CREAT) {
	ndp->ni_cnd.cn_nameiop = CREATE;
	ndp->ni_cnd.cn_flags = ISOPEN \| LOCKPARENT \| LOCKLEAF \|
	MPSAFE;
	if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
	ndp->ni_cnd.cn_flags \|= FOLLOW;
	if (!(vn_open_flags & VN_OPEN_NOAUDIT))
	ndp->ni_cnd.cn_flags \|= AUDITVNODE1;
	bwillwrite();
	if ((error = namei(ndp)) != 0)
	return (error);
	vfslocked = NDHASGIANT(ndp);
	if (!mpsafe)
	ndp->ni_cnd.cn_flags &= ~MPSAFE;
	if (ndp->ni_vp == NULL) {
	VATTR_NULL(vap);
	vap->va_type = VREG;
	vap->va_mode = cmode;
	if (fmode & O_EXCL)
	vap->va_vaflags \|= VA_EXCLUSIVE;
	if (vn_start_write(ndp->ni_dvp, &mp, V_NOWAIT) != 0) {
	NDFREE(ndp, NDF_ONLY_PNBUF);
	vput(ndp->ni_dvp);
	VFS_UNLOCK_GIANT(vfslocked);
	if ((error = vn_start_write(NULL, &mp,
	V_XSLEEP \| PCATCH)) != 0)
	return (error);
	goto restart;
	}
	#ifdef MAC
	error = mac_vnode_check_create(cred, ndp->ni_dvp,
	&ndp->ni_cnd, vap);
	if (error == 0)
	#endif
	error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
	&ndp->ni_cnd, vap);
	vput(ndp->ni_dvp);
	vn_finished_write(mp);
	if (error) {
	VFS_UNLOCK_GIANT(vfslocked);
	NDFREE(ndp, NDF_ONLY_PNBUF);
	return (error);
	}
	fmode &= ~O_TRUNC;
	vp = ndp->ni_vp;
	} else {
	if (ndp->ni_dvp == ndp->ni_vp)
	vrele(ndp->ni_dvp);
	else
	vput(ndp->ni_dvp);
	ndp->ni_dvp = NULL;
	vp = ndp->ni_vp;
	if (fmode & O_EXCL) {
	error = EEXIST;
	goto bad;
	}
	fmode &= ~O_CREAT;
	}
	} else {
	ndp->ni_cnd.cn_nameiop = LOOKUP;
	ndp->ni_cnd.cn_flags = ISOPEN \|
	((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) \|
	LOCKLEAF \| MPSAFE;
	if (!(fmode & FWRITE))
	ndp->ni_cnd.cn_flags \|= LOCKSHARED;
	if (!(vn_open_flags & VN_OPEN_NOAUDIT))
	ndp->ni_cnd.cn_flags \|= AUDITVNODE1;
	if ((error = namei(ndp)) != 0)
	return (error);
	if (!mpsafe)
	ndp->ni_cnd.cn_flags &= ~MPSAFE;
	vfslocked = NDHASGIANT(ndp);
	vp = ndp->ni_vp;
	}
	if (vp->v_type == VLNK) {
	error = EMLINK;
	goto bad;
	}
	if (vp->v_type == VSOCK) {
	error = EOPNOTSUPP;
	goto bad;
	}
	if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
	error = ENOTDIR;
	goto bad;
	}
	accmode = 0;
	if (fmode & (FWRITE \| O_TRUNC)) {
	if (vp->v_type == VDIR) {
	error = EISDIR;
	goto bad;
	}
	accmode \|= VWRITE;
	}
	if (fmode & FREAD)
	accmode \|= VREAD;
	if (fmode & FEXEC)
	accmode \|= VEXEC;
	if ((fmode & O_APPEND) && (fmode & FWRITE))
	accmode \|= VAPPEND;
	#ifdef MAC
	error = mac_vnode_check_open(cred, vp, accmode);
	if (error)
	goto bad;
	#endif
	if ((fmode & O_CREAT) == 0) {
	if (accmode & VWRITE) {
	error = vn_writechk(vp);
	if (error)
	goto bad;
	}
	if (accmode) {
	error = VOP_ACCESS(vp, accmode, cred, td);
	if (error)
	goto bad;
	}
	}
	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
	goto bad;

	if (fmode & FWRITE)
	vp->v_writecount++;
	*flagp = fmode;
	ASSERT_VOP_LOCKED(vp, "vn_open_cred");
	if (!mpsafe)
	VFS_UNLOCK_GIANT(vfslocked);
	return (0);
	bad:
	NDFREE(ndp, NDF_ONLY_PNBUF);
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	*flagp = fmode;
	ndp->ni_vp = NULL;
	return (error);
	}

	/*
	* Check for write permissions on the specified vnode.
	* Prototype text segments cannot be written.
	*/
	int
	vn_writechk(vp)
	register struct vnode *vp;
	{

	ASSERT_VOP_LOCKED(vp, "vn_writechk");
	/*
	* If there's shared text associated with
	* the vnode, try to free it up once. If
	* we fail, we can't allow writing.
	*/
	if (vp->v_vflag & VV_TEXT)
	return (ETXTBSY);

	return (0);
	}

	/*
	* Vnode close call
	*/
	int
	vn_close(vp, flags, file_cred, td)
	register struct vnode *vp;
	int flags;
	struct ucred *file_cred;
	struct thread *td;
	{
	struct mount *mp;
	int error, lock_flags;

	if (!(flags & FWRITE) && vp->v_mount != NULL &&
	vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
	lock_flags = LK_SHARED;
	else
	lock_flags = LK_EXCLUSIVE;

	VFS_ASSERT_GIANT(vp->v_mount);

	vn_start_write(vp, &mp, V_WAIT);
	vn_lock(vp, lock_flags \| LK_RETRY);
	if (flags & FWRITE) {
	VNASSERT(vp->v_writecount > 0, vp,
	("vn_close: negative writecount"));
	vp->v_writecount--;
	}
	error = VOP_CLOSE(vp, flags, file_cred, td);
	vput(vp);
	vn_finished_write(mp);
	return (error);
	}

	/*
	* Heuristic to detect sequential operation.
	*/
	static int
	sequential_heuristic(struct uio uio, struct file fp)
	{

	if (atomic_load_acq_int(&(fp->f_flag)) & FRDAHEAD)
	return (fp->f_seqcount << IO_SEQSHIFT);

	/*
	* Offset 0 is handled specially. open() sets f_seqcount to 1 so
	* that the first I/O is normally considered to be slightly
	* sequential. Seeking to offset 0 doesn't change sequentiality
	* unless previous seeks have reduced f_seqcount to 0, in which
	* case offset 0 is not special.
	*/
	if ((uio->uio_offset == 0 && fp->f_seqcount > 0) \|\|
	uio->uio_offset == fp->f_nextoff) {
	/*
	* f_seqcount is in units of fixed-size blocks so that it
	* depends mainly on the amount of sequential I/O and not
	* much on the number of sequential I/O's. The fixed size
	* of 16384 is hard-coded here since it is (not quite) just
	* a magic size that works well here. This size is more
	* closely related to the best I/O size for real disks than
	* to any block size used by software.
	*/
	fp->f_seqcount += howmany(uio->uio_resid, 16384);
	if (fp->f_seqcount > IO_SEQMAX)
	fp->f_seqcount = IO_SEQMAX;
	return (fp->f_seqcount << IO_SEQSHIFT);
	}

	/* Not sequential. Quickly draw-down sequentiality. */
	if (fp->f_seqcount > 1)
	fp->f_seqcount = 1;
	else
	fp->f_seqcount = 0;
	return (0);
	}

	/*
	* Package up an I/O request on a vnode into a uio and do it.
	*/
	int
	vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
	aresid, td)
	enum uio_rw rw;
	struct vnode *vp;
	void *base;
	int len;
	off_t offset;
	enum uio_seg segflg;
	int ioflg;
	struct ucred *active_cred;
	struct ucred *file_cred;
	int *aresid;
	struct thread *td;
	{
	struct uio auio;
	struct iovec aiov;
	struct mount *mp;
	struct ucred *cred;
	int error, lock_flags;

	VFS_ASSERT_GIANT(vp->v_mount);

	if ((ioflg & IO_NODELOCKED) == 0) {
	mp = NULL;
	if (rw == UIO_WRITE) {
	if (vp->v_type != VCHR &&
	(error = vn_start_write(vp, &mp, V_WAIT \| PCATCH))
	!= 0)
	return (error);
	if (MNT_SHARED_WRITES(mp) \|\|
	((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
	lock_flags = LK_SHARED;
	} else {
	lock_flags = LK_EXCLUSIVE;
	}
	vn_lock(vp, lock_flags \| LK_RETRY);
	} else
	vn_lock(vp, LK_SHARED \| LK_RETRY);

	}
	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
	auio.uio_iov = &aiov;
	auio.uio_iovcnt = 1;
	aiov.iov_base = base;
	aiov.iov_len = len;
	auio.uio_resid = len;
	auio.uio_offset = offset;
	auio.uio_segflg = segflg;
	auio.uio_rw = rw;
	auio.uio_td = td;
	error = 0;
	#ifdef MAC
	if ((ioflg & IO_NOMACCHECK) == 0) {
	if (rw == UIO_READ)
	error = mac_vnode_check_read(active_cred, file_cred,
	vp);
	else
	error = mac_vnode_check_write(active_cred, file_cred,
	vp);
	}
	#endif
	if (error == 0) {
	if (file_cred)
	cred = file_cred;
	else
	cred = active_cred;
	if (rw == UIO_READ)
	error = VOP_READ(vp, &auio, ioflg, cred);
	else
	error = VOP_WRITE(vp, &auio, ioflg, cred);
	}
	if (aresid)
	*aresid = auio.uio_resid;
	else
	if (auio.uio_resid && error == 0)
	error = EIO;
	if ((ioflg & IO_NODELOCKED) == 0) {
	if (rw == UIO_WRITE && vp->v_type != VCHR)
	vn_finished_write(mp);
	VOP_UNLOCK(vp, 0);
	}
	return (error);
	}

	/*
	* Package up an I/O request on a vnode into a uio and do it. The I/O
	* request is split up into smaller chunks and we try to avoid saturating
	* the buffer cache while potentially holding a vnode locked, so we
	* check bwillwrite() before calling vn_rdwr(). We also call kern_yield()
	* to give other processes a chance to lock the vnode (either other processes
	* core'ing the same binary, or unrelated processes scanning the directory).
	*/
	int
	vn_rdwr_inchunks(rw, vp, base, len, offset, segflg, ioflg, active_cred,
	file_cred, aresid, td)
	enum uio_rw rw;
	struct vnode *vp;
	void *base;
	size_t len;
	off_t offset;
	enum uio_seg segflg;
	int ioflg;
	struct ucred *active_cred;
	struct ucred *file_cred;
	size_t *aresid;
	struct thread *td;
	{
	int error = 0;
	int iaresid;

	VFS_ASSERT_GIANT(vp->v_mount);

	do {
	int chunk;

	/*
	* Force `offset' to a multiple of MAXBSIZE except possibly
	* for the first chunk, so that filesystems only need to
	* write full blocks except possibly for the first and last
	* chunks.
	*/
	chunk = MAXBSIZE - (uoff_t)offset % MAXBSIZE;

	if (chunk > len)
	chunk = len;
	if (rw != UIO_READ && vp->v_type == VREG)
	bwillwrite();
	iaresid = 0;
	error = vn_rdwr(rw, vp, base, chunk, offset, segflg,
	ioflg, active_cred, file_cred, &iaresid, td);
	len -= chunk; /* aresid calc already includes length */
	if (error)
	break;
	offset += chunk;
	base = (char *)base + chunk;
	kern_yield(PRI_USER);
	} while (len);
	if (aresid)
	*aresid = len + iaresid;
	return (error);
	}

	/*
	* File table vnode read routine.
	*/
	static int
	vn_read(fp, uio, active_cred, flags, td)
	struct file *fp;
	struct uio *uio;
	struct ucred *active_cred;
	int flags;
	struct thread *td;
	{
	struct vnode *vp;
	int error, ioflag;
	struct mtx *mtxp;
	int vfslocked;

	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
	uio->uio_td, td));
	mtxp = NULL;
	vp = fp->f_vnode;
	ioflag = 0;
	if (fp->f_flag & FNONBLOCK)
	ioflag \|= IO_NDELAY;
	if (fp->f_flag & O_DIRECT)
	ioflag \|= IO_DIRECT;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	/*
	* According to McKusick the vn lock was protecting f_offset here.
	* It is now protected by the FOFFSET_LOCKED flag.
	*/
	if ((flags & FOF_OFFSET) == 0) {
	mtxp = mtx_pool_find(mtxpool_sleep, fp);
	mtx_lock(mtxp);
	while(fp->f_vnread_flags & FOFFSET_LOCKED) {
	fp->f_vnread_flags \|= FOFFSET_LOCK_WAITING;
	msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
	"vnread offlock", 0);
	}
	fp->f_vnread_flags \|= FOFFSET_LOCKED;
	mtx_unlock(mtxp);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	uio->uio_offset = fp->f_offset;
	} else
	vn_lock(vp, LK_SHARED \| LK_RETRY);

	ioflag \|= sequential_heuristic(uio, fp);

	#ifdef MAC
	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
	if (error == 0)
	#endif
	error = VOP_READ(vp, uio, ioflag, fp->f_cred);
	if ((flags & FOF_OFFSET) == 0) {
	fp->f_offset = uio->uio_offset;
	mtx_lock(mtxp);
	if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
	wakeup(&fp->f_vnread_flags);
	fp->f_vnread_flags = 0;
	mtx_unlock(mtxp);
	}
	fp->f_nextoff = uio->uio_offset;
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* File table vnode write routine.
	*/
	static int
	vn_write(fp, uio, active_cred, flags, td)
	struct file *fp;
	struct uio *uio;
	struct ucred *active_cred;
	int flags;
	struct thread *td;
	{
	struct vnode *vp;
	struct mount *mp;
	int error, ioflag, lock_flags;
	int vfslocked;

	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
	uio->uio_td, td));
	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	if (vp->v_type == VREG)
	bwillwrite();
	ioflag = IO_UNIT;
	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
	ioflag \|= IO_APPEND;
	if (fp->f_flag & FNONBLOCK)
	ioflag \|= IO_NDELAY;
	if (fp->f_flag & O_DIRECT)
	ioflag \|= IO_DIRECT;
	if ((fp->f_flag & O_FSYNC) \|\|
	(vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
	ioflag \|= IO_SYNC;
	mp = NULL;
	if (vp->v_type != VCHR &&
	(error = vn_start_write(vp, &mp, V_WAIT \| PCATCH)) != 0)
	goto unlock;

	if ((MNT_SHARED_WRITES(mp) \|\|
	((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
	(flags & FOF_OFFSET) != 0) {
	lock_flags = LK_SHARED;
	} else {
	lock_flags = LK_EXCLUSIVE;
	}

	vn_lock(vp, lock_flags \| LK_RETRY);
	if ((flags & FOF_OFFSET) == 0)
	uio->uio_offset = fp->f_offset;
	ioflag \|= sequential_heuristic(uio, fp);
	#ifdef MAC
	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
	if (error == 0)
	#endif
	error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
	if ((flags & FOF_OFFSET) == 0)
	fp->f_offset = uio->uio_offset;
	fp->f_nextoff = uio->uio_offset;
	VOP_UNLOCK(vp, 0);
	if (vp->v_type != VCHR)
	vn_finished_write(mp);
	unlock:
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* File table truncate routine.
	*/
	static int
	vn_truncate(fp, length, active_cred, td)
	struct file *fp;
	off_t length;
	struct ucred *active_cred;
	struct thread *td;
	{
	struct vattr vattr;
	struct mount *mp;
	struct vnode *vp;
	int vfslocked;
	int error;

	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	error = vn_start_write(vp, &mp, V_WAIT \| PCATCH);
	if (error) {
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	if (vp->v_type == VDIR) {
	error = EISDIR;
	goto out;
	}
	#ifdef MAC
	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
	if (error)
	goto out;
	#endif
	error = vn_writechk(vp);
	if (error == 0) {
	VATTR_NULL(&vattr);
	vattr.va_size = length;
	error = VOP_SETATTR(vp, &vattr, fp->f_cred);
	}
	out:
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* File table vnode stat routine.
	*/
	static int
	vn_statfile(fp, sb, active_cred, td)
	struct file *fp;
	struct stat *sb;
	struct ucred *active_cred;
	struct thread *td;
	{
	struct vnode *vp = fp->f_vnode;
	int vfslocked;
	int error;

	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	error = vn_stat(vp, sb, active_cred, fp->f_cred, td);
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);

	return (error);
	}

	/*
	* Stat a vnode; implementation for the stat syscall
	*/
	int
	vn_stat(vp, sb, active_cred, file_cred, td)
	struct vnode *vp;
	register struct stat *sb;
	struct ucred *active_cred;
	struct ucred *file_cred;
	struct thread *td;
	{
	struct vattr vattr;
	register struct vattr *vap;
	int error;
	u_short mode;

	#ifdef MAC
	error = mac_vnode_check_stat(active_cred, file_cred, vp);
	if (error)
	return (error);
	#endif

	vap = &vattr;

	/*
	* Initialize defaults for new and unusual fields, so that file
	* systems which don't support these fields don't need to know
	* about them.
	*/
	vap->va_birthtime.tv_sec = -1;
	vap->va_birthtime.tv_nsec = 0;
	vap->va_fsid = VNOVAL;
	vap->va_rdev = NODEV;

	error = VOP_GETATTR(vp, vap, active_cred);
	if (error)
	return (error);

	/*
	* Zero the spare stat fields
	*/
	bzero(sb, sizeof *sb);

	/*
	* Copy from vattr table
	*/
	if (vap->va_fsid != VNOVAL)
	sb->st_dev = vap->va_fsid;
	else
	sb->st_dev = vp->v_mount->mnt_stat.f_fsid.val[0];
	sb->st_ino = vap->va_fileid;
	mode = vap->va_mode;
	switch (vap->va_type) {
	case VREG:
	mode \|= S_IFREG;
	break;
	case VDIR:
	mode \|= S_IFDIR;
	break;
	case VBLK:
	mode \|= S_IFBLK;
	break;
	case VCHR:
	mode \|= S_IFCHR;
	break;
	case VLNK:
	mode \|= S_IFLNK;
	break;
	case VSOCK:
	mode \|= S_IFSOCK;
	break;
	case VFIFO:
	mode \|= S_IFIFO;
	break;
	default:
	return (EBADF);
	};
	sb->st_mode = mode;
	sb->st_nlink = vap->va_nlink;
	sb->st_uid = vap->va_uid;
	sb->st_gid = vap->va_gid;
	sb->st_rdev = vap->va_rdev;
	if (vap->va_size > OFF_MAX)
	return (EOVERFLOW);
	sb->st_size = vap->va_size;
	sb->st_atim = vap->va_atime;
	sb->st_mtim = vap->va_mtime;
	sb->st_ctim = vap->va_ctime;
	sb->st_birthtim = vap->va_birthtime;

	/*
	* According to www.opengroup.org, the meaning of st_blksize is
	* "a filesystem-specific preferred I/O block size for this
	* object. In some filesystem types, this may vary from file
	* to file"
	* Use miminum/default of PAGE_SIZE (e.g. for VCHR).
	*/

	sb->st_blksize = max(PAGE_SIZE, vap->va_blocksize);

	sb->st_flags = vap->va_flags;
	if (priv_check(td, PRIV_VFS_GENERATION))
	sb->st_gen = 0;
	else
	sb->st_gen = vap->va_gen;

	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
	return (0);
	}

	/*
	* File table vnode ioctl routine.
	*/
	static int
	vn_ioctl(fp, com, data, active_cred, td)
	struct file *fp;
	u_long com;
	void *data;
	struct ucred *active_cred;
	struct thread *td;
	{
	struct vnode *vp = fp->f_vnode;
	struct vattr vattr;
	int vfslocked;
	int error;

	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	error = ENOTTY;
	switch (vp->v_type) {
	case VREG:
	case VDIR:
	if (com == FIONREAD) {
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	error = VOP_GETATTR(vp, &vattr, active_cred);
	VOP_UNLOCK(vp, 0);
	if (!error)
	(int )data = vattr.va_size - fp->f_offset;
	}
	if (com == FIONBIO \|\| com == FIOASYNC) /* XXX */
	error = 0;
	else
	error = VOP_IOCTL(vp, com, data, fp->f_flag,
	active_cred, td);
	break;

	default:
	break;
	}
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* File table vnode poll routine.
	*/
	static int
	vn_poll(fp, events, active_cred, td)
	struct file *fp;
	int events;
	struct ucred *active_cred;
	struct thread *td;
	{
	struct vnode *vp;
	int vfslocked;
	int error;

	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	#ifdef MAC
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	error = mac_vnode_check_poll(active_cred, fp->f_cred, vp);
	VOP_UNLOCK(vp, 0);
	if (!error)
	#endif

	error = VOP_POLL(vp, events, fp->f_cred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Acquire the requested lock and then check for validity. LK_RETRY
	* permits vn_lock to return doomed vnodes.
	*/
	int
	_vn_lock(struct vnode vp, int flags, char file, int line)
	{
	int error;

	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
	("vn_lock called with no locktype."));
	do {
	#ifdef DEBUG_VFS_LOCKS
	KASSERT(vp->v_holdcnt != 0,
	("vn_lock %p: zero hold count", vp));
	#endif
	error = VOP_LOCK1(vp, flags, file, line);
	flags &= ~LK_INTERLOCK; /* Interlock is always dropped. */
	KASSERT((flags & LK_RETRY) == 0 \|\| error == 0,
	("LK_RETRY set with incompatible flags (0x%x) or an error occured (%d)",
	flags, error));
	/*
	* Callers specify LK_RETRY if they wish to get dead vnodes.
	* If RETRY is not set, we return ENOENT instead.
	*/
	if (error == 0 && vp->v_iflag & VI_DOOMED &&
	(flags & LK_RETRY) == 0) {
	VOP_UNLOCK(vp, 0);
	error = ENOENT;
	break;
	}
	} while (flags & LK_RETRY && error != 0);
	return (error);
	}

	/*
	* File table vnode close routine.
	*/
	static int
	vn_closefile(fp, td)
	struct file *fp;
	struct thread *td;
	{
	struct vnode *vp;
	struct flock lf;
	int vfslocked;
	int error;

	vp = fp->f_vnode;

	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	if (fp->f_type == DTYPE_VNODE && fp->f_flag & FHASLOCK) {
	lf.l_whence = SEEK_SET;
	lf.l_start = 0;
	lf.l_len = 0;
	lf.l_type = F_UNLCK;
	(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
	}

	fp->f_ops = &badfileops;

	error = vn_close(vp, fp->f_flag, fp->f_cred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* Preparing to start a filesystem write operation. If the operation is
	* permitted, then we bump the count of operations in progress and
	* proceed. If a suspend request is in progress, we wait until the
	* suspension is over, and then proceed.
	*/
	int
	vn_start_write(vp, mpp, flags)
	struct vnode *vp;
	struct mount **mpp;
	int flags;
	{
	struct mount *mp;
	int error;

	error = 0;
	/*
	* If a vnode is provided, get and return the mount point that
	* to which it will write.
	*/
	if (vp != NULL) {
	if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
	*mpp = NULL;
	if (error != EOPNOTSUPP)
	return (error);
	return (0);
	}
	}
	if ((mp = *mpp) == NULL)
	return (0);

	/*
	* VOP_GETWRITEMOUNT() returns with the mp refcount held through
	* a vfs_ref().
	* As long as a vnode is not provided we need to acquire a
	* refcount for the provided mountpoint too, in order to
	* emulate a vfs_ref().
	*/
	MNT_ILOCK(mp);
	if (vp == NULL)
	MNT_REF(mp);

	/*
	* Check on status of suspension.
	*/
	if ((curthread->td_pflags & TDP_IGNSUSP) == 0 \|\|
	mp->mnt_susp_owner != curthread) {
	while ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
	if (flags & V_NOWAIT) {
	error = EWOULDBLOCK;
	goto unlock;
	}
	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
	(PUSER - 1) \| (flags & PCATCH), "suspfs", 0);
	if (error)
	goto unlock;
	}
	}
	if (flags & V_XSLEEP)
	goto unlock;
	mp->mnt_writeopcount++;
	unlock:
	if (error != 0 \|\| (flags & V_XSLEEP) != 0)
	MNT_REL(mp);
	MNT_IUNLOCK(mp);
	return (error);
	}

	/*
	* Secondary suspension. Used by operations such as vop_inactive
	* routines that are needed by the higher level functions. These
	* are allowed to proceed until all the higher level functions have
	* completed (indicated by mnt_writeopcount dropping to zero). At that
	* time, these operations are halted until the suspension is over.
	*/
	int
	vn_start_secondary_write(vp, mpp, flags)
	struct vnode *vp;
	struct mount **mpp;
	int flags;
	{
	struct mount *mp;
	int error;

	retry:
	if (vp != NULL) {
	if ((error = VOP_GETWRITEMOUNT(vp, mpp)) != 0) {
	*mpp = NULL;
	if (error != EOPNOTSUPP)
	return (error);
	return (0);
	}
	}
	/*
	* If we are not suspended or have not yet reached suspended
	* mode, then let the operation proceed.
	*/
	if ((mp = *mpp) == NULL)
	return (0);

	/*
	* VOP_GETWRITEMOUNT() returns with the mp refcount held through
	* a vfs_ref().
	* As long as a vnode is not provided we need to acquire a
	* refcount for the provided mountpoint too, in order to
	* emulate a vfs_ref().
	*/
	MNT_ILOCK(mp);
	if (vp == NULL)
	MNT_REF(mp);
	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED \| MNTK_SUSPEND2)) == 0) {
	mp->mnt_secondary_writes++;
	mp->mnt_secondary_accwrites++;
	MNT_IUNLOCK(mp);
	return (0);
	}
	if (flags & V_NOWAIT) {
	MNT_REL(mp);
	MNT_IUNLOCK(mp);
	return (EWOULDBLOCK);
	}
	/*
	* Wait for the suspension to finish.
	*/
	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
	(PUSER - 1) \| (flags & PCATCH) \| PDROP, "suspfs", 0);
	vfs_rel(mp);
	if (error == 0)
	goto retry;
	return (error);
	}

	/*
	* Filesystem write operation has completed. If we are suspending and this
	* operation is the last one, notify the suspender that the suspension is
	* now in effect.
	*/
	void
	vn_finished_write(mp)
	struct mount *mp;
	{
	if (mp == NULL)
	return;
	MNT_ILOCK(mp);
	MNT_REL(mp);
	mp->mnt_writeopcount--;
	if (mp->mnt_writeopcount < 0)
	panic("vn_finished_write: neg cnt");
	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
	mp->mnt_writeopcount <= 0)
	wakeup(&mp->mnt_writeopcount);
	MNT_IUNLOCK(mp);
	}


	/*
	* Filesystem secondary write operation has completed. If we are
	* suspending and this operation is the last one, notify the suspender
	* that the suspension is now in effect.
	*/
	void
	vn_finished_secondary_write(mp)
	struct mount *mp;
	{
	if (mp == NULL)
	return;
	MNT_ILOCK(mp);
	MNT_REL(mp);
	mp->mnt_secondary_writes--;
	if (mp->mnt_secondary_writes < 0)
	panic("vn_finished_secondary_write: neg cnt");
	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0 &&
	mp->mnt_secondary_writes <= 0)
	wakeup(&mp->mnt_secondary_writes);
	MNT_IUNLOCK(mp);
	}



	/*
	* Request a filesystem to suspend write operations.
	*/
	int
	vfs_write_suspend(mp)
	struct mount *mp;
	{
	int error;

	MNT_ILOCK(mp);
	if (mp->mnt_susp_owner == curthread) {
	MNT_IUNLOCK(mp);
	return (EALREADY);
	}
	while (mp->mnt_kern_flag & MNTK_SUSPEND)
	msleep(&mp->mnt_flag, MNT_MTX(mp), PUSER - 1, "wsuspfs", 0);
	mp->mnt_kern_flag \|= MNTK_SUSPEND;
	mp->mnt_susp_owner = curthread;
	if (mp->mnt_writeopcount > 0)
	(void) msleep(&mp->mnt_writeopcount,
	MNT_MTX(mp), (PUSER - 1)\|PDROP, "suspwt", 0);
	else
	MNT_IUNLOCK(mp);
	if ((error = VFS_SYNC(mp, MNT_SUSPEND)) != 0)
	vfs_write_resume(mp);
	return (error);
	}

	/*
	* Request a filesystem to resume write operations.
	*/
	void
	vfs_write_resume(mp)
	struct mount *mp;
	{

	MNT_ILOCK(mp);
	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0) {
	KASSERT(mp->mnt_susp_owner == curthread, ("mnt_susp_owner"));
	mp->mnt_kern_flag &= ~(MNTK_SUSPEND \| MNTK_SUSPEND2 \|
	MNTK_SUSPENDED);
	mp->mnt_susp_owner = NULL;
	wakeup(&mp->mnt_writeopcount);
	wakeup(&mp->mnt_flag);
	curthread->td_pflags &= ~TDP_IGNSUSP;
	MNT_IUNLOCK(mp);
	VFS_SUSP_CLEAN(mp);
	} else
	MNT_IUNLOCK(mp);
	}

	/*
	* Implement kqueues for files by translating it to vnode operation.
	*/
	static int
	vn_kqfilter(struct file fp, struct knote kn)
	{
	int vfslocked;
	int error;

	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
	error = VOP_KQFILTER(fp->f_vnode, kn);
	VFS_UNLOCK_GIANT(vfslocked);

	return error;
	}

	/*
	* Simplified in-kernel wrapper calls for extended attribute access.
	* Both calls pass in a NULL credential, authorizing as "kernel" access.
	* Set IO_NODELOCKED in ioflg if the vnode is already locked.
	*/
	int
	vn_extattr_get(struct vnode *vp, int ioflg, int attrnamespace,
	const char attrname, int buflen, char buf, struct thread td)
	{
	struct uio auio;
	struct iovec iov;
	int error;

	iov.iov_len = *buflen;
	iov.iov_base = buf;

	auio.uio_iov = &iov;
	auio.uio_iovcnt = 1;
	auio.uio_rw = UIO_READ;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_td = td;
	auio.uio_offset = 0;
	auio.uio_resid = *buflen;

	if ((ioflg & IO_NODELOCKED) == 0)
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);

	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");

	/* authorize attribute retrieval as kernel */
	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, NULL,
	td);

	if ((ioflg & IO_NODELOCKED) == 0)
	VOP_UNLOCK(vp, 0);

	if (error == 0) {
	buflen = buflen - auio.uio_resid;
	}

	return (error);
	}

	/*
	* XXX failure mode if partially written?
	*/
	int
	vn_extattr_set(struct vnode *vp, int ioflg, int attrnamespace,
	const char attrname, int buflen, char buf, struct thread *td)
	{
	struct uio auio;
	struct iovec iov;
	struct mount *mp;
	int error;

	iov.iov_len = buflen;
	iov.iov_base = buf;

	auio.uio_iov = &iov;
	auio.uio_iovcnt = 1;
	auio.uio_rw = UIO_WRITE;
	auio.uio_segflg = UIO_SYSSPACE;
	auio.uio_td = td;
	auio.uio_offset = 0;
	auio.uio_resid = buflen;

	if ((ioflg & IO_NODELOCKED) == 0) {
	if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
	return (error);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	}

	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");

	/* authorize attribute setting as kernel */
	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio, NULL, td);

	if ((ioflg & IO_NODELOCKED) == 0) {
	vn_finished_write(mp);
	VOP_UNLOCK(vp, 0);
	}

	return (error);
	}

	int
	vn_extattr_rm(struct vnode *vp, int ioflg, int attrnamespace,
	const char attrname, struct thread td)
	{
	struct mount *mp;
	int error;

	if ((ioflg & IO_NODELOCKED) == 0) {
	if ((error = vn_start_write(vp, &mp, V_WAIT)) != 0)
	return (error);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	}

	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");

	/* authorize attribute removal as kernel */
	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, NULL, td);
	if (error == EOPNOTSUPP)
	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
	NULL, td);

	if ((ioflg & IO_NODELOCKED) == 0) {
	vn_finished_write(mp);
	VOP_UNLOCK(vp, 0);
	}

	return (error);
	}

	int
	vn_vget_ino(struct vnode vp, ino_t ino, int lkflags, struct vnode *rvp)
	{
	struct mount *mp;
	int ltype, error;

	mp = vp->v_mount;
	ltype = VOP_ISLOCKED(vp);
	KASSERT(ltype == LK_EXCLUSIVE \|\| ltype == LK_SHARED,
	("vn_vget_ino: vp not locked"));
	error = vfs_busy(mp, MBF_NOWAIT);
	if (error != 0) {
	vfs_ref(mp);
	VOP_UNLOCK(vp, 0);
	error = vfs_busy(mp, 0);
	vn_lock(vp, ltype \| LK_RETRY);
	vfs_rel(mp);
	if (error != 0)
	return (ENOENT);
	if (vp->v_iflag & VI_DOOMED) {
	vfs_unbusy(mp);
	return (ENOENT);
	}
	}
	VOP_UNLOCK(vp, 0);
	error = VFS_VGET(mp, ino, lkflags, rvp);
	vfs_unbusy(mp);
	vn_lock(vp, ltype \| LK_RETRY);
	if (vp->v_iflag & VI_DOOMED) {
	if (error == 0)
	vput(*rvp);
	error = ENOENT;
	}
	return (error);
	}

	int
	vn_rlimit_fsize(const struct vnode vp, const struct uio uio,
	const struct thread *td)
	{

	if (vp->v_type != VREG \|\| td == NULL)
	return (0);
	PROC_LOCK(td->td_proc);
	if ((uoff_t)uio->uio_offset + uio->uio_resid >
	lim_cur(td->td_proc, RLIMIT_FSIZE)) {
	- psignal(td->td_proc, SIGXFSZ);
	+ kern_psignal(td->td_proc, SIGXFSZ);
	PROC_UNLOCK(td->td_proc);
	return (EFBIG);
	}
	PROC_UNLOCK(td->td_proc);
	return (0);
	}

	int
	vn_chmod(struct file fp, mode_t mode, struct ucred active_cred,
	struct thread *td)
	{
	struct vnode *vp;
	int error, vfslocked;

	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	#ifdef AUDIT
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	AUDIT_ARG_VNODE1(vp);
	VOP_UNLOCK(vp, 0);
	#endif
	error = setfmode(td, active_cred, vp, mode);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	int
	vn_chown(struct file fp, uid_t uid, gid_t gid, struct ucred active_cred,
	struct thread *td)
	{
	struct vnode *vp;
	int error, vfslocked;

	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	#ifdef AUDIT
	vn_lock(vp, LK_SHARED \| LK_RETRY);
	AUDIT_ARG_VNODE1(vp);
	VOP_UNLOCK(vp, 0);
	#endif
	error = setfown(td, active_cred, vp, uid, gid);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	void
	vn_pages_remove(struct vnode *vp, vm_pindex_t start, vm_pindex_t end)
	{
	vm_object_t object;

	if ((object = vp->v_object) == NULL)
	return;
	VM_OBJECT_LOCK(object);
	vm_object_page_remove(object, start, end, 0);
	VM_OBJECT_UNLOCK(object);
	}
	Index: head/sys/kgssapi/gss_impl.c
	===================================================================
	--- head/sys/kgssapi/gss_impl.c (revision 225616)
	+++ head/sys/kgssapi/gss_impl.c (revision 225617)
	@@ -1,303 +1,303 @@
	/*-
	* Copyright (c) 2008 Isilon Inc http://www.isilon.com/
	* Authors: Doug Rabson <dfr@rabson.org>
	* Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/kernel.h>
	#include <sys/kobj.h>
	#include <sys/malloc.h>
	#include <sys/module.h>
	#include <sys/priv.h>
	#include <sys/syscall.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>

	#include <kgssapi/gssapi.h>
	#include <kgssapi/gssapi_impl.h>
	#include <rpc/rpc.h>
	#include <rpc/rpc_com.h>
	#include <rpc/rpcsec_gss.h>

	#include "gssd.h"
	#include "kgss_if.h"

	MALLOC_DEFINE(M_GSSAPI, "GSS-API", "GSS-API");

	/*
	* Syscall hooks
	*/
	static int gssd_syscall_offset = SYS_gssd_syscall;
	static struct sysent gssd_syscall_prev_sysent;
	MAKE_SYSENT(gssd_syscall);
	static bool_t gssd_syscall_registered = FALSE;

	struct kgss_mech_list kgss_mechs;
	CLIENT *kgss_gssd_handle;

	static void
	kgss_init(void *dummy)
	{
	int error;

	LIST_INIT(&kgss_mechs);
	error = syscall_register(&gssd_syscall_offset, &gssd_syscall_sysent,
	&gssd_syscall_prev_sysent);
	if (error)
	printf("Can't register GSSD syscall\n");
	else
	gssd_syscall_registered = TRUE;
	}
	SYSINIT(kgss_init, SI_SUB_LOCK, SI_ORDER_FIRST, kgss_init, NULL);

	static void
	kgss_uninit(void *dummy)
	{

	if (gssd_syscall_registered)
	syscall_deregister(&gssd_syscall_offset,
	&gssd_syscall_prev_sysent);
	}
	SYSUNINIT(kgss_uninit, SI_SUB_LOCK, SI_ORDER_FIRST, kgss_uninit, NULL);

	int
	-gssd_syscall(struct thread td, struct gssd_syscall_args uap)
	+sys_gssd_syscall(struct thread td, struct gssd_syscall_args uap)
	{
	struct sockaddr_un sun;
	struct netconfig *nconf;
	char path[MAXPATHLEN];
	int error;

	error = priv_check(td, PRIV_NFS_DAEMON);
	if (error)
	return (error);

	if (kgss_gssd_handle)
	CLNT_DESTROY(kgss_gssd_handle);

	error = copyinstr(uap->path, path, sizeof(path), NULL);
	if (error)
	return (error);

	sun.sun_family = AF_LOCAL;
	strcpy(sun.sun_path, path);
	sun.sun_len = SUN_LEN(&sun);

	nconf = getnetconfigent("local");
	kgss_gssd_handle = clnt_reconnect_create(nconf,
	(struct sockaddr *) &sun, GSSD, GSSDVERS,
	RPC_MAXDATASIZE, RPC_MAXDATASIZE);

	return (0);
	}

	int
	kgss_oid_equal(const gss_OID oid1, const gss_OID oid2)
	{

	if (oid1 == oid2)
	return (1);
	if (!oid1 \|\| !oid2)
	return (0);
	if (oid1->length != oid2->length)
	return (0);
	if (memcmp(oid1->elements, oid2->elements, oid1->length))
	return (0);
	return (1);
	}

	void
	kgss_install_mech(gss_OID mech_type, const char name, struct kobj_class cls)
	{
	struct kgss_mech *km;

	km = malloc(sizeof(struct kgss_mech), M_GSSAPI, M_WAITOK);
	km->km_mech_type = mech_type;
	km->km_mech_name = name;
	km->km_class = cls;
	LIST_INSERT_HEAD(&kgss_mechs, km, km_link);
	}

	void
	kgss_uninstall_mech(gss_OID mech_type)
	{
	struct kgss_mech *km;

	LIST_FOREACH(km, &kgss_mechs, km_link) {
	if (kgss_oid_equal(km->km_mech_type, mech_type)) {
	LIST_REMOVE(km, km_link);
	free(km, M_GSSAPI);
	return;
	}
	}
	}

	gss_OID
	kgss_find_mech_by_name(const char *name)
	{
	struct kgss_mech *km;

	LIST_FOREACH(km, &kgss_mechs, km_link) {
	if (!strcmp(km->km_mech_name, name)) {
	return (km->km_mech_type);
	}
	}
	return (GSS_C_NO_OID);
	}

	const char *
	kgss_find_mech_by_oid(const gss_OID oid)
	{
	struct kgss_mech *km;

	LIST_FOREACH(km, &kgss_mechs, km_link) {
	if (kgss_oid_equal(km->km_mech_type, oid)) {
	return (km->km_mech_name);
	}
	}
	return (NULL);
	}

	gss_ctx_id_t
	kgss_create_context(gss_OID mech_type)
	{
	struct kgss_mech *km;
	gss_ctx_id_t ctx;

	LIST_FOREACH(km, &kgss_mechs, km_link) {
	if (kgss_oid_equal(km->km_mech_type, mech_type))
	break;
	}
	if (!km)
	return (NULL);

	ctx = (gss_ctx_id_t) kobj_create(km->km_class, M_GSSAPI, M_WAITOK);
	KGSS_INIT(ctx);

	return (ctx);
	}

	void
	kgss_delete_context(gss_ctx_id_t ctx, gss_buffer_t output_token)
	{

	KGSS_DELETE(ctx, output_token);
	kobj_delete((kobj_t) ctx, M_GSSAPI);
	}

	OM_uint32
	kgss_transfer_context(gss_ctx_id_t ctx)
	{
	struct export_sec_context_res res;
	struct export_sec_context_args args;
	enum clnt_stat stat;
	OM_uint32 maj_stat;

	if (!kgss_gssd_handle)
	return (GSS_S_FAILURE);

	args.ctx = ctx->handle;
	bzero(&res, sizeof(res));
	stat = gssd_export_sec_context_1(&args, &res, kgss_gssd_handle);
	if (stat != RPC_SUCCESS) {
	return (GSS_S_FAILURE);
	}

	maj_stat = KGSS_IMPORT(ctx, res.format, &res.interprocess_token);
	ctx->handle = 0;

	xdr_free((xdrproc_t) xdr_export_sec_context_res, &res);

	return (maj_stat);
	}

	void
	kgss_copy_buffer(const gss_buffer_t from, gss_buffer_t to)
	{
	to->length = from->length;
	if (from->length) {
	to->value = malloc(from->length, M_GSSAPI, M_WAITOK);
	bcopy(from->value, to->value, from->length);
	} else {
	to->value = NULL;
	}
	}

	/*
	* Kernel module glue
	*/
	static int
	kgssapi_modevent(module_t mod, int type, void *data)
	{
	int error = 0;

	switch (type) {
	case MOD_LOAD:
	rpc_gss_entries.rpc_gss_secfind = rpc_gss_secfind;
	rpc_gss_entries.rpc_gss_secpurge = rpc_gss_secpurge;
	rpc_gss_entries.rpc_gss_seccreate = rpc_gss_seccreate;
	rpc_gss_entries.rpc_gss_set_defaults = rpc_gss_set_defaults;
	rpc_gss_entries.rpc_gss_max_data_length =
	rpc_gss_max_data_length;
	rpc_gss_entries.rpc_gss_get_error = rpc_gss_get_error;
	rpc_gss_entries.rpc_gss_mech_to_oid = rpc_gss_mech_to_oid;
	rpc_gss_entries.rpc_gss_oid_to_mech = rpc_gss_oid_to_mech;
	rpc_gss_entries.rpc_gss_qop_to_num = rpc_gss_qop_to_num;
	rpc_gss_entries.rpc_gss_get_mechanisms = rpc_gss_get_mechanisms;
	rpc_gss_entries.rpc_gss_get_versions = rpc_gss_get_versions;
	rpc_gss_entries.rpc_gss_is_installed = rpc_gss_is_installed;
	rpc_gss_entries.rpc_gss_set_svc_name = rpc_gss_set_svc_name;
	rpc_gss_entries.rpc_gss_clear_svc_name = rpc_gss_clear_svc_name;
	rpc_gss_entries.rpc_gss_getcred = rpc_gss_getcred;
	rpc_gss_entries.rpc_gss_set_callback = rpc_gss_set_callback;
	rpc_gss_entries.rpc_gss_clear_callback = rpc_gss_clear_callback;
	rpc_gss_entries.rpc_gss_get_principal_name =
	rpc_gss_get_principal_name;
	rpc_gss_entries.rpc_gss_svc_max_data_length =
	rpc_gss_svc_max_data_length;
	break;
	case MOD_UNLOAD:
	/*
	* Unloading of the kgssapi module is not currently supported.
	* If somebody wants this, we would need to keep track of
	* currently executing threads and make sure the count is 0.
	*/
	/* FALLTHROUGH */
	default:
	error = EOPNOTSUPP;
	};
	return (error);
	}
	static moduledata_t kgssapi_mod = {
	"kgssapi",
	kgssapi_modevent,
	NULL,
	};
	DECLARE_MODULE(kgssapi, kgssapi_mod, SI_SUB_VFS, SI_ORDER_ANY);
	MODULE_DEPEND(kgssapi, krpc, 1, 1, 1);
	MODULE_VERSION(kgssapi, 1);
	Index: head/sys/mips/mips/pm_machdep.c
	===================================================================
	--- head/sys/mips/mips/pm_machdep.c (revision 225616)
	+++ head/sys/mips/mips/pm_machdep.c (revision 225617)
	@@ -1,576 +1,576 @@
	/*-
	* Copyright (c) 1992 Terrence R. Lambert.
	* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
	* All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* William Jolitz.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
	* from: src/sys/i386/i386/machdep.c,v 1.385.2.3 2000/05/10 02:04:46 obrien
	* JNPR: pm_machdep.c,v 1.9.2.1 2007/08/16 15:59:10 girish
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_cputype.h"

	#include <sys/types.h>
	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysent.h>
	#include <sys/proc.h>
	#include <sys/signalvar.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/ucontext.h>
	#include <sys/lock.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysproto.h>
	#include <sys/ptrace.h>
	#include <sys/syslog.h>
	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_extern.h>
	#include <sys/user.h>
	#include <sys/uio.h>
	#include <machine/reg.h>
	#include <machine/md_var.h>
	#include <machine/sigframe.h>
	#include <machine/vmparam.h>
	#include <sys/vnode.h>
	#include <fs/pseudofs/pseudofs.h>
	#include <fs/procfs/procfs.h>

	#define UCONTEXT_MAGIC 0xACEDBADE

	/*
	* Send an interrupt to process.
	*
	* Stack is set up to allow sigcode stored
	* at top to call routine, followed by kcall
	* to sigreturn routine below. After sigreturn
	* resets the signal mask, the stack, and the
	* frame pointer, it returns to the user
	* specified pc, psl.
	*/
	void
	sendsig(sig_t catcher, ksiginfo_t ksi, sigset_t mask)
	{
	struct proc *p;
	struct thread *td;
	struct trapframe *regs;
	struct sigacts *psp;
	struct sigframe sf, *sfp;
	int sig;
	int oonstack;

	td = curthread;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	sig = ksi->ksi_signo;
	psp = p->p_sigacts;
	mtx_assert(&psp->ps_mtx, MA_OWNED);

	regs = td->td_frame;
	oonstack = sigonstack(regs->sp);

	/* save user context */
	bzero(&sf, sizeof(struct sigframe));
	sf.sf_uc.uc_sigmask = *mask;
	sf.sf_uc.uc_stack = td->td_sigstk;
	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
	sf.sf_uc.uc_mcontext.mc_pc = regs->pc;
	sf.sf_uc.uc_mcontext.mullo = regs->mullo;
	sf.sf_uc.uc_mcontext.mulhi = regs->mulhi;
	sf.sf_uc.uc_mcontext.mc_regs[0] = UCONTEXT_MAGIC; /* magic number */
	bcopy((void )&regs->ast, (void )&sf.sf_uc.uc_mcontext.mc_regs[1],
	sizeof(sf.sf_uc.uc_mcontext.mc_regs) - sizeof(register_t));
	sf.sf_uc.uc_mcontext.mc_fpused = td->td_md.md_flags & MDTD_FPUSED;
	if (sf.sf_uc.uc_mcontext.mc_fpused) {
	/* if FPU has current state, save it first */
	if (td == PCPU_GET(fpcurthread))
	MipsSaveCurFPState(td);
	bcopy((void *)&td->td_frame->f0,
	(void *)sf.sf_uc.uc_mcontext.mc_fpregs,
	sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
	}

	/* Allocate and validate space for the signal handler context. */
	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
	SIGISMEMBER(psp->ps_sigonstack, sig)) {
	sfp = (struct sigframe *)((vm_offset_t)(td->td_sigstk.ss_sp +
	td->td_sigstk.ss_size - sizeof(struct sigframe))
	& ~(sizeof(__int64_t) - 1));
	} else
	sfp = (struct sigframe *)((vm_offset_t)(regs->sp -
	sizeof(struct sigframe)) & ~(sizeof(__int64_t) - 1));

	/* Translate the signal if appropriate */
	if (p->p_sysent->sv_sigtbl) {
	if (sig <= p->p_sysent->sv_sigsize)
	sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
	}

	/* Build the argument list for the signal handler. */
	regs->a0 = sig;
	regs->a2 = (register_t)(intptr_t)&sfp->sf_uc;
	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
	/* Signal handler installed with SA_SIGINFO. */
	regs->a1 = (register_t)(intptr_t)&sfp->sf_si;
	/* sf.sf_ahu.sf_action = (__siginfohandler_t )catcher; /

	/* fill siginfo structure */
	sf.sf_si.si_signo = sig;
	sf.sf_si.si_code = ksi->ksi_code;
	sf.sf_si.si_addr = (void*)(intptr_t)regs->badvaddr;
	} else {
	/* Old FreeBSD-style arguments. */
	regs->a1 = ksi->ksi_code;
	regs->a3 = regs->badvaddr;
	/* sf.sf_ahu.sf_handler = catcher; */
	}

	mtx_unlock(&psp->ps_mtx);
	PROC_UNLOCK(p);

	/*
	* Copy the sigframe out to the user's stack.
	*/
	if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) {
	/*
	* Something is wrong with the stack pointer.
	* ...Kill the process.
	*/
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	}

	regs->pc = (register_t)(intptr_t)catcher;
	regs->t9 = (register_t)(intptr_t)catcher;
	regs->sp = (register_t)(intptr_t)sfp;
	/*
	* Signal trampoline code is at base of user stack.
	*/
	regs->ra = (register_t)(intptr_t)PS_STRINGS - *(p->p_sysent->sv_szsigcode);
	PROC_LOCK(p);
	mtx_lock(&psp->ps_mtx);
	}

	#ifdef GONE_IN_7
	/*
	* Build siginfo_t for SA thread
	*/
	void
	cpu_thread_siginfo(int sig, u_long code, siginfo_t *si)
	{
	struct proc *p;
	struct thread *td;

	td = curthread;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);

	bzero(si, sizeof(*si));
	si->si_signo = sig;
	si->si_code = code;
	/* XXXKSE fill other fields */
	}
	#endif

	/*
	* System call to cleanup state after a signal
	* has been taken. Reset signal mask and
	* stack state from context left by sendsig (above).
	* Return to previous pc as specified by
	* context left by sendsig.
	*/
	int
	-sigreturn(struct thread td, struct sigreturn_args uap)
	+sys_sigreturn(struct thread td, struct sigreturn_args uap)
	{
	struct trapframe *regs;
	ucontext_t *ucp;
	ucontext_t uc;
	int error;

	ucp = &uc;

	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
	if (error != 0)
	return (error);

	regs = td->td_frame;

	/* #ifdef DEBUG */
	if (ucp->uc_mcontext.mc_regs[ZERO] != UCONTEXT_MAGIC) {
	printf("sigreturn: pid %d, ucp %p\n", td->td_proc->p_pid, ucp);
	printf(" old sp %p ra %p pc %p\n",
	(void )(intptr_t)regs->sp, (void )(intptr_t)regs->ra, (void *)(intptr_t)regs->pc);
	printf(" new sp %p ra %p pc %p z %p\n",
	(void *)(intptr_t)ucp->uc_mcontext.mc_regs[SP],
	(void *)(intptr_t)ucp->uc_mcontext.mc_regs[RA],
	(void *)(intptr_t)ucp->uc_mcontext.mc_regs[PC],
	(void *)(intptr_t)ucp->uc_mcontext.mc_regs[ZERO]);
	return EINVAL;
	}
	/* #endif */

	bcopy((const void )&ucp->uc_mcontext.mc_regs[1], (void )&regs->ast,
	sizeof(ucp->uc_mcontext.mc_regs) - sizeof(register_t));

	if (ucp->uc_mcontext.mc_fpused)
	bcopy((const void *)ucp->uc_mcontext.mc_fpregs,
	(void *)&td->td_frame->f0,
	sizeof(ucp->uc_mcontext.mc_fpregs));

	regs->pc = ucp->uc_mcontext.mc_pc;
	regs->mullo = ucp->uc_mcontext.mullo;
	regs->mulhi = ucp->uc_mcontext.mulhi;

	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);

	return(EJUSTRETURN);
	}


	int
	ptrace_set_pc(struct thread *td, unsigned long addr)
	{
	td->td_frame->pc = (register_t) addr;
	return 0;
	}

	static int
	ptrace_read_int(struct thread td, off_t addr, int v)
	{
	struct iovec iov;
	struct uio uio;

	PROC_LOCK_ASSERT(td->td_proc, MA_NOTOWNED);
	iov.iov_base = (caddr_t) v;
	iov.iov_len = sizeof(int);
	uio.uio_iov = &iov;
	uio.uio_iovcnt = 1;
	uio.uio_offset = (off_t)addr;
	uio.uio_resid = sizeof(int);
	uio.uio_segflg = UIO_SYSSPACE;
	uio.uio_rw = UIO_READ;
	uio.uio_td = td;
	return proc_rwmem(td->td_proc, &uio);
	}

	static int
	ptrace_write_int(struct thread *td, off_t addr, int v)
	{
	struct iovec iov;
	struct uio uio;

	PROC_LOCK_ASSERT(td->td_proc, MA_NOTOWNED);
	iov.iov_base = (caddr_t) &v;
	iov.iov_len = sizeof(int);
	uio.uio_iov = &iov;
	uio.uio_iovcnt = 1;
	uio.uio_offset = (off_t)addr;
	uio.uio_resid = sizeof(int);
	uio.uio_segflg = UIO_SYSSPACE;
	uio.uio_rw = UIO_WRITE;
	uio.uio_td = td;
	return proc_rwmem(td->td_proc, &uio);
	}

	int
	ptrace_single_step(struct thread *td)
	{
	unsigned va;
	struct trapframe *locr0 = td->td_frame;
	int i;
	int bpinstr = MIPS_BREAK_SSTEP;
	int curinstr;
	struct proc *p;

	p = td->td_proc;
	PROC_UNLOCK(p);
	/*
	* Fetch what's at the current location.
	*/
	ptrace_read_int(td, (off_t)locr0->pc, &curinstr);

	/* compute next address after current location */
	if(curinstr != 0) {
	va = MipsEmulateBranch(locr0, locr0->pc, locr0->fsr,
	(uintptr_t)&curinstr);
	} else {
	va = locr0->pc + 4;
	}
	if (td->td_md.md_ss_addr) {
	printf("SS %s (%d): breakpoint already set at %x (va %x)\n",
	p->p_comm, p->p_pid, td->td_md.md_ss_addr, va); /* XXX */
	return (EFAULT);
	}
	td->td_md.md_ss_addr = va;
	/*
	* Fetch what's at the current location.
	*/
	ptrace_read_int(td, (off_t)va, &td->td_md.md_ss_instr);

	/*
	* Store breakpoint instruction at the "next" location now.
	*/
	i = ptrace_write_int (td, va, bpinstr);

	/*
	* The sync'ing of I & D caches is done by procfs_domem()
	* through procfs_rwmem().
	*/

	PROC_LOCK(p);
	if (i < 0)
	return (EFAULT);
	#if 0
	printf("SS %s (%d): breakpoint set at %x: %x (pc %x) br %x\n",
	p->p_comm, p->p_pid, p->p_md.md_ss_addr,
	p->p_md.md_ss_instr, locr0->pc, curinstr); /* XXX */
	#endif
	return (0);
	}


	void
	makectx(struct trapframe tf, struct pcb pcb)
	{

	pcb->pcb_regs.ra = tf->ra;
	pcb->pcb_regs.pc = tf->pc;
	pcb->pcb_regs.sp = tf->sp;
	}

	int
	fill_regs(struct thread td, struct reg regs)
	{
	memcpy(regs, td->td_frame, sizeof(struct reg));
	return (0);
	}

	int
	set_regs(struct thread td, struct reg regs)
	{
	struct trapframe *f;
	register_t sr;

	f = (struct trapframe *) td->td_frame;
	/*
	* Don't allow the user to change SR
	*/
	sr = f->sr;
	memcpy(td->td_frame, regs, sizeof(struct reg));
	f->sr = sr;
	return (0);
	}

	int
	get_mcontext(struct thread td, mcontext_t mcp, int flags)
	{
	struct trapframe *tp;

	tp = td->td_frame;
	PROC_LOCK(curthread->td_proc);
	mcp->mc_onstack = sigonstack(tp->sp);
	PROC_UNLOCK(curthread->td_proc);
	bcopy((void )&td->td_frame->zero, (void )&mcp->mc_regs,
	sizeof(mcp->mc_regs));

	mcp->mc_fpused = td->td_md.md_flags & MDTD_FPUSED;
	if (mcp->mc_fpused) {
	bcopy((void )&td->td_frame->f0, (void )&mcp->mc_fpregs,
	sizeof(mcp->mc_fpregs));
	}
	if (flags & GET_MC_CLEAR_RET) {
	mcp->mc_regs[V0] = 0;
	mcp->mc_regs[V1] = 0;
	mcp->mc_regs[A3] = 0;
	}

	mcp->mc_pc = td->td_frame->pc;
	mcp->mullo = td->td_frame->mullo;
	mcp->mulhi = td->td_frame->mulhi;
	mcp->mc_tls = td->td_md.md_tls;
	return (0);
	}

	int
	set_mcontext(struct thread td, const mcontext_t mcp)
	{
	struct trapframe *tp;

	tp = td->td_frame;
	bcopy((void )&mcp->mc_regs, (void )&td->td_frame->zero,
	sizeof(mcp->mc_regs));

	td->td_md.md_flags = mcp->mc_fpused & MDTD_FPUSED;
	if (mcp->mc_fpused) {
	bcopy((void )&mcp->mc_fpregs, (void )&td->td_frame->f0,
	sizeof(mcp->mc_fpregs));
	}
	td->td_frame->pc = mcp->mc_pc;
	td->td_frame->mullo = mcp->mullo;
	td->td_frame->mulhi = mcp->mulhi;
	td->td_md.md_tls = mcp->mc_tls;
	/* Dont let user to set any bits in Status and casue registers */

	return (0);
	}

	int
	fill_fpregs(struct thread td, struct fpreg fpregs)
	{
	if (td == PCPU_GET(fpcurthread))
	MipsSaveCurFPState(td);
	memcpy(fpregs, &td->td_frame->f0, sizeof(struct fpreg));
	return 0;
	}

	int
	set_fpregs(struct thread td, struct fpreg fpregs)
	{
	if (PCPU_GET(fpcurthread) == td)
	PCPU_SET(fpcurthread, (struct thread *)0);
	memcpy(&td->td_frame->f0, fpregs, sizeof(struct fpreg));
	return 0;
	}


	/*
	* Clear registers on exec
	* $sp is set to the stack pointer passed in. $pc is set to the entry
	* point given by the exec_package passed in, as is $t9 (used for PIC
	* code by the MIPS elf abi).
	*/
	void
	exec_setregs(struct thread td, struct image_params imgp, u_long stack)
	{

	bzero((caddr_t)td->td_frame, sizeof(struct trapframe));

	/*
	* The stack pointer has to be aligned to accommodate the largest
	* datatype at minimum. This probably means it should be 16-byte
	* aligned, but for now we're 8-byte aligning it.
	*/
	td->td_frame->sp = ((register_t) stack) & ~(sizeof(__int64_t) - 1);

	/*
	* If we're running o32 or n32 programs but have 64-bit registers,
	* GCC may use stack-relative addressing near the top of user
	* address space that, due to sign extension, will yield an
	* invalid address. For instance, if sp is 0x7fffff00 then GCC
	* might do something like this to load a word from 0x7ffffff0:
	*
	* addu sp, sp, 32768
	* lw t0, -32528(sp)
	*
	* On systems with 64-bit registers, sp is sign-extended to
	* 0xffffffff80007f00 and the load is instead done from
	* 0xffffffff7ffffff0.
	*
	* To prevent this, we subtract 64K from the stack pointer here.
	*
	* For consistency, we should just always do this unless we're
	* running n64 programs. For now, since we don't support
	* COMPAT_FREEBSD32 on n64 kernels, we just do it unless we're
	* running n64 kernels.
	*/
	#if !defined(__mips_n64)
	td->td_frame->sp -= 65536;
	#endif

	td->td_frame->pc = imgp->entry_addr & ~3;
	td->td_frame->t9 = imgp->entry_addr & ~3; /* abicall req */
	td->td_frame->sr = MIPS_SR_KSU_USER \| MIPS_SR_EXL \| MIPS_SR_INT_IE \|
	(mips_rd_status() & MIPS_SR_INT_MASK);
	#if defined(__mips_n32)
	td->td_frame->sr \|= MIPS_SR_PX;
	#elif defined(__mips_n64)
	td->td_frame->sr \|= MIPS_SR_PX \| MIPS_SR_UX \| MIPS_SR_KX;
	#endif
	#ifdef CPU_CNMIPS
	td->td_frame->sr \|= MIPS_SR_COP_2_BIT \| MIPS_SR_PX \| MIPS_SR_UX \|
	MIPS_SR_KX \| MIPS_SR_SX;
	#endif
	/*
	* FREEBSD_DEVELOPERS_FIXME:
	* Setup any other CPU-Specific registers (Not MIPS Standard)
	* and/or bits in other standard MIPS registers (if CPU-Specific)
	* that are needed.
	*/

	/*
	* Set up arguments for the rtld-capable crt0:
	* a0 stack pointer
	* a1 rtld cleanup (filled in by dynamic loader)
	* a2 rtld object (filled in by dynamic loader)
	* a3 ps_strings
	*/
	td->td_frame->a0 = (register_t) stack;
	td->td_frame->a1 = 0;
	td->td_frame->a2 = 0;
	td->td_frame->a3 = (register_t)imgp->ps_strings;

	td->td_md.md_flags &= ~MDTD_FPUSED;
	if (PCPU_GET(fpcurthread) == td)
	PCPU_SET(fpcurthread, (struct thread *)0);
	td->td_md.md_ss_addr = 0;
	}

	int
	ptrace_clear_single_step(struct thread *td)
	{
	int i;
	struct proc *p;

	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	if (!td->td_md.md_ss_addr)
	return EINVAL;

	/*
	* Restore original instruction and clear BP
	*/
	i = ptrace_write_int (td, td->td_md.md_ss_addr, td->td_md.md_ss_instr);

	/* The sync'ing of I & D caches is done by procfs_domem(). */

	if (i < 0) {
	log(LOG_ERR, "SS %s %d: can't restore instruction at %x: %x\n",
	p->p_comm, p->p_pid, td->td_md.md_ss_addr,
	td->td_md.md_ss_instr);
	}
	td->td_md.md_ss_addr = 0;
	return 0;
	}
	Index: head/sys/net/route.c
	===================================================================
	--- head/sys/net/route.c (revision 225616)
	+++ head/sys/net/route.c (revision 225617)
	@@ -1,1593 +1,1593 @@
	/*-
	* Copyright (c) 1980, 1986, 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)route.c 8.3.1.1 (Berkeley) 2/23/95
	* $FreeBSD$
	*/
	/************************************************************************
	* Note: In this file a 'fib' is a "forwarding information base" *
	* Which is the new name for an in kernel routing (next hop) table. *
	***********************************************************************/

	#include "opt_inet.h"
	#include "opt_route.h"
	#include "opt_mrouting.h"
	#include "opt_mpath.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/syslog.h>
	#include <sys/malloc.h>
	#include <sys/mbuf.h>
	#include <sys/socket.h>
	#include <sys/sysctl.h>
	#include <sys/syslog.h>
	#include <sys/sysproto.h>
	#include <sys/proc.h>
	#include <sys/domain.h>
	#include <sys/kernel.h>

	#include <net/if.h>
	#include <net/if_dl.h>
	#include <net/route.h>
	#include <net/vnet.h>
	#include <net/flowtable.h>

	#ifdef RADIX_MPATH
	#include <net/radix_mpath.h>
	#endif

	#include <netinet/in.h>
	#include <netinet/ip_mroute.h>

	#include <vm/uma.h>

	u_int rt_numfibs = RT_NUMFIBS;
	SYSCTL_UINT(_net, OID_AUTO, fibs, CTLFLAG_RD, &rt_numfibs, 0, "");
	/*
	* Allow the boot code to allow LESS than RT_MAXFIBS to be used.
	* We can't do more because storage is statically allocated for now.
	* (for compatibility reasons.. this will change).
	*/
	TUNABLE_INT("net.fibs", &rt_numfibs);

	/*
	* By default add routes to all fibs for new interfaces.
	* Once this is set to 0 then only allocate routes on interface
	* changes for the FIB of the caller when adding a new set of addresses
	* to an interface. XXX this is a shotgun aproach to a problem that needs
	* a more fine grained solution.. that will come.
	*/
	u_int rt_add_addr_allfibs = 1;
	SYSCTL_UINT(_net, OID_AUTO, add_addr_allfibs, CTLFLAG_RW,
	&rt_add_addr_allfibs, 0, "");
	TUNABLE_INT("net.add_addr_allfibs", &rt_add_addr_allfibs);

	VNET_DEFINE(struct rtstat, rtstat);
	#define V_rtstat VNET(rtstat)

	VNET_DEFINE(struct radix_node_head *, rt_tables);
	#define V_rt_tables VNET(rt_tables)

	VNET_DEFINE(int, rttrash); /* routes not in table but not freed */
	#define V_rttrash VNET(rttrash)


	/* compare two sockaddr structures */
	#define sa_equal(a1, a2) (bcmp((a1), (a2), (a1)->sa_len) == 0)

	/*
	* Convert a 'struct radix_node ' to a 'struct rtentry '.
	* The operation can be done safely (in this code) because a
	* 'struct rtentry' starts with two 'struct radix_node''s, the first
	* one representing leaf nodes in the routing tree, which is
	* what the code in radix.c passes us as a 'struct radix_node'.
	*
	* But because there are a lot of assumptions in this conversion,
	* do not cast explicitly, but always use the macro below.
	*/
	#define RNTORT(p) ((struct rtentry *)(p))

	static VNET_DEFINE(uma_zone_t, rtzone); /* Routing table UMA zone. */
	#define V_rtzone VNET(rtzone)

	/*
	* handler for net.my_fibnum
	*/
	static int
	sysctl_my_fibnum(SYSCTL_HANDLER_ARGS)
	{
	int fibnum;
	int error;

	fibnum = curthread->td_proc->p_fibnum;
	error = sysctl_handle_int(oidp, &fibnum, 0, req);
	return (error);
	}

	SYSCTL_PROC(_net, OID_AUTO, my_fibnum, CTLTYPE_INT\|CTLFLAG_RD,
	NULL, 0, &sysctl_my_fibnum, "I", "default FIB of caller");

	static __inline struct radix_node_head **
	rt_tables_get_rnh_ptr(int table, int fam)
	{
	struct radix_node_head **rnh;

	KASSERT(table >= 0 && table < rt_numfibs, ("%s: table out of bounds.",
	__func__));
	KASSERT(fam >= 0 && fam < (AF_MAX+1), ("%s: fam out of bounds.",
	__func__));

	/* rnh is [fib=0][af=0]. */
	rnh = (struct radix_node_head **)V_rt_tables;
	/* Get the offset to the requested table and fam. */
	rnh += table * (AF_MAX+1) + fam;

	return (rnh);
	}

	struct radix_node_head *
	rt_tables_get_rnh(int table, int fam)
	{

	return (*rt_tables_get_rnh_ptr(table, fam));
	}

	/*
	* route initialization must occur before ip6_init2(), which happenas at
	* SI_ORDER_MIDDLE.
	*/
	static void
	route_init(void)
	{
	struct domain *dom;
	int max_keylen = 0;

	/* whack the tunable ints into line. */
	if (rt_numfibs > RT_MAXFIBS)
	rt_numfibs = RT_MAXFIBS;
	if (rt_numfibs == 0)
	rt_numfibs = 1;

	for (dom = domains; dom; dom = dom->dom_next)
	if (dom->dom_maxrtkey > max_keylen)
	max_keylen = dom->dom_maxrtkey;

	rn_init(max_keylen); /* init all zeroes, all ones, mask table */
	}
	SYSINIT(route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, route_init, 0);

	static void
	vnet_route_init(const void *unused __unused)
	{
	struct domain *dom;
	struct radix_node_head **rnh;
	int table;
	int fam;

	V_rt_tables = malloc(rt_numfibs * (AF_MAX+1) *
	sizeof(struct radix_node_head *), M_RTABLE, M_WAITOK\|M_ZERO);

	V_rtzone = uma_zcreate("rtentry", sizeof(struct rtentry), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, 0);
	for (dom = domains; dom; dom = dom->dom_next) {
	if (dom->dom_rtattach) {
	for (table = 0; table < rt_numfibs; table++) {
	if ( (fam = dom->dom_family) == AF_INET \|\|
	table == 0) {
	/* for now only AF_INET has > 1 table */
	/* XXX MRT
	* rtattach will be also called
	* from vfs_export.c but the
	* offset will be 0
	* (only for AF_INET and AF_INET6
	* which don't need it anyhow)
	*/
	rnh = rt_tables_get_rnh_ptr(table, fam);
	if (rnh == NULL)
	panic("%s: rnh NULL", __func__);
	dom->dom_rtattach((void **)rnh,
	dom->dom_rtoffset);
	} else {
	break;
	}
	}
	}
	}
	}
	VNET_SYSINIT(vnet_route_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
	vnet_route_init, 0);

	#ifdef VIMAGE
	static void
	vnet_route_uninit(const void *unused __unused)
	{
	int table;
	int fam;
	struct domain *dom;
	struct radix_node_head **rnh;

	for (dom = domains; dom; dom = dom->dom_next) {
	if (dom->dom_rtdetach) {
	for (table = 0; table < rt_numfibs; table++) {
	if ( (fam = dom->dom_family) == AF_INET \|\|
	table == 0) {
	/* For now only AF_INET has > 1 tbl. */
	rnh = rt_tables_get_rnh_ptr(table, fam);
	if (rnh == NULL)
	panic("%s: rnh NULL", __func__);
	dom->dom_rtdetach((void **)rnh,
	dom->dom_rtoffset);
	} else {
	break;
	}
	}
	}
	}
	}
	VNET_SYSUNINIT(vnet_route_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
	vnet_route_uninit, 0);
	#endif

	#ifndef _SYS_SYSPROTO_H_
	struct setfib_args {
	int fibnum;
	};
	#endif
	int
	-setfib(struct thread td, struct setfib_args uap)
	+sys_setfib(struct thread td, struct setfib_args uap)
	{
	if (uap->fibnum < 0 \|\| uap->fibnum >= rt_numfibs)
	return EINVAL;
	td->td_proc->p_fibnum = uap->fibnum;
	return (0);
	}

	/*
	* Packet routing routines.
	*/
	void
	rtalloc(struct route *ro)
	{
	rtalloc_ign_fib(ro, 0UL, 0);
	}

	void
	rtalloc_fib(struct route *ro, u_int fibnum)
	{
	rtalloc_ign_fib(ro, 0UL, fibnum);
	}

	void
	rtalloc_ign(struct route *ro, u_long ignore)
	{
	struct rtentry *rt;

	if ((rt = ro->ro_rt) != NULL) {
	if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
	return;
	RTFREE(rt);
	ro->ro_rt = NULL;
	}
	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, 0);
	if (ro->ro_rt)
	RT_UNLOCK(ro->ro_rt);
	}

	void
	rtalloc_ign_fib(struct route *ro, u_long ignore, u_int fibnum)
	{
	struct rtentry *rt;

	if ((rt = ro->ro_rt) != NULL) {
	if (rt->rt_ifp != NULL && rt->rt_flags & RTF_UP)
	return;
	RTFREE(rt);
	ro->ro_rt = NULL;
	}
	ro->ro_rt = rtalloc1_fib(&ro->ro_dst, 1, ignore, fibnum);
	if (ro->ro_rt)
	RT_UNLOCK(ro->ro_rt);
	}

	/*
	* Look up the route that matches the address given
	* Or, at least try.. Create a cloned route if needed.
	*
	* The returned route, if any, is locked.
	*/
	struct rtentry *
	rtalloc1(struct sockaddr *dst, int report, u_long ignflags)
	{
	return (rtalloc1_fib(dst, report, ignflags, 0));
	}

	struct rtentry *
	rtalloc1_fib(struct sockaddr *dst, int report, u_long ignflags,
	u_int fibnum)
	{
	struct radix_node_head *rnh;
	struct radix_node *rn;
	struct rtentry *newrt;
	struct rt_addrinfo info;
	int err = 0, msgtype = RTM_MISS;
	int needlock;

	KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
	if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */
	fibnum = 0;
	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
	newrt = NULL;
	if (rnh == NULL)
	goto miss;

	/*
	* Look up the address in the table for that Address Family
	*/
	needlock = !(ignflags & RTF_RNH_LOCKED);
	if (needlock)
	RADIX_NODE_HEAD_RLOCK(rnh);
	#ifdef INVARIANTS
	else
	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
	#endif
	rn = rnh->rnh_matchaddr(dst, rnh);
	if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
	newrt = RNTORT(rn);
	RT_LOCK(newrt);
	RT_ADDREF(newrt);
	if (needlock)
	RADIX_NODE_HEAD_RUNLOCK(rnh);
	goto done;

	} else if (needlock)
	RADIX_NODE_HEAD_RUNLOCK(rnh);

	/*
	* Either we hit the root or couldn't find any match,
	* Which basically means
	* "caint get there frm here"
	*/
	miss:
	V_rtstat.rts_unreach++;

	if (report) {
	/*
	* If required, report the failure to the supervising
	* Authorities.
	* For a delete, this is not an error. (report == 0)
	*/
	bzero(&info, sizeof(info));
	info.rti_info[RTAX_DST] = dst;
	rt_missmsg(msgtype, &info, 0, err);
	}
	done:
	if (newrt)
	RT_LOCK_ASSERT(newrt);
	return (newrt);
	}

	/*
	* Remove a reference count from an rtentry.
	* If the count gets low enough, take it out of the routing table
	*/
	void
	rtfree(struct rtentry *rt)
	{
	struct radix_node_head *rnh;

	KASSERT(rt != NULL,("%s: NULL rt", __func__));
	rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
	KASSERT(rnh != NULL,("%s: NULL rnh", __func__));

	RT_LOCK_ASSERT(rt);

	/*
	* The callers should use RTFREE_LOCKED() or RTFREE(), so
	* we should come here exactly with the last reference.
	*/
	RT_REMREF(rt);
	if (rt->rt_refcnt > 0) {
	log(LOG_DEBUG, "%s: %p has %d refs\n", __func__, rt, rt->rt_refcnt);
	goto done;
	}

	/*
	* On last reference give the "close method" a chance
	* to cleanup private state. This also permits (for
	* IPv4 and IPv6) a chance to decide if the routing table
	* entry should be purged immediately or at a later time.
	* When an immediate purge is to happen the close routine
	* typically calls rtexpunge which clears the RTF_UP flag
	* on the entry so that the code below reclaims the storage.
	*/
	if (rt->rt_refcnt == 0 && rnh->rnh_close)
	rnh->rnh_close((struct radix_node *)rt, rnh);

	/*
	* If we are no longer "up" (and ref == 0)
	* then we can free the resources associated
	* with the route.
	*/
	if ((rt->rt_flags & RTF_UP) == 0) {
	if (rt->rt_nodes->rn_flags & (RNF_ACTIVE \| RNF_ROOT))
	panic("rtfree 2");
	/*
	* the rtentry must have been removed from the routing table
	* so it is represented in rttrash.. remove that now.
	*/
	V_rttrash--;
	#ifdef DIAGNOSTIC
	if (rt->rt_refcnt < 0) {
	printf("rtfree: %p not freed (neg refs)\n", rt);
	goto done;
	}
	#endif
	/*
	* release references on items we hold them on..
	* e.g other routes and ifaddrs.
	*/
	if (rt->rt_ifa)
	ifa_free(rt->rt_ifa);
	/*
	* The key is separatly alloc'd so free it (see rt_setgate()).
	* This also frees the gateway, as they are always malloc'd
	* together.
	*/
	Free(rt_key(rt));

	/*
	* and the rtentry itself of course
	*/
	RT_LOCK_DESTROY(rt);
	uma_zfree(V_rtzone, rt);
	return;
	}
	done:
	RT_UNLOCK(rt);
	}


	/*
	* Force a routing table entry to the specified
	* destination to go through the given gateway.
	* Normally called as a result of a routing redirect
	* message from the network layer.
	*/
	void
	rtredirect(struct sockaddr *dst,
	struct sockaddr *gateway,
	struct sockaddr *netmask,
	int flags,
	struct sockaddr *src)
	{
	rtredirect_fib(dst, gateway, netmask, flags, src, 0);
	}

	void
	rtredirect_fib(struct sockaddr *dst,
	struct sockaddr *gateway,
	struct sockaddr *netmask,
	int flags,
	struct sockaddr *src,
	u_int fibnum)
	{
	struct rtentry rt, rt0 = NULL;
	int error = 0;
	short *stat = NULL;
	struct rt_addrinfo info;
	struct ifaddr *ifa;
	struct radix_node_head *rnh;

	ifa = NULL;
	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
	if (rnh == NULL) {
	error = EAFNOSUPPORT;
	goto out;
	}

	/* verify the gateway is directly reachable */
	if ((ifa = ifa_ifwithnet(gateway, 0)) == NULL) {
	error = ENETUNREACH;
	goto out;
	}
	rt = rtalloc1_fib(dst, 0, 0UL, fibnum); /* NB: rt is locked */
	/*
	* If the redirect isn't from our current router for this dst,
	* it's either old or wrong. If it redirects us to ourselves,
	* we have a routing loop, perhaps as a result of an interface
	* going down recently.
	*/
	if (!(flags & RTF_DONE) && rt &&
	(!sa_equal(src, rt->rt_gateway) \|\| rt->rt_ifa != ifa))
	error = EINVAL;
	else if (ifa_ifwithaddr_check(gateway))
	error = EHOSTUNREACH;
	if (error)
	goto done;
	/*
	* Create a new entry if we just got back a wildcard entry
	* or the lookup failed. This is necessary for hosts
	* which use routing redirects generated by smart gateways
	* to dynamically build the routing tables.
	*/
	if (rt == NULL \|\| (rt_mask(rt) && rt_mask(rt)->sa_len < 2))
	goto create;
	/*
	* Don't listen to the redirect if it's
	* for a route to an interface.
	*/
	if (rt->rt_flags & RTF_GATEWAY) {
	if (((rt->rt_flags & RTF_HOST) == 0) && (flags & RTF_HOST)) {
	/*
	* Changing from route to net => route to host.
	* Create new route, rather than smashing route to net.
	*/
	create:
	rt0 = rt;
	rt = NULL;

	flags \|= RTF_GATEWAY \| RTF_DYNAMIC;
	bzero((caddr_t)&info, sizeof(info));
	info.rti_info[RTAX_DST] = dst;
	info.rti_info[RTAX_GATEWAY] = gateway;
	info.rti_info[RTAX_NETMASK] = netmask;
	info.rti_ifa = ifa;
	info.rti_flags = flags;
	if (rt0 != NULL)
	RT_UNLOCK(rt0); /* drop lock to avoid LOR with RNH */
	error = rtrequest1_fib(RTM_ADD, &info, &rt, fibnum);
	if (rt != NULL) {
	RT_LOCK(rt);
	if (rt0 != NULL)
	EVENTHANDLER_INVOKE(route_redirect_event, rt0, rt, dst);
	flags = rt->rt_flags;
	}
	if (rt0 != NULL)
	RTFREE(rt0);

	stat = &V_rtstat.rts_dynamic;
	} else {
	struct rtentry *gwrt;

	/*
	* Smash the current notion of the gateway to
	* this destination. Should check about netmask!!!
	*/
	rt->rt_flags \|= RTF_MODIFIED;
	flags \|= RTF_MODIFIED;
	stat = &V_rtstat.rts_newgateway;
	/*
	* add the key and gateway (in one malloc'd chunk).
	*/
	RT_UNLOCK(rt);
	RADIX_NODE_HEAD_LOCK(rnh);
	RT_LOCK(rt);
	rt_setgate(rt, rt_key(rt), gateway);
	gwrt = rtalloc1(gateway, 1, RTF_RNH_LOCKED);
	RADIX_NODE_HEAD_UNLOCK(rnh);
	EVENTHANDLER_INVOKE(route_redirect_event, rt, gwrt, dst);
	RTFREE_LOCKED(gwrt);
	}
	} else
	error = EHOSTUNREACH;
	done:
	if (rt)
	RTFREE_LOCKED(rt);
	out:
	if (error)
	V_rtstat.rts_badredirect++;
	else if (stat != NULL)
	(*stat)++;
	bzero((caddr_t)&info, sizeof(info));
	info.rti_info[RTAX_DST] = dst;
	info.rti_info[RTAX_GATEWAY] = gateway;
	info.rti_info[RTAX_NETMASK] = netmask;
	info.rti_info[RTAX_AUTHOR] = src;
	rt_missmsg(RTM_REDIRECT, &info, flags, error);
	if (ifa != NULL)
	ifa_free(ifa);
	}

	int
	rtioctl(u_long req, caddr_t data)
	{
	return (rtioctl_fib(req, data, 0));
	}

	/*
	* Routing table ioctl interface.
	*/
	int
	rtioctl_fib(u_long req, caddr_t data, u_int fibnum)
	{

	/*
	* If more ioctl commands are added here, make sure the proper
	* super-user checks are being performed because it is possible for
	* prison-root to make it this far if raw sockets have been enabled
	* in jails.
	*/
	#ifdef INET
	/* Multicast goop, grrr... */
	return mrt_ioctl ? mrt_ioctl(req, data, fibnum) : EOPNOTSUPP;
	#else /* INET */
	return ENXIO;
	#endif /* INET */
	}

	/*
	* For both ifa_ifwithroute() routines, 'ifa' is returned referenced.
	*/
	struct ifaddr *
	ifa_ifwithroute(int flags, struct sockaddr dst, struct sockaddr gateway)
	{
	return (ifa_ifwithroute_fib(flags, dst, gateway, 0));
	}

	struct ifaddr *
	ifa_ifwithroute_fib(int flags, struct sockaddr dst, struct sockaddr gateway,
	u_int fibnum)
	{
	register struct ifaddr *ifa;
	int not_found = 0;

	if ((flags & RTF_GATEWAY) == 0) {
	/*
	* If we are adding a route to an interface,
	* and the interface is a pt to pt link
	* we should search for the destination
	* as our clue to the interface. Otherwise
	* we can use the local address.
	*/
	ifa = NULL;
	if (flags & RTF_HOST)
	ifa = ifa_ifwithdstaddr(dst);
	if (ifa == NULL)
	ifa = ifa_ifwithaddr(gateway);
	} else {
	/*
	* If we are adding a route to a remote net
	* or host, the gateway may still be on the
	* other end of a pt to pt link.
	*/
	ifa = ifa_ifwithdstaddr(gateway);
	}
	if (ifa == NULL)
	ifa = ifa_ifwithnet(gateway, 0);
	if (ifa == NULL) {
	struct rtentry *rt = rtalloc1_fib(gateway, 0, RTF_RNH_LOCKED, fibnum);
	if (rt == NULL)
	return (NULL);
	/*
	* dismiss a gateway that is reachable only
	* through the default router
	*/
	switch (gateway->sa_family) {
	case AF_INET:
	if (satosin(rt_key(rt))->sin_addr.s_addr == INADDR_ANY)
	not_found = 1;
	break;
	case AF_INET6:
	if (IN6_IS_ADDR_UNSPECIFIED(&satosin6(rt_key(rt))->sin6_addr))
	not_found = 1;
	break;
	default:
	break;
	}
	if (!not_found && rt->rt_ifa != NULL) {
	ifa = rt->rt_ifa;
	ifa_ref(ifa);
	}
	RT_REMREF(rt);
	RT_UNLOCK(rt);
	if (not_found \|\| ifa == NULL)
	return (NULL);
	}
	if (ifa->ifa_addr->sa_family != dst->sa_family) {
	struct ifaddr *oifa = ifa;
	ifa = ifaof_ifpforaddr(dst, ifa->ifa_ifp);
	if (ifa == NULL)
	ifa = oifa;
	else
	ifa_free(oifa);
	}
	return (ifa);
	}

	/*
	* Do appropriate manipulations of a routing tree given
	* all the bits of info needed
	*/
	int
	rtrequest(int req,
	struct sockaddr *dst,
	struct sockaddr *gateway,
	struct sockaddr *netmask,
	int flags,
	struct rtentry **ret_nrt)
	{
	return (rtrequest_fib(req, dst, gateway, netmask, flags, ret_nrt, 0));
	}

	int
	rtrequest_fib(int req,
	struct sockaddr *dst,
	struct sockaddr *gateway,
	struct sockaddr *netmask,
	int flags,
	struct rtentry **ret_nrt,
	u_int fibnum)
	{
	struct rt_addrinfo info;

	if (dst->sa_len == 0)
	return(EINVAL);

	bzero((caddr_t)&info, sizeof(info));
	info.rti_flags = flags;
	info.rti_info[RTAX_DST] = dst;
	info.rti_info[RTAX_GATEWAY] = gateway;
	info.rti_info[RTAX_NETMASK] = netmask;
	return rtrequest1_fib(req, &info, ret_nrt, fibnum);
	}

	/*
	* These (questionable) definitions of apparent local variables apply
	* to the next two functions. XXXXXX!!!
	*/
	#define dst info->rti_info[RTAX_DST]
	#define gateway info->rti_info[RTAX_GATEWAY]
	#define netmask info->rti_info[RTAX_NETMASK]
	#define ifaaddr info->rti_info[RTAX_IFA]
	#define ifpaddr info->rti_info[RTAX_IFP]
	#define flags info->rti_flags

	int
	rt_getifa(struct rt_addrinfo *info)
	{
	return (rt_getifa_fib(info, 0));
	}

	/*
	* Look up rt_addrinfo for a specific fib. Note that if rti_ifa is defined,
	* it will be referenced so the caller must free it.
	*/
	int
	rt_getifa_fib(struct rt_addrinfo *info, u_int fibnum)
	{
	struct ifaddr *ifa;
	int error = 0;

	/*
	* ifp may be specified by sockaddr_dl
	* when protocol address is ambiguous.
	*/
	if (info->rti_ifp == NULL && ifpaddr != NULL &&
	ifpaddr->sa_family == AF_LINK &&
	(ifa = ifa_ifwithnet(ifpaddr, 0)) != NULL) {
	info->rti_ifp = ifa->ifa_ifp;
	ifa_free(ifa);
	}
	if (info->rti_ifa == NULL && ifaaddr != NULL)
	info->rti_ifa = ifa_ifwithaddr(ifaaddr);
	if (info->rti_ifa == NULL) {
	struct sockaddr *sa;

	sa = ifaaddr != NULL ? ifaaddr :
	(gateway != NULL ? gateway : dst);
	if (sa != NULL && info->rti_ifp != NULL)
	info->rti_ifa = ifaof_ifpforaddr(sa, info->rti_ifp);
	else if (dst != NULL && gateway != NULL)
	info->rti_ifa = ifa_ifwithroute_fib(flags, dst, gateway,
	fibnum);
	else if (sa != NULL)
	info->rti_ifa = ifa_ifwithroute_fib(flags, sa, sa,
	fibnum);
	}
	if ((ifa = info->rti_ifa) != NULL) {
	if (info->rti_ifp == NULL)
	info->rti_ifp = ifa->ifa_ifp;
	} else
	error = ENETUNREACH;
	return (error);
	}

	/*
	* Expunges references to a route that's about to be reclaimed.
	* The route must be locked.
	*/
	int
	rtexpunge(struct rtentry *rt)
	{
	#if !defined(RADIX_MPATH)
	struct radix_node *rn;
	#else
	struct rt_addrinfo info;
	int fib;
	struct rtentry *rt0;
	#endif
	struct radix_node_head *rnh;
	struct ifaddr *ifa;
	int error = 0;

	/*
	* Find the correct routing tree to use for this Address Family
	*/
	rnh = rt_tables_get_rnh(rt->rt_fibnum, rt_key(rt)->sa_family);
	RT_LOCK_ASSERT(rt);
	if (rnh == NULL)
	return (EAFNOSUPPORT);
	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);

	#ifdef RADIX_MPATH
	fib = rt->rt_fibnum;
	bzero(&info, sizeof(info));
	info.rti_ifp = rt->rt_ifp;
	info.rti_flags = RTF_RNH_LOCKED;
	info.rti_info[RTAX_DST] = rt_key(rt);
	info.rti_info[RTAX_GATEWAY] = rt->rt_ifa->ifa_addr;

	RT_UNLOCK(rt);
	error = rtrequest1_fib(RTM_DELETE, &info, &rt0, fib);

	if (error == 0 && rt0 != NULL) {
	rt = rt0;
	RT_LOCK(rt);
	} else if (error != 0) {
	RT_LOCK(rt);
	return (error);
	}
	#else
	/*
	* Remove the item from the tree; it should be there,
	* but when callers invoke us blindly it may not (sigh).
	*/
	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
	if (rn == NULL) {
	error = ESRCH;
	goto bad;
	}
	KASSERT((rn->rn_flags & (RNF_ACTIVE \| RNF_ROOT)) == 0,
	("unexpected flags 0x%x", rn->rn_flags));
	KASSERT(rt == RNTORT(rn),
	("lookup mismatch, rt %p rn %p", rt, rn));
	#endif /* RADIX_MPATH */

	rt->rt_flags &= ~RTF_UP;

	/*
	* Give the protocol a chance to keep things in sync.
	*/
	if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest) {
	struct rt_addrinfo info;

	bzero((caddr_t)&info, sizeof(info));
	info.rti_flags = rt->rt_flags;
	info.rti_info[RTAX_DST] = rt_key(rt);
	info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
	info.rti_info[RTAX_NETMASK] = rt_mask(rt);
	ifa->ifa_rtrequest(RTM_DELETE, rt, &info);
	}

	/*
	* one more rtentry floating around that is not
	* linked to the routing table.
	*/
	V_rttrash++;
	#if !defined(RADIX_MPATH)
	bad:
	#endif
	return (error);
	}

	#ifdef RADIX_MPATH
	static int
	rn_mpath_update(int req, struct rt_addrinfo *info,
	struct radix_node_head rnh, struct rtentry *ret_nrt)
	{
	/*
	* if we got multipath routes, we require users to specify
	* a matching RTAX_GATEWAY.
	*/
	struct rtentry rt, rto = NULL;
	register struct radix_node *rn;
	int error = 0;

	rn = rnh->rnh_matchaddr(dst, rnh);
	if (rn == NULL)
	return (ESRCH);
	rto = rt = RNTORT(rn);
	rt = rt_mpath_matchgate(rt, gateway);
	if (rt == NULL)
	return (ESRCH);
	/*
	* this is the first entry in the chain
	*/
	if (rto == rt) {
	rn = rn_mpath_next((struct radix_node *)rt);
	/*
	* there is another entry, now it's active
	*/
	if (rn) {
	rto = RNTORT(rn);
	RT_LOCK(rto);
	rto->rt_flags \|= RTF_UP;
	RT_UNLOCK(rto);
	} else if (rt->rt_flags & RTF_GATEWAY) {
	/*
	* For gateway routes, we need to
	* make sure that we we are deleting
	* the correct gateway.
	* rt_mpath_matchgate() does not
	* check the case when there is only
	* one route in the chain.
	*/
	if (gateway &&
	(rt->rt_gateway->sa_len != gateway->sa_len \|\|
	memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
	error = ESRCH;
	else {
	/*
	* remove from tree before returning it
	* to the caller
	*/
	rn = rnh->rnh_deladdr(dst, netmask, rnh);
	KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
	goto gwdelete;
	}

	}
	/*
	* use the normal delete code to remove
	* the first entry
	*/
	if (req != RTM_DELETE)
	goto nondelete;

	error = ENOENT;
	goto done;
	}

	/*
	* if the entry is 2nd and on up
	*/
	if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
	panic ("rtrequest1: rt_mpath_deldup");
	gwdelete:
	RT_LOCK(rt);
	RT_ADDREF(rt);
	if (req == RTM_DELETE) {
	rt->rt_flags &= ~RTF_UP;
	/*
	* One more rtentry floating around that is not
	* linked to the routing table. rttrash will be decremented
	* when RTFREE(rt) is eventually called.
	*/
	V_rttrash++;
	}

	nondelete:
	if (req != RTM_DELETE)
	panic("unrecognized request %d", req);


	/*
	* If the caller wants it, then it can have it,
	* but it's up to it to free the rtentry as we won't be
	* doing it.
	*/
	if (ret_nrt) {
	*ret_nrt = rt;
	RT_UNLOCK(rt);
	} else
	RTFREE_LOCKED(rt);
	done:
	return (error);
	}
	#endif

	int
	rtrequest1_fib(int req, struct rt_addrinfo info, struct rtentry *ret_nrt,
	u_int fibnum)
	{
	int error = 0, needlock = 0;
	register struct rtentry *rt;
	#ifdef FLOWTABLE
	register struct rtentry *rt0;
	#endif
	register struct radix_node *rn;
	register struct radix_node_head *rnh;
	struct ifaddr *ifa;
	struct sockaddr *ndst;
	#define senderr(x) { error = x ; goto bad; }

	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
	if (dst->sa_family != AF_INET) /* Only INET supports > 1 fib now */
	fibnum = 0;
	/*
	* Find the correct routing tree to use for this Address Family
	*/
	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
	if (rnh == NULL)
	return (EAFNOSUPPORT);
	needlock = ((flags & RTF_RNH_LOCKED) == 0);
	flags &= ~RTF_RNH_LOCKED;
	if (needlock)
	RADIX_NODE_HEAD_LOCK(rnh);
	else
	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
	/*
	* If we are adding a host route then we don't want to put
	* a netmask in the tree, nor do we want to clone it.
	*/
	if (flags & RTF_HOST)
	netmask = NULL;

	switch (req) {
	case RTM_DELETE:
	#ifdef RADIX_MPATH
	if (rn_mpath_capable(rnh)) {
	error = rn_mpath_update(req, info, rnh, ret_nrt);
	/*
	* "bad" holds true for the success case
	* as well
	*/
	if (error != ENOENT)
	goto bad;
	error = 0;
	}
	#endif
	/*
	* Remove the item from the tree and return it.
	* Complain if it is not there and do no more processing.
	*/
	rn = rnh->rnh_deladdr(dst, netmask, rnh);
	if (rn == NULL)
	senderr(ESRCH);
	if (rn->rn_flags & (RNF_ACTIVE \| RNF_ROOT))
	panic ("rtrequest delete");
	rt = RNTORT(rn);
	RT_LOCK(rt);
	RT_ADDREF(rt);
	rt->rt_flags &= ~RTF_UP;

	/*
	* give the protocol a chance to keep things in sync.
	*/
	if ((ifa = rt->rt_ifa) && ifa->ifa_rtrequest)
	ifa->ifa_rtrequest(RTM_DELETE, rt, info);

	/*
	* One more rtentry floating around that is not
	* linked to the routing table. rttrash will be decremented
	* when RTFREE(rt) is eventually called.
	*/
	V_rttrash++;

	/*
	* If the caller wants it, then it can have it,
	* but it's up to it to free the rtentry as we won't be
	* doing it.
	*/
	if (ret_nrt) {
	*ret_nrt = rt;
	RT_UNLOCK(rt);
	} else
	RTFREE_LOCKED(rt);
	break;
	case RTM_RESOLVE:
	/*
	* resolve was only used for route cloning
	* here for compat
	*/
	break;
	case RTM_ADD:
	if ((flags & RTF_GATEWAY) && !gateway)
	senderr(EINVAL);
	if (dst && gateway && (dst->sa_family != gateway->sa_family) &&
	(gateway->sa_family != AF_UNSPEC) && (gateway->sa_family != AF_LINK))
	senderr(EINVAL);

	if (info->rti_ifa == NULL) {
	error = rt_getifa_fib(info, fibnum);
	if (error)
	senderr(error);
	} else
	ifa_ref(info->rti_ifa);
	ifa = info->rti_ifa;
	rt = uma_zalloc(V_rtzone, M_NOWAIT \| M_ZERO);
	if (rt == NULL) {
	if (ifa != NULL)
	ifa_free(ifa);
	senderr(ENOBUFS);
	}
	RT_LOCK_INIT(rt);
	rt->rt_flags = RTF_UP \| flags;
	rt->rt_fibnum = fibnum;
	/*
	* Add the gateway. Possibly re-malloc-ing the storage for it
	*
	*/
	RT_LOCK(rt);
	if ((error = rt_setgate(rt, dst, gateway)) != 0) {
	RT_LOCK_DESTROY(rt);
	if (ifa != NULL)
	ifa_free(ifa);
	uma_zfree(V_rtzone, rt);
	senderr(error);
	}

	/*
	* point to the (possibly newly malloc'd) dest address.
	*/
	ndst = (struct sockaddr *)rt_key(rt);

	/*
	* make sure it contains the value we want (masked if needed).
	*/
	if (netmask) {
	rt_maskedcopy(dst, ndst, netmask);
	} else
	bcopy(dst, ndst, dst->sa_len);

	/*
	* We use the ifa reference returned by rt_getifa_fib().
	* This moved from below so that rnh->rnh_addaddr() can
	* examine the ifa and ifa->ifa_ifp if it so desires.
	*/
	rt->rt_ifa = ifa;
	rt->rt_ifp = ifa->ifa_ifp;
	rt->rt_rmx.rmx_weight = 1;

	#ifdef RADIX_MPATH
	/* do not permit exactly the same dst/mask/gw pair */
	if (rn_mpath_capable(rnh) &&
	rt_mpath_conflict(rnh, rt, netmask)) {
	if (rt->rt_ifa) {
	ifa_free(rt->rt_ifa);
	}
	Free(rt_key(rt));
	RT_LOCK_DESTROY(rt);
	uma_zfree(V_rtzone, rt);
	senderr(EEXIST);
	}
	#endif

	#ifdef FLOWTABLE
	rt0 = NULL;
	/* XXX
	* "flow-table" only support IPv4 at the moment.
	* XXX-BZ as of r205066 it would support IPv6.
	*/
	#ifdef INET
	if (dst->sa_family == AF_INET) {
	rn = rnh->rnh_matchaddr(dst, rnh);
	if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
	struct sockaddr *mask;
	u_char m, n;
	int len;

	/*
	* compare mask to see if the new route is
	* more specific than the existing one
	*/
	rt0 = RNTORT(rn);
	RT_LOCK(rt0);
	RT_ADDREF(rt0);
	RT_UNLOCK(rt0);
	/*
	* A host route is already present, so
	* leave the flow-table entries as is.
	*/
	if (rt0->rt_flags & RTF_HOST) {
	RTFREE(rt0);
	rt0 = NULL;
	} else if (!(flags & RTF_HOST) && netmask) {
	mask = rt_mask(rt0);
	len = mask->sa_len;
	m = (u_char *)mask;
	n = (u_char *)netmask;
	while (len-- > 0) {
	if (n != m)
	break;
	n++;
	m++;
	}
	if (len == 0 \|\| (n < m)) {
	RTFREE(rt0);
	rt0 = NULL;
	}
	}
	}
	}
	#endif
	#endif

	/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
	rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes);
	/*
	* If it still failed to go into the tree,
	* then un-make it (this should be a function)
	*/
	if (rn == NULL) {
	if (rt->rt_ifa)
	ifa_free(rt->rt_ifa);
	Free(rt_key(rt));
	RT_LOCK_DESTROY(rt);
	uma_zfree(V_rtzone, rt);
	#ifdef FLOWTABLE
	if (rt0 != NULL)
	RTFREE(rt0);
	#endif
	senderr(EEXIST);
	}
	#ifdef FLOWTABLE
	else if (rt0 != NULL) {
	#ifdef INET
	flowtable_route_flush(V_ip_ft, rt0);
	#endif
	RTFREE(rt0);
	}
	#endif

	/*
	* If this protocol has something to add to this then
	* allow it to do that as well.
	*/
	if (ifa->ifa_rtrequest)
	ifa->ifa_rtrequest(req, rt, info);

	/*
	* actually return a resultant rtentry and
	* give the caller a single reference.
	*/
	if (ret_nrt) {
	*ret_nrt = rt;
	RT_ADDREF(rt);
	}
	RT_UNLOCK(rt);
	break;
	default:
	error = EOPNOTSUPP;
	}
	bad:
	if (needlock)
	RADIX_NODE_HEAD_UNLOCK(rnh);
	return (error);
	#undef senderr
	}

	#undef dst
	#undef gateway
	#undef netmask
	#undef ifaaddr
	#undef ifpaddr
	#undef flags

	int
	rt_setgate(struct rtentry rt, struct sockaddr dst, struct sockaddr *gate)
	{
	/* XXX dst may be overwritten, can we move this to below */
	int dlen = SA_SIZE(dst), glen = SA_SIZE(gate);
	#ifdef INVARIANTS
	struct radix_node_head *rnh;

	rnh = rt_tables_get_rnh(rt->rt_fibnum, dst->sa_family);
	#endif

	RT_LOCK_ASSERT(rt);
	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);

	/*
	* Prepare to store the gateway in rt->rt_gateway.
	* Both dst and gateway are stored one after the other in the same
	* malloc'd chunk. If we have room, we can reuse the old buffer,
	* rt_gateway already points to the right place.
	* Otherwise, malloc a new block and update the 'dst' address.
	*/
	if (rt->rt_gateway == NULL \|\| glen > SA_SIZE(rt->rt_gateway)) {
	caddr_t new;

	R_Malloc(new, caddr_t, dlen + glen);
	if (new == NULL)
	return ENOBUFS;
	/*
	* XXX note, we copy from dst and not rt_key(rt) because
	* rt_setgate() can be called to initialize a newly
	* allocated route entry, in which case rt_key(rt) == NULL
	* (and also rt->rt_gateway == NULL).
	* Free()/free() handle a NULL argument just fine.
	*/
	bcopy(dst, new, dlen);
	Free(rt_key(rt)); /* free old block, if any */
	rt_key(rt) = (struct sockaddr *)new;
	rt->rt_gateway = (struct sockaddr *)(new + dlen);
	}

	/*
	* Copy the new gateway value into the memory chunk.
	*/
	bcopy(gate, rt->rt_gateway, glen);

	return (0);
	}

	void
	rt_maskedcopy(struct sockaddr src, struct sockaddr dst, struct sockaddr *netmask)
	{
	register u_char cp1 = (u_char )src;
	register u_char cp2 = (u_char )dst;
	register u_char cp3 = (u_char )netmask;
	u_char cplim = cp2 + cp3;
	u_char cplim2 = cp2 + cp1;

	cp2++ = cp1++; cp2++ = cp1++; /* copies sa_len & sa_family */
	cp3 += 2;
	if (cplim > cplim2)
	cplim = cplim2;
	while (cp2 < cplim)
	cp2++ = cp1++ & *cp3++;
	if (cp2 < cplim2)
	bzero((caddr_t)cp2, (unsigned)(cplim2 - cp2));
	}

	/*
	* Set up a routing table entry, normally
	* for an interface.
	*/
	#define _SOCKADDR_TMPSIZE 128 /* Not too big.. kernel stack size is limited */
	static inline int
	rtinit1(struct ifaddr *ifa, int cmd, int flags, int fibnum)
	{
	struct sockaddr *dst;
	struct sockaddr *netmask;
	struct rtentry *rt = NULL;
	struct rt_addrinfo info;
	int error = 0;
	int startfib, endfib;
	char tempbuf[_SOCKADDR_TMPSIZE];
	int didwork = 0;
	int a_failure = 0;
	static struct sockaddr_dl null_sdl = {sizeof(null_sdl), AF_LINK};

	if (flags & RTF_HOST) {
	dst = ifa->ifa_dstaddr;
	netmask = NULL;
	} else {
	dst = ifa->ifa_addr;
	netmask = ifa->ifa_netmask;
	}
	if ( dst->sa_family != AF_INET)
	fibnum = 0;
	if (fibnum == -1) {
	if (rt_add_addr_allfibs == 0 && cmd == (int)RTM_ADD) {
	startfib = endfib = curthread->td_proc->p_fibnum;
	} else {
	startfib = 0;
	endfib = rt_numfibs - 1;
	}
	} else {
	KASSERT((fibnum < rt_numfibs), ("rtinit1: bad fibnum"));
	startfib = fibnum;
	endfib = fibnum;
	}
	if (dst->sa_len == 0)
	return(EINVAL);

	/*
	* If it's a delete, check that if it exists,
	* it's on the correct interface or we might scrub
	* a route to another ifa which would
	* be confusing at best and possibly worse.
	*/
	if (cmd == RTM_DELETE) {
	/*
	* It's a delete, so it should already exist..
	* If it's a net, mask off the host bits
	* (Assuming we have a mask)
	* XXX this is kinda inet specific..
	*/
	if (netmask != NULL) {
	rt_maskedcopy(dst, (struct sockaddr *)tempbuf, netmask);
	dst = (struct sockaddr *)tempbuf;
	}
	}
	/*
	* Now go through all the requested tables (fibs) and do the
	* requested action. Realistically, this will either be fib 0
	* for protocols that don't do multiple tables or all the
	* tables for those that do. XXX For this version only AF_INET.
	* When that changes code should be refactored to protocol
	* independent parts and protocol dependent parts.
	*/
	for ( fibnum = startfib; fibnum <= endfib; fibnum++) {
	if (cmd == RTM_DELETE) {
	struct radix_node_head *rnh;
	struct radix_node *rn;
	/*
	* Look up an rtentry that is in the routing tree and
	* contains the correct info.
	*/
	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
	if (rnh == NULL)
	/* this table doesn't exist but others might */
	continue;
	RADIX_NODE_HEAD_LOCK(rnh);
	#ifdef RADIX_MPATH
	if (rn_mpath_capable(rnh)) {

	rn = rnh->rnh_matchaddr(dst, rnh);
	if (rn == NULL)
	error = ESRCH;
	else {
	rt = RNTORT(rn);
	/*
	* for interface route the
	* rt->rt_gateway is sockaddr_intf
	* for cloning ARP entries, so
	* rt_mpath_matchgate must use the
	* interface address
	*/
	rt = rt_mpath_matchgate(rt,
	ifa->ifa_addr);
	if (!rt)
	error = ESRCH;
	}
	}
	else
	#endif
	rn = rnh->rnh_lookup(dst, netmask, rnh);
	error = (rn == NULL \|\|
	(rn->rn_flags & RNF_ROOT) \|\|
	RNTORT(rn)->rt_ifa != ifa \|\|
	!sa_equal((struct sockaddr *)rn->rn_key, dst));
	RADIX_NODE_HEAD_UNLOCK(rnh);
	if (error) {
	/* this is only an error if bad on ALL tables */
	continue;
	}
	}
	/*
	* Do the actual request
	*/
	bzero((caddr_t)&info, sizeof(info));
	info.rti_ifa = ifa;
	info.rti_flags = flags \| (ifa->ifa_flags & ~IFA_RTSELF);
	info.rti_info[RTAX_DST] = dst;
	/*
	* doing this for compatibility reasons
	*/
	if (cmd == RTM_ADD)
	info.rti_info[RTAX_GATEWAY] =
	(struct sockaddr *)&null_sdl;
	else
	info.rti_info[RTAX_GATEWAY] = ifa->ifa_addr;
	info.rti_info[RTAX_NETMASK] = netmask;
	error = rtrequest1_fib(cmd, &info, &rt, fibnum);
	if (error == 0 && rt != NULL) {
	/*
	* notify any listening routing agents of the change
	*/
	RT_LOCK(rt);
	#ifdef RADIX_MPATH
	/*
	* in case address alias finds the first address
	* e.g. ifconfig bge0 192.103.54.246/24
	* e.g. ifconfig bge0 192.103.54.247/24
	* the address set in the route is 192.103.54.246
	* so we need to replace it with 192.103.54.247
	*/
	if (memcmp(rt->rt_ifa->ifa_addr,
	ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
	ifa_free(rt->rt_ifa);
	ifa_ref(ifa);
	rt->rt_ifp = ifa->ifa_ifp;
	rt->rt_ifa = ifa;
	}
	#endif
	/*
	* doing this for compatibility reasons
	*/
	if (cmd == RTM_ADD) {
	((struct sockaddr_dl *)rt->rt_gateway)->sdl_type =
	rt->rt_ifp->if_type;
	((struct sockaddr_dl *)rt->rt_gateway)->sdl_index =
	rt->rt_ifp->if_index;
	}
	RT_ADDREF(rt);
	RT_UNLOCK(rt);
	rt_newaddrmsg(cmd, ifa, error, rt);
	RT_LOCK(rt);
	RT_REMREF(rt);
	if (cmd == RTM_DELETE) {
	/*
	* If we are deleting, and we found an entry,
	* then it's been removed from the tree..
	* now throw it away.
	*/
	RTFREE_LOCKED(rt);
	} else {
	if (cmd == RTM_ADD) {
	/*
	* We just wanted to add it..
	* we don't actually need a reference.
	*/
	RT_REMREF(rt);
	}
	RT_UNLOCK(rt);
	}
	didwork = 1;
	}
	if (error)
	a_failure = error;
	}
	if (cmd == RTM_DELETE) {
	if (didwork) {
	error = 0;
	} else {
	/* we only give an error if it wasn't in any table */
	error = ((flags & RTF_HOST) ?
	EHOSTUNREACH : ENETUNREACH);
	}
	} else {
	if (a_failure) {
	/* return an error if any of them failed */
	error = a_failure;
	}
	}
	return (error);
	}

	/* special one for inet internal use. may not use. */
	int
	rtinit_fib(struct ifaddr *ifa, int cmd, int flags)
	{
	return (rtinit1(ifa, cmd, flags, -1));
	}

	/*
	* Set up a routing table entry, normally
	* for an interface.
	*/
	int
	rtinit(struct ifaddr *ifa, int cmd, int flags)
	{
	struct sockaddr *dst;
	int fib = 0;

	if (flags & RTF_HOST) {
	dst = ifa->ifa_dstaddr;
	} else {
	dst = ifa->ifa_addr;
	}

	if (dst->sa_family == AF_INET)
	fib = -1;
	return (rtinit1(ifa, cmd, flags, fib));
	}
	Index: head/sys/nfs/nfs_nfssvc.c
	===================================================================
	--- head/sys/nfs/nfs_nfssvc.c (revision 225616)
	+++ head/sys/nfs/nfs_nfssvc.c (revision 225617)
	@@ -1,156 +1,156 @@
	/*-
	* Copyright (c) 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* Rick Macklem at The University of Guelph.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_nfs.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/sysproto.h>
	#include <sys/kernel.h>
	#include <sys/sysctl.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/module.h>
	#include <sys/sysent.h>
	#include <sys/syscall.h>
	#include <sys/sysproto.h>

	#include <security/audit/audit.h>

	#include <nfs/nfssvc.h>

	static int nfssvc_offset = SYS_nfssvc;
	static struct sysent nfssvc_prev_sysent;
	MAKE_SYSENT(nfssvc);

	/*
	* This tiny module simply handles the nfssvc() system call. The other
	* nfs modules that use the system call register themselves by setting
	* the nfsd_call_xxx function pointers non-NULL.
	*/

	int (nfsd_call_nfsserver)(struct thread , struct nfssvc_args *) = NULL;
	int (nfsd_call_nfscommon)(struct thread , struct nfssvc_args *) = NULL;
	int (nfsd_call_nfscl)(struct thread , struct nfssvc_args *) = NULL;
	int (nfsd_call_nfsd)(struct thread , struct nfssvc_args *) = NULL;

	/*
	* Nfs server psuedo system call for the nfsd's
	*/
	int
	-nfssvc(struct thread td, struct nfssvc_args uap)
	+sys_nfssvc(struct thread td, struct nfssvc_args uap)
	{
	int error;

	KASSERT(!mtx_owned(&Giant), ("nfssvc(): called with Giant"));

	AUDIT_ARG_CMD(uap->flag);

	/* Allow anyone to get the stats. */
	if ((uap->flag & ~NFSSVC_GETSTATS) != 0) {
	error = priv_check(td, PRIV_NFS_DAEMON);
	if (error != 0)
	return (error);
	}
	error = EINVAL;
	if ((uap->flag & (NFSSVC_ADDSOCK \| NFSSVC_OLDNFSD \| NFSSVC_NFSD)) &&
	nfsd_call_nfsserver != NULL)
	error = (*nfsd_call_nfsserver)(td, uap);
	else if ((uap->flag & (NFSSVC_CBADDSOCK \| NFSSVC_NFSCBD)) &&
	nfsd_call_nfscl != NULL)
	error = (*nfsd_call_nfscl)(td, uap);
	else if ((uap->flag & (NFSSVC_IDNAME \| NFSSVC_GETSTATS \|
	NFSSVC_GSSDADDPORT \| NFSSVC_GSSDADDFIRST \| NFSSVC_GSSDDELETEALL \|
	NFSSVC_NFSUSERDPORT \| NFSSVC_NFSUSERDDELPORT)) &&
	nfsd_call_nfscommon != NULL)
	error = (*nfsd_call_nfscommon)(td, uap);
	else if ((uap->flag & (NFSSVC_NFSDNFSD \| NFSSVC_NFSDADDSOCK \|
	NFSSVC_PUBLICFH \| NFSSVC_V4ROOTEXPORT \| NFSSVC_NOPUBLICFH \|
	NFSSVC_STABLERESTART \| NFSSVC_ADMINREVOKE \|
	NFSSVC_DUMPCLIENTS \| NFSSVC_DUMPLOCKS \| NFSSVC_BACKUPSTABLE)) &&
	nfsd_call_nfsd != NULL)
	error = (*nfsd_call_nfsd)(td, uap);
	if (error == EINTR \|\| error == ERESTART)
	error = 0;
	return (error);
	}

	/*
	* Called once to initialize data structures...
	*/
	static int
	nfssvc_modevent(module_t mod, int type, void *data)
	{
	static int registered;
	int error = 0;

	switch (type) {
	case MOD_LOAD:
	error = syscall_register(&nfssvc_offset, &nfssvc_sysent,
	&nfssvc_prev_sysent);
	if (error)
	break;
	registered = 1;
	break;

	case MOD_UNLOAD:
	if (nfsd_call_nfsserver != NULL \|\| nfsd_call_nfscommon != NULL
	\|\| nfsd_call_nfscl != NULL \|\| nfsd_call_nfsd != NULL) {
	error = EBUSY;
	break;
	}
	if (registered)
	syscall_deregister(&nfssvc_offset, &nfssvc_prev_sysent);
	registered = 0;
	break;
	default:
	error = EOPNOTSUPP;
	break;
	}
	return error;
	}
	static moduledata_t nfssvc_mod = {
	"nfssvc",
	nfssvc_modevent,
	NULL,
	};
	DECLARE_MODULE(nfssvc, nfssvc_mod, SI_SUB_VFS, SI_ORDER_ANY);

	/* So that loader and kldload(2) can find us, wherever we are.. */
	MODULE_VERSION(nfssvc, 1);

	Index: head/sys/nlm/nlm_prot_impl.c
	===================================================================
	--- head/sys/nlm/nlm_prot_impl.c (revision 225616)
	+++ head/sys/nlm/nlm_prot_impl.c (revision 225617)
	@@ -1,2434 +1,2434 @@
	/*-
	* Copyright (c) 2008 Isilon Inc http://www.isilon.com/
	* Authors: Doug Rabson <dfr@rabson.org>
	* Developed with Red Inc: Alfred Perlstein <alfred@freebsd.org>
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include "opt_inet6.h"

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/fail.h>
	#include <sys/fcntl.h>
	#include <sys/kernel.h>
	#include <sys/kthread.h>
	#include <sys/lockf.h>
	#include <sys/malloc.h>
	#include <sys/mount.h>
	#if __FreeBSD_version >= 700000
	#include <sys/priv.h>
	#endif
	#include <sys/proc.h>
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/syscall.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/syslog.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/taskqueue.h>
	#include <sys/unistd.h>
	#include <sys/vnode.h>

	#include <nfs/nfsproto.h>
	#include <nfs/nfs_lock.h>

	#include <nlm/nlm_prot.h>
	#include <nlm/sm_inter.h>
	#include <nlm/nlm.h>
	#include <rpc/rpc_com.h>
	#include <rpc/rpcb_prot.h>

	MALLOC_DEFINE(M_NLM, "NLM", "Network Lock Manager");

	/*
	* If a host is inactive (and holds no locks) for this amount of
	* seconds, we consider it idle and stop tracking it.
	*/
	#define NLM_IDLE_TIMEOUT 30

	/*
	* We check the host list for idle every few seconds.
	*/
	#define NLM_IDLE_PERIOD 5

	/*
	* We only look for GRANTED_RES messages for a little while.
	*/
	#define NLM_EXPIRE_TIMEOUT 10

	/*
	* Support for sysctl vfs.nlm.sysid
	*/
	SYSCTL_NODE(_vfs, OID_AUTO, nlm, CTLFLAG_RW, NULL, "Network Lock Manager");
	SYSCTL_NODE(_vfs_nlm, OID_AUTO, sysid, CTLFLAG_RW, NULL, "");

	/*
	* Syscall hooks
	*/
	static int nlm_syscall_offset = SYS_nlm_syscall;
	static struct sysent nlm_syscall_prev_sysent;
	#if __FreeBSD_version < 700000
	static struct sysent nlm_syscall_sysent = {
	(sizeof(struct nlm_syscall_args) / sizeof(register_t)) \| SYF_MPSAFE,
	(sy_call_t *) nlm_syscall
	};
	#else
	MAKE_SYSENT(nlm_syscall);
	#endif
	static bool_t nlm_syscall_registered = FALSE;

	/*
	* Debug level passed in from userland. We also support a sysctl hook
	* so that it can be changed on a live system.
	*/
	static int nlm_debug_level;
	SYSCTL_INT(_debug, OID_AUTO, nlm_debug, CTLFLAG_RW, &nlm_debug_level, 0, "");

	#define NLM_DEBUG(_level, args...) \
	do { \
	if (nlm_debug_level >= (_level)) \
	log(LOG_DEBUG, args); \
	} while(0)
	#define NLM_ERR(args...) \
	do { \
	log(LOG_ERR, args); \
	} while(0)

	/*
	* Grace period handling. The value of nlm_grace_threshold is the
	* value of time_uptime after which we are serving requests normally.
	*/
	static time_t nlm_grace_threshold;

	/*
	* We check for idle hosts if time_uptime is greater than
	* nlm_next_idle_check,
	*/
	static time_t nlm_next_idle_check;

	/*
	* A socket to use for RPC - shared by all IPv4 RPC clients.
	*/
	static struct socket *nlm_socket;

	#ifdef INET6

	/*
	* A socket to use for RPC - shared by all IPv6 RPC clients.
	*/
	static struct socket *nlm_socket6;

	#endif

	/*
	* An RPC client handle that can be used to communicate with the local
	* NSM.
	*/
	static CLIENT *nlm_nsm;

	/*
	* An AUTH handle for the server's creds.
	*/
	static AUTH *nlm_auth;

	/*
	* A zero timeval for sending async RPC messages.
	*/
	struct timeval nlm_zero_tv = { 0, 0 };

	/*
	* The local NSM state number
	*/
	int nlm_nsm_state;


	/*
	* A lock to protect the host list and waiting lock list.
	*/
	static struct mtx nlm_global_lock;

	/*
	* Locks:
	* (l) locked by nh_lock
	* (s) only accessed via server RPC which is single threaded
	* (g) locked by nlm_global_lock
	* (c) const until freeing
	* (a) modified using atomic ops
	*/

	/*
	* A pending client-side lock request, stored on the nlm_waiting_locks
	* list.
	*/
	struct nlm_waiting_lock {
	TAILQ_ENTRY(nlm_waiting_lock) nw_link; /* (g) */
	bool_t nw_waiting; /* (g) */
	nlm4_lock nw_lock; /* (c) */
	union nfsfh nw_fh; /* (c) */
	struct vnode nw_vp; / (c) */
	};
	TAILQ_HEAD(nlm_waiting_lock_list, nlm_waiting_lock);

	struct nlm_waiting_lock_list nlm_waiting_locks; /* (g) */

	/*
	* A pending server-side asynchronous lock request, stored on the
	* nh_pending list of the NLM host.
	*/
	struct nlm_async_lock {
	TAILQ_ENTRY(nlm_async_lock) af_link; /* (l) host's list of locks */
	struct task af_task; /* (c) async callback details */
	void af_cookie; / (l) lock manager cancel token */
	struct vnode af_vp; / (l) vnode to lock */
	struct flock af_fl; /* (c) lock details */
	struct nlm_host af_host; / (c) host which is locking */
	CLIENT af_rpc; / (c) rpc client to send message */
	nlm4_testargs af_granted; /* (c) notification details */
	time_t af_expiretime; /* (c) notification time */
	};
	TAILQ_HEAD(nlm_async_lock_list, nlm_async_lock);

	/*
	* NLM host.
	*/
	enum nlm_host_state {
	NLM_UNMONITORED,
	NLM_MONITORED,
	NLM_MONITOR_FAILED,
	NLM_RECOVERING
	};

	struct nlm_rpc {
	CLIENT nr_client; / (l) RPC client handle */
	time_t nr_create_time; /* (l) when client was created */
	};

	struct nlm_host {
	struct mtx nh_lock;
	volatile u_int nh_refs; /* (a) reference count */
	TAILQ_ENTRY(nlm_host) nh_link; /* (g) global list of hosts */
	char nh_caller_name[MAXNAMELEN]; /* (c) printable name of host */
	uint32_t nh_sysid; /* (c) our allocaed system ID */
	char nh_sysid_string[10]; /* (c) string rep. of sysid */
	struct sockaddr_storage nh_addr; /* (s) remote address of host */
	struct nlm_rpc nh_srvrpc; /* (l) RPC for server replies */
	struct nlm_rpc nh_clntrpc; /* (l) RPC for client requests */
	rpcvers_t nh_vers; /* (s) NLM version of host */
	int nh_state; /* (s) last seen NSM state of host */
	enum nlm_host_state nh_monstate; /* (l) local NSM monitoring state */
	time_t nh_idle_timeout; /* (s) Time at which host is idle */
	struct sysctl_ctx_list nh_sysctl; /* (c) vfs.nlm.sysid nodes */
	uint32_t nh_grantcookie; /* (l) grant cookie counter */
	struct nlm_async_lock_list nh_pending; /* (l) pending async locks */
	struct nlm_async_lock_list nh_granted; /* (l) granted locks */
	struct nlm_async_lock_list nh_finished; /* (l) finished async locks */
	};
	TAILQ_HEAD(nlm_host_list, nlm_host);

	static struct nlm_host_list nlm_hosts; /* (g) */
	static uint32_t nlm_next_sysid = 1; /* (g) */

	static void nlm_host_unmonitor(struct nlm_host *);

	struct nlm_grantcookie {
	uint32_t ng_sysid;
	uint32_t ng_cookie;
	};

	static inline uint32_t
	ng_sysid(struct netobj *src)
	{

	return ((struct nlm_grantcookie *)src->n_bytes)->ng_sysid;
	}

	static inline uint32_t
	ng_cookie(struct netobj *src)
	{

	return ((struct nlm_grantcookie *)src->n_bytes)->ng_cookie;
	}

	/**********************************************************************/

	/*
	* Initialise NLM globals.
	*/
	static void
	nlm_init(void *dummy)
	{
	int error;

	mtx_init(&nlm_global_lock, "nlm_global_lock", NULL, MTX_DEF);
	TAILQ_INIT(&nlm_waiting_locks);
	TAILQ_INIT(&nlm_hosts);

	error = syscall_register(&nlm_syscall_offset, &nlm_syscall_sysent,
	&nlm_syscall_prev_sysent);
	if (error)
	NLM_ERR("Can't register NLM syscall\n");
	else
	nlm_syscall_registered = TRUE;
	}
	SYSINIT(nlm_init, SI_SUB_LOCK, SI_ORDER_FIRST, nlm_init, NULL);

	static void
	nlm_uninit(void *dummy)
	{

	if (nlm_syscall_registered)
	syscall_deregister(&nlm_syscall_offset,
	&nlm_syscall_prev_sysent);
	}
	SYSUNINIT(nlm_uninit, SI_SUB_LOCK, SI_ORDER_FIRST, nlm_uninit, NULL);

	/*
	* Create a netobj from an arbitrary source.
	*/
	void
	nlm_make_netobj(struct netobj *dst, caddr_t src, size_t srcsize,
	struct malloc_type *type)
	{

	dst->n_len = srcsize;
	dst->n_bytes = malloc(srcsize, type, M_WAITOK);
	memcpy(dst->n_bytes, src, srcsize);
	}

	/*
	* Copy a struct netobj.
	*/
	void
	nlm_copy_netobj(struct netobj dst, struct netobj src,
	struct malloc_type *type)
	{

	nlm_make_netobj(dst, src->n_bytes, src->n_len, type);
	}


	/*
	* Create an RPC client handle for the given (address,prog,vers)
	* triple using UDP.
	*/
	static CLIENT *
	nlm_get_rpc(struct sockaddr *sa, rpcprog_t prog, rpcvers_t vers)
	{
	char *wchan = "nlmrcv";
	const char* protofmly;
	struct sockaddr_storage ss;
	struct socket *so;
	CLIENT *rpcb;
	struct timeval timo;
	RPCB parms;
	char *uaddr;
	enum clnt_stat stat = RPC_SUCCESS;
	int rpcvers = RPCBVERS4;
	bool_t do_tcp = FALSE;
	bool_t tryagain = FALSE;
	struct portmap mapping;
	u_short port = 0;

	/*
	* First we need to contact the remote RPCBIND service to find
	* the right port.
	*/
	memcpy(&ss, sa, sa->sa_len);
	switch (ss.ss_family) {
	case AF_INET:
	((struct sockaddr_in *)&ss)->sin_port = htons(111);
	protofmly = "inet";
	so = nlm_socket;
	break;

	#ifdef INET6
	case AF_INET6:
	((struct sockaddr_in6 *)&ss)->sin6_port = htons(111);
	protofmly = "inet6";
	so = nlm_socket6;
	break;
	#endif

	default:
	/*
	* Unsupported address family - fail.
	*/
	return (NULL);
	}

	rpcb = clnt_dg_create(so, (struct sockaddr *)&ss,
	RPCBPROG, rpcvers, 0, 0);
	if (!rpcb)
	return (NULL);

	try_tcp:
	parms.r_prog = prog;
	parms.r_vers = vers;
	if (do_tcp)
	parms.r_netid = "tcp";
	else
	parms.r_netid = "udp";
	parms.r_addr = "";
	parms.r_owner = "";

	/*
	* Use the default timeout.
	*/
	timo.tv_sec = 25;
	timo.tv_usec = 0;
	again:
	switch (rpcvers) {
	case RPCBVERS4:
	case RPCBVERS:
	/*
	* Try RPCBIND 4 then 3.
	*/
	uaddr = NULL;
	stat = CLNT_CALL(rpcb, (rpcprog_t) RPCBPROC_GETADDR,
	(xdrproc_t) xdr_rpcb, &parms,
	(xdrproc_t) xdr_wrapstring, &uaddr, timo);
	if (stat == RPC_SUCCESS) {
	/*
	* We have a reply from the remote RPCBIND - turn it
	* into an appropriate address and make a new client
	* that can talk to the remote NLM.
	*
	* XXX fixup IPv6 scope ID.
	*/
	struct netbuf *a;
	a = __rpc_uaddr2taddr_af(ss.ss_family, uaddr);
	if (!a) {
	tryagain = TRUE;
	} else {
	tryagain = FALSE;
	memcpy(&ss, a->buf, a->len);
	free(a->buf, M_RPC);
	free(a, M_RPC);
	xdr_free((xdrproc_t) xdr_wrapstring, &uaddr);
	}
	}
	if (tryagain \|\| stat == RPC_PROGVERSMISMATCH) {
	if (rpcvers == RPCBVERS4)
	rpcvers = RPCBVERS;
	else if (rpcvers == RPCBVERS)
	rpcvers = PMAPVERS;
	CLNT_CONTROL(rpcb, CLSET_VERS, &rpcvers);
	goto again;
	}
	break;
	case PMAPVERS:
	/*
	* Try portmap.
	*/
	mapping.pm_prog = parms.r_prog;
	mapping.pm_vers = parms.r_vers;
	mapping.pm_prot = do_tcp ? IPPROTO_TCP : IPPROTO_UDP;
	mapping.pm_port = 0;

	stat = CLNT_CALL(rpcb, (rpcprog_t) PMAPPROC_GETPORT,
	(xdrproc_t) xdr_portmap, &mapping,
	(xdrproc_t) xdr_u_short, &port, timo);

	if (stat == RPC_SUCCESS) {
	switch (ss.ss_family) {
	case AF_INET:
	((struct sockaddr_in *)&ss)->sin_port =
	htons(port);
	break;

	#ifdef INET6
	case AF_INET6:
	((struct sockaddr_in6 *)&ss)->sin6_port =
	htons(port);
	break;
	#endif
	}
	}
	break;
	default:
	panic("invalid rpcvers %d", rpcvers);
	}
	/*
	* We may have a positive response from the portmapper, but the NLM
	* service was not found. Make sure we received a valid port.
	*/
	switch (ss.ss_family) {
	case AF_INET:
	port = ((struct sockaddr_in *)&ss)->sin_port;
	break;
	#ifdef INET6
	case AF_INET6:
	port = ((struct sockaddr_in6 *)&ss)->sin6_port;
	break;
	#endif
	}
	if (stat != RPC_SUCCESS \|\| !port) {
	/*
	* If we were able to talk to rpcbind or portmap, but the udp
	* variant wasn't available, ask about tcp.
	*
	* XXX - We could also check for a TCP portmapper, but
	* if the host is running a portmapper at all, we should be able
	* to hail it over UDP.
	*/
	if (stat == RPC_SUCCESS && !do_tcp) {
	do_tcp = TRUE;
	goto try_tcp;
	}

	/* Otherwise, bad news. */
	NLM_ERR("NLM: failed to contact remote rpcbind, "
	"stat = %d, port = %d\n", (int) stat, port);
	CLNT_DESTROY(rpcb);
	return (NULL);
	}

	if (do_tcp) {
	/*
	* Destroy the UDP client we used to speak to rpcbind and
	* recreate as a TCP client.
	*/
	struct netconfig *nconf = NULL;

	CLNT_DESTROY(rpcb);

	switch (ss.ss_family) {
	case AF_INET:
	nconf = getnetconfigent("tcp");
	break;
	#ifdef INET6
	case AF_INET6:
	nconf = getnetconfigent("tcp6");
	break;
	#endif
	}

	rpcb = clnt_reconnect_create(nconf, (struct sockaddr *)&ss,
	prog, vers, 0, 0);
	CLNT_CONTROL(rpcb, CLSET_WAITCHAN, wchan);
	rpcb->cl_auth = nlm_auth;

	} else {
	/*
	* Re-use the client we used to speak to rpcbind.
	*/
	CLNT_CONTROL(rpcb, CLSET_SVC_ADDR, &ss);
	CLNT_CONTROL(rpcb, CLSET_PROG, &prog);
	CLNT_CONTROL(rpcb, CLSET_VERS, &vers);
	CLNT_CONTROL(rpcb, CLSET_WAITCHAN, wchan);
	rpcb->cl_auth = nlm_auth;
	}

	return (rpcb);
	}

	/*
	* This async callback after when an async lock request has been
	* granted. We notify the host which initiated the request.
	*/
	static void
	nlm_lock_callback(void *arg, int pending)
	{
	struct nlm_async_lock af = (struct nlm_async_lock ) arg;
	struct rpc_callextra ext;

	NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) granted,"
	" cookie %d:%d\n", af, af->af_host->nh_caller_name,
	af->af_host->nh_sysid, ng_sysid(&af->af_granted.cookie),
	ng_cookie(&af->af_granted.cookie));

	/*
	* Send the results back to the host.
	*
	* Note: there is a possible race here with nlm_host_notify
	* destroying the RPC client. To avoid problems, the first
	* thing nlm_host_notify does is to cancel pending async lock
	* requests.
	*/
	memset(&ext, 0, sizeof(ext));
	ext.rc_auth = nlm_auth;
	if (af->af_host->nh_vers == NLM_VERS4) {
	nlm4_granted_msg_4(&af->af_granted,
	NULL, af->af_rpc, &ext, nlm_zero_tv);
	} else {
	/*
	* Back-convert to legacy protocol
	*/
	nlm_testargs granted;
	granted.cookie = af->af_granted.cookie;
	granted.exclusive = af->af_granted.exclusive;
	granted.alock.caller_name =
	af->af_granted.alock.caller_name;
	granted.alock.fh = af->af_granted.alock.fh;
	granted.alock.oh = af->af_granted.alock.oh;
	granted.alock.svid = af->af_granted.alock.svid;
	granted.alock.l_offset =
	af->af_granted.alock.l_offset;
	granted.alock.l_len =
	af->af_granted.alock.l_len;

	nlm_granted_msg_1(&granted,
	NULL, af->af_rpc, &ext, nlm_zero_tv);
	}

	/*
	* Move this entry to the nh_granted list.
	*/
	af->af_expiretime = time_uptime + NLM_EXPIRE_TIMEOUT;
	mtx_lock(&af->af_host->nh_lock);
	TAILQ_REMOVE(&af->af_host->nh_pending, af, af_link);
	TAILQ_INSERT_TAIL(&af->af_host->nh_granted, af, af_link);
	mtx_unlock(&af->af_host->nh_lock);
	}

	/*
	* Free an async lock request. The request must have been removed from
	* any list.
	*/
	static void
	nlm_free_async_lock(struct nlm_async_lock *af)
	{
	/*
	* Free an async lock.
	*/
	if (af->af_rpc)
	CLNT_RELEASE(af->af_rpc);
	xdr_free((xdrproc_t) xdr_nlm4_testargs, &af->af_granted);
	if (af->af_vp)
	vrele(af->af_vp);
	free(af, M_NLM);
	}

	/*
	* Cancel our async request - this must be called with
	* af->nh_host->nh_lock held. This is slightly complicated by a
	* potential race with our own callback. If we fail to cancel the
	* lock, it must already have been granted - we make sure our async
	* task has completed by calling taskqueue_drain in this case.
	*/
	static int
	nlm_cancel_async_lock(struct nlm_async_lock *af)
	{
	struct nlm_host *host = af->af_host;
	int error;

	mtx_assert(&host->nh_lock, MA_OWNED);

	mtx_unlock(&host->nh_lock);

	error = VOP_ADVLOCKASYNC(af->af_vp, NULL, F_CANCEL, &af->af_fl,
	F_REMOTE, NULL, &af->af_cookie);

	if (error) {
	/*
	* We failed to cancel - make sure our callback has
	* completed before we continue.
	*/
	taskqueue_drain(taskqueue_thread, &af->af_task);
	}

	mtx_lock(&host->nh_lock);

	if (!error) {
	NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) "
	"cancelled\n", af, host->nh_caller_name, host->nh_sysid);

	/*
	* Remove from the nh_pending list and free now that
	* we are safe from the callback.
	*/
	TAILQ_REMOVE(&host->nh_pending, af, af_link);
	mtx_unlock(&host->nh_lock);
	nlm_free_async_lock(af);
	mtx_lock(&host->nh_lock);
	}

	return (error);
	}

	static void
	nlm_check_expired_locks(struct nlm_host *host)
	{
	struct nlm_async_lock *af;
	time_t uptime = time_uptime;

	mtx_lock(&host->nh_lock);
	while ((af = TAILQ_FIRST(&host->nh_granted)) != NULL
	&& uptime >= af->af_expiretime) {
	NLM_DEBUG(2, "NLM: async lock %p for %s (sysid %d) expired,"
	" cookie %d:%d\n", af, af->af_host->nh_caller_name,
	af->af_host->nh_sysid, ng_sysid(&af->af_granted.cookie),
	ng_cookie(&af->af_granted.cookie));
	TAILQ_REMOVE(&host->nh_granted, af, af_link);
	mtx_unlock(&host->nh_lock);
	nlm_free_async_lock(af);
	mtx_lock(&host->nh_lock);
	}
	while ((af = TAILQ_FIRST(&host->nh_finished)) != NULL) {
	TAILQ_REMOVE(&host->nh_finished, af, af_link);
	mtx_unlock(&host->nh_lock);
	nlm_free_async_lock(af);
	mtx_lock(&host->nh_lock);
	}
	mtx_unlock(&host->nh_lock);
	}

	/*
	* Free resources used by a host. This is called after the reference
	* count has reached zero so it doesn't need to worry about locks.
	*/
	static void
	nlm_host_destroy(struct nlm_host *host)
	{

	mtx_lock(&nlm_global_lock);
	TAILQ_REMOVE(&nlm_hosts, host, nh_link);
	mtx_unlock(&nlm_global_lock);

	if (host->nh_srvrpc.nr_client)
	CLNT_RELEASE(host->nh_srvrpc.nr_client);
	if (host->nh_clntrpc.nr_client)
	CLNT_RELEASE(host->nh_clntrpc.nr_client);
	mtx_destroy(&host->nh_lock);
	sysctl_ctx_free(&host->nh_sysctl);
	free(host, M_NLM);
	}

	/*
	* Thread start callback for client lock recovery
	*/
	static void
	nlm_client_recovery_start(void *arg)
	{
	struct nlm_host host = (struct nlm_host ) arg;

	NLM_DEBUG(1, "NLM: client lock recovery for %s started\n",
	host->nh_caller_name);

	nlm_client_recovery(host);

	NLM_DEBUG(1, "NLM: client lock recovery for %s completed\n",
	host->nh_caller_name);

	host->nh_monstate = NLM_MONITORED;
	nlm_host_release(host);

	kthread_exit();
	}

	/*
	* This is called when we receive a host state change notification. We
	* unlock any active locks owned by the host. When rpc.lockd is
	* shutting down, this function is called with newstate set to zero
	* which allows us to cancel any pending async locks and clear the
	* locking state.
	*/
	static void
	nlm_host_notify(struct nlm_host *host, int newstate)
	{
	struct nlm_async_lock *af;

	if (newstate) {
	NLM_DEBUG(1, "NLM: host %s (sysid %d) rebooted, new "
	"state is %d\n", host->nh_caller_name,
	host->nh_sysid, newstate);
	}

	/*
	* Cancel any pending async locks for this host.
	*/
	mtx_lock(&host->nh_lock);
	while ((af = TAILQ_FIRST(&host->nh_pending)) != NULL) {
	/*
	* nlm_cancel_async_lock will remove the entry from
	* nh_pending and free it.
	*/
	nlm_cancel_async_lock(af);
	}
	mtx_unlock(&host->nh_lock);
	nlm_check_expired_locks(host);

	/*
	* The host just rebooted - trash its locks.
	*/
	lf_clearremotesys(host->nh_sysid);
	host->nh_state = newstate;

	/*
	* If we have any remote locks for this host (i.e. it
	* represents a remote NFS server that our local NFS client
	* has locks for), start a recovery thread.
	*/
	if (newstate != 0
	&& host->nh_monstate != NLM_RECOVERING
	&& lf_countlocks(NLM_SYSID_CLIENT \| host->nh_sysid) > 0) {
	struct thread *td;
	host->nh_monstate = NLM_RECOVERING;
	refcount_acquire(&host->nh_refs);
	kthread_add(nlm_client_recovery_start, host, curproc, &td, 0, 0,
	"NFS lock recovery for %s", host->nh_caller_name);
	}
	}

	/*
	* Sysctl handler to count the number of locks for a sysid.
	*/
	static int
	nlm_host_lock_count_sysctl(SYSCTL_HANDLER_ARGS)
	{
	struct nlm_host *host;
	int count;

	host = oidp->oid_arg1;
	count = lf_countlocks(host->nh_sysid);
	return sysctl_handle_int(oidp, &count, 0, req);
	}

	/*
	* Sysctl handler to count the number of client locks for a sysid.
	*/
	static int
	nlm_host_client_lock_count_sysctl(SYSCTL_HANDLER_ARGS)
	{
	struct nlm_host *host;
	int count;

	host = oidp->oid_arg1;
	count = lf_countlocks(NLM_SYSID_CLIENT \| host->nh_sysid);
	return sysctl_handle_int(oidp, &count, 0, req);
	}

	/*
	* Create a new NLM host.
	*/
	static struct nlm_host *
	nlm_create_host(const char* caller_name)
	{
	struct nlm_host *host;
	struct sysctl_oid *oid;

	mtx_assert(&nlm_global_lock, MA_OWNED);

	NLM_DEBUG(1, "NLM: new host %s (sysid %d)\n",
	caller_name, nlm_next_sysid);
	host = malloc(sizeof(struct nlm_host), M_NLM, M_NOWAIT\|M_ZERO);
	if (!host)
	return (NULL);
	mtx_init(&host->nh_lock, "nh_lock", NULL, MTX_DEF);
	host->nh_refs = 1;
	strlcpy(host->nh_caller_name, caller_name, MAXNAMELEN);
	host->nh_sysid = nlm_next_sysid++;
	snprintf(host->nh_sysid_string, sizeof(host->nh_sysid_string),
	"%d", host->nh_sysid);
	host->nh_vers = 0;
	host->nh_state = 0;
	host->nh_monstate = NLM_UNMONITORED;
	host->nh_grantcookie = 1;
	TAILQ_INIT(&host->nh_pending);
	TAILQ_INIT(&host->nh_granted);
	TAILQ_INIT(&host->nh_finished);
	TAILQ_INSERT_TAIL(&nlm_hosts, host, nh_link);

	mtx_unlock(&nlm_global_lock);

	sysctl_ctx_init(&host->nh_sysctl);
	oid = SYSCTL_ADD_NODE(&host->nh_sysctl,
	SYSCTL_STATIC_CHILDREN(_vfs_nlm_sysid),
	OID_AUTO, host->nh_sysid_string, CTLFLAG_RD, NULL, "");
	SYSCTL_ADD_STRING(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
	"hostname", CTLFLAG_RD, host->nh_caller_name, 0, "");
	SYSCTL_ADD_UINT(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
	"version", CTLFLAG_RD, &host->nh_vers, 0, "");
	SYSCTL_ADD_UINT(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
	"monitored", CTLFLAG_RD, &host->nh_monstate, 0, "");
	SYSCTL_ADD_PROC(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
	"lock_count", CTLTYPE_INT \| CTLFLAG_RD, host, 0,
	nlm_host_lock_count_sysctl, "I", "");
	SYSCTL_ADD_PROC(&host->nh_sysctl, SYSCTL_CHILDREN(oid), OID_AUTO,
	"client_lock_count", CTLTYPE_INT \| CTLFLAG_RD, host, 0,
	nlm_host_client_lock_count_sysctl, "I", "");

	mtx_lock(&nlm_global_lock);

	return (host);
	}

	/*
	* Acquire the next sysid for remote locks not handled by the NLM.
	*/
	uint32_t
	nlm_acquire_next_sysid(void)
	{
	uint32_t next_sysid;

	mtx_lock(&nlm_global_lock);
	next_sysid = nlm_next_sysid++;
	mtx_unlock(&nlm_global_lock);
	return (next_sysid);
	}

	/*
	* Return non-zero if the address parts of the two sockaddrs are the
	* same.
	*/
	static int
	nlm_compare_addr(const struct sockaddr a, const struct sockaddr b)
	{
	const struct sockaddr_in a4, b4;
	#ifdef INET6
	const struct sockaddr_in6 a6, b6;
	#endif

	if (a->sa_family != b->sa_family)
	return (FALSE);

	switch (a->sa_family) {
	case AF_INET:
	a4 = (const struct sockaddr_in *) a;
	b4 = (const struct sockaddr_in *) b;
	return !memcmp(&a4->sin_addr, &b4->sin_addr,
	sizeof(a4->sin_addr));
	#ifdef INET6
	case AF_INET6:
	a6 = (const struct sockaddr_in6 *) a;
	b6 = (const struct sockaddr_in6 *) b;
	return !memcmp(&a6->sin6_addr, &b6->sin6_addr,
	sizeof(a6->sin6_addr));
	#endif
	}

	return (0);
	}

	/*
	* Check for idle hosts and stop monitoring them. We could also free
	* the host structure here, possibly after a larger timeout but that
	* would require some care to avoid races with
	* e.g. nlm_host_lock_count_sysctl.
	*/
	static void
	nlm_check_idle(void)
	{
	struct nlm_host *host;

	mtx_assert(&nlm_global_lock, MA_OWNED);

	if (time_uptime <= nlm_next_idle_check)
	return;

	nlm_next_idle_check = time_uptime + NLM_IDLE_PERIOD;

	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
	if (host->nh_monstate == NLM_MONITORED
	&& time_uptime > host->nh_idle_timeout) {
	mtx_unlock(&nlm_global_lock);
	if (lf_countlocks(host->nh_sysid) > 0
	\|\| lf_countlocks(NLM_SYSID_CLIENT
	+ host->nh_sysid)) {
	host->nh_idle_timeout =
	time_uptime + NLM_IDLE_TIMEOUT;
	mtx_lock(&nlm_global_lock);
	continue;
	}
	nlm_host_unmonitor(host);
	mtx_lock(&nlm_global_lock);
	}
	}
	}

	/*
	* Search for an existing NLM host that matches the given name
	* (typically the caller_name element of an nlm4_lock). If none is
	* found, create a new host. If 'addr' is non-NULL, record the remote
	* address of the host so that we can call it back for async
	* responses. If 'vers' is greater than zero then record the NLM
	* program version to use to communicate with this client.
	*/
	struct nlm_host *
	nlm_find_host_by_name(const char name, const struct sockaddr addr,
	rpcvers_t vers)
	{
	struct nlm_host *host;

	mtx_lock(&nlm_global_lock);

	/*
	* The remote host is determined by caller_name.
	*/
	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
	if (!strcmp(host->nh_caller_name, name))
	break;
	}

	if (!host) {
	host = nlm_create_host(name);
	if (!host) {
	mtx_unlock(&nlm_global_lock);
	return (NULL);
	}
	}
	refcount_acquire(&host->nh_refs);

	host->nh_idle_timeout = time_uptime + NLM_IDLE_TIMEOUT;

	/*
	* If we have an address for the host, record it so that we
	* can send async replies etc.
	*/
	if (addr) {

	KASSERT(addr->sa_len < sizeof(struct sockaddr_storage),
	("Strange remote transport address length"));

	/*
	* If we have seen an address before and we currently
	* have an RPC client handle, make sure the address is
	* the same, otherwise discard the client handle.
	*/
	if (host->nh_addr.ss_len && host->nh_srvrpc.nr_client) {
	if (!nlm_compare_addr(
	(struct sockaddr *) &host->nh_addr,
	addr)
	\|\| host->nh_vers != vers) {
	CLIENT *client;
	mtx_lock(&host->nh_lock);
	client = host->nh_srvrpc.nr_client;
	host->nh_srvrpc.nr_client = NULL;
	mtx_unlock(&host->nh_lock);
	if (client) {
	CLNT_RELEASE(client);
	}
	}
	}
	memcpy(&host->nh_addr, addr, addr->sa_len);
	host->nh_vers = vers;
	}

	nlm_check_idle();

	mtx_unlock(&nlm_global_lock);

	return (host);
	}

	/*
	* Search for an existing NLM host that matches the given remote
	* address. If none is found, create a new host with the requested
	* address and remember 'vers' as the NLM protocol version to use for
	* that host.
	*/
	struct nlm_host *
	nlm_find_host_by_addr(const struct sockaddr *addr, int vers)
	{
	/*
	* Fake up a name using inet_ntop. This buffer is
	* large enough for an IPv6 address.
	*/
	char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"];
	struct nlm_host *host;

	switch (addr->sa_family) {
	case AF_INET:
	inet_ntop(AF_INET,
	&((const struct sockaddr_in *) addr)->sin_addr,
	tmp, sizeof tmp);
	break;
	#ifdef INET6
	case AF_INET6:
	inet_ntop(AF_INET6,
	&((const struct sockaddr_in6 *) addr)->sin6_addr,
	tmp, sizeof tmp);
	break;
	#endif
	default:
	strcmp(tmp, "<unknown>");
	}


	mtx_lock(&nlm_global_lock);

	/*
	* The remote host is determined by caller_name.
	*/
	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
	if (nlm_compare_addr(addr,
	(const struct sockaddr *) &host->nh_addr))
	break;
	}

	if (!host) {
	host = nlm_create_host(tmp);
	if (!host) {
	mtx_unlock(&nlm_global_lock);
	return (NULL);
	}
	memcpy(&host->nh_addr, addr, addr->sa_len);
	host->nh_vers = vers;
	}
	refcount_acquire(&host->nh_refs);

	host->nh_idle_timeout = time_uptime + NLM_IDLE_TIMEOUT;

	nlm_check_idle();

	mtx_unlock(&nlm_global_lock);

	return (host);
	}

	/*
	* Find the NLM host that matches the value of 'sysid'. If none
	* exists, return NULL.
	*/
	static struct nlm_host *
	nlm_find_host_by_sysid(int sysid)
	{
	struct nlm_host *host;

	TAILQ_FOREACH(host, &nlm_hosts, nh_link) {
	if (host->nh_sysid == sysid) {
	refcount_acquire(&host->nh_refs);
	return (host);
	}
	}

	return (NULL);
	}

	void nlm_host_release(struct nlm_host *host)
	{
	if (refcount_release(&host->nh_refs)) {
	/*
	* Free the host
	*/
	nlm_host_destroy(host);
	}
	}

	/*
	* Unregister this NLM host with the local NSM due to idleness.
	*/
	static void
	nlm_host_unmonitor(struct nlm_host *host)
	{
	mon_id smmonid;
	sm_stat_res smstat;
	struct timeval timo;
	enum clnt_stat stat;

	NLM_DEBUG(1, "NLM: unmonitoring %s (sysid %d)\n",
	host->nh_caller_name, host->nh_sysid);

	/*
	* We put our assigned system ID value in the priv field to
	* make it simpler to find the host if we are notified of a
	* host restart.
	*/
	smmonid.mon_name = host->nh_caller_name;
	smmonid.my_id.my_name = "localhost";
	smmonid.my_id.my_prog = NLM_PROG;
	smmonid.my_id.my_vers = NLM_SM;
	smmonid.my_id.my_proc = NLM_SM_NOTIFY;

	timo.tv_sec = 25;
	timo.tv_usec = 0;
	stat = CLNT_CALL(nlm_nsm, SM_UNMON,
	(xdrproc_t) xdr_mon, &smmonid,
	(xdrproc_t) xdr_sm_stat, &smstat, timo);

	if (stat != RPC_SUCCESS) {
	NLM_ERR("Failed to contact local NSM - rpc error %d\n", stat);
	return;
	}
	if (smstat.res_stat == stat_fail) {
	NLM_ERR("Local NSM refuses to unmonitor %s\n",
	host->nh_caller_name);
	return;
	}

	host->nh_monstate = NLM_UNMONITORED;
	}

	/*
	* Register this NLM host with the local NSM so that we can be
	* notified if it reboots.
	*/
	void
	nlm_host_monitor(struct nlm_host *host, int state)
	{
	mon smmon;
	sm_stat_res smstat;
	struct timeval timo;
	enum clnt_stat stat;

	if (state && !host->nh_state) {
	/*
	* This is the first time we have seen an NSM state
	* value for this host. We record it here to help
	* detect host reboots.
	*/
	host->nh_state = state;
	NLM_DEBUG(1, "NLM: host %s (sysid %d) has NSM state %d\n",
	host->nh_caller_name, host->nh_sysid, state);
	}

	mtx_lock(&host->nh_lock);
	if (host->nh_monstate != NLM_UNMONITORED) {
	mtx_unlock(&host->nh_lock);
	return;
	}
	host->nh_monstate = NLM_MONITORED;
	mtx_unlock(&host->nh_lock);

	NLM_DEBUG(1, "NLM: monitoring %s (sysid %d)\n",
	host->nh_caller_name, host->nh_sysid);

	/*
	* We put our assigned system ID value in the priv field to
	* make it simpler to find the host if we are notified of a
	* host restart.
	*/
	smmon.mon_id.mon_name = host->nh_caller_name;
	smmon.mon_id.my_id.my_name = "localhost";
	smmon.mon_id.my_id.my_prog = NLM_PROG;
	smmon.mon_id.my_id.my_vers = NLM_SM;
	smmon.mon_id.my_id.my_proc = NLM_SM_NOTIFY;
	memcpy(smmon.priv, &host->nh_sysid, sizeof(host->nh_sysid));

	timo.tv_sec = 25;
	timo.tv_usec = 0;
	stat = CLNT_CALL(nlm_nsm, SM_MON,
	(xdrproc_t) xdr_mon, &smmon,
	(xdrproc_t) xdr_sm_stat, &smstat, timo);

	if (stat != RPC_SUCCESS) {
	NLM_ERR("Failed to contact local NSM - rpc error %d\n", stat);
	return;
	}
	if (smstat.res_stat == stat_fail) {
	NLM_ERR("Local NSM refuses to monitor %s\n",
	host->nh_caller_name);
	mtx_lock(&host->nh_lock);
	host->nh_monstate = NLM_MONITOR_FAILED;
	mtx_unlock(&host->nh_lock);
	return;
	}

	host->nh_monstate = NLM_MONITORED;
	}

	/*
	* Return an RPC client handle that can be used to talk to the NLM
	* running on the given host.
	*/
	CLIENT *
	nlm_host_get_rpc(struct nlm_host *host, bool_t isserver)
	{
	struct nlm_rpc *rpc;
	CLIENT *client;

	mtx_lock(&host->nh_lock);

	if (isserver)
	rpc = &host->nh_srvrpc;
	else
	rpc = &host->nh_clntrpc;

	/*
	* We can't hold onto RPC handles for too long - the async
	* call/reply protocol used by some NLM clients makes it hard
	* to tell when they change port numbers (e.g. after a
	* reboot). Note that if a client reboots while it isn't
	* holding any locks, it won't bother to notify us. We
	* expire the RPC handles after two minutes.
	*/
	if (rpc->nr_client && time_uptime > rpc->nr_create_time + 2*60) {
	client = rpc->nr_client;
	rpc->nr_client = NULL;
	mtx_unlock(&host->nh_lock);
	CLNT_RELEASE(client);
	mtx_lock(&host->nh_lock);
	}

	if (!rpc->nr_client) {
	mtx_unlock(&host->nh_lock);
	client = nlm_get_rpc((struct sockaddr *)&host->nh_addr,
	NLM_PROG, host->nh_vers);
	mtx_lock(&host->nh_lock);

	if (client) {
	if (rpc->nr_client) {
	mtx_unlock(&host->nh_lock);
	CLNT_DESTROY(client);
	mtx_lock(&host->nh_lock);
	} else {
	rpc->nr_client = client;
	rpc->nr_create_time = time_uptime;
	}
	}
	}

	client = rpc->nr_client;
	if (client)
	CLNT_ACQUIRE(client);
	mtx_unlock(&host->nh_lock);

	return (client);

	}

	int nlm_host_get_sysid(struct nlm_host *host)
	{

	return (host->nh_sysid);
	}

	int
	nlm_host_get_state(struct nlm_host *host)
	{

	return (host->nh_state);
	}

	void *
	nlm_register_wait_lock(struct nlm4_lock lock, struct vnode vp)
	{
	struct nlm_waiting_lock *nw;

	nw = malloc(sizeof(struct nlm_waiting_lock), M_NLM, M_WAITOK);
	nw->nw_lock = *lock;
	memcpy(&nw->nw_fh.fh_bytes, nw->nw_lock.fh.n_bytes,
	nw->nw_lock.fh.n_len);
	nw->nw_lock.fh.n_bytes = nw->nw_fh.fh_bytes;
	nw->nw_waiting = TRUE;
	nw->nw_vp = vp;
	mtx_lock(&nlm_global_lock);
	TAILQ_INSERT_TAIL(&nlm_waiting_locks, nw, nw_link);
	mtx_unlock(&nlm_global_lock);

	return nw;
	}

	void
	nlm_deregister_wait_lock(void *handle)
	{
	struct nlm_waiting_lock *nw = handle;

	mtx_lock(&nlm_global_lock);
	TAILQ_REMOVE(&nlm_waiting_locks, nw, nw_link);
	mtx_unlock(&nlm_global_lock);

	free(nw, M_NLM);
	}

	int
	nlm_wait_lock(void *handle, int timo)
	{
	struct nlm_waiting_lock *nw = handle;
	int error;

	/*
	* If the granted message arrived before we got here,
	* nw->nw_waiting will be FALSE - in that case, don't sleep.
	*/
	mtx_lock(&nlm_global_lock);
	error = 0;
	if (nw->nw_waiting)
	error = msleep(nw, &nlm_global_lock, PCATCH, "nlmlock", timo);
	TAILQ_REMOVE(&nlm_waiting_locks, nw, nw_link);
	if (error) {
	/*
	* The granted message may arrive after the
	* interrupt/timeout but before we manage to lock the
	* mutex. Detect this by examining nw_lock.
	*/
	if (!nw->nw_waiting)
	error = 0;
	} else {
	/*
	* If nlm_cancel_wait is called, then error will be
	* zero but nw_waiting will still be TRUE. We
	* translate this into EINTR.
	*/
	if (nw->nw_waiting)
	error = EINTR;
	}
	mtx_unlock(&nlm_global_lock);

	free(nw, M_NLM);

	return (error);
	}

	void
	nlm_cancel_wait(struct vnode *vp)
	{
	struct nlm_waiting_lock *nw;

	mtx_lock(&nlm_global_lock);
	TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
	if (nw->nw_vp == vp) {
	wakeup(nw);
	}
	}
	mtx_unlock(&nlm_global_lock);
	}


	/**********************************************************************/

	/*
	* Syscall interface with userland.
	*/

	extern void nlm_prog_0(struct svc_req rqstp, SVCXPRT transp);
	extern void nlm_prog_1(struct svc_req rqstp, SVCXPRT transp);
	extern void nlm_prog_3(struct svc_req rqstp, SVCXPRT transp);
	extern void nlm_prog_4(struct svc_req rqstp, SVCXPRT transp);

	static int
	nlm_register_services(SVCPOOL pool, int addr_count, char *addrs)
	{
	static rpcvers_t versions[] = {
	NLM_SM, NLM_VERS, NLM_VERSX, NLM_VERS4
	};
	static void (dispatchers[])(struct svc_req , SVCXPRT *) = {
	nlm_prog_0, nlm_prog_1, nlm_prog_3, nlm_prog_4
	};
	static const int version_count = sizeof(versions) / sizeof(versions[0]);

	SVCXPRT **xprts;
	char netid[16];
	char uaddr[128];
	struct netconfig *nconf;
	int i, j, error;

	if (!addr_count) {
	NLM_ERR("NLM: no service addresses given - can't start server");
	return (EINVAL);
	}

	xprts = malloc(addr_count * sizeof(SVCXPRT *), M_NLM, M_WAITOK\|M_ZERO);
	for (i = 0; i < version_count; i++) {
	for (j = 0; j < addr_count; j++) {
	/*
	* Create transports for the first version and
	* then just register everything else to the
	* same transports.
	*/
	if (i == 0) {
	char *up;

	error = copyin(&addrs[2*j], &up,
	sizeof(char*));
	if (error)
	goto out;
	error = copyinstr(up, netid, sizeof(netid),
	NULL);
	if (error)
	goto out;
	error = copyin(&addrs[2*j+1], &up,
	sizeof(char*));
	if (error)
	goto out;
	error = copyinstr(up, uaddr, sizeof(uaddr),
	NULL);
	if (error)
	goto out;
	nconf = getnetconfigent(netid);
	if (!nconf) {
	NLM_ERR("Can't lookup netid %s\n",
	netid);
	error = EINVAL;
	goto out;
	}
	xprts[j] = svc_tp_create(pool, dispatchers[i],
	NLM_PROG, versions[i], uaddr, nconf);
	if (!xprts[j]) {
	NLM_ERR("NLM: unable to create "
	"(NLM_PROG, %d).\n", versions[i]);
	error = EINVAL;
	goto out;
	}
	freenetconfigent(nconf);
	} else {
	nconf = getnetconfigent(xprts[j]->xp_netid);
	rpcb_unset(NLM_PROG, versions[i], nconf);
	if (!svc_reg(xprts[j], NLM_PROG, versions[i],
	dispatchers[i], nconf)) {
	NLM_ERR("NLM: can't register "
	"(NLM_PROG, %d)\n", versions[i]);
	error = EINVAL;
	goto out;
	}
	}
	}
	}
	error = 0;
	out:
	for (j = 0; j < addr_count; j++) {
	if (xprts[j])
	SVC_RELEASE(xprts[j]);
	}
	free(xprts, M_NLM);
	return (error);
	}

	/*
	* Main server entry point. Contacts the local NSM to get its current
	* state and send SM_UNMON_ALL. Registers the NLM services and then
	* services requests. Does not return until the server is interrupted
	* by a signal.
	*/
	static int
	nlm_server_main(int addr_count, char **addrs)
	{
	struct thread *td = curthread;
	int error;
	SVCPOOL *pool = NULL;
	struct sockopt opt;
	int portlow;
	#ifdef INET6
	struct sockaddr_in6 sin6;
	#endif
	struct sockaddr_in sin;
	my_id id;
	sm_stat smstat;
	struct timeval timo;
	enum clnt_stat stat;
	struct nlm_host host, nhost;
	struct nlm_waiting_lock *nw;
	vop_advlock_t *old_nfs_advlock;
	vop_reclaim_t *old_nfs_reclaim;
	int v4_used;
	#ifdef INET6
	int v6_used;
	#endif

	if (nlm_socket) {
	NLM_ERR("NLM: can't start server - "
	"it appears to be running already\n");
	return (EPERM);
	}

	memset(&opt, 0, sizeof(opt));

	nlm_socket = NULL;
	error = socreate(AF_INET, &nlm_socket, SOCK_DGRAM, 0,
	td->td_ucred, td);
	if (error) {
	NLM_ERR("NLM: can't create IPv4 socket - error %d\n", error);
	return (error);
	}
	opt.sopt_dir = SOPT_SET;
	opt.sopt_level = IPPROTO_IP;
	opt.sopt_name = IP_PORTRANGE;
	portlow = IP_PORTRANGE_LOW;
	opt.sopt_val = &portlow;
	opt.sopt_valsize = sizeof(portlow);
	sosetopt(nlm_socket, &opt);

	#ifdef INET6
	nlm_socket6 = NULL;
	error = socreate(AF_INET6, &nlm_socket6, SOCK_DGRAM, 0,
	td->td_ucred, td);
	if (error) {
	NLM_ERR("NLM: can't create IPv6 socket - error %d\n", error);
	goto out;
	return (error);
	}
	opt.sopt_dir = SOPT_SET;
	opt.sopt_level = IPPROTO_IPV6;
	opt.sopt_name = IPV6_PORTRANGE;
	portlow = IPV6_PORTRANGE_LOW;
	opt.sopt_val = &portlow;
	opt.sopt_valsize = sizeof(portlow);
	sosetopt(nlm_socket6, &opt);
	#endif

	nlm_auth = authunix_create(curthread->td_ucred);

	#ifdef INET6
	memset(&sin6, 0, sizeof(sin6));
	sin6.sin6_len = sizeof(sin6);
	sin6.sin6_family = AF_INET6;
	sin6.sin6_addr = in6addr_loopback;
	nlm_nsm = nlm_get_rpc((struct sockaddr *) &sin6, SM_PROG, SM_VERS);
	if (!nlm_nsm) {
	#endif
	memset(&sin, 0, sizeof(sin));
	sin.sin_len = sizeof(sin);
	sin.sin_family = AF_INET;
	sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
	nlm_nsm = nlm_get_rpc((struct sockaddr *) &sin, SM_PROG,
	SM_VERS);
	#ifdef INET6
	}
	#endif

	if (!nlm_nsm) {
	NLM_ERR("Can't start NLM - unable to contact NSM\n");
	error = EINVAL;
	goto out;
	}

	pool = svcpool_create("NLM", NULL);

	error = nlm_register_services(pool, addr_count, addrs);
	if (error)
	goto out;

	memset(&id, 0, sizeof(id));
	id.my_name = "NFS NLM";

	timo.tv_sec = 25;
	timo.tv_usec = 0;
	stat = CLNT_CALL(nlm_nsm, SM_UNMON_ALL,
	(xdrproc_t) xdr_my_id, &id,
	(xdrproc_t) xdr_sm_stat, &smstat, timo);

	if (stat != RPC_SUCCESS) {
	struct rpc_err err;

	CLNT_GETERR(nlm_nsm, &err);
	NLM_ERR("NLM: unexpected error contacting NSM, "
	"stat=%d, errno=%d\n", stat, err.re_errno);
	error = EINVAL;
	goto out;
	}

	NLM_DEBUG(1, "NLM: local NSM state is %d\n", smstat.state);
	nlm_nsm_state = smstat.state;

	old_nfs_advlock = nfs_advlock_p;
	nfs_advlock_p = nlm_advlock;
	old_nfs_reclaim = nfs_reclaim_p;
	nfs_reclaim_p = nlm_reclaim;

	svc_run(pool);
	error = 0;

	nfs_advlock_p = old_nfs_advlock;
	nfs_reclaim_p = old_nfs_reclaim;

	out:
	if (pool)
	svcpool_destroy(pool);

	/*
	* We are finished communicating with the NSM.
	*/
	if (nlm_nsm) {
	CLNT_RELEASE(nlm_nsm);
	nlm_nsm = NULL;
	}

	/*
	* Trash all the existing state so that if the server
	* restarts, it gets a clean slate. This is complicated by the
	* possibility that there may be other threads trying to make
	* client locking requests.
	*
	* First we fake a client reboot notification which will
	* cancel any pending async locks and purge remote lock state
	* from the local lock manager. We release the reference from
	* nlm_hosts to the host (which may remove it from the list
	* and free it). After this phase, the only entries in the
	* nlm_host list should be from other threads performing
	* client lock requests. We arrange to defer closing the
	* sockets until the last RPC client handle is released.
	*/
	v4_used = 0;
	#ifdef INET6
	v6_used = 0;
	#endif
	mtx_lock(&nlm_global_lock);
	TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
	wakeup(nw);
	}
	TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, nhost) {
	mtx_unlock(&nlm_global_lock);
	nlm_host_notify(host, 0);
	nlm_host_release(host);
	mtx_lock(&nlm_global_lock);
	}
	TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, nhost) {
	mtx_lock(&host->nh_lock);
	if (host->nh_srvrpc.nr_client
	\|\| host->nh_clntrpc.nr_client) {
	if (host->nh_addr.ss_family == AF_INET)
	v4_used++;
	#ifdef INET6
	if (host->nh_addr.ss_family == AF_INET6)
	v6_used++;
	#endif
	/*
	* Note that the rpc over udp code copes
	* correctly with the fact that a socket may
	* be used by many rpc handles.
	*/
	if (host->nh_srvrpc.nr_client)
	CLNT_CONTROL(host->nh_srvrpc.nr_client,
	CLSET_FD_CLOSE, 0);
	if (host->nh_clntrpc.nr_client)
	CLNT_CONTROL(host->nh_clntrpc.nr_client,
	CLSET_FD_CLOSE, 0);
	}
	mtx_unlock(&host->nh_lock);
	}
	mtx_unlock(&nlm_global_lock);

	AUTH_DESTROY(nlm_auth);

	if (!v4_used)
	soclose(nlm_socket);
	nlm_socket = NULL;
	#ifdef INET6
	if (!v6_used)
	soclose(nlm_socket6);
	nlm_socket6 = NULL;
	#endif

	return (error);
	}

	int
	-nlm_syscall(struct thread td, struct nlm_syscall_args uap)
	+sys_nlm_syscall(struct thread td, struct nlm_syscall_args uap)
	{
	int error;

	#if __FreeBSD_version >= 700000
	error = priv_check(td, PRIV_NFS_LOCKD);
	#else
	error = suser(td);
	#endif
	if (error)
	return (error);

	nlm_debug_level = uap->debug_level;
	nlm_grace_threshold = time_uptime + uap->grace_period;
	nlm_next_idle_check = time_uptime + NLM_IDLE_PERIOD;

	return nlm_server_main(uap->addr_count, uap->addrs);
	}

	/**********************************************************************/

	/*
	* NLM implementation details, called from the RPC stubs.
	*/


	void
	nlm_sm_notify(struct nlm_sm_status *argp)
	{
	uint32_t sysid;
	struct nlm_host *host;

	NLM_DEBUG(3, "nlm_sm_notify(): mon_name = %s\n", argp->mon_name);
	memcpy(&sysid, &argp->priv, sizeof(sysid));
	host = nlm_find_host_by_sysid(sysid);
	if (host) {
	nlm_host_notify(host, argp->state);
	nlm_host_release(host);
	}
	}

	static void
	nlm_convert_to_fhandle_t(fhandle_t fhp, struct netobj p)
	{
	memcpy(fhp, p->n_bytes, sizeof(fhandle_t));
	}

	struct vfs_state {
	struct mount *vs_mp;
	struct vnode *vs_vp;
	int vs_vfslocked;
	int vs_vnlocked;
	};

	static int
	nlm_get_vfs_state(struct nlm_host host, struct svc_req rqstp,
	fhandle_t fhp, struct vfs_state vs)
	{
	int error, exflags;
	struct ucred cred = NULL, credanon;

	memset(vs, 0, sizeof(*vs));

	vs->vs_mp = vfs_getvfs(&fhp->fh_fsid);
	if (!vs->vs_mp) {
	return (ESTALE);
	}
	vs->vs_vfslocked = VFS_LOCK_GIANT(vs->vs_mp);

	error = VFS_CHECKEXP(vs->vs_mp, (struct sockaddr *)&host->nh_addr,
	&exflags, &credanon, NULL, NULL);
	if (error)
	goto out;

	if (exflags & MNT_EXRDONLY \|\| (vs->vs_mp->mnt_flag & MNT_RDONLY)) {
	error = EROFS;
	goto out;
	}

	error = VFS_FHTOVP(vs->vs_mp, &fhp->fh_fid, LK_EXCLUSIVE, &vs->vs_vp);
	if (error)
	goto out;
	vs->vs_vnlocked = TRUE;

	if (!svc_getcred(rqstp, &cred, NULL)) {
	error = EINVAL;
	goto out;
	}
	if (cred->cr_uid == 0 \|\| (exflags & MNT_EXPORTANON)) {
	crfree(cred);
	cred = credanon;
	credanon = NULL;
	}

	/*
	* Check cred.
	*/
	error = VOP_ACCESS(vs->vs_vp, VWRITE, cred, curthread);
	if (error)
	goto out;

	#if __FreeBSD_version < 800011
	VOP_UNLOCK(vs->vs_vp, 0, curthread);
	#else
	VOP_UNLOCK(vs->vs_vp, 0);
	#endif
	vs->vs_vnlocked = FALSE;

	out:
	if (cred)
	crfree(cred);
	if (credanon)
	crfree(credanon);

	return (error);
	}

	static void
	nlm_release_vfs_state(struct vfs_state *vs)
	{

	if (vs->vs_vp) {
	if (vs->vs_vnlocked)
	vput(vs->vs_vp);
	else
	vrele(vs->vs_vp);
	}
	if (vs->vs_mp)
	vfs_rel(vs->vs_mp);
	VFS_UNLOCK_GIANT(vs->vs_vfslocked);
	}

	static nlm4_stats
	nlm_convert_error(int error)
	{

	if (error == ESTALE)
	return nlm4_stale_fh;
	else if (error == EROFS)
	return nlm4_rofs;
	else
	return nlm4_failed;
	}

	int
	nlm_do_test(nlm4_testargs argp, nlm4_testres result, struct svc_req *rqstp,
	CLIENT **rpcp)
	{
	fhandle_t fh;
	struct vfs_state vs;
	struct nlm_host host, bhost;
	int error, sysid;
	struct flock fl;

	memset(result, 0, sizeof(*result));
	memset(&vs, 0, sizeof(vs));

	host = nlm_find_host_by_name(argp->alock.caller_name,
	svc_getrpccaller(rqstp), rqstp->rq_vers);
	if (!host) {
	result->stat.stat = nlm4_denied_nolocks;
	return (ENOMEM);
	}

	NLM_DEBUG(3, "nlm_do_test(): caller_name = %s (sysid = %d)\n",
	host->nh_caller_name, host->nh_sysid);

	nlm_check_expired_locks(host);
	sysid = host->nh_sysid;

	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);

	if (time_uptime < nlm_grace_threshold) {
	result->stat.stat = nlm4_denied_grace_period;
	goto out;
	}

	error = nlm_get_vfs_state(host, rqstp, &fh, &vs);
	if (error) {
	result->stat.stat = nlm_convert_error(error);
	goto out;
	}

	fl.l_start = argp->alock.l_offset;
	fl.l_len = argp->alock.l_len;
	fl.l_pid = argp->alock.svid;
	fl.l_sysid = sysid;
	fl.l_whence = SEEK_SET;
	if (argp->exclusive)
	fl.l_type = F_WRLCK;
	else
	fl.l_type = F_RDLCK;
	error = VOP_ADVLOCK(vs.vs_vp, NULL, F_GETLK, &fl, F_REMOTE);
	if (error) {
	result->stat.stat = nlm4_failed;
	goto out;
	}

	if (fl.l_type == F_UNLCK) {
	result->stat.stat = nlm4_granted;
	} else {
	result->stat.stat = nlm4_denied;
	result->stat.nlm4_testrply_u.holder.exclusive =
	(fl.l_type == F_WRLCK);
	result->stat.nlm4_testrply_u.holder.svid = fl.l_pid;
	bhost = nlm_find_host_by_sysid(fl.l_sysid);
	if (bhost) {
	/*
	* We don't have any useful way of recording
	* the value of oh used in the original lock
	* request. Ideally, the test reply would have
	* a space for the owning host's name allowing
	* our caller's NLM to keep track.
	*
	* As far as I can see, Solaris uses an eight
	* byte structure for oh which contains a four
	* byte pid encoded in local byte order and
	* the first four bytes of the host
	* name. Linux uses a variable length string
	* 'pid@hostname' in ascii but doesn't even
	* return that in test replies.
	*
	* For the moment, return nothing in oh
	* (already zero'ed above).
	*/
	nlm_host_release(bhost);
	}
	result->stat.nlm4_testrply_u.holder.l_offset = fl.l_start;
	result->stat.nlm4_testrply_u.holder.l_len = fl.l_len;
	}

	out:
	nlm_release_vfs_state(&vs);
	if (rpcp)
	*rpcp = nlm_host_get_rpc(host, TRUE);
	nlm_host_release(host);
	return (0);
	}

	int
	nlm_do_lock(nlm4_lockargs argp, nlm4_res result, struct svc_req *rqstp,
	bool_t monitor, CLIENT **rpcp)
	{
	fhandle_t fh;
	struct vfs_state vs;
	struct nlm_host *host;
	int error, sysid;
	struct flock fl;

	memset(result, 0, sizeof(*result));
	memset(&vs, 0, sizeof(vs));

	host = nlm_find_host_by_name(argp->alock.caller_name,
	svc_getrpccaller(rqstp), rqstp->rq_vers);
	if (!host) {
	result->stat.stat = nlm4_denied_nolocks;
	return (ENOMEM);
	}

	NLM_DEBUG(3, "nlm_do_lock(): caller_name = %s (sysid = %d)\n",
	host->nh_caller_name, host->nh_sysid);

	if (monitor && host->nh_state && argp->state
	&& host->nh_state != argp->state) {
	/*
	* The host rebooted without telling us. Trash its
	* locks.
	*/
	nlm_host_notify(host, argp->state);
	}

	nlm_check_expired_locks(host);
	sysid = host->nh_sysid;

	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);

	if (time_uptime < nlm_grace_threshold && !argp->reclaim) {
	result->stat.stat = nlm4_denied_grace_period;
	goto out;
	}

	error = nlm_get_vfs_state(host, rqstp, &fh, &vs);
	if (error) {
	result->stat.stat = nlm_convert_error(error);
	goto out;
	}

	fl.l_start = argp->alock.l_offset;
	fl.l_len = argp->alock.l_len;
	fl.l_pid = argp->alock.svid;
	fl.l_sysid = sysid;
	fl.l_whence = SEEK_SET;
	if (argp->exclusive)
	fl.l_type = F_WRLCK;
	else
	fl.l_type = F_RDLCK;
	if (argp->block) {
	struct nlm_async_lock *af;
	CLIENT *client;
	struct nlm_grantcookie cookie;

	/*
	* First, make sure we can contact the host's NLM.
	*/
	client = nlm_host_get_rpc(host, TRUE);
	if (!client) {
	result->stat.stat = nlm4_failed;
	goto out;
	}

	/*
	* First we need to check and see if there is an
	* existing blocked lock that matches. This could be a
	* badly behaved client or an RPC re-send. If we find
	* one, just return nlm4_blocked.
	*/
	mtx_lock(&host->nh_lock);
	TAILQ_FOREACH(af, &host->nh_pending, af_link) {
	if (af->af_fl.l_start == fl.l_start
	&& af->af_fl.l_len == fl.l_len
	&& af->af_fl.l_pid == fl.l_pid
	&& af->af_fl.l_type == fl.l_type) {
	break;
	}
	}
	if (!af) {
	cookie.ng_sysid = host->nh_sysid;
	cookie.ng_cookie = host->nh_grantcookie++;
	}
	mtx_unlock(&host->nh_lock);
	if (af) {
	CLNT_RELEASE(client);
	result->stat.stat = nlm4_blocked;
	goto out;
	}

	af = malloc(sizeof(struct nlm_async_lock), M_NLM,
	M_WAITOK\|M_ZERO);
	TASK_INIT(&af->af_task, 0, nlm_lock_callback, af);
	af->af_vp = vs.vs_vp;
	af->af_fl = fl;
	af->af_host = host;
	af->af_rpc = client;
	/*
	* We use M_RPC here so that we can xdr_free the thing
	* later.
	*/
	nlm_make_netobj(&af->af_granted.cookie,
	(caddr_t)&cookie, sizeof(cookie), M_RPC);
	af->af_granted.exclusive = argp->exclusive;
	af->af_granted.alock.caller_name =
	strdup(argp->alock.caller_name, M_RPC);
	nlm_copy_netobj(&af->af_granted.alock.fh,
	&argp->alock.fh, M_RPC);
	nlm_copy_netobj(&af->af_granted.alock.oh,
	&argp->alock.oh, M_RPC);
	af->af_granted.alock.svid = argp->alock.svid;
	af->af_granted.alock.l_offset = argp->alock.l_offset;
	af->af_granted.alock.l_len = argp->alock.l_len;

	/*
	* Put the entry on the pending list before calling
	* VOP_ADVLOCKASYNC. We do this in case the lock
	* request was blocked (returning EINPROGRESS) but
	* then granted before we manage to run again. The
	* client may receive the granted message before we
	* send our blocked reply but thats their problem.
	*/
	mtx_lock(&host->nh_lock);
	TAILQ_INSERT_TAIL(&host->nh_pending, af, af_link);
	mtx_unlock(&host->nh_lock);

	error = VOP_ADVLOCKASYNC(vs.vs_vp, NULL, F_SETLK, &fl, F_REMOTE,
	&af->af_task, &af->af_cookie);

	/*
	* If the lock completed synchronously, just free the
	* tracking structure now.
	*/
	if (error != EINPROGRESS) {
	CLNT_RELEASE(af->af_rpc);
	mtx_lock(&host->nh_lock);
	TAILQ_REMOVE(&host->nh_pending, af, af_link);
	mtx_unlock(&host->nh_lock);
	xdr_free((xdrproc_t) xdr_nlm4_testargs,
	&af->af_granted);
	free(af, M_NLM);
	} else {
	NLM_DEBUG(2, "NLM: pending async lock %p for %s "
	"(sysid %d)\n", af, host->nh_caller_name, sysid);
	/*
	* Don't vrele the vnode just yet - this must
	* wait until either the async callback
	* happens or the lock is cancelled.
	*/
	vs.vs_vp = NULL;
	}
	} else {
	error = VOP_ADVLOCK(vs.vs_vp, NULL, F_SETLK, &fl, F_REMOTE);
	}

	if (error) {
	if (error == EINPROGRESS) {
	result->stat.stat = nlm4_blocked;
	} else if (error == EDEADLK) {
	result->stat.stat = nlm4_deadlck;
	} else if (error == EAGAIN) {
	result->stat.stat = nlm4_denied;
	} else {
	result->stat.stat = nlm4_failed;
	}
	} else {
	if (monitor)
	nlm_host_monitor(host, argp->state);
	result->stat.stat = nlm4_granted;
	}

	out:
	nlm_release_vfs_state(&vs);
	if (rpcp)
	*rpcp = nlm_host_get_rpc(host, TRUE);
	nlm_host_release(host);
	return (0);
	}

	int
	nlm_do_cancel(nlm4_cancargs argp, nlm4_res result, struct svc_req *rqstp,
	CLIENT **rpcp)
	{
	fhandle_t fh;
	struct vfs_state vs;
	struct nlm_host *host;
	int error, sysid;
	struct flock fl;
	struct nlm_async_lock *af;

	memset(result, 0, sizeof(*result));
	memset(&vs, 0, sizeof(vs));

	host = nlm_find_host_by_name(argp->alock.caller_name,
	svc_getrpccaller(rqstp), rqstp->rq_vers);
	if (!host) {
	result->stat.stat = nlm4_denied_nolocks;
	return (ENOMEM);
	}

	NLM_DEBUG(3, "nlm_do_cancel(): caller_name = %s (sysid = %d)\n",
	host->nh_caller_name, host->nh_sysid);

	nlm_check_expired_locks(host);
	sysid = host->nh_sysid;

	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);

	if (time_uptime < nlm_grace_threshold) {
	result->stat.stat = nlm4_denied_grace_period;
	goto out;
	}

	error = nlm_get_vfs_state(host, rqstp, &fh, &vs);
	if (error) {
	result->stat.stat = nlm_convert_error(error);
	goto out;
	}

	fl.l_start = argp->alock.l_offset;
	fl.l_len = argp->alock.l_len;
	fl.l_pid = argp->alock.svid;
	fl.l_sysid = sysid;
	fl.l_whence = SEEK_SET;
	if (argp->exclusive)
	fl.l_type = F_WRLCK;
	else
	fl.l_type = F_RDLCK;

	/*
	* First we need to try and find the async lock request - if
	* there isn't one, we give up and return nlm4_denied.
	*/
	mtx_lock(&host->nh_lock);

	TAILQ_FOREACH(af, &host->nh_pending, af_link) {
	if (af->af_fl.l_start == fl.l_start
	&& af->af_fl.l_len == fl.l_len
	&& af->af_fl.l_pid == fl.l_pid
	&& af->af_fl.l_type == fl.l_type) {
	break;
	}
	}

	if (!af) {
	mtx_unlock(&host->nh_lock);
	result->stat.stat = nlm4_denied;
	goto out;
	}

	error = nlm_cancel_async_lock(af);

	if (error) {
	result->stat.stat = nlm4_denied;
	} else {
	result->stat.stat = nlm4_granted;
	}

	mtx_unlock(&host->nh_lock);

	out:
	nlm_release_vfs_state(&vs);
	if (rpcp)
	*rpcp = nlm_host_get_rpc(host, TRUE);
	nlm_host_release(host);
	return (0);
	}

	int
	nlm_do_unlock(nlm4_unlockargs argp, nlm4_res result, struct svc_req *rqstp,
	CLIENT **rpcp)
	{
	fhandle_t fh;
	struct vfs_state vs;
	struct nlm_host *host;
	int error, sysid;
	struct flock fl;

	memset(result, 0, sizeof(*result));
	memset(&vs, 0, sizeof(vs));

	host = nlm_find_host_by_name(argp->alock.caller_name,
	svc_getrpccaller(rqstp), rqstp->rq_vers);
	if (!host) {
	result->stat.stat = nlm4_denied_nolocks;
	return (ENOMEM);
	}

	NLM_DEBUG(3, "nlm_do_unlock(): caller_name = %s (sysid = %d)\n",
	host->nh_caller_name, host->nh_sysid);

	nlm_check_expired_locks(host);
	sysid = host->nh_sysid;

	nlm_convert_to_fhandle_t(&fh, &argp->alock.fh);
	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);

	if (time_uptime < nlm_grace_threshold) {
	result->stat.stat = nlm4_denied_grace_period;
	goto out;
	}

	error = nlm_get_vfs_state(host, rqstp, &fh, &vs);
	if (error) {
	result->stat.stat = nlm_convert_error(error);
	goto out;
	}

	fl.l_start = argp->alock.l_offset;
	fl.l_len = argp->alock.l_len;
	fl.l_pid = argp->alock.svid;
	fl.l_sysid = sysid;
	fl.l_whence = SEEK_SET;
	fl.l_type = F_UNLCK;
	error = VOP_ADVLOCK(vs.vs_vp, NULL, F_UNLCK, &fl, F_REMOTE);

	/*
	* Ignore the error - there is no result code for failure,
	* only for grace period.
	*/
	result->stat.stat = nlm4_granted;

	out:
	nlm_release_vfs_state(&vs);
	if (rpcp)
	*rpcp = nlm_host_get_rpc(host, TRUE);
	nlm_host_release(host);
	return (0);
	}

	int
	nlm_do_granted(nlm4_testargs argp, nlm4_res result, struct svc_req *rqstp,

	CLIENT **rpcp)
	{
	struct nlm_host *host;
	struct nlm_waiting_lock *nw;

	memset(result, 0, sizeof(*result));

	host = nlm_find_host_by_addr(svc_getrpccaller(rqstp), rqstp->rq_vers);
	if (!host) {
	result->stat.stat = nlm4_denied_nolocks;
	return (ENOMEM);
	}

	nlm_copy_netobj(&result->cookie, &argp->cookie, M_RPC);
	result->stat.stat = nlm4_denied;
	KFAIL_POINT_CODE(DEBUG_FP, nlm_deny_grant, goto out);

	mtx_lock(&nlm_global_lock);
	TAILQ_FOREACH(nw, &nlm_waiting_locks, nw_link) {
	if (!nw->nw_waiting)
	continue;
	if (argp->alock.svid == nw->nw_lock.svid
	&& argp->alock.l_offset == nw->nw_lock.l_offset
	&& argp->alock.l_len == nw->nw_lock.l_len
	&& argp->alock.fh.n_len == nw->nw_lock.fh.n_len
	&& !memcmp(argp->alock.fh.n_bytes, nw->nw_lock.fh.n_bytes,
	nw->nw_lock.fh.n_len)) {
	nw->nw_waiting = FALSE;
	wakeup(nw);
	result->stat.stat = nlm4_granted;
	break;
	}
	}
	mtx_unlock(&nlm_global_lock);

	out:
	if (rpcp)
	*rpcp = nlm_host_get_rpc(host, TRUE);
	nlm_host_release(host);
	return (0);
	}

	void
	nlm_do_granted_res(nlm4_res argp, struct svc_req rqstp)
	{
	struct nlm_host *host = NULL;
	struct nlm_async_lock *af = NULL;
	int error;

	if (argp->cookie.n_len != sizeof(struct nlm_grantcookie)) {
	NLM_DEBUG(1, "NLM: bogus grant cookie");
	goto out;
	}

	host = nlm_find_host_by_sysid(ng_sysid(&argp->cookie));
	if (!host) {
	NLM_DEBUG(1, "NLM: Unknown host rejected our grant");
	goto out;
	}

	mtx_lock(&host->nh_lock);
	TAILQ_FOREACH(af, &host->nh_granted, af_link)
	if (ng_cookie(&argp->cookie) ==
	ng_cookie(&af->af_granted.cookie))
	break;
	if (af)
	TAILQ_REMOVE(&host->nh_granted, af, af_link);
	mtx_unlock(&host->nh_lock);

	if (!af) {
	NLM_DEBUG(1, "NLM: host %s (sysid %d) replied to our grant "
	"with unrecognized cookie %d:%d", host->nh_caller_name,
	host->nh_sysid, ng_sysid(&argp->cookie),
	ng_cookie(&argp->cookie));
	goto out;
	}

	if (argp->stat.stat != nlm4_granted) {
	af->af_fl.l_type = F_UNLCK;
	error = VOP_ADVLOCK(af->af_vp, NULL, F_UNLCK, &af->af_fl, F_REMOTE);
	if (error) {
	NLM_DEBUG(1, "NLM: host %s (sysid %d) rejected our grant "
	"and we failed to unlock (%d)", host->nh_caller_name,
	host->nh_sysid, error);
	goto out;
	}

	NLM_DEBUG(5, "NLM: async lock %p rejected by host %s (sysid %d)",
	af, host->nh_caller_name, host->nh_sysid);
	} else {
	NLM_DEBUG(5, "NLM: async lock %p accepted by host %s (sysid %d)",
	af, host->nh_caller_name, host->nh_sysid);
	}

	out:
	if (af)
	nlm_free_async_lock(af);
	if (host)
	nlm_host_release(host);
	}

	void
	nlm_do_free_all(nlm4_notify *argp)
	{
	struct nlm_host host, thost;

	TAILQ_FOREACH_SAFE(host, &nlm_hosts, nh_link, thost) {
	if (!strcmp(host->nh_caller_name, argp->name))
	nlm_host_notify(host, argp->state);
	}
	}

	/*
	* Kernel module glue
	*/
	static int
	nfslockd_modevent(module_t mod, int type, void *data)
	{

	return (0);
	}
	static moduledata_t nfslockd_mod = {
	"nfslockd",
	nfslockd_modevent,
	NULL,
	};
	DECLARE_MODULE(nfslockd, nfslockd_mod, SI_SUB_VFS, SI_ORDER_ANY);

	/* So that loader and kldload(2) can find us, wherever we are.. */
	MODULE_DEPEND(nfslockd, krpc, 1, 1, 1);
	MODULE_DEPEND(nfslockd, nfslock, 1, 1, 1);
	MODULE_VERSION(nfslockd, 1);
	Index: head/sys/pc98/pc98/machdep.c
	===================================================================
	--- head/sys/pc98/pc98/machdep.c (revision 225616)
	+++ head/sys/pc98/pc98/machdep.c (revision 225617)
	@@ -1,2981 +1,2981 @@
	/*-
	* Copyright (c) 1992 Terrence R. Lambert.
	* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
	* All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* William Jolitz.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_atalk.h"
	#include "opt_compat.h"
	#include "opt_cpu.h"
	#include "opt_ddb.h"
	#include "opt_inet.h"
	#include "opt_ipx.h"
	#include "opt_isa.h"
	#include "opt_kstack_pages.h"
	#include "opt_maxmem.h"
	#include "opt_mp_watchdog.h"
	#include "opt_npx.h"
	#include "opt_perfmon.h"

	#include <sys/param.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/bus.h>
	#include <sys/callout.h>
	#include <sys/cons.h>
	#include <sys/cpu.h>
	#include <sys/eventhandler.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/linker.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/msgbuf.h>
	#include <sys/mutex.h>
	#include <sys/pcpu.h>
	#include <sys/ptrace.h>
	#include <sys/reboot.h>
	#include <sys/sched.h>
	#include <sys/signalvar.h>
	#ifdef SMP
	#include <sys/smp.h>
	#endif
	#include <sys/syscallsubr.h>
	#include <sys/sysctl.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/ucontext.h>
	#include <sys/vmmeter.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_page.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_param.h>

	#ifdef DDB
	#ifndef KDB
	#error KDB must be enabled in order for DDB to work!
	#endif
	#include <ddb/ddb.h>
	#include <ddb/db_sym.h>
	#endif

	#include <pc98/pc98/pc98_machdep.h>

	#include <net/netisr.h>

	#include <machine/bootinfo.h>
	#include <machine/clock.h>
	#include <machine/cpu.h>
	#include <machine/cputypes.h>
	#include <machine/intr_machdep.h>
	#include <x86/mca.h>
	#include <machine/md_var.h>
	#include <machine/mp_watchdog.h>
	#include <machine/pc/bios.h>
	#include <machine/pcb.h>
	#include <machine/pcb_ext.h>
	#include <machine/proc.h>
	#include <machine/reg.h>
	#include <machine/sigframe.h>
	#include <machine/specialreg.h>
	#include <machine/vm86.h>
	#ifdef PERFMON
	#include <machine/perfmon.h>
	#endif
	#ifdef SMP
	#include <machine/smp.h>
	#endif

	#ifdef DEV_ISA
	#include <x86/isa/icu.h>
	#endif

	/* Sanity check for __curthread() */
	CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);

	extern void init386(int first);
	extern void dblfault_handler(void);

	extern void printcpuinfo(void); /* XXX header file */
	extern void finishidentcpu(void);
	extern void panicifcpuunsupported(void);
	extern void initializecpu(void);

	#define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
	#define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)

	#if !defined(CPU_DISABLE_SSE) && defined(I686_CPU)
	#define CPU_ENABLE_SSE
	#endif

	static void cpu_startup(void *);
	static void fpstate_drop(struct thread *td);
	static void get_fpcontext(struct thread td, mcontext_t mcp);
	static int set_fpcontext(struct thread td, const mcontext_t mcp);
	#ifdef CPU_ENABLE_SSE
	static void set_fpregs_xmm(struct save87 , struct savexmm );
	static void fill_fpregs_xmm(struct savexmm , struct save87 );
	#endif /* CPU_ENABLE_SSE */
	SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);

	int need_pre_dma_flush; /* If 1, use wbinvd befor DMA transfer. */
	int need_post_dma_flush; /* If 1, use invd after DMA transfer. */

	#ifdef DDB
	extern vm_offset_t ksym_start, ksym_end;
	#endif

	int _udatasel, _ucodesel;
	u_int basemem;

	static int ispc98 = 1;
	SYSCTL_INT(_machdep, OID_AUTO, ispc98, CTLFLAG_RD, &ispc98, 0, "");

	int cold = 1;

	#ifdef COMPAT_43
	static void osendsig(sig_t catcher, ksiginfo_t , sigset_t mask);
	#endif
	#ifdef COMPAT_FREEBSD4
	static void freebsd4_sendsig(sig_t catcher, ksiginfo_t , sigset_t mask);
	#endif

	long Maxmem = 0;
	long realmem = 0;

	/*
	* The number of PHYSMAP entries must be one less than the number of
	* PHYSSEG entries because the PHYSMAP entry that spans the largest
	* physical address that is accessible by ISA DMA is split into two
	* PHYSSEG entries.
	*/
	#define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))

	vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
	vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];

	/* must be 2 less so 0 0 can signal end of chunks */
	#define PHYS_AVAIL_ARRAY_END ((sizeof(phys_avail) / sizeof(phys_avail[0])) - 2)
	#define DUMP_AVAIL_ARRAY_END ((sizeof(dump_avail) / sizeof(dump_avail[0])) - 2)

	struct kva_md_info kmi;

	static struct trapframe proc0_tf;
	struct pcpu __pcpu[MAXCPU];

	struct mtx icu_lock;

	static void
	cpu_startup(dummy)
	void *dummy;
	{
	uintmax_t memsize;

	/*
	* Good {morning,afternoon,evening,night}.
	*/
	startrtclock();
	printcpuinfo();
	panicifcpuunsupported();
	#ifdef PERFMON
	perfmon_init();
	#endif
	realmem = Maxmem;

	/*
	* Display physical memory.
	*/
	memsize = ptoa((uintmax_t)Maxmem);
	printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);

	/*
	* Display any holes after the first chunk of extended memory.
	*/
	if (bootverbose) {
	int indx;

	printf("Physical memory chunk(s):\n");
	for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
	vm_paddr_t size;

	size = phys_avail[indx + 1] - phys_avail[indx];
	printf(
	"0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
	(uintmax_t)phys_avail[indx],
	(uintmax_t)phys_avail[indx + 1] - 1,
	(uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
	}
	}

	vm_ksubmap_init(&kmi);

	printf("avail memory = %ju (%ju MB)\n",
	ptoa((uintmax_t)cnt.v_free_count),
	ptoa((uintmax_t)cnt.v_free_count) / 1048576);

	/*
	* Set up buffers, so they can be used to read disk labels.
	*/
	bufinit();
	vm_pager_bufferinit();
	cpu_setregs();
	}

	/*
	* Send an interrupt to process.
	*
	* Stack is set up to allow sigcode stored
	* at top to call routine, followed by kcall
	* to sigreturn routine below. After sigreturn
	* resets the signal mask, the stack, and the
	* frame pointer, it returns to the user
	* specified pc, psl.
	*/
	#ifdef COMPAT_43
	static void
	osendsig(sig_t catcher, ksiginfo_t ksi, sigset_t mask)
	{
	struct osigframe sf, *fp;
	struct proc *p;
	struct thread *td;
	struct sigacts *psp;
	struct trapframe *regs;
	int sig;
	int oonstack;

	td = curthread;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	sig = ksi->ksi_signo;
	psp = p->p_sigacts;
	mtx_assert(&psp->ps_mtx, MA_OWNED);
	regs = td->td_frame;
	oonstack = sigonstack(regs->tf_esp);

	/* Allocate space for the signal handler context. */
	if ((td->td_pflags & TDP_ALTSTACK) && !oonstack &&
	SIGISMEMBER(psp->ps_sigonstack, sig)) {
	fp = (struct osigframe *)(td->td_sigstk.ss_sp +
	td->td_sigstk.ss_size - sizeof(struct osigframe));
	#if defined(COMPAT_43)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	#endif
	} else
	fp = (struct osigframe *)regs->tf_esp - 1;

	/* Translate the signal if appropriate. */
	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
	sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];

	/* Build the argument list for the signal handler. */
	sf.sf_signum = sig;
	sf.sf_scp = (register_t)&fp->sf_siginfo.si_sc;
	bzero(&sf.sf_siginfo, sizeof(sf.sf_siginfo));
	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
	/* Signal handler installed with SA_SIGINFO. */
	sf.sf_arg2 = (register_t)&fp->sf_siginfo;
	sf.sf_siginfo.si_signo = sig;
	sf.sf_siginfo.si_code = ksi->ksi_code;
	sf.sf_ahu.sf_action = (__osiginfohandler_t *)catcher;
	sf.sf_addr = 0;
	} else {
	/* Old FreeBSD-style arguments. */
	sf.sf_arg2 = ksi->ksi_code;
	sf.sf_addr = (register_t)ksi->ksi_addr;
	sf.sf_ahu.sf_handler = catcher;
	}
	mtx_unlock(&psp->ps_mtx);
	PROC_UNLOCK(p);

	/* Save most if not all of trap frame. */
	sf.sf_siginfo.si_sc.sc_eax = regs->tf_eax;
	sf.sf_siginfo.si_sc.sc_ebx = regs->tf_ebx;
	sf.sf_siginfo.si_sc.sc_ecx = regs->tf_ecx;
	sf.sf_siginfo.si_sc.sc_edx = regs->tf_edx;
	sf.sf_siginfo.si_sc.sc_esi = regs->tf_esi;
	sf.sf_siginfo.si_sc.sc_edi = regs->tf_edi;
	sf.sf_siginfo.si_sc.sc_cs = regs->tf_cs;
	sf.sf_siginfo.si_sc.sc_ds = regs->tf_ds;
	sf.sf_siginfo.si_sc.sc_ss = regs->tf_ss;
	sf.sf_siginfo.si_sc.sc_es = regs->tf_es;
	sf.sf_siginfo.si_sc.sc_fs = regs->tf_fs;
	sf.sf_siginfo.si_sc.sc_gs = rgs();
	sf.sf_siginfo.si_sc.sc_isp = regs->tf_isp;

	/* Build the signal context to be used by osigreturn(). */
	sf.sf_siginfo.si_sc.sc_onstack = (oonstack) ? 1 : 0;
	SIG2OSIG(*mask, sf.sf_siginfo.si_sc.sc_mask);
	sf.sf_siginfo.si_sc.sc_sp = regs->tf_esp;
	sf.sf_siginfo.si_sc.sc_fp = regs->tf_ebp;
	sf.sf_siginfo.si_sc.sc_pc = regs->tf_eip;
	sf.sf_siginfo.si_sc.sc_ps = regs->tf_eflags;
	sf.sf_siginfo.si_sc.sc_trapno = regs->tf_trapno;
	sf.sf_siginfo.si_sc.sc_err = regs->tf_err;

	/*
	* If we're a vm86 process, we want to save the segment registers.
	* We also change eflags to be our emulated eflags, not the actual
	* eflags.
	*/
	if (regs->tf_eflags & PSL_VM) {
	/* XXX confusing names: `tf' isn't a trapframe; `regs' is. */
	struct trapframe_vm86 tf = (struct trapframe_vm86 )regs;
	struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;

	sf.sf_siginfo.si_sc.sc_gs = tf->tf_vm86_gs;
	sf.sf_siginfo.si_sc.sc_fs = tf->tf_vm86_fs;
	sf.sf_siginfo.si_sc.sc_es = tf->tf_vm86_es;
	sf.sf_siginfo.si_sc.sc_ds = tf->tf_vm86_ds;

	if (vm86->vm86_has_vme == 0)
	sf.sf_siginfo.si_sc.sc_ps =
	(tf->tf_eflags & ~(PSL_VIF \| PSL_VIP)) \|
	(vm86->vm86_eflags & (PSL_VIF \| PSL_VIP));

	/* See sendsig() for comments. */
	tf->tf_eflags &= ~(PSL_VM \| PSL_NT \| PSL_VIF \| PSL_VIP);
	}

	/*
	* Copy the sigframe out to the user's stack.
	*/
	if (copyout(&sf, fp, sizeof(*fp)) != 0) {
	#ifdef DEBUG
	printf("process %ld has trashed its stack\n", (long)p->p_pid);
	#endif
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	}

	regs->tf_esp = (int)fp;
	regs->tf_eip = PS_STRINGS - szosigcode;
	regs->tf_eflags &= ~(PSL_T \| PSL_D);
	regs->tf_cs = _ucodesel;
	regs->tf_ds = _udatasel;
	regs->tf_es = _udatasel;
	regs->tf_fs = _udatasel;
	load_gs(_udatasel);
	regs->tf_ss = _udatasel;
	PROC_LOCK(p);
	mtx_lock(&psp->ps_mtx);
	}
	#endif /* COMPAT_43 */

	#ifdef COMPAT_FREEBSD4
	static void
	freebsd4_sendsig(sig_t catcher, ksiginfo_t ksi, sigset_t mask)
	{
	struct sigframe4 sf, *sfp;
	struct proc *p;
	struct thread *td;
	struct sigacts *psp;
	struct trapframe *regs;
	int sig;
	int oonstack;

	td = curthread;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	sig = ksi->ksi_signo;
	psp = p->p_sigacts;
	mtx_assert(&psp->ps_mtx, MA_OWNED);
	regs = td->td_frame;
	oonstack = sigonstack(regs->tf_esp);

	/* Save user context. */
	bzero(&sf, sizeof(sf));
	sf.sf_uc.uc_sigmask = *mask;
	sf.sf_uc.uc_stack = td->td_sigstk;
	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
	? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
	sf.sf_uc.uc_mcontext.mc_gs = rgs();
	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
	bzero(sf.sf_uc.uc_mcontext.mc_fpregs,
	sizeof(sf.sf_uc.uc_mcontext.mc_fpregs));
	bzero(sf.sf_uc.uc_mcontext.__spare__,
	sizeof(sf.sf_uc.uc_mcontext.__spare__));
	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));

	/* Allocate space for the signal handler context. */
	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
	SIGISMEMBER(psp->ps_sigonstack, sig)) {
	sfp = (struct sigframe4 *)(td->td_sigstk.ss_sp +
	td->td_sigstk.ss_size - sizeof(struct sigframe4));
	#if defined(COMPAT_43)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	#endif
	} else
	sfp = (struct sigframe4 *)regs->tf_esp - 1;

	/* Translate the signal if appropriate. */
	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
	sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];

	/* Build the argument list for the signal handler. */
	sf.sf_signum = sig;
	sf.sf_ucontext = (register_t)&sfp->sf_uc;
	bzero(&sf.sf_si, sizeof(sf.sf_si));
	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
	/* Signal handler installed with SA_SIGINFO. */
	sf.sf_siginfo = (register_t)&sfp->sf_si;
	sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;

	/* Fill in POSIX parts */
	sf.sf_si.si_signo = sig;
	sf.sf_si.si_code = ksi->ksi_code;
	sf.sf_si.si_addr = ksi->ksi_addr;
	} else {
	/* Old FreeBSD-style arguments. */
	sf.sf_siginfo = ksi->ksi_code;
	sf.sf_addr = (register_t)ksi->ksi_addr;
	sf.sf_ahu.sf_handler = catcher;
	}
	mtx_unlock(&psp->ps_mtx);
	PROC_UNLOCK(p);

	/*
	* If we're a vm86 process, we want to save the segment registers.
	* We also change eflags to be our emulated eflags, not the actual
	* eflags.
	*/
	if (regs->tf_eflags & PSL_VM) {
	struct trapframe_vm86 tf = (struct trapframe_vm86 )regs;
	struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;

	sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
	sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
	sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
	sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;

	if (vm86->vm86_has_vme == 0)
	sf.sf_uc.uc_mcontext.mc_eflags =
	(tf->tf_eflags & ~(PSL_VIF \| PSL_VIP)) \|
	(vm86->vm86_eflags & (PSL_VIF \| PSL_VIP));

	/*
	* Clear PSL_NT to inhibit T_TSSFLT faults on return from
	* syscalls made by the signal handler. This just avoids
	* wasting time for our lazy fixup of such faults. PSL_NT
	* does nothing in vm86 mode, but vm86 programs can set it
	* almost legitimately in probes for old cpu types.
	*/
	tf->tf_eflags &= ~(PSL_VM \| PSL_NT \| PSL_VIF \| PSL_VIP);
	}

	/*
	* Copy the sigframe out to the user's stack.
	*/
	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
	#ifdef DEBUG
	printf("process %ld has trashed its stack\n", (long)p->p_pid);
	#endif
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	}

	regs->tf_esp = (int)sfp;
	regs->tf_eip = PS_STRINGS - szfreebsd4_sigcode;
	regs->tf_eflags &= ~(PSL_T \| PSL_D);
	regs->tf_cs = _ucodesel;
	regs->tf_ds = _udatasel;
	regs->tf_es = _udatasel;
	regs->tf_fs = _udatasel;
	regs->tf_ss = _udatasel;
	PROC_LOCK(p);
	mtx_lock(&psp->ps_mtx);
	}
	#endif /* COMPAT_FREEBSD4 */

	void
	sendsig(sig_t catcher, ksiginfo_t ksi, sigset_t mask)
	{
	struct sigframe sf, *sfp;
	struct proc *p;
	struct thread *td;
	struct sigacts *psp;
	char *sp;
	struct trapframe *regs;
	struct segment_descriptor *sdp;
	int sig;
	int oonstack;

	td = curthread;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	sig = ksi->ksi_signo;
	psp = p->p_sigacts;
	mtx_assert(&psp->ps_mtx, MA_OWNED);
	#ifdef COMPAT_FREEBSD4
	if (SIGISMEMBER(psp->ps_freebsd4, sig)) {
	freebsd4_sendsig(catcher, ksi, mask);
	return;
	}
	#endif
	#ifdef COMPAT_43
	if (SIGISMEMBER(psp->ps_osigset, sig)) {
	osendsig(catcher, ksi, mask);
	return;
	}
	#endif
	regs = td->td_frame;
	oonstack = sigonstack(regs->tf_esp);

	/* Save user context. */
	bzero(&sf, sizeof(sf));
	sf.sf_uc.uc_sigmask = *mask;
	sf.sf_uc.uc_stack = td->td_sigstk;
	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
	? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
	sf.sf_uc.uc_mcontext.mc_gs = rgs();
	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_fs, sizeof(*regs));
	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
	get_fpcontext(td, &sf.sf_uc.uc_mcontext);
	fpstate_drop(td);
	/*
	* Unconditionally fill the fsbase and gsbase into the mcontext.
	*/
	sdp = &td->td_pcb->pcb_fsd;
	sf.sf_uc.uc_mcontext.mc_fsbase = sdp->sd_hibase << 24 \|
	sdp->sd_lobase;
	sdp = &td->td_pcb->pcb_gsd;
	sf.sf_uc.uc_mcontext.mc_gsbase = sdp->sd_hibase << 24 \|
	sdp->sd_lobase;
	bzero(sf.sf_uc.uc_mcontext.mc_spare1,
	sizeof(sf.sf_uc.uc_mcontext.mc_spare1));
	bzero(sf.sf_uc.uc_mcontext.mc_spare2,
	sizeof(sf.sf_uc.uc_mcontext.mc_spare2));
	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));

	/* Allocate space for the signal handler context. */
	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
	SIGISMEMBER(psp->ps_sigonstack, sig)) {
	sp = td->td_sigstk.ss_sp +
	td->td_sigstk.ss_size - sizeof(struct sigframe);
	#if defined(COMPAT_43)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	#endif
	} else
	sp = (char *)regs->tf_esp - sizeof(struct sigframe);
	/* Align to 16 bytes. */
	sfp = (struct sigframe *)((unsigned int)sp & ~0xF);

	/* Translate the signal if appropriate. */
	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
	sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];

	/* Build the argument list for the signal handler. */
	sf.sf_signum = sig;
	sf.sf_ucontext = (register_t)&sfp->sf_uc;
	bzero(&sf.sf_si, sizeof(sf.sf_si));
	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
	/* Signal handler installed with SA_SIGINFO. */
	sf.sf_siginfo = (register_t)&sfp->sf_si;
	sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;

	/* Fill in POSIX parts */
	sf.sf_si = ksi->ksi_info;
	sf.sf_si.si_signo = sig; /* maybe a translated signal */
	} else {
	/* Old FreeBSD-style arguments. */
	sf.sf_siginfo = ksi->ksi_code;
	sf.sf_addr = (register_t)ksi->ksi_addr;
	sf.sf_ahu.sf_handler = catcher;
	}
	mtx_unlock(&psp->ps_mtx);
	PROC_UNLOCK(p);

	/*
	* If we're a vm86 process, we want to save the segment registers.
	* We also change eflags to be our emulated eflags, not the actual
	* eflags.
	*/
	if (regs->tf_eflags & PSL_VM) {
	struct trapframe_vm86 tf = (struct trapframe_vm86 )regs;
	struct vm86_kernel *vm86 = &td->td_pcb->pcb_ext->ext_vm86;

	sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
	sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
	sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
	sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;

	if (vm86->vm86_has_vme == 0)
	sf.sf_uc.uc_mcontext.mc_eflags =
	(tf->tf_eflags & ~(PSL_VIF \| PSL_VIP)) \|
	(vm86->vm86_eflags & (PSL_VIF \| PSL_VIP));

	/*
	* Clear PSL_NT to inhibit T_TSSFLT faults on return from
	* syscalls made by the signal handler. This just avoids
	* wasting time for our lazy fixup of such faults. PSL_NT
	* does nothing in vm86 mode, but vm86 programs can set it
	* almost legitimately in probes for old cpu types.
	*/
	tf->tf_eflags &= ~(PSL_VM \| PSL_NT \| PSL_VIF \| PSL_VIP);
	}

	/*
	* Copy the sigframe out to the user's stack.
	*/
	if (copyout(&sf, sfp, sizeof(*sfp)) != 0) {
	#ifdef DEBUG
	printf("process %ld has trashed its stack\n", (long)p->p_pid);
	#endif
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	}

	regs->tf_esp = (int)sfp;
	regs->tf_eip = PS_STRINGS - *(p->p_sysent->sv_szsigcode);
	regs->tf_eflags &= ~(PSL_T \| PSL_D);
	regs->tf_cs = _ucodesel;
	regs->tf_ds = _udatasel;
	regs->tf_es = _udatasel;
	regs->tf_fs = _udatasel;
	regs->tf_ss = _udatasel;
	PROC_LOCK(p);
	mtx_lock(&psp->ps_mtx);
	}

	/*
	* System call to cleanup state after a signal
	* has been taken. Reset signal mask and
	* stack state from context left by sendsig (above).
	* Return to previous pc and psl as specified by
	* context left by sendsig. Check carefully to
	* make sure that the user has not modified the
	* state to gain improper privileges.
	*
	* MPSAFE
	*/
	#ifdef COMPAT_43
	int
	osigreturn(td, uap)
	struct thread *td;
	struct osigreturn_args /* {
	struct osigcontext *sigcntxp;
	} / uap;
	{
	struct osigcontext sc;
	struct trapframe *regs;
	struct osigcontext *scp;
	int eflags, error;
	ksiginfo_t ksi;

	regs = td->td_frame;
	error = copyin(uap->sigcntxp, &sc, sizeof(sc));
	if (error != 0)
	return (error);
	scp = &sc;
	eflags = scp->sc_ps;
	if (eflags & PSL_VM) {
	struct trapframe_vm86 tf = (struct trapframe_vm86 )regs;
	struct vm86_kernel *vm86;

	/*
	* if pcb_ext == 0 or vm86_inited == 0, the user hasn't
	* set up the vm86 area, and we can't enter vm86 mode.
	*/
	if (td->td_pcb->pcb_ext == 0)
	return (EINVAL);
	vm86 = &td->td_pcb->pcb_ext->ext_vm86;
	if (vm86->vm86_inited == 0)
	return (EINVAL);

	/* Go back to user mode if both flags are set. */
	if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGBUS;
	ksi.ksi_code = BUS_OBJERR;
	ksi.ksi_addr = (void *)regs->tf_eip;
	trapsignal(td, &ksi);
	}

	if (vm86->vm86_has_vme) {
	eflags = (tf->tf_eflags & ~VME_USERCHANGE) \|
	(eflags & VME_USERCHANGE) \| PSL_VM;
	} else {
	vm86->vm86_eflags = eflags; /* save VIF, VIP */
	eflags = (tf->tf_eflags & ~VM_USERCHANGE) \|
	(eflags & VM_USERCHANGE) \| PSL_VM;
	}
	tf->tf_vm86_ds = scp->sc_ds;
	tf->tf_vm86_es = scp->sc_es;
	tf->tf_vm86_fs = scp->sc_fs;
	tf->tf_vm86_gs = scp->sc_gs;
	tf->tf_ds = _udatasel;
	tf->tf_es = _udatasel;
	tf->tf_fs = _udatasel;
	} else {
	/*
	* Don't allow users to change privileged or reserved flags.
	*/
	/*
	* XXX do allow users to change the privileged flag PSL_RF.
	* The cpu sets PSL_RF in tf_eflags for faults. Debuggers
	* should sometimes set it there too. tf_eflags is kept in
	* the signal context during signal handling and there is no
	* other place to remember it, so the PSL_RF bit may be
	* corrupted by the signal handler without us knowing.
	* Corruption of the PSL_RF bit at worst causes one more or
	* one less debugger trap, so allowing it is fairly harmless.
	*/
	if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
	return (EINVAL);
	}

	/*
	* Don't allow users to load a valid privileged %cs. Let the
	* hardware check for invalid selectors, excess privilege in
	* other selectors, invalid %eip's and invalid %esp's.
	*/
	if (!CS_SECURE(scp->sc_cs)) {
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGBUS;
	ksi.ksi_code = BUS_OBJERR;
	ksi.ksi_trapno = T_PROTFLT;
	ksi.ksi_addr = (void *)regs->tf_eip;
	trapsignal(td, &ksi);
	return (EINVAL);
	}
	regs->tf_ds = scp->sc_ds;
	regs->tf_es = scp->sc_es;
	regs->tf_fs = scp->sc_fs;
	}

	/* Restore remaining registers. */
	regs->tf_eax = scp->sc_eax;
	regs->tf_ebx = scp->sc_ebx;
	regs->tf_ecx = scp->sc_ecx;
	regs->tf_edx = scp->sc_edx;
	regs->tf_esi = scp->sc_esi;
	regs->tf_edi = scp->sc_edi;
	regs->tf_cs = scp->sc_cs;
	regs->tf_ss = scp->sc_ss;
	regs->tf_isp = scp->sc_isp;
	regs->tf_ebp = scp->sc_fp;
	regs->tf_esp = scp->sc_sp;
	regs->tf_eip = scp->sc_pc;
	regs->tf_eflags = eflags;

	#if defined(COMPAT_43)
	if (scp->sc_onstack & 1)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	else
	td->td_sigstk.ss_flags &= ~SS_ONSTACK;
	#endif
	kern_sigprocmask(td, SIG_SETMASK, (sigset_t *)&scp->sc_mask, NULL,
	SIGPROCMASK_OLD);
	return (EJUSTRETURN);
	}
	#endif /* COMPAT_43 */

	#ifdef COMPAT_FREEBSD4
	/*
	* MPSAFE
	*/
	int
	freebsd4_sigreturn(td, uap)
	struct thread *td;
	struct freebsd4_sigreturn_args /* {
	const ucontext4 *sigcntxp;
	} / uap;
	{
	struct ucontext4 uc;
	struct trapframe *regs;
	struct ucontext4 *ucp;
	int cs, eflags, error;
	ksiginfo_t ksi;

	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
	if (error != 0)
	return (error);
	ucp = &uc;
	regs = td->td_frame;
	eflags = ucp->uc_mcontext.mc_eflags;
	if (eflags & PSL_VM) {
	struct trapframe_vm86 tf = (struct trapframe_vm86 )regs;
	struct vm86_kernel *vm86;

	/*
	* if pcb_ext == 0 or vm86_inited == 0, the user hasn't
	* set up the vm86 area, and we can't enter vm86 mode.
	*/
	if (td->td_pcb->pcb_ext == 0)
	return (EINVAL);
	vm86 = &td->td_pcb->pcb_ext->ext_vm86;
	if (vm86->vm86_inited == 0)
	return (EINVAL);

	/* Go back to user mode if both flags are set. */
	if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGBUS;
	ksi.ksi_code = BUS_OBJERR;
	ksi.ksi_addr = (void *)regs->tf_eip;
	trapsignal(td, &ksi);
	}
	if (vm86->vm86_has_vme) {
	eflags = (tf->tf_eflags & ~VME_USERCHANGE) \|
	(eflags & VME_USERCHANGE) \| PSL_VM;
	} else {
	vm86->vm86_eflags = eflags; /* save VIF, VIP */
	eflags = (tf->tf_eflags & ~VM_USERCHANGE) \|
	(eflags & VM_USERCHANGE) \| PSL_VM;
	}
	bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
	tf->tf_eflags = eflags;
	tf->tf_vm86_ds = tf->tf_ds;
	tf->tf_vm86_es = tf->tf_es;
	tf->tf_vm86_fs = tf->tf_fs;
	tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
	tf->tf_ds = _udatasel;
	tf->tf_es = _udatasel;
	tf->tf_fs = _udatasel;
	} else {
	/*
	* Don't allow users to change privileged or reserved flags.
	*/
	/*
	* XXX do allow users to change the privileged flag PSL_RF.
	* The cpu sets PSL_RF in tf_eflags for faults. Debuggers
	* should sometimes set it there too. tf_eflags is kept in
	* the signal context during signal handling and there is no
	* other place to remember it, so the PSL_RF bit may be
	* corrupted by the signal handler without us knowing.
	* Corruption of the PSL_RF bit at worst causes one more or
	* one less debugger trap, so allowing it is fairly harmless.
	*/
	if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
	uprintf("pid %d (%s): freebsd4_sigreturn eflags = 0x%x\n",
	td->td_proc->p_pid, td->td_name, eflags);
	return (EINVAL);
	}

	/*
	* Don't allow users to load a valid privileged %cs. Let the
	* hardware check for invalid selectors, excess privilege in
	* other selectors, invalid %eip's and invalid %esp's.
	*/
	cs = ucp->uc_mcontext.mc_cs;
	if (!CS_SECURE(cs)) {
	uprintf("pid %d (%s): freebsd4_sigreturn cs = 0x%x\n",
	td->td_proc->p_pid, td->td_name, cs);
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGBUS;
	ksi.ksi_code = BUS_OBJERR;
	ksi.ksi_trapno = T_PROTFLT;
	ksi.ksi_addr = (void *)regs->tf_eip;
	trapsignal(td, &ksi);
	return (EINVAL);
	}

	bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
	}

	#if defined(COMPAT_43)
	if (ucp->uc_mcontext.mc_onstack & 1)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	else
	td->td_sigstk.ss_flags &= ~SS_ONSTACK;
	#endif
	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
	return (EJUSTRETURN);
	}
	#endif /* COMPAT_FREEBSD4 */

	/*
	* MPSAFE
	*/
	int
	-sigreturn(td, uap)
	+sys_sigreturn(td, uap)
	struct thread *td;
	struct sigreturn_args /* {
	const struct __ucontext *sigcntxp;
	} / uap;
	{
	ucontext_t uc;
	struct trapframe *regs;
	ucontext_t *ucp;
	int cs, eflags, error, ret;
	ksiginfo_t ksi;

	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
	if (error != 0)
	return (error);
	ucp = &uc;
	regs = td->td_frame;
	eflags = ucp->uc_mcontext.mc_eflags;
	if (eflags & PSL_VM) {
	struct trapframe_vm86 tf = (struct trapframe_vm86 )regs;
	struct vm86_kernel *vm86;

	/*
	* if pcb_ext == 0 or vm86_inited == 0, the user hasn't
	* set up the vm86 area, and we can't enter vm86 mode.
	*/
	if (td->td_pcb->pcb_ext == 0)
	return (EINVAL);
	vm86 = &td->td_pcb->pcb_ext->ext_vm86;
	if (vm86->vm86_inited == 0)
	return (EINVAL);

	/* Go back to user mode if both flags are set. */
	if ((eflags & PSL_VIP) && (eflags & PSL_VIF)) {
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGBUS;
	ksi.ksi_code = BUS_OBJERR;
	ksi.ksi_addr = (void *)regs->tf_eip;
	trapsignal(td, &ksi);
	}

	if (vm86->vm86_has_vme) {
	eflags = (tf->tf_eflags & ~VME_USERCHANGE) \|
	(eflags & VME_USERCHANGE) \| PSL_VM;
	} else {
	vm86->vm86_eflags = eflags; /* save VIF, VIP */
	eflags = (tf->tf_eflags & ~VM_USERCHANGE) \|
	(eflags & VM_USERCHANGE) \| PSL_VM;
	}
	bcopy(&ucp->uc_mcontext.mc_fs, tf, sizeof(struct trapframe));
	tf->tf_eflags = eflags;
	tf->tf_vm86_ds = tf->tf_ds;
	tf->tf_vm86_es = tf->tf_es;
	tf->tf_vm86_fs = tf->tf_fs;
	tf->tf_vm86_gs = ucp->uc_mcontext.mc_gs;
	tf->tf_ds = _udatasel;
	tf->tf_es = _udatasel;
	tf->tf_fs = _udatasel;
	} else {
	/*
	* Don't allow users to change privileged or reserved flags.
	*/
	/*
	* XXX do allow users to change the privileged flag PSL_RF.
	* The cpu sets PSL_RF in tf_eflags for faults. Debuggers
	* should sometimes set it there too. tf_eflags is kept in
	* the signal context during signal handling and there is no
	* other place to remember it, so the PSL_RF bit may be
	* corrupted by the signal handler without us knowing.
	* Corruption of the PSL_RF bit at worst causes one more or
	* one less debugger trap, so allowing it is fairly harmless.
	*/
	if (!EFL_SECURE(eflags & ~PSL_RF, regs->tf_eflags & ~PSL_RF)) {
	uprintf("pid %d (%s): sigreturn eflags = 0x%x\n",
	td->td_proc->p_pid, td->td_name, eflags);
	return (EINVAL);
	}

	/*
	* Don't allow users to load a valid privileged %cs. Let the
	* hardware check for invalid selectors, excess privilege in
	* other selectors, invalid %eip's and invalid %esp's.
	*/
	cs = ucp->uc_mcontext.mc_cs;
	if (!CS_SECURE(cs)) {
	uprintf("pid %d (%s): sigreturn cs = 0x%x\n",
	td->td_proc->p_pid, td->td_name, cs);
	ksiginfo_init_trap(&ksi);
	ksi.ksi_signo = SIGBUS;
	ksi.ksi_code = BUS_OBJERR;
	ksi.ksi_trapno = T_PROTFLT;
	ksi.ksi_addr = (void *)regs->tf_eip;
	trapsignal(td, &ksi);
	return (EINVAL);
	}

	ret = set_fpcontext(td, &ucp->uc_mcontext);
	if (ret != 0)
	return (ret);
	bcopy(&ucp->uc_mcontext.mc_fs, regs, sizeof(*regs));
	}

	#if defined(COMPAT_43)
	if (ucp->uc_mcontext.mc_onstack & 1)
	td->td_sigstk.ss_flags \|= SS_ONSTACK;
	else
	td->td_sigstk.ss_flags &= ~SS_ONSTACK;
	#endif

	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
	return (EJUSTRETURN);
	}

	/*
	* Machine dependent boot() routine
	*
	* I haven't seen anything to put here yet
	* Possibly some stuff might be grafted back here from boot()
	*/
	void
	cpu_boot(int howto)
	{
	}

	/*
	* Flush the D-cache for non-DMA I/O so that the I-cache can
	* be made coherent later.
	*/
	void
	cpu_flush_dcache(void *ptr, size_t len)
	{
	/* Not applicable */
	}

	/* Get current clock frequency for the given cpu id. */
	int
	cpu_est_clockrate(int cpu_id, uint64_t *rate)
	{
	uint64_t tsc1, tsc2;
	register_t reg;

	if (pcpu_find(cpu_id) == NULL \|\| rate == NULL)
	return (EINVAL);
	if ((cpu_feature & CPUID_TSC) == 0)
	return (EOPNOTSUPP);

	#ifdef SMP
	if (smp_cpus > 1) {
	/* Schedule ourselves on the indicated cpu. */
	thread_lock(curthread);
	sched_bind(curthread, cpu_id);
	thread_unlock(curthread);
	}
	#endif

	/* Calibrate by measuring a short delay. */
	reg = intr_disable();
	tsc1 = rdtsc();
	DELAY(1000);
	tsc2 = rdtsc();
	intr_restore(reg);
	rate = (tsc2 - tsc1) 1000;

	#ifdef SMP
	if (smp_cpus > 1) {
	thread_lock(curthread);
	sched_unbind(curthread);
	thread_unlock(curthread);
	}
	#endif

	return (0);
	}


	/*
	* Shutdown the CPU as much as possible
	*/
	void
	cpu_halt(void)
	{
	for (;;)
	__asm__ ("hlt");
	}

	static int idle_mwait = 1; /* Use MONITOR/MWAIT for short idle. */
	TUNABLE_INT("machdep.idle_mwait", &idle_mwait);
	SYSCTL_INT(_machdep, OID_AUTO, idle_mwait, CTLFLAG_RW, &idle_mwait,
	0, "Use MONITOR/MWAIT for short idle");

	#define STATE_RUNNING 0x0
	#define STATE_MWAIT 0x1
	#define STATE_SLEEPING 0x2

	static void
	cpu_idle_hlt(int busy)
	{
	int *state;

	state = (int *)PCPU_PTR(monitorbuf);
	*state = STATE_SLEEPING;
	/*
	* We must absolutely guarentee that hlt is the next instruction
	* after sti or we introduce a timing window.
	*/
	disable_intr();
	if (sched_runnable())
	enable_intr();
	else
	__asm __volatile("sti; hlt");
	*state = STATE_RUNNING;
	}

	/*
	* MWAIT cpu power states. Lower 4 bits are sub-states.
	*/
	#define MWAIT_C0 0xf0
	#define MWAIT_C1 0x00
	#define MWAIT_C2 0x10
	#define MWAIT_C3 0x20
	#define MWAIT_C4 0x30

	static void
	cpu_idle_mwait(int busy)
	{
	int *state;

	state = (int *)PCPU_PTR(monitorbuf);
	*state = STATE_MWAIT;
	if (!sched_runnable()) {
	cpu_monitor(state, 0, 0);
	if (*state == STATE_MWAIT)
	cpu_mwait(0, MWAIT_C1);
	}
	*state = STATE_RUNNING;
	}

	static void
	cpu_idle_spin(int busy)
	{
	int *state;
	int i;

	state = (int *)PCPU_PTR(monitorbuf);
	*state = STATE_RUNNING;
	for (i = 0; i < 1000; i++) {
	if (sched_runnable())
	return;
	cpu_spinwait();
	}
	}

	void (*cpu_idle_fn)(int) = cpu_idle_hlt;

	void
	cpu_idle(int busy)
	{

	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d",
	busy, curcpu);
	#ifdef MP_WATCHDOG
	ap_watchdog(PCPU_GET(cpuid));
	#endif
	/* If we are busy - try to use fast methods. */
	if (busy) {
	if ((cpu_feature2 & CPUID2_MON) && idle_mwait) {
	cpu_idle_mwait(busy);
	goto out;
	}
	}

	/* If we have time - switch timers into idle mode. */
	if (!busy) {
	critical_enter();
	cpu_idleclock();
	}

	/* Call main idle method. */
	cpu_idle_fn(busy);

	/* Switch timers mack into active mode. */
	if (!busy) {
	cpu_activeclock();
	critical_exit();
	}
	out:
	CTR2(KTR_SPARE2, "cpu_idle(%d) at %d done",
	busy, curcpu);
	}

	int
	cpu_idle_wakeup(int cpu)
	{
	struct pcpu *pcpu;
	int *state;

	pcpu = pcpu_find(cpu);
	state = (int *)pcpu->pc_monitorbuf;
	/*
	* This doesn't need to be atomic since missing the race will
	* simply result in unnecessary IPIs.
	*/
	if (*state == STATE_SLEEPING)
	return (0);
	if (*state == STATE_MWAIT)
	*state = STATE_RUNNING;
	return (1);
	}

	/*
	* Ordered by speed/power consumption.
	*/
	struct {
	void *id_fn;
	char *id_name;
	} idle_tbl[] = {
	{ cpu_idle_spin, "spin" },
	{ cpu_idle_mwait, "mwait" },
	{ cpu_idle_hlt, "hlt" },
	{ NULL, NULL }
	};

	static int
	idle_sysctl_available(SYSCTL_HANDLER_ARGS)
	{
	char avail, p;
	int error;
	int i;

	avail = malloc(256, M_TEMP, M_WAITOK);
	p = avail;
	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
	if (strstr(idle_tbl[i].id_name, "mwait") &&
	(cpu_feature2 & CPUID2_MON) == 0)
	continue;
	p += sprintf(p, "%s%s", p != avail ? ", " : "",
	idle_tbl[i].id_name);
	}
	error = sysctl_handle_string(oidp, avail, 0, req);
	free(avail, M_TEMP);
	return (error);
	}

	SYSCTL_PROC(_machdep, OID_AUTO, idle_available, CTLTYPE_STRING \| CTLFLAG_RD,
	0, 0, idle_sysctl_available, "A", "list of available idle functions");

	static int
	idle_sysctl(SYSCTL_HANDLER_ARGS)
	{
	char buf[16];
	int error;
	char *p;
	int i;

	p = "unknown";
	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
	if (idle_tbl[i].id_fn == cpu_idle_fn) {
	p = idle_tbl[i].id_name;
	break;
	}
	}
	strncpy(buf, p, sizeof(buf));
	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
	if (error != 0 \|\| req->newptr == NULL)
	return (error);
	for (i = 0; idle_tbl[i].id_name != NULL; i++) {
	if (strstr(idle_tbl[i].id_name, "mwait") &&
	(cpu_feature2 & CPUID2_MON) == 0)
	continue;
	if (strcmp(idle_tbl[i].id_name, buf))
	continue;
	cpu_idle_fn = idle_tbl[i].id_fn;
	return (0);
	}
	return (EINVAL);
	}

	SYSCTL_PROC(_machdep, OID_AUTO, idle, CTLTYPE_STRING \| CTLFLAG_RW, 0, 0,
	idle_sysctl, "A", "currently selected idle function");

	uint64_t (atomic_load_acq_64)(volatile uint64_t ) =
	atomic_load_acq_64_i386;
	void (atomic_store_rel_64)(volatile uint64_t , uint64_t) =
	atomic_store_rel_64_i386;

	static void
	cpu_probe_cmpxchg8b(void)
	{

	if ((cpu_feature & CPUID_CX8) != 0) {
	atomic_load_acq_64 = atomic_load_acq_64_i586;
	atomic_store_rel_64 = atomic_store_rel_64_i586;
	}
	}

	/*
	* Reset registers to default values on exec.
	*/
	void
	exec_setregs(struct thread td, struct image_params imgp, u_long stack)
	{
	struct trapframe *regs = td->td_frame;
	struct pcb *pcb = td->td_pcb;

	/* Reset pc->pcb_gs and %gs before possibly invalidating it. */
	pcb->pcb_gs = _udatasel;
	load_gs(_udatasel);

	mtx_lock_spin(&dt_lock);
	if (td->td_proc->p_md.md_ldt)
	user_ldt_free(td);
	else
	mtx_unlock_spin(&dt_lock);

	bzero((char *)regs, sizeof(struct trapframe));
	regs->tf_eip = imgp->entry_addr;
	regs->tf_esp = stack;
	regs->tf_eflags = PSL_USER \| (regs->tf_eflags & PSL_T);
	regs->tf_ss = _udatasel;
	regs->tf_ds = _udatasel;
	regs->tf_es = _udatasel;
	regs->tf_fs = _udatasel;
	regs->tf_cs = _ucodesel;

	/* PS_STRINGS value for BSD/OS binaries. It is 0 for non-BSD/OS. */
	regs->tf_ebx = imgp->ps_strings;

	/*
	* Reset the hardware debug registers if they were in use.
	* They won't have any meaning for the newly exec'd process.
	*/
	if (pcb->pcb_flags & PCB_DBREGS) {
	pcb->pcb_dr0 = 0;
	pcb->pcb_dr1 = 0;
	pcb->pcb_dr2 = 0;
	pcb->pcb_dr3 = 0;
	pcb->pcb_dr6 = 0;
	pcb->pcb_dr7 = 0;
	if (pcb == PCPU_GET(curpcb)) {
	/*
	* Clear the debug registers on the running
	* CPU, otherwise they will end up affecting
	* the next process we switch to.
	*/
	reset_dbregs();
	}
	pcb->pcb_flags &= ~PCB_DBREGS;
	}

	/*
	* Initialize the math emulator (if any) for the current process.
	* Actually, just clear the bit that says that the emulator has
	* been initialized. Initialization is delayed until the process
	* traps to the emulator (if it is done at all) mainly because
	* emulators don't provide an entry point for initialization.
	*/
	td->td_pcb->pcb_flags &= ~FP_SOFTFP;
	pcb->pcb_initial_npxcw = __INITIAL_NPXCW__;

	/*
	* Drop the FP state if we hold it, so that the process gets a
	* clean FP state if it uses the FPU again.
	*/
	fpstate_drop(td);

	/*
	* XXX - Linux emulator
	* Make sure sure edx is 0x0 on entry. Linux binaries depend
	* on it.
	*/
	td->td_retval[1] = 0;
	}

	void
	cpu_setregs(void)
	{
	unsigned int cr0;

	cr0 = rcr0();

	/*
	* CR0_MP, CR0_NE and CR0_TS are set for NPX (FPU) support:
	*
	* Prepare to trap all ESC (i.e., NPX) instructions and all WAIT
	* instructions. We must set the CR0_MP bit and use the CR0_TS
	* bit to control the trap, because setting the CR0_EM bit does
	* not cause WAIT instructions to trap. It's important to trap
	* WAIT instructions - otherwise the "wait" variants of no-wait
	* control instructions would degenerate to the "no-wait" variants
	* after FP context switches but work correctly otherwise. It's
	* particularly important to trap WAITs when there is no NPX -
	* otherwise the "wait" variants would always degenerate.
	*
	* Try setting CR0_NE to get correct error reporting on 486DX's.
	* Setting it should fail or do nothing on lesser processors.
	*/
	cr0 \|= CR0_MP \| CR0_NE \| CR0_TS \| CR0_WP \| CR0_AM;
	load_cr0(cr0);
	load_gs(_udatasel);
	}

	u_long bootdev; /* not a struct cdev - encoding is different /
	SYSCTL_ULONG(_machdep, OID_AUTO, guessed_bootdev,
	CTLFLAG_RD, &bootdev, 0, "Maybe the Boot device (not in struct cdev *format)");

	/*
	* Initialize 386 and configure to run kernel
	*/

	/*
	* Initialize segments & interrupt table
	*/

	int _default_ldt;

	union descriptor gdt[NGDT * MAXCPU]; /* global descriptor table */
	union descriptor ldt[NLDT]; /* local descriptor table */
	static struct gate_descriptor idt0[NIDT];
	struct gate_descriptor idt = &idt0[0]; / interrupt descriptor table */
	struct region_descriptor r_gdt, r_idt; /* table descriptors */
	struct mtx dt_lock; /* lock for GDT and LDT */

	#if defined(I586_CPU) && !defined(NO_F00F_HACK)
	extern int has_f00f_bug;
	#endif

	static struct i386tss dblfault_tss;
	static char dblfault_stack[PAGE_SIZE];

	extern vm_offset_t proc0kstack;


	/*
	* software prototypes -- in more palatable form.
	*
	* GCODE_SEL through GUDATA_SEL must be in this order for syscall/sysret
	* GUFS_SEL and GUGS_SEL must be in this order (swtch.s knows it)
	*/
	struct soft_segment_descriptor gdt_segs[] = {
	/* GNULL_SEL 0 Null Descriptor */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = SEL_KPL,
	.ssd_p = 0,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GPRIV_SEL 1 SMP Per-Processor Private Data Descriptor */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_KPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GUFS_SEL 2 %fs Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GUGS_SEL 3 %gs Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GCODE_SEL 4 Code Descriptor for kernel */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMERA,
	.ssd_dpl = SEL_KPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GDATA_SEL 5 Data Descriptor for kernel */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_KPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GUCODE_SEL 6 Code Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMERA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GUDATA_SEL 7 Data Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GBIOSLOWMEM_SEL 8 BIOS access to realmode segment 0x40, must be #8 in GDT */
	{ .ssd_base = 0x400,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_KPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GPROC0_SEL 9 Proc 0 Tss Descriptor */
	{
	.ssd_base = 0x0,
	.ssd_limit = sizeof(struct i386tss)-1,
	.ssd_type = SDT_SYS386TSS,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GLDT_SEL 10 LDT Descriptor */
	{ .ssd_base = (int) ldt,
	.ssd_limit = sizeof(ldt)-1,
	.ssd_type = SDT_SYSLDT,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GUSERLDT_SEL 11 User LDT Descriptor per process */
	{ .ssd_base = (int) ldt,
	.ssd_limit = (512 * sizeof(union descriptor)-1),
	.ssd_type = SDT_SYSLDT,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GPANIC_SEL 12 Panic Tss Descriptor */
	{ .ssd_base = (int) &dblfault_tss,
	.ssd_limit = sizeof(struct i386tss)-1,
	.ssd_type = SDT_SYS386TSS,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* GBIOSCODE32_SEL 13 BIOS 32-bit interface (32bit Code) */
	{ .ssd_base = 0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMERA,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 1 },
	/* GBIOSCODE16_SEL 14 BIOS 32-bit interface (16bit Code) */
	{ .ssd_base = 0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMERA,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 1 },
	/* GBIOSDATA_SEL 15 BIOS 32-bit interface (Data) */
	{ .ssd_base = 0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* GBIOSUTIL_SEL 16 BIOS 16-bit interface (Utility) */
	{ .ssd_base = 0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 1 },
	/* GBIOSARGS_SEL 17 BIOS 16-bit interface (Arguments) */
	{ .ssd_base = 0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = 0,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 1 },
	/* GNDIS_SEL 18 NDIS Descriptor */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	};

	static struct soft_segment_descriptor ldt_segs[] = {
	/* Null Descriptor - overwritten by call gate */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* Null Descriptor - overwritten by call gate */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* Null Descriptor - overwritten by call gate */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* Code Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMERA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	/* Null Descriptor - overwritten by call gate */
	{ .ssd_base = 0x0,
	.ssd_limit = 0x0,
	.ssd_type = 0,
	.ssd_dpl = 0,
	.ssd_p = 0,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 0,
	.ssd_gran = 0 },
	/* Data Descriptor for user */
	{ .ssd_base = 0x0,
	.ssd_limit = 0xfffff,
	.ssd_type = SDT_MEMRWA,
	.ssd_dpl = SEL_UPL,
	.ssd_p = 1,
	.ssd_xx = 0, .ssd_xx1 = 0,
	.ssd_def32 = 1,
	.ssd_gran = 1 },
	};

	void
	setidt(idx, func, typ, dpl, selec)
	int idx;
	inthand_t *func;
	int typ;
	int dpl;
	int selec;
	{
	struct gate_descriptor *ip;

	ip = idt + idx;
	ip->gd_looffset = (int)func;
	ip->gd_selector = selec;
	ip->gd_stkcpy = 0;
	ip->gd_xx = 0;
	ip->gd_type = typ;
	ip->gd_dpl = dpl;
	ip->gd_p = 1;
	ip->gd_hioffset = ((int)func)>>16 ;
	}

	extern inthand_t
	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
	IDTVEC(xmm), IDTVEC(lcall_syscall), IDTVEC(int0x80_syscall);

	#ifdef DDB
	/*
	* Display the index and function name of any IDT entries that don't use
	* the default 'rsvd' entry point.
	*/
	DB_SHOW_COMMAND(idt, db_show_idt)
	{
	struct gate_descriptor *ip;
	int idx;
	uintptr_t func;

	ip = idt;
	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
	func = (ip->gd_hioffset << 16 \| ip->gd_looffset);
	if (func != (uintptr_t)&IDTVEC(rsvd)) {
	db_printf("%3d\t", idx);
	db_printsym(func, DB_STGY_PROC);
	db_printf("\n");
	}
	ip++;
	}
	}

	/* Show privileged registers. */
	DB_SHOW_COMMAND(sysregs, db_show_sysregs)
	{
	uint64_t idtr, gdtr;

	idtr = ridt();
	db_printf("idtr\t0x%08x/%04x\n",
	(u_int)(idtr >> 16), (u_int)idtr & 0xffff);
	gdtr = rgdt();
	db_printf("gdtr\t0x%08x/%04x\n",
	(u_int)(gdtr >> 16), (u_int)gdtr & 0xffff);
	db_printf("ldtr\t0x%04x\n", rldt());
	db_printf("tr\t0x%04x\n", rtr());
	db_printf("cr0\t0x%08x\n", rcr0());
	db_printf("cr2\t0x%08x\n", rcr2());
	db_printf("cr3\t0x%08x\n", rcr3());
	db_printf("cr4\t0x%08x\n", rcr4());
	}
	#endif

	void
	sdtossd(sd, ssd)
	struct segment_descriptor *sd;
	struct soft_segment_descriptor *ssd;
	{
	ssd->ssd_base = (sd->sd_hibase << 24) \| sd->sd_lobase;
	ssd->ssd_limit = (sd->sd_hilimit << 16) \| sd->sd_lolimit;
	ssd->ssd_type = sd->sd_type;
	ssd->ssd_dpl = sd->sd_dpl;
	ssd->ssd_p = sd->sd_p;
	ssd->ssd_def32 = sd->sd_def32;
	ssd->ssd_gran = sd->sd_gran;
	}

	static void
	basemem_setup(void)
	{
	vm_paddr_t pa;
	pt_entry_t *pte;
	int i;

	if (basemem > 640) {
	printf("Preposterous BIOS basemem of %uK, truncating to 640K\n",
	basemem);
	basemem = 640;
	}

	/*
	* XXX if biosbasemem is now < 640, there is a `hole'
	* between the end of base memory and the start of
	* ISA memory. The hole may be empty or it may
	* contain BIOS code or data. Map it read/write so
	* that the BIOS can write to it. (Memory from 0 to
	* the physical end of the kernel is mapped read-only
	* to begin with and then parts of it are remapped.
	* The parts that aren't remapped form holes that
	* remain read-only and are unused by the kernel.
	* The base memory area is below the physical end of
	* the kernel and right now forms a read-only hole.
	* The part of it from PAGE_SIZE to
	* (trunc_page(biosbasemem * 1024) - 1) will be
	* remapped and used by the kernel later.)
	*
	* This code is similar to the code used in
	* pmap_mapdev, but since no memory needs to be
	* allocated we simply change the mapping.
	*/
	for (pa = trunc_page(basemem * 1024);
	pa < ISA_HOLE_START; pa += PAGE_SIZE)
	pmap_kenter(KERNBASE + pa, pa);

	/*
	* Map pages between basemem and ISA_HOLE_START, if any, r/w into
	* the vm86 page table so that vm86 can scribble on them using
	* the vm86 map too. XXX: why 2 ways for this and only 1 way for
	* page 0, at least as initialized here?
	*/
	pte = (pt_entry_t *)vm86paddr;
	for (i = basemem / 4; i < 160; i++)
	pte[i] = (i << PAGE_SHIFT) \| PG_V \| PG_RW \| PG_U;
	}

	/*
	* Populate the (physmap) array with base/bound pairs describing the
	* available physical memory in the system, then test this memory and
	* build the phys_avail array describing the actually-available memory.
	*
	* If we cannot accurately determine the physical memory map, then use
	* value from the 0xE801 call, and failing that, the RTC.
	*
	* Total memory size may be set by the kernel environment variable
	* hw.physmem or the compile-time define MAXMEM.
	*
	* XXX first should be vm_paddr_t.
	*/
	static void
	getmemsize(int first)
	{
	int off, physmap_idx, pa_indx, da_indx;
	u_long physmem_tunable, memtest;
	vm_paddr_t physmap[PHYSMAP_SIZE];
	pt_entry_t *pte;
	quad_t dcons_addr, dcons_size;
	int i;
	int pg_n;
	u_int extmem;
	u_int under16;
	vm_paddr_t pa;

	bzero(physmap, sizeof(physmap));

	/* XXX - some of EPSON machines can't use PG_N */
	pg_n = PG_N;
	if (pc98_machine_type & M_EPSON_PC98) {
	switch (epson_machine_id) {
	#ifdef WB_CACHE
	default:
	#endif
	case EPSON_PC486_HX:
	case EPSON_PC486_HG:
	case EPSON_PC486_HA:
	pg_n = 0;
	break;
	}
	}

	under16 = pc98_getmemsize(&basemem, &extmem);
	basemem_setup();

	physmap[0] = 0;
	physmap[1] = basemem * 1024;
	physmap_idx = 2;
	physmap[physmap_idx] = 0x100000;
	physmap[physmap_idx + 1] = physmap[physmap_idx] + extmem * 1024;

	/*
	* Now, physmap contains a map of physical memory.
	*/

	#ifdef SMP
	/* make hole for AP bootstrap code */
	physmap[1] = mp_bootaddress(physmap[1]);
	#endif

	/*
	* Maxmem isn't the "maximum memory", it's one larger than the
	* highest page of the physical address space. It should be
	* called something like "Maxphyspage". We may adjust this
	* based on ``hw.physmem'' and the results of the memory test.
	*/
	Maxmem = atop(physmap[physmap_idx + 1]);

	#ifdef MAXMEM
	Maxmem = MAXMEM / 4;
	#endif

	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
	Maxmem = atop(physmem_tunable);

	/*
	* By default keep the memtest enabled. Use a general name so that
	* one could eventually do more with the code than just disable it.
	*/
	memtest = 1;
	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);

	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
	(boothowto & RB_VERBOSE))
	printf("Physical memory use set to %ldK\n", Maxmem * 4);

	/*
	* If Maxmem has been increased beyond what the system has detected,
	* extend the last memory segment to the new limit.
	*/
	if (atop(physmap[physmap_idx + 1]) < Maxmem)
	physmap[physmap_idx + 1] = ptoa((vm_paddr_t)Maxmem);

	/*
	* We need to divide chunk if Maxmem is larger than 16MB and
	* under 16MB area is not full of memory.
	* (1) system area (15-16MB region) is cut off
	* (2) extended memory is only over 16MB area (ex. Melco "HYPERMEMORY")
	*/
	if ((under16 != 16 * 1024) && (extmem > 15 * 1024)) {
	/* 15M - 16M region is cut off, so need to divide chunk */
	physmap[physmap_idx + 1] = under16 * 1024;
	physmap_idx += 2;
	physmap[physmap_idx] = 0x1000000;
	physmap[physmap_idx + 1] = physmap[2] + extmem * 1024;
	}

	/* call pmap initialization to make new kernel address space */
	pmap_bootstrap(first);

	/*
	* Size up each available chunk of physical memory.
	*/
	physmap[0] = PAGE_SIZE; /* mask off page 0 */
	pa_indx = 0;
	da_indx = 1;
	phys_avail[pa_indx++] = physmap[0];
	phys_avail[pa_indx] = physmap[0];
	dump_avail[da_indx] = physmap[0];
	pte = CMAP1;

	/*
	* Get dcons buffer address
	*/
	if (getenv_quad("dcons.addr", &dcons_addr) == 0 \|\|
	getenv_quad("dcons.size", &dcons_size) == 0)
	dcons_addr = 0;

	/*
	* physmap is in bytes, so when converting to page boundaries,
	* round up the start address and round down the end address.
	*/
	for (i = 0; i <= physmap_idx; i += 2) {
	vm_paddr_t end;

	end = ptoa((vm_paddr_t)Maxmem);
	if (physmap[i + 1] < end)
	end = trunc_page(physmap[i + 1]);
	for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
	int tmp, page_bad, full;
	int ptr = (int )CADDR1;

	full = FALSE;
	/*
	* block out kernel memory as not available.
	*/
	if (pa >= KERNLOAD && pa < first)
	goto do_dump_avail;

	/*
	* block out dcons buffer
	*/
	if (dcons_addr > 0
	&& pa >= trunc_page(dcons_addr)
	&& pa < dcons_addr + dcons_size)
	goto do_dump_avail;

	page_bad = FALSE;
	if (memtest == 0)
	goto skip_memtest;

	/*
	* map page into kernel: valid, read/write,non-cacheable
	*/
	*pte = pa \| PG_V \| PG_RW \| pg_n;
	invltlb();

	tmp = (int )ptr;
	/*
	* Test for alternating 1's and 0's
	*/
	(volatile int )ptr = 0xaaaaaaaa;
	if ((volatile int )ptr != 0xaaaaaaaa)
	page_bad = TRUE;
	/*
	* Test for alternating 0's and 1's
	*/
	(volatile int )ptr = 0x55555555;
	if ((volatile int )ptr != 0x55555555)
	page_bad = TRUE;
	/*
	* Test for all 1's
	*/
	(volatile int )ptr = 0xffffffff;
	if ((volatile int )ptr != 0xffffffff)
	page_bad = TRUE;
	/*
	* Test for all 0's
	*/
	(volatile int )ptr = 0x0;
	if ((volatile int )ptr != 0x0)
	page_bad = TRUE;
	/*
	* Restore original value.
	*/
	(int )ptr = tmp;

	skip_memtest:
	/*
	* Adjust array of valid/good pages.
	*/
	if (page_bad == TRUE)
	continue;
	/*
	* If this good page is a continuation of the
	* previous set of good pages, then just increase
	* the end pointer. Otherwise start a new chunk.
	* Note that "end" points one higher than end,
	* making the range >= start and < end.
	* If we're also doing a speculative memory
	* test and we at or past the end, bump up Maxmem
	* so that we keep going. The first bad page
	* will terminate the loop.
	*/
	if (phys_avail[pa_indx] == pa) {
	phys_avail[pa_indx] += PAGE_SIZE;
	} else {
	pa_indx++;
	if (pa_indx == PHYS_AVAIL_ARRAY_END) {
	printf(
	"Too many holes in the physical address space, giving up\n");
	pa_indx--;
	full = TRUE;
	goto do_dump_avail;
	}
	phys_avail[pa_indx++] = pa; /* start */
	phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
	}
	physmem++;
	do_dump_avail:
	if (dump_avail[da_indx] == pa) {
	dump_avail[da_indx] += PAGE_SIZE;
	} else {
	da_indx++;
	if (da_indx == DUMP_AVAIL_ARRAY_END) {
	da_indx--;
	goto do_next;
	}
	dump_avail[da_indx++] = pa; /* start */
	dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
	}
	do_next:
	if (full)
	break;
	}
	}
	*pte = 0;
	invltlb();

	/*
	* XXX
	* The last chunk must contain at least one page plus the message
	* buffer to avoid complicating other code (message buffer address
	* calculation, etc.).
	*/
	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
	round_page(msgbufsize) >= phys_avail[pa_indx]) {
	physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
	phys_avail[pa_indx--] = 0;
	phys_avail[pa_indx--] = 0;
	}

	Maxmem = atop(phys_avail[pa_indx]);

	/* Trim off space for the message buffer. */
	phys_avail[pa_indx] -= round_page(msgbufsize);

	/* Map the message buffer. */
	for (off = 0; off < round_page(msgbufsize); off += PAGE_SIZE)
	pmap_kenter((vm_offset_t)msgbufp + off, phys_avail[pa_indx] +
	off);
	}

	void
	init386(first)
	int first;
	{
	struct gate_descriptor *gdp;
	int gsel_tss, metadata_missing, x, pa;
	size_t kstack0_sz;
	struct pcpu *pc;

	thread0.td_kstack = proc0kstack;
	thread0.td_kstack_pages = KSTACK_PAGES;
	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
	thread0.td_pcb = (struct pcb *)(thread0.td_kstack + kstack0_sz) - 1;

	/*
	* This may be done better later if it gets more high level
	* components in it. If so just link td->td_proc here.
	*/
	proc_linkup0(&proc0, &thread0);

	/*
	* Initialize DMAC
	*/
	pc98_init_dmac();

	metadata_missing = 0;
	if (bootinfo.bi_modulep) {
	preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
	preload_bootstrap_relocate(KERNBASE);
	} else {
	metadata_missing = 1;
	}
	if (envmode == 1)
	kern_envp = static_env;
	else if (bootinfo.bi_envp)
	kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;

	/* Init basic tunables, hz etc */
	init_param1();

	/*
	* Make gdt memory segments. All segments cover the full 4GB
	* of address space and permissions are enforced at page level.
	*/
	gdt_segs[GCODE_SEL].ssd_limit = atop(0 - 1);
	gdt_segs[GDATA_SEL].ssd_limit = atop(0 - 1);
	gdt_segs[GUCODE_SEL].ssd_limit = atop(0 - 1);
	gdt_segs[GUDATA_SEL].ssd_limit = atop(0 - 1);
	gdt_segs[GUFS_SEL].ssd_limit = atop(0 - 1);
	gdt_segs[GUGS_SEL].ssd_limit = atop(0 - 1);

	pc = &__pcpu[0];
	gdt_segs[GPRIV_SEL].ssd_limit = atop(0 - 1);
	gdt_segs[GPRIV_SEL].ssd_base = (int) pc;
	gdt_segs[GPROC0_SEL].ssd_base = (int) &pc->pc_common_tss;

	for (x = 0; x < NGDT; x++)
	ssdtosd(&gdt_segs[x], &gdt[x].sd);

	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
	r_gdt.rd_base = (int) gdt;
	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_SPIN);
	lgdt(&r_gdt);

	pcpu_init(pc, 0, sizeof(struct pcpu));
	for (pa = first; pa < first + DPCPU_SIZE; pa += PAGE_SIZE)
	pmap_kenter(pa + KERNBASE, pa);
	dpcpu_init((void *)(first + KERNBASE), 0);
	first += DPCPU_SIZE;
	PCPU_SET(prvspace, pc);
	PCPU_SET(curthread, &thread0);
	PCPU_SET(curpcb, thread0.td_pcb);

	/*
	* Initialize mutexes.
	*
	* icu_lock: in order to allow an interrupt to occur in a critical
	* section, to set pcpu->ipending (etc...) properly, we
	* must be able to get the icu lock, so it can't be
	* under witness.
	*/
	mutex_init();
	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN \| MTX_NOWITNESS \| MTX_NOPROFILE);

	/* make ldt memory segments */
	ldt_segs[LUCODE_SEL].ssd_limit = atop(0 - 1);
	ldt_segs[LUDATA_SEL].ssd_limit = atop(0 - 1);
	for (x = 0; x < sizeof ldt_segs / sizeof ldt_segs[0]; x++)
	ssdtosd(&ldt_segs[x], &ldt[x].sd);

	_default_ldt = GSEL(GLDT_SEL, SEL_KPL);
	lldt(_default_ldt);
	PCPU_SET(currentldt, _default_ldt);

	/* exceptions */
	for (x = 0; x < NIDT; x++)
	setidt(x, &IDTVEC(rsvd), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_DE, &IDTVEC(div), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYS386IGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYS386IGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_BP, &IDTVEC(bpt), SDT_SYS386IGT, SEL_UPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_OF, &IDTVEC(ofl), SDT_SYS386TGT, SEL_UPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_BR, &IDTVEC(bnd), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_NM, &IDTVEC(dna), SDT_SYS386TGT, SEL_KPL
	, GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_DF, 0, SDT_SYSTASKGT, SEL_KPL, GSEL(GPANIC_SEL, SEL_KPL));
	setidt(IDT_FPUGP, &IDTVEC(fpusegm), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_TS, &IDTVEC(tss), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_NP, &IDTVEC(missing), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_SS, &IDTVEC(stk), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_PF, &IDTVEC(page), SDT_SYS386IGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_MF, &IDTVEC(fpu), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_AC, &IDTVEC(align), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_XF, &IDTVEC(xmm), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_SYSCALL, &IDTVEC(int0x80_syscall), SDT_SYS386TGT, SEL_UPL,
	GSEL(GCODE_SEL, SEL_KPL));

	r_idt.rd_limit = sizeof(idt0) - 1;
	r_idt.rd_base = (int) idt;
	lidt(&r_idt);

	/*
	* Initialize the i8254 before the console so that console
	* initialization can use DELAY().
	*/
	i8254_init();

	/*
	* Initialize the console before we print anything out.
	*/
	cninit();

	if (metadata_missing)
	printf("WARNING: loader(8) metadata is missing!\n");

	#ifdef DEV_ISA
	atpic_startup();
	#endif

	#ifdef DDB
	ksym_start = bootinfo.bi_symtab;
	ksym_end = bootinfo.bi_esymtab;
	#endif

	kdb_init();

	#ifdef KDB
	if (boothowto & RB_KDB)
	kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
	#endif

	finishidentcpu(); /* Final stage of CPU initialization */
	setidt(IDT_UD, &IDTVEC(ill), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	setidt(IDT_GP, &IDTVEC(prot), SDT_SYS386TGT, SEL_KPL,
	GSEL(GCODE_SEL, SEL_KPL));
	initializecpu(); /* Initialize CPU registers */

	/* make an initial tss so cpu can get interrupt stack on syscall! */
	/* Note: -16 is so we can grow the trapframe if we came from vm86 */
	PCPU_SET(common_tss.tss_esp0, thread0.td_kstack +
	kstack0_sz - sizeof(struct pcb) - 16);
	PCPU_SET(common_tss.tss_ss0, GSEL(GDATA_SEL, SEL_KPL));
	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
	PCPU_SET(tss_gdt, &gdt[GPROC0_SEL].sd);
	PCPU_SET(common_tssd, *PCPU_GET(tss_gdt));
	PCPU_SET(common_tss.tss_ioopt, (sizeof (struct i386tss)) << 16);
	ltr(gsel_tss);

	/* pointer to selector slot for %fs/%gs */
	PCPU_SET(fsgs_gdt, &gdt[GUFS_SEL].sd);

	dblfault_tss.tss_esp = dblfault_tss.tss_esp0 = dblfault_tss.tss_esp1 =
	dblfault_tss.tss_esp2 = (int)&dblfault_stack[sizeof(dblfault_stack)];
	dblfault_tss.tss_ss = dblfault_tss.tss_ss0 = dblfault_tss.tss_ss1 =
	dblfault_tss.tss_ss2 = GSEL(GDATA_SEL, SEL_KPL);
	dblfault_tss.tss_cr3 = (int)IdlePTD;
	dblfault_tss.tss_eip = (int)dblfault_handler;
	dblfault_tss.tss_eflags = PSL_KERNEL;
	dblfault_tss.tss_ds = dblfault_tss.tss_es =
	dblfault_tss.tss_gs = GSEL(GDATA_SEL, SEL_KPL);
	dblfault_tss.tss_fs = GSEL(GPRIV_SEL, SEL_KPL);
	dblfault_tss.tss_cs = GSEL(GCODE_SEL, SEL_KPL);
	dblfault_tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);

	vm86_initialize();
	getmemsize(first);
	init_param2(physmem);

	/* now running on new page tables, configured,and u/iom is accessible */

	msgbufinit(msgbufp, msgbufsize);

	/* make a call gate to reenter kernel with */
	gdp = &ldt[LSYS5CALLS_SEL].gd;

	x = (int) &IDTVEC(lcall_syscall);
	gdp->gd_looffset = x;
	gdp->gd_selector = GSEL(GCODE_SEL,SEL_KPL);
	gdp->gd_stkcpy = 1;
	gdp->gd_type = SDT_SYS386CGT;
	gdp->gd_dpl = SEL_UPL;
	gdp->gd_p = 1;
	gdp->gd_hioffset = x >> 16;

	/* XXX does this work? */
	/* XXX yes! */
	ldt[LBSDICALLS_SEL] = ldt[LSYS5CALLS_SEL];
	ldt[LSOL26CALLS_SEL] = ldt[LSYS5CALLS_SEL];

	/* transfer to user mode */

	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);

	/* setup proc 0's pcb */
	thread0.td_pcb->pcb_flags = 0;
	thread0.td_pcb->pcb_cr3 = (int)IdlePTD;
	thread0.td_pcb->pcb_ext = 0;
	thread0.td_frame = &proc0_tf;

	cpu_probe_cmpxchg8b();
	}

	void
	cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
	{

	}

	void
	spinlock_enter(void)
	{
	struct thread *td;
	register_t flags;

	td = curthread;
	if (td->td_md.md_spinlock_count == 0) {
	flags = intr_disable();
	td->td_md.md_spinlock_count = 1;
	td->td_md.md_saved_flags = flags;
	} else
	td->td_md.md_spinlock_count++;
	critical_enter();
	}

	void
	spinlock_exit(void)
	{
	struct thread *td;
	register_t flags;

	td = curthread;
	critical_exit();
	flags = td->td_md.md_saved_flags;
	td->td_md.md_spinlock_count--;
	if (td->td_md.md_spinlock_count == 0)
	intr_restore(flags);
	}

	#if defined(I586_CPU) && !defined(NO_F00F_HACK)
	static void f00f_hack(void *unused);
	SYSINIT(f00f_hack, SI_SUB_INTRINSIC, SI_ORDER_FIRST, f00f_hack, NULL);

	static void
	f00f_hack(void *unused)
	{
	struct gate_descriptor *new_idt;
	vm_offset_t tmp;

	if (!has_f00f_bug)
	return;

	GIANT_REQUIRED;

	printf("Intel Pentium detected, installing workaround for F00F bug\n");

	tmp = kmem_alloc(kernel_map, PAGE_SIZE * 2);
	if (tmp == 0)
	panic("kmem_alloc returned 0");

	/* Put the problematic entry (#6) at the end of the lower page. */
	new_idt = (struct gate_descriptor*)
	(tmp + PAGE_SIZE - 7 * sizeof(struct gate_descriptor));
	bcopy(idt, new_idt, sizeof(idt0));
	r_idt.rd_base = (u_int)new_idt;
	lidt(&r_idt);
	idt = new_idt;
	if (vm_map_protect(kernel_map, tmp, tmp + PAGE_SIZE,
	VM_PROT_READ, FALSE) != KERN_SUCCESS)
	panic("vm_map_protect failed");
	}
	#endif /* defined(I586_CPU) && !NO_F00F_HACK */

	/*
	* Construct a PCB from a trapframe. This is called from kdb_trap() where
	* we want to start a backtrace from the function that caused us to enter
	* the debugger. We have the context in the trapframe, but base the trace
	* on the PCB. The PCB doesn't have to be perfect, as long as it contains
	* enough for a backtrace.
	*/
	void
	makectx(struct trapframe tf, struct pcb pcb)
	{

	pcb->pcb_edi = tf->tf_edi;
	pcb->pcb_esi = tf->tf_esi;
	pcb->pcb_ebp = tf->tf_ebp;
	pcb->pcb_ebx = tf->tf_ebx;
	pcb->pcb_eip = tf->tf_eip;
	pcb->pcb_esp = (ISPL(tf->tf_cs)) ? tf->tf_esp : (int)(tf + 1) - 8;
	}

	int
	ptrace_set_pc(struct thread *td, u_long addr)
	{

	td->td_frame->tf_eip = addr;
	return (0);
	}

	int
	ptrace_single_step(struct thread *td)
	{
	td->td_frame->tf_eflags \|= PSL_T;
	return (0);
	}

	int
	ptrace_clear_single_step(struct thread *td)
	{
	td->td_frame->tf_eflags &= ~PSL_T;
	return (0);
	}

	int
	fill_regs(struct thread td, struct reg regs)
	{
	struct pcb *pcb;
	struct trapframe *tp;

	tp = td->td_frame;
	pcb = td->td_pcb;
	regs->r_gs = pcb->pcb_gs;
	return (fill_frame_regs(tp, regs));
	}

	int
	fill_frame_regs(struct trapframe tp, struct reg regs)
	{
	regs->r_fs = tp->tf_fs;
	regs->r_es = tp->tf_es;
	regs->r_ds = tp->tf_ds;
	regs->r_edi = tp->tf_edi;
	regs->r_esi = tp->tf_esi;
	regs->r_ebp = tp->tf_ebp;
	regs->r_ebx = tp->tf_ebx;
	regs->r_edx = tp->tf_edx;
	regs->r_ecx = tp->tf_ecx;
	regs->r_eax = tp->tf_eax;
	regs->r_eip = tp->tf_eip;
	regs->r_cs = tp->tf_cs;
	regs->r_eflags = tp->tf_eflags;
	regs->r_esp = tp->tf_esp;
	regs->r_ss = tp->tf_ss;
	return (0);
	}

	int
	set_regs(struct thread td, struct reg regs)
	{
	struct pcb *pcb;
	struct trapframe *tp;

	tp = td->td_frame;
	if (!EFL_SECURE(regs->r_eflags, tp->tf_eflags) \|\|
	!CS_SECURE(regs->r_cs))
	return (EINVAL);
	pcb = td->td_pcb;
	tp->tf_fs = regs->r_fs;
	tp->tf_es = regs->r_es;
	tp->tf_ds = regs->r_ds;
	tp->tf_edi = regs->r_edi;
	tp->tf_esi = regs->r_esi;
	tp->tf_ebp = regs->r_ebp;
	tp->tf_ebx = regs->r_ebx;
	tp->tf_edx = regs->r_edx;
	tp->tf_ecx = regs->r_ecx;
	tp->tf_eax = regs->r_eax;
	tp->tf_eip = regs->r_eip;
	tp->tf_cs = regs->r_cs;
	tp->tf_eflags = regs->r_eflags;
	tp->tf_esp = regs->r_esp;
	tp->tf_ss = regs->r_ss;
	pcb->pcb_gs = regs->r_gs;
	return (0);
	}

	#ifdef CPU_ENABLE_SSE
	static void
	fill_fpregs_xmm(sv_xmm, sv_87)
	struct savexmm *sv_xmm;
	struct save87 *sv_87;
	{
	register struct env87 *penv_87 = &sv_87->sv_env;
	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
	int i;

	bzero(sv_87, sizeof(*sv_87));

	/* FPU control/status */
	penv_87->en_cw = penv_xmm->en_cw;
	penv_87->en_sw = penv_xmm->en_sw;
	penv_87->en_tw = penv_xmm->en_tw;
	penv_87->en_fip = penv_xmm->en_fip;
	penv_87->en_fcs = penv_xmm->en_fcs;
	penv_87->en_opcode = penv_xmm->en_opcode;
	penv_87->en_foo = penv_xmm->en_foo;
	penv_87->en_fos = penv_xmm->en_fos;

	/* FPU registers */
	for (i = 0; i < 8; ++i)
	sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
	}

	static void
	set_fpregs_xmm(sv_87, sv_xmm)
	struct save87 *sv_87;
	struct savexmm *sv_xmm;
	{
	register struct env87 *penv_87 = &sv_87->sv_env;
	register struct envxmm *penv_xmm = &sv_xmm->sv_env;
	int i;

	/* FPU control/status */
	penv_xmm->en_cw = penv_87->en_cw;
	penv_xmm->en_sw = penv_87->en_sw;
	penv_xmm->en_tw = penv_87->en_tw;
	penv_xmm->en_fip = penv_87->en_fip;
	penv_xmm->en_fcs = penv_87->en_fcs;
	penv_xmm->en_opcode = penv_87->en_opcode;
	penv_xmm->en_foo = penv_87->en_foo;
	penv_xmm->en_fos = penv_87->en_fos;

	/* FPU registers */
	for (i = 0; i < 8; ++i)
	sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
	}
	#endif /* CPU_ENABLE_SSE */

	int
	fill_fpregs(struct thread td, struct fpreg fpregs)
	{

	KASSERT(td == curthread \|\| TD_IS_SUSPENDED(td),
	("not suspended thread %p", td));
	#ifdef DEV_NPX
	npxgetregs(td);
	#else
	bzero(fpregs, sizeof(*fpregs));
	#endif
	#ifdef CPU_ENABLE_SSE
	if (cpu_fxsr)
	fill_fpregs_xmm(&td->td_pcb->pcb_user_save.sv_xmm,
	(struct save87 *)fpregs);
	else
	#endif /* CPU_ENABLE_SSE */
	bcopy(&td->td_pcb->pcb_user_save.sv_87, fpregs,
	sizeof(*fpregs));
	return (0);
	}

	int
	set_fpregs(struct thread td, struct fpreg fpregs)
	{

	#ifdef CPU_ENABLE_SSE
	if (cpu_fxsr)
	set_fpregs_xmm((struct save87 *)fpregs,
	&td->td_pcb->pcb_user_save.sv_xmm);
	else
	#endif /* CPU_ENABLE_SSE */
	bcopy(fpregs, &td->td_pcb->pcb_user_save.sv_87,
	sizeof(*fpregs));
	#ifdef DEV_NPX
	npxuserinited(td);
	#endif
	return (0);
	}

	/*
	* Get machine context.
	*/
	int
	get_mcontext(struct thread td, mcontext_t mcp, int flags)
	{
	struct trapframe *tp;
	struct segment_descriptor *sdp;

	tp = td->td_frame;

	PROC_LOCK(curthread->td_proc);
	mcp->mc_onstack = sigonstack(tp->tf_esp);
	PROC_UNLOCK(curthread->td_proc);
	mcp->mc_gs = td->td_pcb->pcb_gs;
	mcp->mc_fs = tp->tf_fs;
	mcp->mc_es = tp->tf_es;
	mcp->mc_ds = tp->tf_ds;
	mcp->mc_edi = tp->tf_edi;
	mcp->mc_esi = tp->tf_esi;
	mcp->mc_ebp = tp->tf_ebp;
	mcp->mc_isp = tp->tf_isp;
	mcp->mc_eflags = tp->tf_eflags;
	if (flags & GET_MC_CLEAR_RET) {
	mcp->mc_eax = 0;
	mcp->mc_edx = 0;
	mcp->mc_eflags &= ~PSL_C;
	} else {
	mcp->mc_eax = tp->tf_eax;
	mcp->mc_edx = tp->tf_edx;
	}
	mcp->mc_ebx = tp->tf_ebx;
	mcp->mc_ecx = tp->tf_ecx;
	mcp->mc_eip = tp->tf_eip;
	mcp->mc_cs = tp->tf_cs;
	mcp->mc_esp = tp->tf_esp;
	mcp->mc_ss = tp->tf_ss;
	mcp->mc_len = sizeof(*mcp);
	get_fpcontext(td, mcp);
	sdp = &td->td_pcb->pcb_fsd;
	mcp->mc_fsbase = sdp->sd_hibase << 24 \| sdp->sd_lobase;
	sdp = &td->td_pcb->pcb_gsd;
	mcp->mc_gsbase = sdp->sd_hibase << 24 \| sdp->sd_lobase;
	bzero(mcp->mc_spare1, sizeof(mcp->mc_spare1));
	bzero(mcp->mc_spare2, sizeof(mcp->mc_spare2));
	return (0);
	}

	/*
	* Set machine context.
	*
	* However, we don't set any but the user modifiable flags, and we won't
	* touch the cs selector.
	*/
	int
	set_mcontext(struct thread td, const mcontext_t mcp)
	{
	struct trapframe *tp;
	int eflags, ret;

	tp = td->td_frame;
	if (mcp->mc_len != sizeof(*mcp))
	return (EINVAL);
	eflags = (mcp->mc_eflags & PSL_USERCHANGE) \|
	(tp->tf_eflags & ~PSL_USERCHANGE);
	if ((ret = set_fpcontext(td, mcp)) == 0) {
	tp->tf_fs = mcp->mc_fs;
	tp->tf_es = mcp->mc_es;
	tp->tf_ds = mcp->mc_ds;
	tp->tf_edi = mcp->mc_edi;
	tp->tf_esi = mcp->mc_esi;
	tp->tf_ebp = mcp->mc_ebp;
	tp->tf_ebx = mcp->mc_ebx;
	tp->tf_edx = mcp->mc_edx;
	tp->tf_ecx = mcp->mc_ecx;
	tp->tf_eax = mcp->mc_eax;
	tp->tf_eip = mcp->mc_eip;
	tp->tf_eflags = eflags;
	tp->tf_esp = mcp->mc_esp;
	tp->tf_ss = mcp->mc_ss;
	td->td_pcb->pcb_gs = mcp->mc_gs;
	ret = 0;
	}
	return (ret);
	}

	static void
	get_fpcontext(struct thread td, mcontext_t mcp)
	{

	#ifndef DEV_NPX
	mcp->mc_fpformat = _MC_FPFMT_NODEV;
	mcp->mc_ownedfp = _MC_FPOWNED_NONE;
	bzero(mcp->mc_fpstate, sizeof(mcp->mc_fpstate));
	#else
	mcp->mc_ownedfp = npxgetregs(td);
	bcopy(&td->td_pcb->pcb_user_save, &mcp->mc_fpstate,
	sizeof(mcp->mc_fpstate));
	mcp->mc_fpformat = npxformat();
	#endif
	}

	static int
	set_fpcontext(struct thread td, const mcontext_t mcp)
	{

	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
	return (0);
	else if (mcp->mc_fpformat != _MC_FPFMT_387 &&
	mcp->mc_fpformat != _MC_FPFMT_XMM)
	return (EINVAL);
	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE)
	/* We don't care what state is left in the FPU or PCB. */
	fpstate_drop(td);
	else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU \|\|
	mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
	#ifdef DEV_NPX
	#ifdef CPU_ENABLE_SSE
	if (cpu_fxsr)
	((union savefpu *)&mcp->mc_fpstate)->sv_xmm.sv_env.
	en_mxcsr &= cpu_mxcsr_mask;
	#endif
	npxsetregs(td, (union savefpu *)&mcp->mc_fpstate);
	#endif
	} else
	return (EINVAL);
	return (0);
	}

	static void
	fpstate_drop(struct thread *td)
	{

	critical_enter();
	#ifdef DEV_NPX
	if (PCPU_GET(fpcurthread) == td)
	npxdrop();
	#endif
	/*
	* XXX force a full drop of the npx. The above only drops it if we
	* owned it. npxgetregs() has the same bug in the !cpu_fxsr case.
	*
	* XXX I don't much like npxgetregs()'s semantics of doing a full
	* drop. Dropping only to the pcb matches fnsave's behaviour.
	* We only need to drop to !PCB_INITDONE in sendsig(). But
	* sendsig() is the only caller of npxgetregs()... perhaps we just
	* have too many layers.
	*/
	curthread->td_pcb->pcb_flags &= ~(PCB_NPXINITDONE \|
	PCB_NPXUSERINITDONE);
	critical_exit();
	}

	int
	fill_dbregs(struct thread td, struct dbreg dbregs)
	{
	struct pcb *pcb;

	if (td == NULL) {
	dbregs->dr[0] = rdr0();
	dbregs->dr[1] = rdr1();
	dbregs->dr[2] = rdr2();
	dbregs->dr[3] = rdr3();
	dbregs->dr[4] = rdr4();
	dbregs->dr[5] = rdr5();
	dbregs->dr[6] = rdr6();
	dbregs->dr[7] = rdr7();
	} else {
	pcb = td->td_pcb;
	dbregs->dr[0] = pcb->pcb_dr0;
	dbregs->dr[1] = pcb->pcb_dr1;
	dbregs->dr[2] = pcb->pcb_dr2;
	dbregs->dr[3] = pcb->pcb_dr3;
	dbregs->dr[4] = 0;
	dbregs->dr[5] = 0;
	dbregs->dr[6] = pcb->pcb_dr6;
	dbregs->dr[7] = pcb->pcb_dr7;
	}
	return (0);
	}

	int
	set_dbregs(struct thread td, struct dbreg dbregs)
	{
	struct pcb *pcb;
	int i;

	if (td == NULL) {
	load_dr0(dbregs->dr[0]);
	load_dr1(dbregs->dr[1]);
	load_dr2(dbregs->dr[2]);
	load_dr3(dbregs->dr[3]);
	load_dr4(dbregs->dr[4]);
	load_dr5(dbregs->dr[5]);
	load_dr6(dbregs->dr[6]);
	load_dr7(dbregs->dr[7]);
	} else {
	/*
	* Don't let an illegal value for dr7 get set. Specifically,
	* check for undefined settings. Setting these bit patterns
	* result in undefined behaviour and can lead to an unexpected
	* TRCTRAP.
	*/
	for (i = 0; i < 4; i++) {
	if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
	return (EINVAL);
	if (DBREG_DR7_LEN(dbregs->dr[7], i) == 0x02)
	return (EINVAL);
	}

	pcb = td->td_pcb;

	/*
	* Don't let a process set a breakpoint that is not within the
	* process's address space. If a process could do this, it
	* could halt the system by setting a breakpoint in the kernel
	* (if ddb was enabled). Thus, we need to check to make sure
	* that no breakpoints are being enabled for addresses outside
	* process's address space.
	*
	* XXX - what about when the watched area of the user's
	* address space is written into from within the kernel
	* ... wouldn't that still cause a breakpoint to be generated
	* from within kernel mode?
	*/

	if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
	/* dr0 is enabled */
	if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
	return (EINVAL);
	}

	if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
	/* dr1 is enabled */
	if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
	return (EINVAL);
	}

	if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
	/* dr2 is enabled */
	if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
	return (EINVAL);
	}

	if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
	/* dr3 is enabled */
	if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
	return (EINVAL);
	}

	pcb->pcb_dr0 = dbregs->dr[0];
	pcb->pcb_dr1 = dbregs->dr[1];
	pcb->pcb_dr2 = dbregs->dr[2];
	pcb->pcb_dr3 = dbregs->dr[3];
	pcb->pcb_dr6 = dbregs->dr[6];
	pcb->pcb_dr7 = dbregs->dr[7];

	pcb->pcb_flags \|= PCB_DBREGS;
	}

	return (0);
	}

	/*
	* Return > 0 if a hardware breakpoint has been hit, and the
	* breakpoint was in user space. Return 0, otherwise.
	*/
	int
	user_dbreg_trap(void)
	{
	u_int32_t dr7, dr6; /* debug registers dr6 and dr7 */
	u_int32_t bp; /* breakpoint bits extracted from dr6 */
	int nbp; /* number of breakpoints that triggered */
	caddr_t addr[4]; /* breakpoint addresses */
	int i;

	dr7 = rdr7();
	if ((dr7 & 0x000000ff) == 0) {
	/*
	* all GE and LE bits in the dr7 register are zero,
	* thus the trap couldn't have been caused by the
	* hardware debug registers
	*/
	return 0;
	}

	nbp = 0;
	dr6 = rdr6();
	bp = dr6 & 0x0000000f;

	if (!bp) {
	/*
	* None of the breakpoint bits are set meaning this
	* trap was not caused by any of the debug registers
	*/
	return 0;
	}

	/*
	* at least one of the breakpoints were hit, check to see
	* which ones and if any of them are user space addresses
	*/

	if (bp & 0x01) {
	addr[nbp++] = (caddr_t)rdr0();
	}
	if (bp & 0x02) {
	addr[nbp++] = (caddr_t)rdr1();
	}
	if (bp & 0x04) {
	addr[nbp++] = (caddr_t)rdr2();
	}
	if (bp & 0x08) {
	addr[nbp++] = (caddr_t)rdr3();
	}

	for (i = 0; i < nbp; i++) {
	if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
	/*
	* addr[i] is in user space
	*/
	return nbp;
	}
	}

	/*
	* None of the breakpoints are in user space.
	*/
	return 0;
	}

	#ifdef KDB

	/*
	* Provide inb() and outb() as functions. They are normally only available as
	* inline functions, thus cannot be called from the debugger.
	*/

	/* silence compiler warnings */
	u_char inb_(u_short);
	void outb_(u_short, u_char);

	u_char
	inb_(u_short port)
	{
	return inb(port);
	}

	void
	outb_(u_short port, u_char data)
	{
	outb(port, data);
	}

	#endif /* KDB */
	Index: head/sys/powerpc/powerpc/exec_machdep.c
	===================================================================
	--- head/sys/powerpc/powerpc/exec_machdep.c (revision 225616)
	+++ head/sys/powerpc/powerpc/exec_machdep.c (revision 225617)
	@@ -1,1044 +1,1044 @@
	/*-
	* Copyright (C) 1995, 1996 Wolfgang Solfrank.
	* Copyright (C) 1995, 1996 TooLs GmbH.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by TooLs GmbH.
	* 4. The name of TooLs GmbH may not be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
	* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
	* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/
	/*-
	* Copyright (C) 2001 Benno Rice
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY Benno Rice ``AS IS'' AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
	* IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
	* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
	* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	* $NetBSD: machdep.c,v 1.74.2.1 2000/11/01 16:13:48 tv Exp $
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"

	#include <sys/param.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/bus.h>
	#include <sys/cons.h>
	#include <sys/cpu.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/signalvar.h>
	#include <sys/syscallsubr.h>
	#include <sys/syscall.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/ucontext.h>
	#include <sys/uio.h>

	#include <machine/altivec.h>
	#include <machine/cpu.h>
	#include <machine/elf.h>
	#include <machine/fpu.h>
	#include <machine/pcb.h>
	#include <machine/reg.h>
	#include <machine/sigframe.h>
	#include <machine/trap.h>
	#include <machine/vmparam.h>

	#ifdef COMPAT_FREEBSD32
	#include <compat/freebsd32/freebsd32_signal.h>
	#include <compat/freebsd32/freebsd32_util.h>
	#include <compat/freebsd32/freebsd32_proto.h>

	typedef struct __ucontext32 {
	sigset_t uc_sigmask;
	mcontext32_t uc_mcontext;
	uint32_t uc_link;
	struct sigaltstack32 uc_stack;
	uint32_t uc_flags;
	uint32_t __spare__[4];
	} ucontext32_t;

	struct sigframe32 {
	ucontext32_t sf_uc;
	struct siginfo32 sf_si;
	};

	static int grab_mcontext32(struct thread td, mcontext32_t , int flags);
	#endif

	static int grab_mcontext(struct thread , mcontext_t , int);

	void
	sendsig(sig_t catcher, ksiginfo_t ksi, sigset_t mask)
	{
	struct trapframe *tf;
	struct sigacts *psp;
	struct sigframe sf;
	struct thread *td;
	struct proc *p;
	#ifdef COMPAT_FREEBSD32
	struct siginfo32 siginfo32;
	struct sigframe32 sf32;
	#endif
	size_t sfpsize;
	caddr_t sfp, usfp;
	int oonstack, rndfsize;
	int sig;
	int code;

	td = curthread;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);

	psp = p->p_sigacts;
	mtx_assert(&psp->ps_mtx, MA_OWNED);
	tf = td->td_frame;
	oonstack = sigonstack(tf->fixreg[1]);

	/*
	* Fill siginfo structure.
	*/
	ksi->ksi_info.si_signo = ksi->ksi_signo;
	#ifdef AIM
	ksi->ksi_info.si_addr = (void *)((tf->exc == EXC_DSI) ?
	tf->cpu.aim.dar : tf->srr0);
	#else
	ksi->ksi_info.si_addr = (void *)((tf->exc == EXC_DSI) ?
	tf->cpu.booke.dear : tf->srr0);
	#endif

	#ifdef COMPAT_FREEBSD32
	if (SV_PROC_FLAG(p, SV_ILP32)) {
	siginfo_to_siginfo32(&ksi->ksi_info, &siginfo32);
	sig = siginfo32.si_signo;
	code = siginfo32.si_code;
	sfp = (caddr_t)&sf32;
	sfpsize = sizeof(sf32);
	rndfsize = ((sizeof(sf32) + 15) / 16) * 16;

	/*
	* Save user context
	*/

	memset(&sf32, 0, sizeof(sf32));
	grab_mcontext32(td, &sf32.sf_uc.uc_mcontext, 0);

	sf32.sf_uc.uc_sigmask = *mask;
	sf32.sf_uc.uc_stack.ss_sp = (uintptr_t)td->td_sigstk.ss_sp;
	sf32.sf_uc.uc_stack.ss_size = (uint32_t)td->td_sigstk.ss_size;
	sf32.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
	? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;

	sf32.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
	} else {
	#endif
	sig = ksi->ksi_signo;
	code = ksi->ksi_code;
	sfp = (caddr_t)&sf;
	sfpsize = sizeof(sf);
	#ifdef __powerpc64__
	/*
	* 64-bit PPC defines a 288 byte scratch region
	* below the stack.
	*/
	rndfsize = 288 + ((sizeof(sf) + 47) / 48) * 48;
	#else
	rndfsize = ((sizeof(sf) + 15) / 16) * 16;
	#endif

	/*
	* Save user context
	*/

	memset(&sf, 0, sizeof(sf));
	grab_mcontext(td, &sf.sf_uc.uc_mcontext, 0);

	sf.sf_uc.uc_sigmask = *mask;
	sf.sf_uc.uc_stack = td->td_sigstk;
	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
	? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;

	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
	#ifdef COMPAT_FREEBSD32
	}
	#endif

	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
	catcher, sig);

	/*
	* Allocate and validate space for the signal handler context.
	*/
	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
	SIGISMEMBER(psp->ps_sigonstack, sig)) {
	usfp = (void *)(td->td_sigstk.ss_sp +
	td->td_sigstk.ss_size - rndfsize);
	} else {
	usfp = (void *)(tf->fixreg[1] - rndfsize);
	}

	/*
	* Translate the signal if appropriate (Linux emu ?)
	*/
	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
	sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];

	/*
	* Save the floating-point state, if necessary, then copy it.
	*/
	/* XXX */

	/*
	* Set up the registers to return to sigcode.
	*
	* r1/sp - sigframe ptr
	* lr - sig function, dispatched to by blrl in trampoline
	* r3 - sig number
	* r4 - SIGINFO ? &siginfo : exception code
	* r5 - user context
	* srr0 - trampoline function addr
	*/
	tf->lr = (register_t)catcher;
	tf->fixreg[1] = (register_t)usfp;
	tf->fixreg[FIRSTARG] = sig;
	#ifdef COMPAT_FREEBSD32
	tf->fixreg[FIRSTARG+2] = (register_t)usfp +
	((SV_PROC_FLAG(p, SV_ILP32)) ?
	offsetof(struct sigframe32, sf_uc) :
	offsetof(struct sigframe, sf_uc));
	#else
	tf->fixreg[FIRSTARG+2] = (register_t)usfp +
	offsetof(struct sigframe, sf_uc);
	#endif
	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
	/*
	* Signal handler installed with SA_SIGINFO.
	*/
	#ifdef COMPAT_FREEBSD32
	if (SV_PROC_FLAG(p, SV_ILP32)) {
	sf32.sf_si = siginfo32;
	tf->fixreg[FIRSTARG+1] = (register_t)usfp +
	offsetof(struct sigframe32, sf_si);
	sf32.sf_si = siginfo32;
	} else {
	#endif
	tf->fixreg[FIRSTARG+1] = (register_t)usfp +
	offsetof(struct sigframe, sf_si);
	sf.sf_si = ksi->ksi_info;
	#ifdef COMPAT_FREEBSD32
	}
	#endif
	} else {
	/* Old FreeBSD-style arguments. */
	tf->fixreg[FIRSTARG+1] = code;
	#ifdef AIM
	tf->fixreg[FIRSTARG+3] = (tf->exc == EXC_DSI) ?
	tf->cpu.aim.dar : tf->srr0;
	#else
	tf->fixreg[FIRSTARG+3] = (tf->exc == EXC_DSI) ?
	tf->cpu.booke.dear : tf->srr0;
	#endif
	}
	mtx_unlock(&psp->ps_mtx);
	PROC_UNLOCK(p);

	tf->srr0 = (register_t)p->p_sysent->sv_sigcode_base;

	/*
	* copy the frame out to userland.
	*/
	if (copyout(sfp, usfp, sfpsize) != 0) {
	/*
	* Process has trashed its stack. Kill it.
	*/
	CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	}

	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#x sp=%#x", td,
	tf->srr0, tf->fixreg[1]);

	PROC_LOCK(p);
	mtx_lock(&psp->ps_mtx);
	}

	int
	-sigreturn(struct thread td, struct sigreturn_args uap)
	+sys_sigreturn(struct thread td, struct sigreturn_args uap)
	{
	ucontext_t uc;
	int error;

	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);

	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
	CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
	return (EFAULT);
	}

	error = set_mcontext(td, &uc.uc_mcontext);
	if (error != 0)
	return (error);

	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);

	CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x",
	td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]);

	return (EJUSTRETURN);
	}

	#ifdef COMPAT_FREEBSD4
	int
	freebsd4_sigreturn(struct thread td, struct freebsd4_sigreturn_args uap)
	{

	- return sigreturn(td, (struct sigreturn_args *)uap);
	+ return sys_sigreturn(td, (struct sigreturn_args *)uap);
	}
	#endif

	/*
	* Construct a PCB from a trapframe. This is called from kdb_trap() where
	* we want to start a backtrace from the function that caused us to enter
	* the debugger. We have the context in the trapframe, but base the trace
	* on the PCB. The PCB doesn't have to be perfect, as long as it contains
	* enough for a backtrace.
	*/
	void
	makectx(struct trapframe tf, struct pcb pcb)
	{

	pcb->pcb_lr = tf->srr0;
	pcb->pcb_sp = tf->fixreg[1];
	}

	/*
	* get_mcontext/sendsig helper routine that doesn't touch the
	* proc lock
	*/
	static int
	grab_mcontext(struct thread td, mcontext_t mcp, int flags)
	{
	struct pcb *pcb;

	pcb = td->td_pcb;

	memset(mcp, 0, sizeof(mcontext_t));

	mcp->mc_vers = _MC_VERSION;
	mcp->mc_flags = 0;
	memcpy(&mcp->mc_frame, td->td_frame, sizeof(struct trapframe));
	if (flags & GET_MC_CLEAR_RET) {
	mcp->mc_gpr[3] = 0;
	mcp->mc_gpr[4] = 0;
	}

	#ifdef AIM
	/*
	* This assumes that floating-point context is not lazy,
	* so if the thread has used FP there would have been a
	* FP-unavailable exception that would have set things up
	* correctly.
	*/
	if (pcb->pcb_flags & PCB_FPU) {
	KASSERT(td == curthread,
	("get_mcontext: fp save not curthread"));
	critical_enter();
	save_fpu(td);
	critical_exit();
	mcp->mc_flags \|= _MC_FP_VALID;
	memcpy(&mcp->mc_fpscr, &pcb->pcb_fpu.fpscr, sizeof(double));
	memcpy(mcp->mc_fpreg, pcb->pcb_fpu.fpr, 32*sizeof(double));
	}

	/*
	* Repeat for Altivec context
	*/

	if (pcb->pcb_flags & PCB_VEC) {
	KASSERT(td == curthread,
	("get_mcontext: fp save not curthread"));
	critical_enter();
	save_vec(td);
	critical_exit();
	mcp->mc_flags \|= _MC_AV_VALID;
	mcp->mc_vscr = pcb->pcb_vec.vscr;
	mcp->mc_vrsave = pcb->pcb_vec.vrsave;
	memcpy(mcp->mc_avec, pcb->pcb_vec.vr, sizeof(mcp->mc_avec));
	}
	#endif

	mcp->mc_len = sizeof(*mcp);

	return (0);
	}

	int
	get_mcontext(struct thread td, mcontext_t mcp, int flags)
	{
	int error;

	error = grab_mcontext(td, mcp, flags);
	if (error == 0) {
	PROC_LOCK(curthread->td_proc);
	mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]);
	PROC_UNLOCK(curthread->td_proc);
	}

	return (error);
	}

	int
	set_mcontext(struct thread td, const mcontext_t mcp)
	{
	struct pcb *pcb;
	struct trapframe *tf;

	pcb = td->td_pcb;
	tf = td->td_frame;

	if (mcp->mc_vers != _MC_VERSION \|\| mcp->mc_len != sizeof(*mcp))
	return (EINVAL);

	#ifdef AIM
	/*
	* Don't let the user set privileged MSR bits
	*/
	if ((mcp->mc_srr1 & PSL_USERSTATIC) != (tf->srr1 & PSL_USERSTATIC)) {
	return (EINVAL);
	}
	#endif

	memcpy(tf, mcp->mc_frame, sizeof(mcp->mc_frame));

	#ifdef AIM
	if (mcp->mc_flags & _MC_FP_VALID) {
	if ((pcb->pcb_flags & PCB_FPU) != PCB_FPU) {
	critical_enter();
	enable_fpu(td);
	critical_exit();
	}
	memcpy(&pcb->pcb_fpu.fpscr, &mcp->mc_fpscr, sizeof(double));
	memcpy(pcb->pcb_fpu.fpr, mcp->mc_fpreg, 32*sizeof(double));
	}

	if (mcp->mc_flags & _MC_AV_VALID) {
	if ((pcb->pcb_flags & PCB_VEC) != PCB_VEC) {
	critical_enter();
	enable_vec(td);
	critical_exit();
	}
	pcb->pcb_vec.vscr = mcp->mc_vscr;
	pcb->pcb_vec.vrsave = mcp->mc_vrsave;
	memcpy(pcb->pcb_vec.vr, mcp->mc_avec, sizeof(mcp->mc_avec));
	}
	#endif

	return (0);
	}

	/*
	* Set set up registers on exec.
	*/
	void
	exec_setregs(struct thread td, struct image_params imgp, u_long stack)
	{
	struct trapframe *tf;
	register_t argc;
	#ifdef __powerpc64__
	register_t entry_desc[3];
	#endif

	tf = trapframe(td);
	bzero(tf, sizeof *tf);
	#ifdef __powerpc64__
	tf->fixreg[1] = -roundup(-stack + 48, 16);
	#else
	tf->fixreg[1] = -roundup(-stack + 8, 16);
	#endif

	/*
	* Set up arguments for _start():
	* _start(argc, argv, envp, obj, cleanup, ps_strings);
	*
	* Notes:
	* - obj and cleanup are the auxilliary and termination
	* vectors. They are fixed up by ld.elf_so.
	* - ps_strings is a NetBSD extention, and will be
	* ignored by executables which are strictly
	* compliant with the SVR4 ABI.
	*
	* XXX We have to set both regs and retval here due to different
	* XXX calling convention in trap.c and init_main.c.
	*/

	/* Collect argc from the user stack */
	argc = fuword((void *)stack);

	/*
	* XXX PG: these get overwritten in the syscall return code.
	* execve() should return EJUSTRETURN, like it does on NetBSD.
	* Emulate by setting the syscall return value cells. The
	* registers still have to be set for init's fork trampoline.
	*/
	td->td_retval[0] = argc;
	td->td_retval[1] = stack + sizeof(register_t);
	tf->fixreg[3] = argc;
	tf->fixreg[4] = stack + sizeof(register_t);
	tf->fixreg[5] = stack + (2 + argc)*sizeof(register_t);
	tf->fixreg[6] = 0; /* auxillary vector */
	tf->fixreg[7] = 0; /* termination vector */
	tf->fixreg[8] = (register_t)imgp->ps_strings; /* NetBSD extension */

	#ifdef __powerpc64__
	/*
	* For 64-bit, we need to disentangle the function descriptor
	*
	* 0. entry point
	* 1. TOC value (r2)
	* 2. Environment pointer (r11)
	*/

	(void)copyin((void *)imgp->entry_addr, entry_desc, sizeof(entry_desc));
	tf->srr0 = entry_desc[0] + imgp->reloc_base;
	tf->fixreg[2] = entry_desc[1] + imgp->reloc_base;
	tf->fixreg[11] = entry_desc[2] + imgp->reloc_base;
	tf->srr1 = PSL_SF \| PSL_USERSET \| PSL_FE_DFLT;
	if (mfmsr() & PSL_HV)
	tf->srr1 \|= PSL_HV;
	#else
	tf->srr0 = imgp->entry_addr;
	tf->srr1 = PSL_USERSET \| PSL_FE_DFLT;
	#endif
	td->td_pcb->pcb_flags = 0;
	}

	#ifdef COMPAT_FREEBSD32
	void
	ppc32_setregs(struct thread td, struct image_params imgp, u_long stack)
	{
	struct trapframe *tf;
	uint32_t argc;

	tf = trapframe(td);
	bzero(tf, sizeof *tf);
	tf->fixreg[1] = -roundup(-stack + 8, 16);

	argc = fuword32((void *)stack);

	td->td_retval[0] = argc;
	td->td_retval[1] = stack + sizeof(uint32_t);
	tf->fixreg[3] = argc;
	tf->fixreg[4] = stack + sizeof(uint32_t);
	tf->fixreg[5] = stack + (2 + argc)*sizeof(uint32_t);
	tf->fixreg[6] = 0; /* auxillary vector */
	tf->fixreg[7] = 0; /* termination vector */
	tf->fixreg[8] = (register_t)imgp->ps_strings; /* NetBSD extension */

	tf->srr0 = imgp->entry_addr;
	tf->srr1 = PSL_MBO \| PSL_USERSET \| PSL_FE_DFLT;
	tf->srr1 &= ~PSL_SF;
	if (mfmsr() & PSL_HV)
	tf->srr1 \|= PSL_HV;
	td->td_pcb->pcb_flags = 0;
	}
	#endif

	int
	fill_regs(struct thread td, struct reg regs)
	{
	struct trapframe *tf;

	tf = td->td_frame;
	memcpy(regs, tf, sizeof(struct reg));

	return (0);
	}

	int
	fill_dbregs(struct thread td, struct dbreg dbregs)
	{
	/* No debug registers on PowerPC */
	return (ENOSYS);
	}

	int
	fill_fpregs(struct thread td, struct fpreg fpregs)
	{
	struct pcb *pcb;

	pcb = td->td_pcb;

	if ((pcb->pcb_flags & PCB_FPU) == 0)
	memset(fpregs, 0, sizeof(struct fpreg));
	else
	memcpy(fpregs, &pcb->pcb_fpu, sizeof(struct fpreg));

	return (0);
	}

	int
	set_regs(struct thread td, struct reg regs)
	{
	struct trapframe *tf;

	tf = td->td_frame;
	memcpy(tf, regs, sizeof(struct reg));

	return (0);
	}

	int
	set_dbregs(struct thread td, struct dbreg dbregs)
	{
	/* No debug registers on PowerPC */
	return (ENOSYS);
	}

	int
	set_fpregs(struct thread td, struct fpreg fpregs)
	{
	#ifdef AIM
	struct pcb *pcb;

	pcb = td->td_pcb;
	if ((pcb->pcb_flags & PCB_FPU) == 0)
	enable_fpu(td);
	memcpy(&pcb->pcb_fpu, fpregs, sizeof(struct fpreg));
	#endif

	return (0);
	}

	#ifdef COMPAT_FREEBSD32
	int
	set_regs32(struct thread td, struct reg32 regs)
	{
	struct trapframe *tf;
	int i;

	tf = td->td_frame;
	for (i = 0; i < 32; i++)
	tf->fixreg[i] = regs->fixreg[i];
	tf->lr = regs->lr;
	tf->cr = regs->cr;
	tf->xer = regs->xer;
	tf->ctr = regs->ctr;
	tf->srr0 = regs->pc;

	return (0);
	}

	int
	fill_regs32(struct thread td, struct reg32 regs)
	{
	struct trapframe *tf;
	int i;

	tf = td->td_frame;
	for (i = 0; i < 32; i++)
	regs->fixreg[i] = tf->fixreg[i];
	regs->lr = tf->lr;
	regs->cr = tf->cr;
	regs->xer = tf->xer;
	regs->ctr = tf->ctr;
	regs->pc = tf->srr0;

	return (0);
	}

	static int
	grab_mcontext32(struct thread td, mcontext32_t mcp, int flags)
	{
	mcontext_t mcp64;
	int i, error;

	error = grab_mcontext(td, &mcp64, flags);
	if (error != 0)
	return (error);

	mcp->mc_vers = mcp64.mc_vers;
	mcp->mc_flags = mcp64.mc_flags;
	mcp->mc_onstack = mcp64.mc_onstack;
	mcp->mc_len = mcp64.mc_len;
	memcpy(mcp->mc_avec,mcp64.mc_avec,sizeof(mcp64.mc_avec));
	memcpy(mcp->mc_av,mcp64.mc_av,sizeof(mcp64.mc_av));
	for (i = 0; i < 42; i++)
	mcp->mc_frame[i] = mcp64.mc_frame[i];
	memcpy(mcp->mc_fpreg,mcp64.mc_fpreg,sizeof(mcp64.mc_fpreg));

	return (0);
	}

	static int
	get_mcontext32(struct thread td, mcontext32_t mcp, int flags)
	{
	int error;

	error = grab_mcontext32(td, mcp, flags);
	if (error == 0) {
	PROC_LOCK(curthread->td_proc);
	mcp->mc_onstack = sigonstack(td->td_frame->fixreg[1]);
	PROC_UNLOCK(curthread->td_proc);
	}

	return (error);
	}

	static int
	set_mcontext32(struct thread td, const mcontext32_t mcp)
	{
	mcontext_t mcp64;
	int i, error;

	mcp64.mc_vers = mcp->mc_vers;
	mcp64.mc_flags = mcp->mc_flags;
	mcp64.mc_onstack = mcp->mc_onstack;
	mcp64.mc_len = mcp->mc_len;
	memcpy(mcp64.mc_avec,mcp->mc_avec,sizeof(mcp64.mc_avec));
	memcpy(mcp64.mc_av,mcp->mc_av,sizeof(mcp64.mc_av));
	for (i = 0; i < 42; i++)
	mcp64.mc_frame[i] = mcp->mc_frame[i];
	memcpy(mcp64.mc_fpreg,mcp->mc_fpreg,sizeof(mcp64.mc_fpreg));

	error = set_mcontext(td, &mcp64);

	return (error);
	}
	#endif

	#ifdef COMPAT_FREEBSD32
	int
	freebsd32_sigreturn(struct thread td, struct freebsd32_sigreturn_args uap)
	{
	ucontext32_t uc;
	int error;

	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);

	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
	CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
	return (EFAULT);
	}

	error = set_mcontext32(td, &uc.uc_mcontext);
	if (error != 0)
	return (error);

	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);

	CTR3(KTR_SIG, "sigreturn: return td=%p pc=%#x sp=%#x",
	td, uc.uc_mcontext.mc_srr0, uc.uc_mcontext.mc_gpr[1]);

	return (EJUSTRETURN);
	}

	/*
	* The first two fields of a ucontext_t are the signal mask and the machine
	* context. The next field is uc_link; we want to avoid destroying the link
	* when copying out contexts.
	*/
	#define UC32_COPY_SIZE offsetof(ucontext32_t, uc_link)

	int
	freebsd32_getcontext(struct thread td, struct freebsd32_getcontext_args uap)
	{
	ucontext32_t uc;
	int ret;

	if (uap->ucp == NULL)
	ret = EINVAL;
	else {
	get_mcontext32(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
	PROC_LOCK(td->td_proc);
	uc.uc_sigmask = td->td_sigmask;
	PROC_UNLOCK(td->td_proc);
	ret = copyout(&uc, uap->ucp, UC32_COPY_SIZE);
	}
	return (ret);
	}

	int
	freebsd32_setcontext(struct thread td, struct freebsd32_setcontext_args uap)
	{
	ucontext32_t uc;
	int ret;

	if (uap->ucp == NULL)
	ret = EINVAL;
	else {
	ret = copyin(uap->ucp, &uc, UC32_COPY_SIZE);
	if (ret == 0) {
	ret = set_mcontext32(td, &uc.uc_mcontext);
	if (ret == 0) {
	kern_sigprocmask(td, SIG_SETMASK,
	&uc.uc_sigmask, NULL, 0);
	}
	}
	}
	return (ret == 0 ? EJUSTRETURN : ret);
	}

	int
	freebsd32_swapcontext(struct thread td, struct freebsd32_swapcontext_args uap)
	{
	ucontext32_t uc;
	int ret;

	if (uap->oucp == NULL \|\| uap->ucp == NULL)
	ret = EINVAL;
	else {
	get_mcontext32(td, &uc.uc_mcontext, GET_MC_CLEAR_RET);
	PROC_LOCK(td->td_proc);
	uc.uc_sigmask = td->td_sigmask;
	PROC_UNLOCK(td->td_proc);
	ret = copyout(&uc, uap->oucp, UC32_COPY_SIZE);
	if (ret == 0) {
	ret = copyin(uap->ucp, &uc, UC32_COPY_SIZE);
	if (ret == 0) {
	ret = set_mcontext32(td, &uc.uc_mcontext);
	if (ret == 0) {
	kern_sigprocmask(td, SIG_SETMASK,
	&uc.uc_sigmask, NULL, 0);
	}
	}
	}
	}
	return (ret == 0 ? EJUSTRETURN : ret);
	}

	#endif

	void
	cpu_set_syscall_retval(struct thread *td, int error)
	{
	struct proc *p;
	struct trapframe *tf;
	int fixup;

	if (error == EJUSTRETURN)
	return;

	p = td->td_proc;
	tf = td->td_frame;

	if (tf->fixreg[0] == SYS___syscall &&
	(SV_PROC_FLAG(p, SV_ILP32))) {
	int code = tf->fixreg[FIRSTARG + 1];
	if (p->p_sysent->sv_mask)
	code &= p->p_sysent->sv_mask;
	fixup = (code != SYS_freebsd6_lseek && code != SYS_lseek) ?
	1 : 0;
	} else
	fixup = 0;

	switch (error) {
	case 0:
	if (fixup) {
	/*
	* 64-bit return, 32-bit syscall. Fixup byte order
	*/
	tf->fixreg[FIRSTARG] = 0;
	tf->fixreg[FIRSTARG + 1] = td->td_retval[0];
	} else {
	tf->fixreg[FIRSTARG] = td->td_retval[0];
	tf->fixreg[FIRSTARG + 1] = td->td_retval[1];
	}
	tf->cr &= ~0x10000000; /* Unset summary overflow */
	break;
	case ERESTART:
	/*
	* Set user's pc back to redo the system call.
	*/
	tf->srr0 -= 4;
	break;
	default:
	if (p->p_sysent->sv_errsize) {
	error = (error < p->p_sysent->sv_errsize) ?
	p->p_sysent->sv_errtbl[error] : -1;
	}
	tf->fixreg[FIRSTARG] = error;
	tf->cr \|= 0x10000000; /* Set summary overflow */
	break;
	}
	}

	/*
	* Threading functions
	*/
	void
	cpu_thread_exit(struct thread *td)
	{
	}

	void
	cpu_thread_clean(struct thread *td)
	{
	}

	void
	cpu_thread_alloc(struct thread *td)
	{
	struct pcb *pcb;

	pcb = (struct pcb )((td->td_kstack + td->td_kstack_pages PAGE_SIZE -
	sizeof(struct pcb)) & ~0x2fUL);
	td->td_pcb = pcb;
	td->td_frame = (struct trapframe *)pcb - 1;
	}

	void
	cpu_thread_free(struct thread *td)
	{
	}

	int
	cpu_set_user_tls(struct thread td, void tls_base)
	{

	if (SV_PROC_FLAG(td->td_proc, SV_LP64))
	td->td_frame->fixreg[13] = (register_t)tls_base + 0x7010;
	else
	td->td_frame->fixreg[2] = (register_t)tls_base + 0x7008;
	return (0);
	}

	void
	cpu_set_upcall(struct thread td, struct thread td0)
	{
	struct pcb *pcb2;
	struct trapframe *tf;
	struct callframe *cf;

	pcb2 = td->td_pcb;

	/* Copy the upcall pcb */
	bcopy(td0->td_pcb, pcb2, sizeof(*pcb2));

	/* Create a stack for the new thread */
	tf = td->td_frame;
	bcopy(td0->td_frame, tf, sizeof(struct trapframe));
	tf->fixreg[FIRSTARG] = 0;
	tf->fixreg[FIRSTARG + 1] = 0;
	tf->cr &= ~0x10000000;

	/* Set registers for trampoline to user mode. */
	cf = (struct callframe *)tf - 1;
	memset(cf, 0, sizeof(struct callframe));
	cf->cf_func = (register_t)fork_return;
	cf->cf_arg0 = (register_t)td;
	cf->cf_arg1 = (register_t)tf;

	pcb2->pcb_sp = (register_t)cf;
	#ifdef __powerpc64__
	pcb2->pcb_lr = ((register_t *)fork_trampoline)[0];
	pcb2->pcb_toc = ((register_t *)fork_trampoline)[1];
	#else
	pcb2->pcb_lr = (register_t)fork_trampoline;
	#endif
	pcb2->pcb_cpu.aim.usr_vsid = 0;

	/* Setup to release spin count in fork_exit(). */
	td->td_md.md_spinlock_count = 1;
	td->td_md.md_saved_msr = PSL_KERNSET;
	}

	void
	cpu_set_upcall_kse(struct thread td, void (entry)(void ), void arg,
	stack_t *stack)
	{
	struct trapframe *tf;
	uintptr_t sp;

	tf = td->td_frame;
	/* align stack and alloc space for frame ptr and saved LR */
	#ifdef __powerpc64__
	sp = ((uintptr_t)stack->ss_sp + stack->ss_size - 48) &
	~0x1f;
	#else
	sp = ((uintptr_t)stack->ss_sp + stack->ss_size - 8) &
	~0x1f;
	#endif
	bzero(tf, sizeof(struct trapframe));

	tf->fixreg[1] = (register_t)sp;
	tf->fixreg[3] = (register_t)arg;
	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
	tf->srr0 = (register_t)entry;
	#ifdef AIM
	tf->srr1 = PSL_MBO \| PSL_USERSET \| PSL_FE_DFLT;
	#ifdef __powerpc64__
	tf->srr1 &= ~PSL_SF;
	#endif
	#else
	tf->srr1 = PSL_USERSET;
	#endif
	} else {
	#ifdef __powerpc64__
	register_t entry_desc[3];
	(void)copyin((void *)entry, entry_desc, sizeof(entry_desc));
	tf->srr0 = entry_desc[0];
	tf->fixreg[2] = entry_desc[1];
	tf->fixreg[11] = entry_desc[2];
	tf->srr1 = PSL_SF \| PSL_MBO \| PSL_USERSET \| PSL_FE_DFLT;
	#endif
	}

	#ifdef __powerpc64__
	if (mfmsr() & PSL_HV)
	tf->srr1 \|= PSL_HV;
	#endif
	td->td_pcb->pcb_flags = 0;

	td->td_retval[0] = (register_t)entry;
	td->td_retval[1] = 0;
	}

	Index: head/sys/security/audit/audit_syscalls.c
	===================================================================
	--- head/sys/security/audit/audit_syscalls.c (revision 225616)
	+++ head/sys/security/audit/audit_syscalls.c (revision 225617)
	@@ -1,877 +1,877 @@
	/*-
	* Copyright (c) 1999-2009 Apple Inc.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. Neither the name of Apple Inc. ("Apple") nor the names of
	* its contributors may be used to endorse or promote products derived
	* from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR
	* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
	* IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
	* POSSIBILITY OF SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/mount.h>
	#include <sys/namei.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>
	#include <sys/vnode.h>
	#include <sys/jail.h>

	#include <bsm/audit.h>
	#include <bsm/audit_kevents.h>

	#include <security/audit/audit.h>
	#include <security/audit/audit_private.h>
	#include <security/mac/mac_framework.h>

	#ifdef AUDIT

	/*
	* System call to allow a user space application to submit a BSM audit record
	* to the kernel for inclusion in the audit log. This function does little
	* verification on the audit record that is submitted.
	*
	* XXXAUDIT: Audit preselection for user records does not currently work,
	* since we pre-select only based on the AUE_audit event type, not the event
	* type submitted as part of the user audit data.
	*/
	/* ARGSUSED */
	int
	-audit(struct thread td, struct audit_args uap)
	+sys_audit(struct thread td, struct audit_args uap)
	{
	int error;
	void * rec;
	struct kaudit_record *ar;

	if (jailed(td->td_ucred))
	return (ENOSYS);
	error = priv_check(td, PRIV_AUDIT_SUBMIT);
	if (error)
	return (error);

	if ((uap->length <= 0) \|\| (uap->length > audit_qctrl.aq_bufsz))
	return (EINVAL);

	ar = currecord();

	/*
	* If there's no current audit record (audit() itself not audited)
	* commit the user audit record.
	*/
	if (ar == NULL) {

	/*
	* This is not very efficient; we're required to allocate a
	* complete kernel audit record just so the user record can
	* tag along.
	*
	* XXXAUDIT: Maybe AUE_AUDIT in the system call context and
	* special pre-select handling?
	*/
	td->td_ar = audit_new(AUE_NULL, td);
	if (td->td_ar == NULL)
	return (ENOTSUP);
	td->td_pflags \|= TDP_AUDITREC;
	ar = td->td_ar;
	}

	if (uap->length > MAX_AUDIT_RECORD_SIZE)
	return (EINVAL);

	rec = malloc(uap->length, M_AUDITDATA, M_WAITOK);

	error = copyin(uap->record, rec, uap->length);
	if (error)
	goto free_out;

	/* Verify the record. */
	if (bsm_rec_verify(rec) == 0) {
	error = EINVAL;
	goto free_out;
	}

	#ifdef MAC
	error = mac_system_check_audit(td->td_ucred, rec, uap->length);
	if (error)
	goto free_out;
	#endif

	/*
	* Attach the user audit record to the kernel audit record. Because
	* this system call is an auditable event, we will write the user
	* record along with the record for this audit event.
	*
	* XXXAUDIT: KASSERT appropriate starting values of k_udata, k_ulen,
	* k_ar_commit & AR_COMMIT_USER?
	*/
	ar->k_udata = rec;
	ar->k_ulen = uap->length;
	ar->k_ar_commit \|= AR_COMMIT_USER;

	/*
	* Currently we assume that all preselection has been performed in
	* userspace. We unconditionally set these masks so that the records
	* get committed both to the trail and pipe. In the future we will
	* want to setup kernel based preselection.
	*/
	ar->k_ar_commit \|= (AR_PRESELECT_USER_TRAIL \| AR_PRESELECT_USER_PIPE);
	return (0);

	free_out:
	/*
	* audit_syscall_exit() will free the audit record on the thread even
	* if we allocated it above.
	*/
	free(rec, M_AUDITDATA);
	return (error);
	}

	/*
	* System call to manipulate auditing.
	*/
	/* ARGSUSED */
	int
	-auditon(struct thread td, struct auditon_args uap)
	+sys_auditon(struct thread td, struct auditon_args uap)
	{
	struct ucred cred, newcred, *oldcred;
	int error;
	union auditon_udata udata;
	struct proc *tp;

	if (jailed(td->td_ucred))
	return (ENOSYS);
	AUDIT_ARG_CMD(uap->cmd);

	#ifdef MAC
	error = mac_system_check_auditon(td->td_ucred, uap->cmd);
	if (error)
	return (error);
	#endif

	error = priv_check(td, PRIV_AUDIT_CONTROL);
	if (error)
	return (error);

	if ((uap->length <= 0) \|\| (uap->length > sizeof(union auditon_udata)))
	return (EINVAL);

	memset((void *)&udata, 0, sizeof(udata));

	/*
	* Some of the GET commands use the arguments too.
	*/
	switch (uap->cmd) {
	case A_SETPOLICY:
	case A_OLDSETPOLICY:
	case A_SETKMASK:
	case A_SETQCTRL:
	case A_OLDSETQCTRL:
	case A_SETSTAT:
	case A_SETUMASK:
	case A_SETSMASK:
	case A_SETCOND:
	case A_OLDSETCOND:
	case A_SETCLASS:
	case A_SETPMASK:
	case A_SETFSIZE:
	case A_SETKAUDIT:
	case A_GETCLASS:
	case A_GETPINFO:
	case A_GETPINFO_ADDR:
	case A_SENDTRIGGER:
	error = copyin(uap->data, (void *)&udata, uap->length);
	if (error)
	return (error);
	AUDIT_ARG_AUDITON(&udata);
	break;
	}

	/*
	* XXXAUDIT: Locking?
	*/
	switch (uap->cmd) {
	case A_OLDGETPOLICY:
	case A_GETPOLICY:
	if (uap->length == sizeof(udata.au_policy64)) {
	if (!audit_fail_stop)
	udata.au_policy64 \|= AUDIT_CNT;
	if (audit_panic_on_write_fail)
	udata.au_policy64 \|= AUDIT_AHLT;
	if (audit_argv)
	udata.au_policy64 \|= AUDIT_ARGV;
	if (audit_arge)
	udata.au_policy64 \|= AUDIT_ARGE;
	break;
	}
	if (uap->length != sizeof(udata.au_policy))
	return (EINVAL);
	if (!audit_fail_stop)
	udata.au_policy \|= AUDIT_CNT;
	if (audit_panic_on_write_fail)
	udata.au_policy \|= AUDIT_AHLT;
	if (audit_argv)
	udata.au_policy \|= AUDIT_ARGV;
	if (audit_arge)
	udata.au_policy \|= AUDIT_ARGE;
	break;

	case A_OLDSETPOLICY:
	case A_SETPOLICY:
	if (uap->length == sizeof(udata.au_policy64)) {
	if (udata.au_policy & (~AUDIT_CNT\|AUDIT_AHLT\|
	AUDIT_ARGV\|AUDIT_ARGE))
	return (EINVAL);
	audit_fail_stop = ((udata.au_policy64 & AUDIT_CNT) ==
	0);
	audit_panic_on_write_fail = (udata.au_policy64 &
	AUDIT_AHLT);
	audit_argv = (udata.au_policy64 & AUDIT_ARGV);
	audit_arge = (udata.au_policy64 & AUDIT_ARGE);
	break;
	}
	if (uap->length != sizeof(udata.au_policy))
	return (EINVAL);
	if (udata.au_policy & ~(AUDIT_CNT\|AUDIT_AHLT\|AUDIT_ARGV\|
	AUDIT_ARGE))
	return (EINVAL);
	/*
	* XXX - Need to wake up waiters if the policy relaxes?
	*/
	audit_fail_stop = ((udata.au_policy & AUDIT_CNT) == 0);
	audit_panic_on_write_fail = (udata.au_policy & AUDIT_AHLT);
	audit_argv = (udata.au_policy & AUDIT_ARGV);
	audit_arge = (udata.au_policy & AUDIT_ARGE);
	break;

	case A_GETKMASK:
	if (uap->length != sizeof(udata.au_mask))
	return (EINVAL);
	udata.au_mask = audit_nae_mask;
	break;

	case A_SETKMASK:
	if (uap->length != sizeof(udata.au_mask))
	return (EINVAL);
	audit_nae_mask = udata.au_mask;
	break;

	case A_OLDGETQCTRL:
	case A_GETQCTRL:
	if (uap->length == sizeof(udata.au_qctrl64)) {
	udata.au_qctrl64.aq64_hiwater =
	(u_int64_t)audit_qctrl.aq_hiwater;
	udata.au_qctrl64.aq64_lowater =
	(u_int64_t)audit_qctrl.aq_lowater;
	udata.au_qctrl64.aq64_bufsz =
	(u_int64_t)audit_qctrl.aq_bufsz;
	udata.au_qctrl64.aq64_minfree =
	(u_int64_t)audit_qctrl.aq_minfree;
	break;
	}
	if (uap->length != sizeof(udata.au_qctrl))
	return (EINVAL);
	udata.au_qctrl = audit_qctrl;
	break;

	case A_OLDSETQCTRL:
	case A_SETQCTRL:
	if (uap->length == sizeof(udata.au_qctrl64)) {
	if ((udata.au_qctrl64.aq64_hiwater > AQ_MAXHIGH) \|\|
	(udata.au_qctrl64.aq64_lowater >=
	udata.au_qctrl.aq_hiwater) \|\|
	(udata.au_qctrl64.aq64_bufsz > AQ_MAXBUFSZ) \|\|
	(udata.au_qctrl64.aq64_minfree < 0) \|\|
	(udata.au_qctrl64.aq64_minfree > 100))
	return (EINVAL);
	audit_qctrl.aq_hiwater =
	(int)udata.au_qctrl64.aq64_hiwater;
	audit_qctrl.aq_lowater =
	(int)udata.au_qctrl64.aq64_lowater;
	audit_qctrl.aq_bufsz =
	(int)udata.au_qctrl64.aq64_bufsz;
	audit_qctrl.aq_minfree =
	(int)udata.au_qctrl64.aq64_minfree;
	audit_qctrl.aq_delay = -1; /* Not used. */
	break;
	}
	if (uap->length != sizeof(udata.au_qctrl))
	return (EINVAL);
	if ((udata.au_qctrl.aq_hiwater > AQ_MAXHIGH) \|\|
	(udata.au_qctrl.aq_lowater >= udata.au_qctrl.aq_hiwater) \|\|
	(udata.au_qctrl.aq_bufsz > AQ_MAXBUFSZ) \|\|
	(udata.au_qctrl.aq_minfree < 0) \|\|
	(udata.au_qctrl.aq_minfree > 100))
	return (EINVAL);

	audit_qctrl = udata.au_qctrl;
	/* XXX The queue delay value isn't used with the kernel. */
	audit_qctrl.aq_delay = -1;
	break;

	case A_GETCWD:
	return (ENOSYS);
	break;

	case A_GETCAR:
	return (ENOSYS);
	break;

	case A_GETSTAT:
	return (ENOSYS);
	break;

	case A_SETSTAT:
	return (ENOSYS);
	break;

	case A_SETUMASK:
	return (ENOSYS);
	break;

	case A_SETSMASK:
	return (ENOSYS);
	break;

	case A_OLDGETCOND:
	case A_GETCOND:
	if (uap->length == sizeof(udata.au_cond64)) {
	if (audit_enabled && !audit_suspended)
	udata.au_cond64 = AUC_AUDITING;
	else
	udata.au_cond64 = AUC_NOAUDIT;
	break;
	}
	if (uap->length != sizeof(udata.au_cond))
	return (EINVAL);
	if (audit_enabled && !audit_suspended)
	udata.au_cond = AUC_AUDITING;
	else
	udata.au_cond = AUC_NOAUDIT;
	break;

	case A_OLDSETCOND:
	case A_SETCOND:
	if (uap->length == sizeof(udata.au_cond64)) {
	if (udata.au_cond64 == AUC_NOAUDIT)
	audit_suspended = 1;
	if (udata.au_cond64 == AUC_AUDITING)
	audit_suspended = 0;
	if (udata.au_cond64 == AUC_DISABLED) {
	audit_suspended = 1;
	audit_shutdown(NULL, 0);
	}
	break;
	}
	if (uap->length != sizeof(udata.au_cond))
	return (EINVAL);
	if (udata.au_cond == AUC_NOAUDIT)
	audit_suspended = 1;
	if (udata.au_cond == AUC_AUDITING)
	audit_suspended = 0;
	if (udata.au_cond == AUC_DISABLED) {
	audit_suspended = 1;
	audit_shutdown(NULL, 0);
	}
	break;

	case A_GETCLASS:
	if (uap->length != sizeof(udata.au_evclass))
	return (EINVAL);
	udata.au_evclass.ec_class = au_event_class(
	udata.au_evclass.ec_number);
	break;

	case A_SETCLASS:
	if (uap->length != sizeof(udata.au_evclass))
	return (EINVAL);
	au_evclassmap_insert(udata.au_evclass.ec_number,
	udata.au_evclass.ec_class);
	break;

	case A_GETPINFO:
	if (uap->length != sizeof(udata.au_aupinfo))
	return (EINVAL);
	if (udata.au_aupinfo.ap_pid < 1)
	return (ESRCH);
	if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL)
	return (ESRCH);
	if ((error = p_cansee(td, tp)) != 0) {
	PROC_UNLOCK(tp);
	return (error);
	}
	cred = tp->p_ucred;
	if (cred->cr_audit.ai_termid.at_type == AU_IPv6) {
	PROC_UNLOCK(tp);
	return (EINVAL);
	}
	udata.au_aupinfo.ap_auid = cred->cr_audit.ai_auid;
	udata.au_aupinfo.ap_mask.am_success =
	cred->cr_audit.ai_mask.am_success;
	udata.au_aupinfo.ap_mask.am_failure =
	cred->cr_audit.ai_mask.am_failure;
	udata.au_aupinfo.ap_termid.machine =
	cred->cr_audit.ai_termid.at_addr[0];
	udata.au_aupinfo.ap_termid.port =
	(dev_t)cred->cr_audit.ai_termid.at_port;
	udata.au_aupinfo.ap_asid = cred->cr_audit.ai_asid;
	PROC_UNLOCK(tp);
	break;

	case A_SETPMASK:
	if (uap->length != sizeof(udata.au_aupinfo))
	return (EINVAL);
	if (udata.au_aupinfo.ap_pid < 1)
	return (ESRCH);
	newcred = crget();
	if ((tp = pfind(udata.au_aupinfo.ap_pid)) == NULL) {
	crfree(newcred);
	return (ESRCH);
	}
	if ((error = p_cansee(td, tp)) != 0) {
	PROC_UNLOCK(tp);
	crfree(newcred);
	return (error);
	}
	oldcred = tp->p_ucred;
	crcopy(newcred, oldcred);
	newcred->cr_audit.ai_mask.am_success =
	udata.au_aupinfo.ap_mask.am_success;
	newcred->cr_audit.ai_mask.am_failure =
	udata.au_aupinfo.ap_mask.am_failure;
	td->td_proc->p_ucred = newcred;
	PROC_UNLOCK(tp);
	crfree(oldcred);
	break;

	case A_SETFSIZE:
	if (uap->length != sizeof(udata.au_fstat))
	return (EINVAL);
	if ((udata.au_fstat.af_filesz != 0) &&
	(udata.au_fstat.af_filesz < MIN_AUDIT_FILE_SIZE))
	return (EINVAL);
	audit_fstat.af_filesz = udata.au_fstat.af_filesz;
	break;

	case A_GETFSIZE:
	if (uap->length != sizeof(udata.au_fstat))
	return (EINVAL);
	udata.au_fstat.af_filesz = audit_fstat.af_filesz;
	udata.au_fstat.af_currsz = audit_fstat.af_currsz;
	break;

	case A_GETPINFO_ADDR:
	if (uap->length != sizeof(udata.au_aupinfo_addr))
	return (EINVAL);
	if (udata.au_aupinfo_addr.ap_pid < 1)
	return (ESRCH);
	if ((tp = pfind(udata.au_aupinfo_addr.ap_pid)) == NULL)
	return (ESRCH);
	cred = tp->p_ucred;
	udata.au_aupinfo_addr.ap_auid = cred->cr_audit.ai_auid;
	udata.au_aupinfo_addr.ap_mask.am_success =
	cred->cr_audit.ai_mask.am_success;
	udata.au_aupinfo_addr.ap_mask.am_failure =
	cred->cr_audit.ai_mask.am_failure;
	udata.au_aupinfo_addr.ap_termid = cred->cr_audit.ai_termid;
	udata.au_aupinfo_addr.ap_asid = cred->cr_audit.ai_asid;
	PROC_UNLOCK(tp);
	break;

	case A_GETKAUDIT:
	if (uap->length != sizeof(udata.au_kau_info))
	return (EINVAL);
	audit_get_kinfo(&udata.au_kau_info);
	break;

	case A_SETKAUDIT:
	if (uap->length != sizeof(udata.au_kau_info))
	return (EINVAL);
	if (udata.au_kau_info.ai_termid.at_type != AU_IPv4 &&
	udata.au_kau_info.ai_termid.at_type != AU_IPv6)
	return (EINVAL);
	audit_set_kinfo(&udata.au_kau_info);
	break;

	case A_SENDTRIGGER:
	if (uap->length != sizeof(udata.au_trigger))
	return (EINVAL);
	if ((udata.au_trigger < AUDIT_TRIGGER_MIN) \|\|
	(udata.au_trigger > AUDIT_TRIGGER_MAX))
	return (EINVAL);
	return (audit_send_trigger(udata.au_trigger));

	default:
	return (EINVAL);
	}

	/*
	* Copy data back to userspace for the GET comands.
	*/
	switch (uap->cmd) {
	case A_GETPOLICY:
	case A_OLDGETPOLICY:
	case A_GETKMASK:
	case A_GETQCTRL:
	case A_OLDGETQCTRL:
	case A_GETCWD:
	case A_GETCAR:
	case A_GETSTAT:
	case A_GETCOND:
	case A_OLDGETCOND:
	case A_GETCLASS:
	case A_GETPINFO:
	case A_GETFSIZE:
	case A_GETPINFO_ADDR:
	case A_GETKAUDIT:
	error = copyout((void *)&udata, uap->data, uap->length);
	if (error)
	return (error);
	break;
	}

	return (0);
	}

	/*
	* System calls to manage the user audit information.
	*/
	/* ARGSUSED */
	int
	-getauid(struct thread td, struct getauid_args uap)
	+sys_getauid(struct thread td, struct getauid_args uap)
	{
	int error;

	if (jailed(td->td_ucred))
	return (ENOSYS);
	error = priv_check(td, PRIV_AUDIT_GETAUDIT);
	if (error)
	return (error);
	return (copyout(&td->td_ucred->cr_audit.ai_auid, uap->auid,
	sizeof(td->td_ucred->cr_audit.ai_auid)));
	}

	/* ARGSUSED */
	int
	-setauid(struct thread td, struct setauid_args uap)
	+sys_setauid(struct thread td, struct setauid_args uap)
	{
	struct ucred newcred, oldcred;
	au_id_t id;
	int error;

	if (jailed(td->td_ucred))
	return (ENOSYS);
	error = copyin(uap->auid, &id, sizeof(id));
	if (error)
	return (error);
	audit_arg_auid(id);
	newcred = crget();
	PROC_LOCK(td->td_proc);
	oldcred = td->td_proc->p_ucred;
	crcopy(newcred, oldcred);
	#ifdef MAC
	error = mac_cred_check_setauid(oldcred, id);
	if (error)
	goto fail;
	#endif
	error = priv_check_cred(oldcred, PRIV_AUDIT_SETAUDIT, 0);
	if (error)
	goto fail;
	newcred->cr_audit.ai_auid = id;
	td->td_proc->p_ucred = newcred;
	PROC_UNLOCK(td->td_proc);
	crfree(oldcred);
	return (0);
	fail:
	PROC_UNLOCK(td->td_proc);
	crfree(newcred);
	return (error);
	}

	/*
	* System calls to get and set process audit information.
	*/
	/* ARGSUSED */
	int
	-getaudit(struct thread td, struct getaudit_args uap)
	+sys_getaudit(struct thread td, struct getaudit_args uap)
	{
	struct auditinfo ai;
	struct ucred *cred;
	int error;

	cred = td->td_ucred;
	if (jailed(cred))
	return (ENOSYS);
	error = priv_check(td, PRIV_AUDIT_GETAUDIT);
	if (error)
	return (error);
	if (cred->cr_audit.ai_termid.at_type == AU_IPv6)
	return (E2BIG);
	bzero(&ai, sizeof(ai));
	ai.ai_auid = cred->cr_audit.ai_auid;
	ai.ai_mask = cred->cr_audit.ai_mask;
	ai.ai_asid = cred->cr_audit.ai_asid;
	ai.ai_termid.machine = cred->cr_audit.ai_termid.at_addr[0];
	ai.ai_termid.port = cred->cr_audit.ai_termid.at_port;
	return (copyout(&ai, uap->auditinfo, sizeof(ai)));
	}

	/* ARGSUSED */
	int
	-setaudit(struct thread td, struct setaudit_args uap)
	+sys_setaudit(struct thread td, struct setaudit_args uap)
	{
	struct ucred newcred, oldcred;
	struct auditinfo ai;
	int error;

	if (jailed(td->td_ucred))
	return (ENOSYS);
	error = copyin(uap->auditinfo, &ai, sizeof(ai));
	if (error)
	return (error);
	audit_arg_auditinfo(&ai);
	newcred = crget();
	PROC_LOCK(td->td_proc);
	oldcred = td->td_proc->p_ucred;
	crcopy(newcred, oldcred);
	#ifdef MAC
	error = mac_cred_check_setaudit(oldcred, &ai);
	if (error)
	goto fail;
	#endif
	error = priv_check_cred(oldcred, PRIV_AUDIT_SETAUDIT, 0);
	if (error)
	goto fail;
	bzero(&newcred->cr_audit, sizeof(newcred->cr_audit));
	newcred->cr_audit.ai_auid = ai.ai_auid;
	newcred->cr_audit.ai_mask = ai.ai_mask;
	newcred->cr_audit.ai_asid = ai.ai_asid;
	newcred->cr_audit.ai_termid.at_addr[0] = ai.ai_termid.machine;
	newcred->cr_audit.ai_termid.at_port = ai.ai_termid.port;
	newcred->cr_audit.ai_termid.at_type = AU_IPv4;
	td->td_proc->p_ucred = newcred;
	PROC_UNLOCK(td->td_proc);
	crfree(oldcred);
	return (0);
	fail:
	PROC_UNLOCK(td->td_proc);
	crfree(newcred);
	return (error);
	}

	/* ARGSUSED */
	int
	-getaudit_addr(struct thread td, struct getaudit_addr_args uap)
	+sys_getaudit_addr(struct thread td, struct getaudit_addr_args uap)
	{
	int error;

	if (jailed(td->td_ucred))
	return (ENOSYS);
	if (uap->length < sizeof(*uap->auditinfo_addr))
	return (EOVERFLOW);
	error = priv_check(td, PRIV_AUDIT_GETAUDIT);
	if (error)
	return (error);
	return (copyout(&td->td_ucred->cr_audit, uap->auditinfo_addr,
	sizeof(*uap->auditinfo_addr)));
	}

	/* ARGSUSED */
	int
	-setaudit_addr(struct thread td, struct setaudit_addr_args uap)
	+sys_setaudit_addr(struct thread td, struct setaudit_addr_args uap)
	{
	struct ucred newcred, oldcred;
	struct auditinfo_addr aia;
	int error;

	if (jailed(td->td_ucred))
	return (ENOSYS);
	error = copyin(uap->auditinfo_addr, &aia, sizeof(aia));
	if (error)
	return (error);
	audit_arg_auditinfo_addr(&aia);
	if (aia.ai_termid.at_type != AU_IPv6 &&
	aia.ai_termid.at_type != AU_IPv4)
	return (EINVAL);
	newcred = crget();
	PROC_LOCK(td->td_proc);
	oldcred = td->td_proc->p_ucred;
	crcopy(newcred, oldcred);
	#ifdef MAC
	error = mac_cred_check_setaudit_addr(oldcred, &aia);
	if (error)
	goto fail;
	#endif
	error = priv_check_cred(oldcred, PRIV_AUDIT_SETAUDIT, 0);
	if (error)
	goto fail;
	newcred->cr_audit = aia;
	td->td_proc->p_ucred = newcred;
	PROC_UNLOCK(td->td_proc);
	crfree(oldcred);
	return (0);
	fail:
	PROC_UNLOCK(td->td_proc);
	crfree(newcred);
	return (error);
	}

	/*
	* Syscall to manage audit files.
	*/
	/* ARGSUSED */
	int
	-auditctl(struct thread td, struct auditctl_args uap)
	+sys_auditctl(struct thread td, struct auditctl_args uap)
	{
	struct nameidata nd;
	struct ucred *cred;
	struct vnode *vp;
	int error = 0;
	int flags, vfslocked;

	if (jailed(td->td_ucred))
	return (ENOSYS);
	error = priv_check(td, PRIV_AUDIT_CONTROL);
	if (error)
	return (error);

	vp = NULL;
	cred = NULL;

	/*
	* If a path is specified, open the replacement vnode, perform
	* validity checks, and grab another reference to the current
	* credential.
	*
	* On Darwin, a NULL path argument is also used to disable audit.
	*/
	if (uap->path == NULL)
	return (EINVAL);

	NDINIT(&nd, LOOKUP, FOLLOW \| LOCKLEAF \| MPSAFE \| AUDITVNODE1,
	UIO_USERSPACE, uap->path, td);
	flags = AUDIT_OPEN_FLAGS;
	error = vn_open(&nd, &flags, 0, NULL);
	if (error)
	return (error);
	vfslocked = NDHASGIANT(&nd);
	vp = nd.ni_vp;
	#ifdef MAC
	error = mac_system_check_auditctl(td->td_ucred, vp);
	VOP_UNLOCK(vp, 0);
	if (error) {
	vn_close(vp, AUDIT_CLOSE_FLAGS, td->td_ucred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	#else
	VOP_UNLOCK(vp, 0);
	#endif
	NDFREE(&nd, NDF_ONLY_PNBUF);
	if (vp->v_type != VREG) {
	vn_close(vp, AUDIT_CLOSE_FLAGS, td->td_ucred, td);
	VFS_UNLOCK_GIANT(vfslocked);
	return (EINVAL);
	}
	VFS_UNLOCK_GIANT(vfslocked);
	cred = td->td_ucred;
	crhold(cred);

	/*
	* XXXAUDIT: Should audit_suspended actually be cleared by
	* audit_worker?
	*/
	audit_suspended = 0;

	audit_rotate_vnode(cred, vp);

	return (error);
	}

	#else /* !AUDIT */

	int
	-audit(struct thread td, struct audit_args uap)
	+sys_audit(struct thread td, struct audit_args uap)
	{

	return (ENOSYS);
	}

	int
	-auditon(struct thread td, struct auditon_args uap)
	+sys_auditon(struct thread td, struct auditon_args uap)
	{

	return (ENOSYS);
	}

	int
	-getauid(struct thread td, struct getauid_args uap)
	+sys_getauid(struct thread td, struct getauid_args uap)
	{

	return (ENOSYS);
	}

	int
	-setauid(struct thread td, struct setauid_args uap)
	+sys_setauid(struct thread td, struct setauid_args uap)
	{

	return (ENOSYS);
	}

	int
	-getaudit(struct thread td, struct getaudit_args uap)
	+sys_getaudit(struct thread td, struct getaudit_args uap)
	{

	return (ENOSYS);
	}

	int
	-setaudit(struct thread td, struct setaudit_args uap)
	+sys_setaudit(struct thread td, struct setaudit_args uap)
	{

	return (ENOSYS);
	}

	int
	-getaudit_addr(struct thread td, struct getaudit_addr_args uap)
	+sys_getaudit_addr(struct thread td, struct getaudit_addr_args uap)
	{

	return (ENOSYS);
	}

	int
	-setaudit_addr(struct thread td, struct setaudit_addr_args uap)
	+sys_setaudit_addr(struct thread td, struct setaudit_addr_args uap)
	{

	return (ENOSYS);
	}

	int
	-auditctl(struct thread td, struct auditctl_args uap)
	+sys_auditctl(struct thread td, struct auditctl_args uap)
	{

	return (ENOSYS);
	}
	#endif /* AUDIT */
	Index: head/sys/security/mac/mac_syscalls.c
	===================================================================
	--- head/sys/security/mac/mac_syscalls.c (revision 225616)
	+++ head/sys/security/mac/mac_syscalls.c (revision 225617)
	@@ -1,731 +1,731 @@
	/*-
	* Copyright (c) 1999-2002, 2006, 2009 Robert N. M. Watson
	* Copyright (c) 2001 Ilmar S. Habibulin
	* Copyright (c) 2001-2005 Networks Associates Technology, Inc.
	* Copyright (c) 2005-2006 SPARTA, Inc.
	* Copyright (c) 2008 Apple Inc.
	* All rights reserved.
	*
	* This software was developed by Robert Watson and Ilmar Habibulin for the
	* TrustedBSD Project.
	*
	* This software was developed for the FreeBSD Project in part by Network
	* Associates Laboratories, the Security Research Division of Network
	* Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
	* as part of the DARPA CHATS research program.
	*
	* This software was enhanced by SPARTA ISSO under SPAWAR contract
	* N66001-04-C-6019 ("SEFOS").
	*
	* This software was developed at the University of Cambridge Computer
	* Laboratory with support from a grant from Google, Inc.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_mac.h"

	#include <sys/param.h>
	#include <sys/capability.h>
	#include <sys/fcntl.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/malloc.h>
	#include <sys/mutex.h>
	#include <sys/mac.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/sysent.h>
	#include <sys/vnode.h>
	#include <sys/mount.h>
	#include <sys/file.h>
	#include <sys/namei.h>
	#include <sys/socket.h>
	#include <sys/pipe.h>
	#include <sys/socketvar.h>

	#include <security/mac/mac_framework.h>
	#include <security/mac/mac_internal.h>
	#include <security/mac/mac_policy.h>

	#ifdef MAC

	FEATURE(security_mac, "Mandatory Access Control Framework support");

	int
	-__mac_get_pid(struct thread td, struct __mac_get_pid_args uap)
	+sys___mac_get_pid(struct thread td, struct __mac_get_pid_args uap)
	{
	char elements, buffer;
	struct mac mac;
	struct proc *tproc;
	struct ucred *tcred;
	int error;

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	tproc = pfind(uap->pid);
	if (tproc == NULL)
	return (ESRCH);

	tcred = NULL; /* Satisfy gcc. */
	error = p_cansee(td, tproc);
	if (error == 0)
	tcred = crhold(tproc->p_ucred);
	PROC_UNLOCK(tproc);
	if (error)
	return (error);

	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
	if (error) {
	free(elements, M_MACTEMP);
	crfree(tcred);
	return (error);
	}

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK \| M_ZERO);
	error = mac_cred_externalize_label(tcred->cr_label, elements,
	buffer, mac.m_buflen);
	if (error == 0)
	error = copyout(buffer, mac.m_string, strlen(buffer)+1);

	free(buffer, M_MACTEMP);
	free(elements, M_MACTEMP);
	crfree(tcred);
	return (error);
	}

	int
	-__mac_get_proc(struct thread td, struct __mac_get_proc_args uap)
	+sys___mac_get_proc(struct thread td, struct __mac_get_proc_args uap)
	{
	char elements, buffer;
	struct mac mac;
	int error;

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
	if (error) {
	free(elements, M_MACTEMP);
	return (error);
	}

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK \| M_ZERO);
	error = mac_cred_externalize_label(td->td_ucred->cr_label,
	elements, buffer, mac.m_buflen);
	if (error == 0)
	error = copyout(buffer, mac.m_string, strlen(buffer)+1);

	free(buffer, M_MACTEMP);
	free(elements, M_MACTEMP);
	return (error);
	}

	int
	-__mac_set_proc(struct thread td, struct __mac_set_proc_args uap)
	+sys___mac_set_proc(struct thread td, struct __mac_set_proc_args uap)
	{
	struct ucred newcred, oldcred;
	struct label *intlabel;
	struct proc *p;
	struct mac mac;
	char *buffer;
	int error;

	if (!(mac_labeled & MPC_OBJECT_CRED))
	return (EINVAL);

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
	if (error) {
	free(buffer, M_MACTEMP);
	return (error);
	}

	intlabel = mac_cred_label_alloc();
	error = mac_cred_internalize_label(intlabel, buffer);
	free(buffer, M_MACTEMP);
	if (error)
	goto out;

	newcred = crget();

	p = td->td_proc;
	PROC_LOCK(p);
	oldcred = p->p_ucred;

	error = mac_cred_check_relabel(oldcred, intlabel);
	if (error) {
	PROC_UNLOCK(p);
	crfree(newcred);
	goto out;
	}

	setsugid(p);
	crcopy(newcred, oldcred);
	mac_cred_relabel(newcred, intlabel);
	p->p_ucred = newcred;

	PROC_UNLOCK(p);
	crfree(oldcred);
	mac_proc_vm_revoke(td);

	out:
	mac_cred_label_free(intlabel);
	return (error);
	}

	int
	-__mac_get_fd(struct thread td, struct __mac_get_fd_args uap)
	+sys___mac_get_fd(struct thread td, struct __mac_get_fd_args uap)
	{
	char elements, buffer;
	struct label *intlabel;
	struct file *fp;
	struct mac mac;
	struct vnode *vp;
	struct pipe *pipe;
	struct socket *so;
	short label_type;
	int vfslocked, error;

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
	if (error) {
	free(elements, M_MACTEMP);
	return (error);
	}

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK \| M_ZERO);
	error = fget(td, uap->fd, CAP_MAC_GET, &fp);
	if (error)
	goto out;

	label_type = fp->f_type;
	switch (fp->f_type) {
	case DTYPE_FIFO:
	case DTYPE_VNODE:
	if (!(mac_labeled & MPC_OBJECT_VNODE))
	return (EINVAL);
	vp = fp->f_vnode;
	intlabel = mac_vnode_label_alloc();
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	mac_vnode_copy_label(vp->v_label, intlabel);
	VOP_UNLOCK(vp, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	error = mac_vnode_externalize_label(intlabel, elements,
	buffer, mac.m_buflen);
	mac_vnode_label_free(intlabel);
	break;

	case DTYPE_PIPE:
	if (!(mac_labeled & MPC_OBJECT_PIPE))
	return (EINVAL);
	pipe = fp->f_data;
	intlabel = mac_pipe_label_alloc();
	PIPE_LOCK(pipe);
	mac_pipe_copy_label(pipe->pipe_pair->pp_label, intlabel);
	PIPE_UNLOCK(pipe);
	error = mac_pipe_externalize_label(intlabel, elements,
	buffer, mac.m_buflen);
	mac_pipe_label_free(intlabel);
	break;

	case DTYPE_SOCKET:
	if (!(mac_labeled & MPC_OBJECT_SOCKET))
	return (EINVAL);
	so = fp->f_data;
	intlabel = mac_socket_label_alloc(M_WAITOK);
	SOCK_LOCK(so);
	mac_socket_copy_label(so->so_label, intlabel);
	SOCK_UNLOCK(so);
	error = mac_socket_externalize_label(intlabel, elements,
	buffer, mac.m_buflen);
	mac_socket_label_free(intlabel);
	break;

	default:
	error = EINVAL;
	}
	fdrop(fp, td);
	if (error == 0)
	error = copyout(buffer, mac.m_string, strlen(buffer)+1);

	out:
	free(buffer, M_MACTEMP);
	free(elements, M_MACTEMP);
	return (error);
	}

	int
	-__mac_get_file(struct thread td, struct __mac_get_file_args uap)
	+sys___mac_get_file(struct thread td, struct __mac_get_file_args uap)
	{
	char elements, buffer;
	struct nameidata nd;
	struct label *intlabel;
	struct mac mac;
	int vfslocked, error;

	if (!(mac_labeled & MPC_OBJECT_VNODE))
	return (EINVAL);

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
	if (error) {
	free(elements, M_MACTEMP);
	return (error);
	}

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK \| M_ZERO);
	NDINIT(&nd, LOOKUP, MPSAFE \| LOCKLEAF \| FOLLOW, UIO_USERSPACE,
	uap->path_p, td);
	error = namei(&nd);
	if (error)
	goto out;

	intlabel = mac_vnode_label_alloc();
	vfslocked = NDHASGIANT(&nd);
	mac_vnode_copy_label(nd.ni_vp->v_label, intlabel);
	error = mac_vnode_externalize_label(intlabel, elements, buffer,
	mac.m_buflen);

	NDFREE(&nd, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	mac_vnode_label_free(intlabel);
	if (error == 0)
	error = copyout(buffer, mac.m_string, strlen(buffer)+1);

	out:
	free(buffer, M_MACTEMP);
	free(elements, M_MACTEMP);

	return (error);
	}

	int
	-__mac_get_link(struct thread td, struct __mac_get_link_args uap)
	+sys___mac_get_link(struct thread td, struct __mac_get_link_args uap)
	{
	char elements, buffer;
	struct nameidata nd;
	struct label *intlabel;
	struct mac mac;
	int vfslocked, error;

	if (!(mac_labeled & MPC_OBJECT_VNODE))
	return (EINVAL);

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
	if (error) {
	free(elements, M_MACTEMP);
	return (error);
	}

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK \| M_ZERO);
	NDINIT(&nd, LOOKUP, MPSAFE \| LOCKLEAF \| NOFOLLOW, UIO_USERSPACE,
	uap->path_p, td);
	error = namei(&nd);
	if (error)
	goto out;

	intlabel = mac_vnode_label_alloc();
	vfslocked = NDHASGIANT(&nd);
	mac_vnode_copy_label(nd.ni_vp->v_label, intlabel);
	error = mac_vnode_externalize_label(intlabel, elements, buffer,
	mac.m_buflen);
	NDFREE(&nd, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	mac_vnode_label_free(intlabel);

	if (error == 0)
	error = copyout(buffer, mac.m_string, strlen(buffer)+1);

	out:
	free(buffer, M_MACTEMP);
	free(elements, M_MACTEMP);

	return (error);
	}

	int
	-__mac_set_fd(struct thread td, struct __mac_set_fd_args uap)
	+sys___mac_set_fd(struct thread td, struct __mac_set_fd_args uap)
	{
	struct label *intlabel;
	struct pipe *pipe;
	struct socket *so;
	struct file *fp;
	struct mount *mp;
	struct vnode *vp;
	struct mac mac;
	char *buffer;
	int error, vfslocked;

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
	if (error) {
	free(buffer, M_MACTEMP);
	return (error);
	}

	error = fget(td, uap->fd, CAP_MAC_SET, &fp);
	if (error)
	goto out;

	switch (fp->f_type) {
	case DTYPE_FIFO:
	case DTYPE_VNODE:
	if (!(mac_labeled & MPC_OBJECT_VNODE))
	return (EINVAL);
	intlabel = mac_vnode_label_alloc();
	error = mac_vnode_internalize_label(intlabel, buffer);
	if (error) {
	mac_vnode_label_free(intlabel);
	break;
	}
	vp = fp->f_vnode;
	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
	error = vn_start_write(vp, &mp, V_WAIT \| PCATCH);
	if (error != 0) {
	VFS_UNLOCK_GIANT(vfslocked);
	mac_vnode_label_free(intlabel);
	break;
	}
	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	error = vn_setlabel(vp, intlabel, td->td_ucred);
	VOP_UNLOCK(vp, 0);
	vn_finished_write(mp);
	VFS_UNLOCK_GIANT(vfslocked);
	mac_vnode_label_free(intlabel);
	break;

	case DTYPE_PIPE:
	if (!(mac_labeled & MPC_OBJECT_PIPE))
	return (EINVAL);
	intlabel = mac_pipe_label_alloc();
	error = mac_pipe_internalize_label(intlabel, buffer);
	if (error == 0) {
	pipe = fp->f_data;
	PIPE_LOCK(pipe);
	error = mac_pipe_label_set(td->td_ucred,
	pipe->pipe_pair, intlabel);
	PIPE_UNLOCK(pipe);
	}
	mac_pipe_label_free(intlabel);
	break;

	case DTYPE_SOCKET:
	if (!(mac_labeled & MPC_OBJECT_SOCKET))
	return (EINVAL);
	intlabel = mac_socket_label_alloc(M_WAITOK);
	error = mac_socket_internalize_label(intlabel, buffer);
	if (error == 0) {
	so = fp->f_data;
	error = mac_socket_label_set(td->td_ucred, so,
	intlabel);
	}
	mac_socket_label_free(intlabel);
	break;

	default:
	error = EINVAL;
	}
	fdrop(fp, td);
	out:
	free(buffer, M_MACTEMP);
	return (error);
	}

	int
	-__mac_set_file(struct thread td, struct __mac_set_file_args uap)
	+sys___mac_set_file(struct thread td, struct __mac_set_file_args uap)
	{
	struct label *intlabel;
	struct nameidata nd;
	struct mount *mp;
	struct mac mac;
	char *buffer;
	int vfslocked, error;

	if (!(mac_labeled & MPC_OBJECT_VNODE))
	return (EINVAL);

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
	if (error) {
	free(buffer, M_MACTEMP);
	return (error);
	}

	intlabel = mac_vnode_label_alloc();
	error = mac_vnode_internalize_label(intlabel, buffer);
	free(buffer, M_MACTEMP);
	if (error)
	goto out;

	NDINIT(&nd, LOOKUP, MPSAFE \| LOCKLEAF \| FOLLOW, UIO_USERSPACE,
	uap->path_p, td);
	error = namei(&nd);
	vfslocked = NDHASGIANT(&nd);
	if (error == 0) {
	error = vn_start_write(nd.ni_vp, &mp, V_WAIT \| PCATCH);
	if (error == 0) {
	error = vn_setlabel(nd.ni_vp, intlabel,
	td->td_ucred);
	vn_finished_write(mp);
	}
	}

	NDFREE(&nd, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	out:
	mac_vnode_label_free(intlabel);
	return (error);
	}

	int
	-__mac_set_link(struct thread td, struct __mac_set_link_args uap)
	+sys___mac_set_link(struct thread td, struct __mac_set_link_args uap)
	{
	struct label *intlabel;
	struct nameidata nd;
	struct mount *mp;
	struct mac mac;
	char *buffer;
	int vfslocked, error;

	if (!(mac_labeled & MPC_OBJECT_VNODE))
	return (EINVAL);

	error = copyin(uap->mac_p, &mac, sizeof(mac));
	if (error)
	return (error);

	error = mac_check_structmac_consistent(&mac);
	if (error)
	return (error);

	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
	if (error) {
	free(buffer, M_MACTEMP);
	return (error);
	}

	intlabel = mac_vnode_label_alloc();
	error = mac_vnode_internalize_label(intlabel, buffer);
	free(buffer, M_MACTEMP);
	if (error)
	goto out;

	NDINIT(&nd, LOOKUP, MPSAFE \| LOCKLEAF \| NOFOLLOW, UIO_USERSPACE,
	uap->path_p, td);
	error = namei(&nd);
	vfslocked = NDHASGIANT(&nd);
	if (error == 0) {
	error = vn_start_write(nd.ni_vp, &mp, V_WAIT \| PCATCH);
	if (error == 0) {
	error = vn_setlabel(nd.ni_vp, intlabel,
	td->td_ucred);
	vn_finished_write(mp);
	}
	}

	NDFREE(&nd, 0);
	VFS_UNLOCK_GIANT(vfslocked);
	out:
	mac_vnode_label_free(intlabel);
	return (error);
	}

	int
	-mac_syscall(struct thread td, struct mac_syscall_args uap)
	+sys_mac_syscall(struct thread td, struct mac_syscall_args uap)
	{
	struct mac_policy_conf *mpc;
	char target[MAC_MAX_POLICY_NAME];
	int error;

	error = copyinstr(uap->policy, target, sizeof(target), NULL);
	if (error)
	return (error);

	error = ENOSYS;
	LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) {
	if (strcmp(mpc->mpc_name, target) == 0 &&
	mpc->mpc_ops->mpo_syscall != NULL) {
	error = mpc->mpc_ops->mpo_syscall(td,
	uap->call, uap->arg);
	goto out;
	}
	}

	if (!LIST_EMPTY(&mac_policy_list)) {
	mac_policy_slock_sleep();
	LIST_FOREACH(mpc, &mac_policy_list, mpc_list) {
	if (strcmp(mpc->mpc_name, target) == 0 &&
	mpc->mpc_ops->mpo_syscall != NULL) {
	error = mpc->mpc_ops->mpo_syscall(td,
	uap->call, uap->arg);
	break;
	}
	}
	mac_policy_sunlock_sleep();
	}
	out:
	return (error);
	}

	#else /* !MAC */

	int
	-__mac_get_pid(struct thread td, struct __mac_get_pid_args uap)
	+sys___mac_get_pid(struct thread td, struct __mac_get_pid_args uap)
	{

	return (ENOSYS);
	}

	int
	-__mac_get_proc(struct thread td, struct __mac_get_proc_args uap)
	+sys___mac_get_proc(struct thread td, struct __mac_get_proc_args uap)
	{

	return (ENOSYS);
	}

	int
	-__mac_set_proc(struct thread td, struct __mac_set_proc_args uap)
	+sys___mac_set_proc(struct thread td, struct __mac_set_proc_args uap)
	{

	return (ENOSYS);
	}

	int
	-__mac_get_fd(struct thread td, struct __mac_get_fd_args uap)
	+sys___mac_get_fd(struct thread td, struct __mac_get_fd_args uap)
	{

	return (ENOSYS);
	}

	int
	-__mac_get_file(struct thread td, struct __mac_get_file_args uap)
	+sys___mac_get_file(struct thread td, struct __mac_get_file_args uap)
	{

	return (ENOSYS);
	}

	int
	-__mac_get_link(struct thread td, struct __mac_get_link_args uap)
	+sys___mac_get_link(struct thread td, struct __mac_get_link_args uap)
	{

	return (ENOSYS);
	}

	int
	-__mac_set_fd(struct thread td, struct __mac_set_fd_args uap)
	+sys___mac_set_fd(struct thread td, struct __mac_set_fd_args uap)
	{

	return (ENOSYS);
	}

	int
	-__mac_set_file(struct thread td, struct __mac_set_file_args uap)
	+sys___mac_set_file(struct thread td, struct __mac_set_file_args uap)
	{

	return (ENOSYS);
	}

	int
	-__mac_set_link(struct thread td, struct __mac_set_link_args uap)
	+sys___mac_set_link(struct thread td, struct __mac_set_link_args uap)
	{

	return (ENOSYS);
	}

	int
	-mac_syscall(struct thread td, struct mac_syscall_args uap)
	+sys_mac_syscall(struct thread td, struct mac_syscall_args uap)
	{

	return (ENOSYS);
	}

	#endif /* !MAC */
	Index: head/sys/sparc64/sparc64/machdep.c
	===================================================================
	--- head/sys/sparc64/sparc64/machdep.c (revision 225616)
	+++ head/sys/sparc64/sparc64/machdep.c (revision 225617)
	@@ -1,1128 +1,1128 @@
	/*-
	* Copyright (c) 2001 Jake Burkholder.
	* Copyright (c) 1992 Terrence R. Lambert.
	* Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
	* All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* William Jolitz.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
	* from: FreeBSD: src/sys/i386/i386/machdep.c,v 1.477 2001/08/27
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_ddb.h"
	#include "opt_kstack_pages.h"

	#include <sys/param.h>
	#include <sys/malloc.h>
	#include <sys/proc.h>
	#include <sys/systm.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/bus.h>
	#include <sys/cpu.h>
	#include <sys/cons.h>
	#include <sys/eventhandler.h>
	#include <sys/exec.h>
	#include <sys/imgact.h>
	#include <sys/interrupt.h>
	#include <sys/kdb.h>
	#include <sys/kernel.h>
	#include <sys/ktr.h>
	#include <sys/linker.h>
	#include <sys/lock.h>
	#include <sys/msgbuf.h>
	#include <sys/mutex.h>
	#include <sys/pcpu.h>
	#include <sys/ptrace.h>
	#include <sys/reboot.h>
	#include <sys/signalvar.h>
	#include <sys/smp.h>
	#include <sys/syscallsubr.h>
	#include <sys/sysent.h>
	#include <sys/sysproto.h>
	#include <sys/timetc.h>
	#include <sys/ucontext.h>

	#include <dev/ofw/openfirm.h>

	#include <vm/vm.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_page.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_param.h>

	#include <ddb/ddb.h>

	#include <machine/bus.h>
	#include <machine/cache.h>
	#include <machine/clock.h>
	#include <machine/cmt.h>
	#include <machine/cpu.h>
	#include <machine/fireplane.h>
	#include <machine/fp.h>
	#include <machine/fsr.h>
	#include <machine/intr_machdep.h>
	#include <machine/jbus.h>
	#include <machine/md_var.h>
	#include <machine/metadata.h>
	#include <machine/ofw_machdep.h>
	#include <machine/ofw_mem.h>
	#include <machine/pcb.h>
	#include <machine/pmap.h>
	#include <machine/pstate.h>
	#include <machine/reg.h>
	#include <machine/sigframe.h>
	#include <machine/smp.h>
	#include <machine/tick.h>
	#include <machine/tlb.h>
	#include <machine/tstate.h>
	#include <machine/upa.h>
	#include <machine/ver.h>

	typedef int ofw_vec_t(void *);

	#ifdef DDB
	extern vm_offset_t ksym_start, ksym_end;
	#endif

	int dtlb_slots;
	int itlb_slots;
	struct tlb_entry *kernel_tlbs;
	int kernel_tlb_slots;

	int cold = 1;
	long Maxmem;
	long realmem;

	void *dpcpu0;
	char pcpu0[PCPU_PAGES * PAGE_SIZE];
	struct trapframe frame0;

	vm_offset_t kstack0;
	vm_paddr_t kstack0_phys;

	struct kva_md_info kmi;

	u_long ofw_vec;
	u_long ofw_tba;
	u_int tba_taken_over;

	char sparc64_model[32];

	static int cpu_use_vis = 1;

	cpu_block_copy_t *cpu_block_copy;
	cpu_block_zero_t *cpu_block_zero;

	static phandle_t find_bsp(phandle_t node, uint32_t bspid, u_int cpu_impl);
	void sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3,
	ofw_vec_t *vec);
	static void sparc64_shutdown_final(void *dummy, int howto);

	static void cpu_startup(void *arg);
	SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);

	CTASSERT((1 << INT_SHIFT) == sizeof(int));
	CTASSERT((1 << PTR_SHIFT) == sizeof(char *));

	CTASSERT(sizeof(struct reg) == 256);
	CTASSERT(sizeof(struct fpreg) == 272);
	CTASSERT(sizeof(struct __mcontext) == 512);

	CTASSERT((sizeof(struct pcb) & (64 - 1)) == 0);
	CTASSERT((offsetof(struct pcb, pcb_kfp) & (64 - 1)) == 0);
	CTASSERT((offsetof(struct pcb, pcb_ufp) & (64 - 1)) == 0);
	CTASSERT(sizeof(struct pcb) <= ((KSTACK_PAGES * PAGE_SIZE) / 8));

	CTASSERT(sizeof(struct pcpu) <= ((PCPU_PAGES * PAGE_SIZE) / 2));

	static void
	cpu_startup(void *arg)
	{
	vm_paddr_t physsz;
	int i;

	physsz = 0;
	for (i = 0; i < sparc64_nmemreg; i++)
	physsz += sparc64_memreg[i].mr_size;
	printf("real memory = %lu (%lu MB)\n", physsz,
	physsz / (1024 * 1024));
	realmem = (long)physsz / PAGE_SIZE;

	vm_ksubmap_init(&kmi);

	bufinit();
	vm_pager_bufferinit();

	EVENTHANDLER_REGISTER(shutdown_final, sparc64_shutdown_final, NULL,
	SHUTDOWN_PRI_LAST);

	printf("avail memory = %lu (%lu MB)\n", cnt.v_free_count * PAGE_SIZE,
	cnt.v_free_count / ((1024 * 1024) / PAGE_SIZE));

	if (bootverbose)
	printf("machine: %s\n", sparc64_model);

	cpu_identify(rdpr(ver), PCPU_GET(clock), curcpu);
	}

	void
	cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
	{
	struct intr_request *ir;
	int i;

	pcpu->pc_irtail = &pcpu->pc_irhead;
	for (i = 0; i < IR_FREE; i++) {
	ir = &pcpu->pc_irpool[i];
	ir->ir_next = pcpu->pc_irfree;
	pcpu->pc_irfree = ir;
	}
	}

	void
	spinlock_enter(void)
	{
	struct thread *td;
	register_t pil;

	td = curthread;
	if (td->td_md.md_spinlock_count == 0) {
	pil = rdpr(pil);
	wrpr(pil, 0, PIL_TICK);
	td->td_md.md_spinlock_count = 1;
	td->td_md.md_saved_pil = pil;
	} else
	td->td_md.md_spinlock_count++;
	critical_enter();
	}

	void
	spinlock_exit(void)
	{
	struct thread *td;
	register_t pil;

	td = curthread;
	critical_exit();
	pil = td->td_md.md_saved_pil;
	td->td_md.md_spinlock_count--;
	if (td->td_md.md_spinlock_count == 0)
	wrpr(pil, pil, 0);
	}

	static phandle_t
	find_bsp(phandle_t node, uint32_t bspid, u_int cpu_impl)
	{
	char type[sizeof("cpu")];
	phandle_t child;
	uint32_t cpuid;

	for (; node != 0; node = OF_peer(node)) {
	child = OF_child(node);
	if (child > 0) {
	child = find_bsp(child, bspid, cpu_impl);
	if (child > 0)
	return (child);
	} else {
	if (OF_getprop(node, "device_type", type,
	sizeof(type)) <= 0)
	continue;
	if (strcmp(type, "cpu") != 0)
	continue;
	if (OF_getprop(node, cpu_cpuid_prop(cpu_impl), &cpuid,
	sizeof(cpuid)) <= 0)
	continue;
	if (cpuid == bspid)
	return (node);
	}
	}
	return (0);
	}

	const char *
	cpu_cpuid_prop(u_int cpu_impl)
	{

	switch (cpu_impl) {
	case CPU_IMPL_SPARC64:
	case CPU_IMPL_SPARC64V:
	case CPU_IMPL_ULTRASPARCI:
	case CPU_IMPL_ULTRASPARCII:
	case CPU_IMPL_ULTRASPARCIIi:
	case CPU_IMPL_ULTRASPARCIIe:
	return ("upa-portid");
	case CPU_IMPL_ULTRASPARCIII:
	case CPU_IMPL_ULTRASPARCIIIp:
	case CPU_IMPL_ULTRASPARCIIIi:
	case CPU_IMPL_ULTRASPARCIIIip:
	return ("portid");
	case CPU_IMPL_ULTRASPARCIV:
	case CPU_IMPL_ULTRASPARCIVp:
	return ("cpuid");
	default:
	return ("");
	}
	}

	uint32_t
	cpu_get_mid(u_int cpu_impl)
	{

	switch (cpu_impl) {
	case CPU_IMPL_SPARC64:
	case CPU_IMPL_SPARC64V:
	case CPU_IMPL_ULTRASPARCI:
	case CPU_IMPL_ULTRASPARCII:
	case CPU_IMPL_ULTRASPARCIIi:
	case CPU_IMPL_ULTRASPARCIIe:
	return (UPA_CR_GET_MID(ldxa(0, ASI_UPA_CONFIG_REG)));
	case CPU_IMPL_ULTRASPARCIII:
	case CPU_IMPL_ULTRASPARCIIIp:
	return (FIREPLANE_CR_GET_AID(ldxa(AA_FIREPLANE_CONFIG,
	ASI_FIREPLANE_CONFIG_REG)));
	case CPU_IMPL_ULTRASPARCIIIi:
	case CPU_IMPL_ULTRASPARCIIIip:
	return (JBUS_CR_GET_JID(ldxa(0, ASI_JBUS_CONFIG_REG)));
	case CPU_IMPL_ULTRASPARCIV:
	case CPU_IMPL_ULTRASPARCIVp:
	return (INTR_ID_GET_ID(ldxa(AA_INTR_ID, ASI_INTR_ID)));
	default:
	return (0);
	}
	}

	void
	sparc64_init(caddr_t mdp, u_long o1, u_long o2, u_long o3, ofw_vec_t *vec)
	{
	char *env;
	struct pcpu *pc;
	vm_offset_t end;
	vm_offset_t va;
	caddr_t kmdp;
	phandle_t root;
	u_int cpu_impl;

	end = 0;
	kmdp = NULL;

	/*
	* Find out what kind of CPU we have first, for anything that changes
	* behaviour.
	*/
	cpu_impl = VER_IMPL(rdpr(ver));

	/*
	* Do CPU-specific initialization.
	*/
	if (cpu_impl >= CPU_IMPL_ULTRASPARCIII)
	cheetah_init(cpu_impl);
	else if (cpu_impl == CPU_IMPL_SPARC64V)
	zeus_init(cpu_impl);

	/*
	* Clear (S)TICK timer (including NPT).
	*/
	tick_clear(cpu_impl);

	/*
	* UltraSparc II[e,i] based systems come up with the tick interrupt
	* enabled and a handler that resets the tick counter, causing DELAY()
	* to not work properly when used early in boot.
	* UltraSPARC III based systems come up with the system tick interrupt
	* enabled, causing an interrupt storm on startup since they are not
	* handled.
	*/
	tick_stop(cpu_impl);

	/*
	* Set up Open Firmware entry points.
	*/
	ofw_tba = rdpr(tba);
	ofw_vec = (u_long)vec;

	/*
	* Parse metadata if present and fetch parameters. Must be before the
	* console is inited so cninit gets the right value of boothowto.
	*/
	if (mdp != NULL) {
	preload_metadata = mdp;
	kmdp = preload_search_by_type("elf kernel");
	if (kmdp != NULL) {
	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
	kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
	end = MD_FETCH(kmdp, MODINFOMD_KERNEND, vm_offset_t);
	kernel_tlb_slots = MD_FETCH(kmdp, MODINFOMD_DTLB_SLOTS,
	int);
	kernel_tlbs = (void *)preload_search_info(kmdp,
	MODINFO_METADATA \| MODINFOMD_DTLB);
	}
	}

	init_param1();

	/*
	* Initialize Open Firmware (needed for console).
	*/
	OF_install(OFW_STD_DIRECT, 0);
	OF_init(ofw_entry);

	/*
	* Prime our per-CPU data page for use. Note, we are using it for
	* our stack, so don't pass the real size (PAGE_SIZE) to pcpu_init
	* or it'll zero it out from under us.
	*/
	pc = (struct pcpu )(pcpu0 + (PCPU_PAGES PAGE_SIZE)) - 1;
	pcpu_init(pc, 0, sizeof(struct pcpu));
	pc->pc_addr = (vm_offset_t)pcpu0;
	pc->pc_impl = cpu_impl;
	pc->pc_mid = cpu_get_mid(cpu_impl);
	pc->pc_tlb_ctx = TLB_CTX_USER_MIN;
	pc->pc_tlb_ctx_min = TLB_CTX_USER_MIN;
	pc->pc_tlb_ctx_max = TLB_CTX_USER_MAX;

	/*
	* Determine the OFW node and frequency of the BSP (and ensure the
	* BSP is in the device tree in the first place).
	*/
	root = OF_peer(0);
	pc->pc_node = find_bsp(root, pc->pc_mid, cpu_impl);
	if (pc->pc_node == 0)
	OF_exit();
	if (OF_getprop(pc->pc_node, "clock-frequency", &pc->pc_clock,
	sizeof(pc->pc_clock)) <= 0)
	OF_exit();

	/*
	* Provide a DELAY() that works before PCPU_REG is set. We can't
	* set PCPU_REG without also taking over the trap table or the
	* firmware will overwrite it. Unfortunately, it's way to early
	* to also take over the trap table at this point.
	*/
	clock_boot = pc->pc_clock;
	delay_func = delay_boot;

	/*
	* Initialize the console before printing anything.
	* NB: the low-level console drivers require a working DELAY() at
	* this point.
	*/
	cninit();

	/*
	* Panic if there is no metadata. Most likely the kernel was booted
	* directly, instead of through loader(8).
	*/
	if (mdp == NULL \|\| kmdp == NULL \|\| end == 0 \|\|
	kernel_tlb_slots == 0 \|\| kernel_tlbs == NULL) {
	printf("sparc64_init: missing loader metadata.\n"
	"This probably means you are not using loader(8).\n");
	panic("sparc64_init");
	}

	/*
	* Work around the broken loader behavior of not demapping no
	* longer used kernel TLB slots when unloading the kernel or
	* modules.
	*/
	for (va = KERNBASE + (kernel_tlb_slots - 1) * PAGE_SIZE_4M;
	va >= roundup2(end, PAGE_SIZE_4M); va -= PAGE_SIZE_4M) {
	if (bootverbose)
	printf("demapping unused kernel TLB slot "
	"(va %#lx - %#lx)\n", va, va + PAGE_SIZE_4M - 1);
	stxa(TLB_DEMAP_VA(va) \| TLB_DEMAP_PRIMARY \| TLB_DEMAP_PAGE,
	ASI_DMMU_DEMAP, 0);
	stxa(TLB_DEMAP_VA(va) \| TLB_DEMAP_PRIMARY \| TLB_DEMAP_PAGE,
	ASI_IMMU_DEMAP, 0);
	flush(KERNBASE);
	kernel_tlb_slots--;
	}

	/*
	* Determine the TLB slot maxima, which are expected to be
	* equal across all CPUs.
	* NB: for cheetah-class CPUs, these properties only refer
	* to the t16s.
	*/
	if (OF_getprop(pc->pc_node, "#dtlb-entries", &dtlb_slots,
	sizeof(dtlb_slots)) == -1)
	panic("sparc64_init: cannot determine number of dTLB slots");
	if (OF_getprop(pc->pc_node, "#itlb-entries", &itlb_slots,
	sizeof(itlb_slots)) == -1)
	panic("sparc64_init: cannot determine number of iTLB slots");

	/*
	* Initialize and enable the caches. Note that his may include
	* applying workarounds.
	*/
	cache_init(pc);
	cache_enable(cpu_impl);
	uma_set_align(pc->pc_cache.dc_linesize - 1);

	cpu_block_copy = bcopy;
	cpu_block_zero = bzero;
	getenv_int("machdep.use_vis", &cpu_use_vis);
	if (cpu_use_vis) {
	switch (cpu_impl) {
	case CPU_IMPL_SPARC64:
	case CPU_IMPL_ULTRASPARCI:
	case CPU_IMPL_ULTRASPARCII:
	case CPU_IMPL_ULTRASPARCIIi:
	case CPU_IMPL_ULTRASPARCIIe:
	case CPU_IMPL_ULTRASPARCIII: /* NB: we've disabled P$. */
	case CPU_IMPL_ULTRASPARCIIIp:
	case CPU_IMPL_ULTRASPARCIIIi:
	case CPU_IMPL_ULTRASPARCIV:
	case CPU_IMPL_ULTRASPARCIVp:
	case CPU_IMPL_ULTRASPARCIIIip:
	cpu_block_copy = spitfire_block_copy;
	cpu_block_zero = spitfire_block_zero;
	break;
	case CPU_IMPL_SPARC64V:
	cpu_block_copy = zeus_block_copy;
	cpu_block_zero = zeus_block_zero;
	break;
	}
	}

	#ifdef SMP
	mp_init(cpu_impl);
	#endif

	/*
	* Initialize virtual memory and calculate physmem.
	*/
	pmap_bootstrap(cpu_impl);

	/*
	* Initialize tunables.
	*/
	init_param2(physmem);
	env = getenv("kernelname");
	if (env != NULL) {
	strlcpy(kernelname, env, sizeof(kernelname));
	freeenv(env);
	}

	/*
	* Initialize the interrupt tables.
	*/
	intr_init1();

	/*
	* Initialize proc0, set kstack0, frame0, curthread and curpcb.
	*/
	proc_linkup0(&proc0, &thread0);
	proc0.p_md.md_sigtramp = NULL;
	proc0.p_md.md_utrap = NULL;
	thread0.td_kstack = kstack0;
	thread0.td_kstack_pages = KSTACK_PAGES;
	thread0.td_pcb = (struct pcb *)
	(thread0.td_kstack + KSTACK_PAGES * PAGE_SIZE) - 1;
	frame0.tf_tstate = TSTATE_IE \| TSTATE_PEF \| TSTATE_PRIV;
	thread0.td_frame = &frame0;
	pc->pc_curthread = &thread0;
	pc->pc_curpcb = thread0.td_pcb;

	/*
	* Initialize global registers.
	*/
	cpu_setregs(pc);

	/*
	* Take over the trap table via the PROM. Using the PROM for this
	* is necessary in order to set obp-control-relinquished to true
	* within the PROM so obtaining /virtual-memory/translations doesn't
	* trigger a fatal reset error or worse things further down the road.
	* XXX it should be possible to use this solely instead of writing
	* %tba in cpu_setregs(). Doing so causes a hang however.
	*/
	sun4u_set_traptable(tl0_base);

	/*
	* It's now safe to use the real DELAY().
	*/
	delay_func = delay_tick;

	/*
	* Initialize the dynamic per-CPU area for the BSP and the message
	* buffer (after setting the trap table).
	*/
	dpcpu_init(dpcpu0, 0);
	msgbufinit(msgbufp, msgbufsize);

	/*
	* Initialize mutexes.
	*/
	mutex_init();

	/*
	* Finish the interrupt initialization now that mutexes work and
	* enable them.
	*/
	intr_init2();
	wrpr(pil, 0, 0);
	wrpr(pstate, 0, PSTATE_KERNEL);

	/*
	* Finish pmap initialization now that we're ready for mutexes.
	*/
	PMAP_LOCK_INIT(kernel_pmap);

	OF_getprop(root, "name", sparc64_model, sizeof(sparc64_model) - 1);

	kdb_init();

	#ifdef KDB
	if (boothowto & RB_KDB)
	kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
	#endif
	}

	void
	sendsig(sig_t catcher, ksiginfo_t ksi, sigset_t mask)
	{
	struct trapframe *tf;
	struct sigframe *sfp;
	struct sigacts *psp;
	struct sigframe sf;
	struct thread *td;
	struct frame *fp;
	struct proc *p;
	u_long sp;
	int oonstack;
	int sig;

	oonstack = 0;
	td = curthread;
	p = td->td_proc;
	PROC_LOCK_ASSERT(p, MA_OWNED);
	sig = ksi->ksi_signo;
	psp = p->p_sigacts;
	mtx_assert(&psp->ps_mtx, MA_OWNED);
	tf = td->td_frame;
	sp = tf->tf_sp + SPOFF;
	oonstack = sigonstack(sp);

	CTR4(KTR_SIG, "sendsig: td=%p (%s) catcher=%p sig=%d", td, p->p_comm,
	catcher, sig);

	/* Make sure we have a signal trampoline to return to. */
	if (p->p_md.md_sigtramp == NULL) {
	/*
	* No signal trampoline... kill the process.
	*/
	CTR0(KTR_SIG, "sendsig: no sigtramp");
	printf("sendsig: %s is too old, rebuild it\n", p->p_comm);
	sigexit(td, sig);
	/* NOTREACHED */
	}

	/* Save user context. */
	bzero(&sf, sizeof(sf));
	get_mcontext(td, &sf.sf_uc.uc_mcontext, 0);
	sf.sf_uc.uc_sigmask = *mask;
	sf.sf_uc.uc_stack = td->td_sigstk;
	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) ?
	((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;

	/* Allocate and validate space for the signal handler context. */
	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
	SIGISMEMBER(psp->ps_sigonstack, sig)) {
	sfp = (struct sigframe *)(td->td_sigstk.ss_sp +
	td->td_sigstk.ss_size - sizeof(struct sigframe));
	} else
	sfp = (struct sigframe *)sp - 1;
	mtx_unlock(&psp->ps_mtx);
	PROC_UNLOCK(p);

	fp = (struct frame *)sfp - 1;

	/* Translate the signal if appropriate. */
	if (p->p_sysent->sv_sigtbl && sig <= p->p_sysent->sv_sigsize)
	sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];

	/* Build the argument list for the signal handler. */
	tf->tf_out[0] = sig;
	tf->tf_out[2] = (register_t)&sfp->sf_uc;
	tf->tf_out[4] = (register_t)catcher;
	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
	/* Signal handler installed with SA_SIGINFO. */
	tf->tf_out[1] = (register_t)&sfp->sf_si;

	/* Fill in POSIX parts. */
	sf.sf_si = ksi->ksi_info;
	sf.sf_si.si_signo = sig; /* maybe a translated signal */
	} else {
	/* Old FreeBSD-style arguments. */
	tf->tf_out[1] = ksi->ksi_code;
	tf->tf_out[3] = (register_t)ksi->ksi_addr;
	}

	/* Copy the sigframe out to the user's stack. */
	if (rwindow_save(td) != 0 \|\| copyout(&sf, sfp, sizeof(*sfp)) != 0 \|\|
	suword(&fp->fr_in[6], tf->tf_out[6]) != 0) {
	/*
	* Something is wrong with the stack pointer.
	* ...Kill the process.
	*/
	CTR2(KTR_SIG, "sendsig: sigexit td=%p sfp=%p", td, sfp);
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	/* NOTREACHED */
	}

	tf->tf_tpc = (u_long)p->p_md.md_sigtramp;
	tf->tf_tnpc = tf->tf_tpc + 4;
	tf->tf_sp = (u_long)fp - SPOFF;

	CTR3(KTR_SIG, "sendsig: return td=%p pc=%#lx sp=%#lx", td, tf->tf_tpc,
	tf->tf_sp);

	PROC_LOCK(p);
	mtx_lock(&psp->ps_mtx);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct sigreturn_args {
	ucontext_t *ucp;
	};
	#endif

	/*
	* MPSAFE
	*/
	int
	-sigreturn(struct thread td, struct sigreturn_args uap)
	+sys_sigreturn(struct thread td, struct sigreturn_args uap)
	{
	struct proc *p;
	mcontext_t *mc;
	ucontext_t uc;
	int error;

	p = td->td_proc;
	if (rwindow_save(td)) {
	PROC_LOCK(p);
	sigexit(td, SIGILL);
	}

	CTR2(KTR_SIG, "sigreturn: td=%p ucp=%p", td, uap->sigcntxp);
	if (copyin(uap->sigcntxp, &uc, sizeof(uc)) != 0) {
	CTR1(KTR_SIG, "sigreturn: efault td=%p", td);
	return (EFAULT);
	}

	mc = &uc.uc_mcontext;
	error = set_mcontext(td, mc);
	if (error != 0)
	return (error);

	kern_sigprocmask(td, SIG_SETMASK, &uc.uc_sigmask, NULL, 0);

	CTR4(KTR_SIG, "sigreturn: return td=%p pc=%#lx sp=%#lx tstate=%#lx",
	td, mc->mc_tpc, mc->mc_sp, mc->mc_tstate);
	return (EJUSTRETURN);
	}

	/*
	* Construct a PCB from a trapframe. This is called from kdb_trap() where
	* we want to start a backtrace from the function that caused us to enter
	* the debugger. We have the context in the trapframe, but base the trace
	* on the PCB. The PCB doesn't have to be perfect, as long as it contains
	* enough for a backtrace.
	*/
	void
	makectx(struct trapframe tf, struct pcb pcb)
	{

	pcb->pcb_pc = tf->tf_tpc;
	pcb->pcb_sp = tf->tf_sp;
	}

	int
	get_mcontext(struct thread td, mcontext_t mc, int flags)
	{
	struct trapframe *tf;
	struct pcb *pcb;

	tf = td->td_frame;
	pcb = td->td_pcb;
	/*
	* Copy the registers which will be restored by tl0_ret() from the
	* trapframe.
	* Note that we skip %g7 which is used as the userland TLS register
	* and %wstate.
	*/
	mc->mc_flags = _MC_VERSION;
	mc->mc_global[1] = tf->tf_global[1];
	mc->mc_global[2] = tf->tf_global[2];
	mc->mc_global[3] = tf->tf_global[3];
	mc->mc_global[4] = tf->tf_global[4];
	mc->mc_global[5] = tf->tf_global[5];
	mc->mc_global[6] = tf->tf_global[6];
	if (flags & GET_MC_CLEAR_RET) {
	mc->mc_out[0] = 0;
	mc->mc_out[1] = 0;
	} else {
	mc->mc_out[0] = tf->tf_out[0];
	mc->mc_out[1] = tf->tf_out[1];
	}
	mc->mc_out[2] = tf->tf_out[2];
	mc->mc_out[3] = tf->tf_out[3];
	mc->mc_out[4] = tf->tf_out[4];
	mc->mc_out[5] = tf->tf_out[5];
	mc->mc_out[6] = tf->tf_out[6];
	mc->mc_out[7] = tf->tf_out[7];
	mc->mc_fprs = tf->tf_fprs;
	mc->mc_fsr = tf->tf_fsr;
	mc->mc_gsr = tf->tf_gsr;
	mc->mc_tnpc = tf->tf_tnpc;
	mc->mc_tpc = tf->tf_tpc;
	mc->mc_tstate = tf->tf_tstate;
	mc->mc_y = tf->tf_y;
	critical_enter();
	if ((tf->tf_fprs & FPRS_FEF) != 0) {
	savefpctx(pcb->pcb_ufp);
	tf->tf_fprs &= ~FPRS_FEF;
	pcb->pcb_flags \|= PCB_FEF;
	}
	if ((pcb->pcb_flags & PCB_FEF) != 0) {
	bcopy(pcb->pcb_ufp, mc->mc_fp, sizeof(mc->mc_fp));
	mc->mc_fprs \|= FPRS_FEF;
	}
	critical_exit();
	return (0);
	}

	int
	set_mcontext(struct thread td, const mcontext_t mc)
	{
	struct trapframe *tf;
	struct pcb *pcb;

	if (!TSTATE_SECURE(mc->mc_tstate) \|\|
	(mc->mc_flags & ((1L << _MC_VERSION_BITS) - 1)) != _MC_VERSION)
	return (EINVAL);
	tf = td->td_frame;
	pcb = td->td_pcb;
	/* Make sure the windows are spilled first. */
	flushw();
	/*
	* Copy the registers which will be restored by tl0_ret() to the
	* trapframe.
	* Note that we skip %g7 which is used as the userland TLS register
	* and %wstate.
	*/
	tf->tf_global[1] = mc->mc_global[1];
	tf->tf_global[2] = mc->mc_global[2];
	tf->tf_global[3] = mc->mc_global[3];
	tf->tf_global[4] = mc->mc_global[4];
	tf->tf_global[5] = mc->mc_global[5];
	tf->tf_global[6] = mc->mc_global[6];
	tf->tf_out[0] = mc->mc_out[0];
	tf->tf_out[1] = mc->mc_out[1];
	tf->tf_out[2] = mc->mc_out[2];
	tf->tf_out[3] = mc->mc_out[3];
	tf->tf_out[4] = mc->mc_out[4];
	tf->tf_out[5] = mc->mc_out[5];
	tf->tf_out[6] = mc->mc_out[6];
	tf->tf_out[7] = mc->mc_out[7];
	tf->tf_fprs = mc->mc_fprs;
	tf->tf_fsr = mc->mc_fsr;
	tf->tf_gsr = mc->mc_gsr;
	tf->tf_tnpc = mc->mc_tnpc;
	tf->tf_tpc = mc->mc_tpc;
	tf->tf_tstate = mc->mc_tstate;
	tf->tf_y = mc->mc_y;
	if ((mc->mc_fprs & FPRS_FEF) != 0) {
	tf->tf_fprs = 0;
	bcopy(mc->mc_fp, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
	pcb->pcb_flags \|= PCB_FEF;
	}
	return (0);
	}

	/*
	* Exit the kernel and execute a firmware call that will not return, as
	* specified by the arguments.
	*/
	void
	cpu_shutdown(void *args)
	{

	#ifdef SMP
	cpu_mp_shutdown();
	#endif
	ofw_exit(args);
	}

	/*
	* Flush the D-cache for non-DMA I/O so that the I-cache can
	* be made coherent later.
	*/
	void
	cpu_flush_dcache(void *ptr, size_t len)
	{

	/* TBD */
	}

	/* Get current clock frequency for the given CPU ID. */
	int
	cpu_est_clockrate(int cpu_id, uint64_t *rate)
	{
	struct pcpu *pc;

	pc = pcpu_find(cpu_id);
	if (pc == NULL \|\| rate == NULL)
	return (EINVAL);
	*rate = pc->pc_clock;
	return (0);
	}

	/*
	* Duplicate OF_exit() with a different firmware call function that restores
	* the trap table, otherwise a RED state exception is triggered in at least
	* some firmware versions.
	*/
	void
	cpu_halt(void)
	{
	static struct {
	cell_t name;
	cell_t nargs;
	cell_t nreturns;
	} args = {
	(cell_t)"exit",
	0,
	0
	};

	cpu_shutdown(&args);
	}

	static void
	sparc64_shutdown_final(void *dummy, int howto)
	{
	static struct {
	cell_t name;
	cell_t nargs;
	cell_t nreturns;
	} args = {
	(cell_t)"SUNW,power-off",
	0,
	0
	};

	/* Turn the power off? */
	if ((howto & RB_POWEROFF) != 0)
	cpu_shutdown(&args);
	/* In case of halt, return to the firmware. */
	if ((howto & RB_HALT) != 0)
	cpu_halt();
	}

	void
	cpu_idle(int busy)
	{

	/* Insert code to halt (until next interrupt) for the idle loop. */
	}

	int
	cpu_idle_wakeup(int cpu)
	{

	return (1);
	}

	int
	ptrace_set_pc(struct thread *td, u_long addr)
	{

	td->td_frame->tf_tpc = addr;
	td->td_frame->tf_tnpc = addr + 4;
	return (0);
	}

	int
	ptrace_single_step(struct thread *td)
	{

	/* TODO; */
	return (0);
	}

	int
	ptrace_clear_single_step(struct thread *td)
	{

	/* TODO; */
	return (0);
	}

	void
	exec_setregs(struct thread td, struct image_params imgp, u_long stack)
	{
	struct trapframe *tf;
	struct pcb *pcb;
	struct proc *p;
	u_long sp;

	/* XXX no cpu_exec */
	p = td->td_proc;
	p->p_md.md_sigtramp = NULL;
	if (p->p_md.md_utrap != NULL) {
	utrap_free(p->p_md.md_utrap);
	p->p_md.md_utrap = NULL;
	}

	pcb = td->td_pcb;
	tf = td->td_frame;
	sp = rounddown(stack, 16);
	bzero(pcb, sizeof(*pcb));
	bzero(tf, sizeof(*tf));
	tf->tf_out[0] = stack;
	tf->tf_out[3] = p->p_sysent->sv_psstrings;
	tf->tf_out[6] = sp - SPOFF - sizeof(struct frame);
	tf->tf_tnpc = imgp->entry_addr + 4;
	tf->tf_tpc = imgp->entry_addr;
	tf->tf_tstate = TSTATE_IE \| TSTATE_PEF \| TSTATE_MM_TSO;

	td->td_retval[0] = tf->tf_out[0];
	td->td_retval[1] = tf->tf_out[1];
	}

	int
	fill_regs(struct thread td, struct reg regs)
	{

	bcopy(td->td_frame, regs, sizeof(*regs));
	return (0);
	}

	int
	set_regs(struct thread td, struct reg regs)
	{
	struct trapframe *tf;

	if (!TSTATE_SECURE(regs->r_tstate))
	return (EINVAL);
	tf = td->td_frame;
	regs->r_wstate = tf->tf_wstate;
	bcopy(regs, tf, sizeof(*regs));
	return (0);
	}

	int
	fill_dbregs(struct thread td, struct dbreg dbregs)
	{

	return (ENOSYS);
	}

	int
	set_dbregs(struct thread td, struct dbreg dbregs)
	{

	return (ENOSYS);
	}

	int
	fill_fpregs(struct thread td, struct fpreg fpregs)
	{
	struct trapframe *tf;
	struct pcb *pcb;

	pcb = td->td_pcb;
	tf = td->td_frame;
	bcopy(pcb->pcb_ufp, fpregs->fr_regs, sizeof(fpregs->fr_regs));
	fpregs->fr_fsr = tf->tf_fsr;
	fpregs->fr_gsr = tf->tf_gsr;
	return (0);
	}

	int
	set_fpregs(struct thread td, struct fpreg fpregs)
	{
	struct trapframe *tf;
	struct pcb *pcb;

	pcb = td->td_pcb;
	tf = td->td_frame;
	tf->tf_fprs &= ~FPRS_FEF;
	bcopy(fpregs->fr_regs, pcb->pcb_ufp, sizeof(pcb->pcb_ufp));
	tf->tf_fsr = fpregs->fr_fsr;
	tf->tf_gsr = fpregs->fr_gsr;
	return (0);
	}

	struct md_utrap *
	utrap_alloc(void)
	{
	struct md_utrap *ut;

	ut = malloc(sizeof(struct md_utrap), M_SUBPROC, M_WAITOK \| M_ZERO);
	ut->ut_refcnt = 1;
	return (ut);
	}

	void
	utrap_free(struct md_utrap *ut)
	{
	int refcnt;

	if (ut == NULL)
	return;
	mtx_pool_lock(mtxpool_sleep, ut);
	ut->ut_refcnt--;
	refcnt = ut->ut_refcnt;
	mtx_pool_unlock(mtxpool_sleep, ut);
	if (refcnt == 0)
	free(ut, M_SUBPROC);
	}

	struct md_utrap *
	utrap_hold(struct md_utrap *ut)
	{

	if (ut == NULL)
	return (NULL);
	mtx_pool_lock(mtxpool_sleep, ut);
	ut->ut_refcnt++;
	mtx_pool_unlock(mtxpool_sleep, ut);
	return (ut);
	}
	Index: head/sys/sys/posix4.h
	===================================================================
	--- head/sys/sys/posix4.h (revision 225616)
	+++ head/sys/sys/posix4.h (revision 225617)
	@@ -1,117 +1,117 @@
	#ifndef _P1003_1B_P1003_1B_H_
	#define _P1003_1B_P1003_1B_H_
	/*-
	* Copyright (c) 1996, 1997, 1998
	* HD Associates, Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by HD Associates, Inc
	* 4. Neither the name of the author nor the names of any co-contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#include <sys/param.h>
	#include <sys/ioccom.h>
	#include <sys/malloc.h>
	#include <sys/sched.h>

	/* Generate syscall stubs for when something is optionally
	* loadable as a module. References "syscall_not_present".
	* XXX Good candidate for sys/syscall.h
	*/
	struct proc;
	struct nosys_args;
	extern int syscall_not_present(struct thread , const char , struct nosys_args *);

	#define SYSCALL_NOT_PRESENT_GEN(SC) \
	-int SC (struct thread td, struct SC##_args uap) \
	+int sys_ ## SC (struct thread td, struct SC##_args uap) \
	{ \
	return syscall_not_present(td, #SC , (struct nosys_args *)uap); \
	}


	MALLOC_DECLARE(M_P31B);

	#define p31b_malloc(SIZE) malloc((SIZE), M_P31B, M_WAITOK)
	#define p31b_free(P) free((P), M_P31B)

	int p31b_proc(struct proc , pid_t, struct proc *);

	void p31b_setcfg(int, int);
	int p31b_getcfg(int);
	int p31b_iscfg(int);
	void p31b_unsetcfg(int);

	#ifdef _KPOSIX_PRIORITY_SCHEDULING

	/*
	* KSCHED_OP_RW is a vector of read/write flags for each entry indexed
	* by the enum ksched_op.
	*
	* 1 means you need write access, 0 means read is sufficient.
	*/

	enum ksched_op {

	#define KSCHED_OP_RW { 1, 0, 1, 0, 0, 0, 0, 0 }

	SCHED_SETPARAM,
	SCHED_GETPARAM,
	SCHED_SETSCHEDULER,
	SCHED_GETSCHEDULER,
	SCHED_YIELD,
	SCHED_GET_PRIORITY_MAX,
	SCHED_GET_PRIORITY_MIN,
	SCHED_RR_GET_INTERVAL,
	SCHED_OP_MAX
	};

	struct ksched;

	int ksched_attach(struct ksched **);
	int ksched_detach(struct ksched *);

	int ksched_setparam(struct ksched *,
	struct thread , const struct sched_param );
	int ksched_getparam(struct ksched *,
	struct thread , struct sched_param );

	int ksched_setscheduler(struct ksched *,
	struct thread , int, const struct sched_param );
	int ksched_getscheduler(struct ksched , struct thread , int *);

	int ksched_yield(struct ksched *);

	int ksched_get_priority_max(struct ksched , int, int );
	int ksched_get_priority_min(struct ksched , int, int );

	int ksched_rr_get_interval(struct ksched *,
	struct thread , struct timespec );

	#endif /* _KPOSIX_PRIORITY_SCHEDULING */

	#endif /* _P1003_1B_P1003_1B_H_ */
	Index: head/sys/sys/signalvar.h
	===================================================================
	--- head/sys/sys/signalvar.h (revision 225616)
	+++ head/sys/sys/signalvar.h (revision 225617)
	@@ -1,367 +1,367 @@
	/*-
	* Copyright (c) 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* @(#)signalvar.h 8.6 (Berkeley) 2/19/95
	* $FreeBSD$
	*/

	#ifndef _SYS_SIGNALVAR_H_
	#define _SYS_SIGNALVAR_H_

	#include <sys/queue.h>
	#include <sys/_lock.h>
	#include <sys/_mutex.h>
	#include <sys/signal.h>

	/*
	* Kernel signal definitions and data structures.
	*/

	/*
	* Logical process signal actions and state, needed only within the process
	* The mapping between sigacts and proc structures is 1:1 except for rfork()
	* processes masquerading as threads which use one structure for the whole
	* group. All members are locked by the included mutex. The reference count
	* and mutex must be last for the bcopy in sigacts_copy() to work.
	*/
	struct sigacts {
	sig_t ps_sigact[_SIG_MAXSIG]; /* Disposition of signals. */
	sigset_t ps_catchmask[_SIG_MAXSIG]; /* Signals to be blocked. */
	sigset_t ps_sigonstack; /* Signals to take on sigstack. */
	sigset_t ps_sigintr; /* Signals that interrupt syscalls. */
	sigset_t ps_sigreset; /* Signals that reset when caught. */
	sigset_t ps_signodefer; /* Signals not masked while handled. */
	sigset_t ps_siginfo; /* Signals that want SA_SIGINFO args. */
	sigset_t ps_sigignore; /* Signals being ignored. */
	sigset_t ps_sigcatch; /* Signals being caught by user. */
	sigset_t ps_freebsd4; /* Signals using freebsd4 ucontext. */
	sigset_t ps_osigset; /* Signals using <= 3.x osigset_t. */
	sigset_t ps_usertramp; /* SunOS compat; libc sigtramp. XXX */
	int ps_flag;
	int ps_refcnt;
	struct mtx ps_mtx;
	};

	#define PS_NOCLDWAIT 0x0001 /* No zombies if child dies */
	#define PS_NOCLDSTOP 0x0002 /* No SIGCHLD when children stop. */
	#define PS_CLDSIGIGN 0x0004 /* The SIGCHLD handler is SIG_IGN. */

	#ifdef _KERNEL

	#ifdef COMPAT_43
	typedef struct {
	struct osigcontext si_sc;
	int si_signo;
	int si_code;
	union sigval si_value;
	} osiginfo_t;

	struct osigaction {
	union {
	void (*__sa_handler)(int);
	void (__sa_sigaction)(int, osiginfo_t , void *);
	} __sigaction_u; /* signal handler */
	osigset_t sa_mask; /* signal mask to apply */
	int sa_flags; /* see signal options below */
	};

	typedef void __osiginfohandler_t(int, osiginfo_t , void );
	#endif /* COMPAT_43 */

	/* additional signal action values, used only temporarily/internally */
	#define SIG_CATCH ((__sighandler_t *)2)
	/* #define SIG_HOLD ((__sighandler_t )3) See signal.h /

	/*
	* get signal action for process and signal; currently only for current process
	*/
	#define SIGACTION(p, sig) (p->p_sigacts->ps_sigact[_SIG_IDX(sig)])

	#endif /* _KERNEL */

	/*
	* sigset_t manipulation macros.
	*/
	#define SIGADDSET(set, signo) \
	((set).__bits[_SIG_WORD(signo)] \|= _SIG_BIT(signo))

	#define SIGDELSET(set, signo) \
	((set).__bits[_SIG_WORD(signo)] &= ~_SIG_BIT(signo))

	#define SIGEMPTYSET(set) \
	do { \
	int __i; \
	for (__i = 0; __i < _SIG_WORDS; __i++) \
	(set).__bits[__i] = 0; \
	} while (0)

	#define SIGFILLSET(set) \
	do { \
	int __i; \
	for (__i = 0; __i < _SIG_WORDS; __i++) \
	(set).__bits[__i] = ~0U; \
	} while (0)

	#define SIGISMEMBER(set, signo) \
	((set).__bits[_SIG_WORD(signo)] & _SIG_BIT(signo))

	#define SIGISEMPTY(set) (__sigisempty(&(set)))
	#define SIGNOTEMPTY(set) (!__sigisempty(&(set)))

	#define SIGSETEQ(set1, set2) (__sigseteq(&(set1), &(set2)))
	#define SIGSETNEQ(set1, set2) (!__sigseteq(&(set1), &(set2)))

	#define SIGSETOR(set1, set2) \
	do { \
	int __i; \
	for (__i = 0; __i < _SIG_WORDS; __i++) \
	(set1).__bits[__i] \|= (set2).__bits[__i]; \
	} while (0)

	#define SIGSETAND(set1, set2) \
	do { \
	int __i; \
	for (__i = 0; __i < _SIG_WORDS; __i++) \
	(set1).__bits[__i] &= (set2).__bits[__i]; \
	} while (0)

	#define SIGSETNAND(set1, set2) \
	do { \
	int __i; \
	for (__i = 0; __i < _SIG_WORDS; __i++) \
	(set1).__bits[__i] &= ~(set2).__bits[__i]; \
	} while (0)

	#define SIGSETLO(set1, set2) ((set1).__bits[0] = (set2).__bits[0])
	#define SIGSETOLD(set, oset) ((set).__bits[0] = (oset))

	#define SIG_CANTMASK(set) \
	SIGDELSET(set, SIGKILL), SIGDELSET(set, SIGSTOP)

	#define SIG_STOPSIGMASK(set) \
	SIGDELSET(set, SIGSTOP), SIGDELSET(set, SIGTSTP), \
	SIGDELSET(set, SIGTTIN), SIGDELSET(set, SIGTTOU)

	#define SIG_CONTSIGMASK(set) \
	SIGDELSET(set, SIGCONT)

	#define sigcantmask (sigmask(SIGKILL) \| sigmask(SIGSTOP))

	#define SIG2OSIG(sig, osig) (osig = (sig).__bits[0])
	#define OSIG2SIG(osig, sig) SIGEMPTYSET(sig); (sig).__bits[0] = osig

	static __inline int
	__sigisempty(sigset_t *set)
	{
	int i;

	for (i = 0; i < _SIG_WORDS; i++) {
	if (set->__bits[i])
	return (0);
	}
	return (1);
	}

	static __inline int
	__sigseteq(sigset_t set1, sigset_t set2)
	{
	int i;

	for (i = 0; i < _SIG_WORDS; i++) {
	if (set1->__bits[i] != set2->__bits[i])
	return (0);
	}
	return (1);
	}

	struct osigevent {
	int sigev_notify; /* Notification type */
	union {
	int __sigev_signo; /* Signal number */
	int __sigev_notify_kqueue;
	} __sigev_u;
	union sigval sigev_value; /* Signal value */
	};

	typedef struct ksiginfo {
	TAILQ_ENTRY(ksiginfo) ksi_link;
	siginfo_t ksi_info;
	int ksi_flags;
	struct sigqueue *ksi_sigq;
	} ksiginfo_t;

	#define ksi_signo ksi_info.si_signo
	#define ksi_errno ksi_info.si_errno
	#define ksi_code ksi_info.si_code
	#define ksi_pid ksi_info.si_pid
	#define ksi_uid ksi_info.si_uid
	#define ksi_status ksi_info.si_status
	#define ksi_addr ksi_info.si_addr
	#define ksi_value ksi_info.si_value
	#define ksi_band ksi_info.si_band
	#define ksi_trapno ksi_info.si_trapno
	#define ksi_overrun ksi_info.si_overrun
	#define ksi_timerid ksi_info.si_timerid
	#define ksi_mqd ksi_info.si_mqd

	/* bits for ksi_flags */
	#define KSI_TRAP 0x01 /* Generated by trap. */
	#define KSI_EXT 0x02 /* Externally managed ksi. */
	#define KSI_INS 0x04 /* Directly insert ksi, not the copy */
	#define KSI_SIGQ 0x08 /* Generated by sigqueue, might ret EGAIN. */
	#define KSI_HEAD 0x10 /* Insert into head, not tail. */
	#define KSI_COPYMASK (KSI_TRAP\|KSI_SIGQ)

	#define KSI_ONQ(ksi) ((ksi)->ksi_sigq != NULL)

	typedef struct sigqueue {
	sigset_t sq_signals; /* All pending signals. */
	sigset_t sq_kill; /* Legacy depth 1 queue. */
	TAILQ_HEAD(, ksiginfo) sq_list;/* Queued signal info. */
	struct proc *sq_proc;
	int sq_flags;
	} sigqueue_t;

	/* Flags for ksi_flags */
	#define SQ_INIT 0x01

	#ifdef _KERNEL

	/* Return nonzero if process p has an unmasked pending signal. */
	#define SIGPENDING(td) \
	((!SIGISEMPTY((td)->td_siglist) && \
	!sigsetmasked(&(td)->td_siglist, &(td)->td_sigmask)) \|\| \
	(!SIGISEMPTY((td)->td_proc->p_siglist) && \
	!sigsetmasked(&(td)->td_proc->p_siglist, &(td)->td_sigmask)))
	/*
	* Return the value of the pseudo-expression ((set & ~mask) != 0). This
	* is an optimized version of SIGISEMPTY() on a temporary variable
	* containing SIGSETNAND(set, mask).
	*/
	static __inline int
	sigsetmasked(sigset_t set, sigset_t mask)
	{
	int i;

	for (i = 0; i < _SIG_WORDS; i++) {
	if (set->__bits[i] & ~mask->__bits[i])
	return (0);
	}
	return (1);
	}

	#define ksiginfo_init(ksi) \
	do { \
	bzero(ksi, sizeof(ksiginfo_t)); \
	} while(0)

	#define ksiginfo_init_trap(ksi) \
	do { \
	ksiginfo_t *kp = ksi; \
	bzero(kp, sizeof(ksiginfo_t)); \
	kp->ksi_flags \|= KSI_TRAP; \
	} while(0)

	static __inline void
	ksiginfo_copy(ksiginfo_t src, ksiginfo_t dst)
	{
	(dst)->ksi_info = src->ksi_info;
	(dst)->ksi_flags = (src->ksi_flags & KSI_COPYMASK);
	}

	static __inline void
	ksiginfo_set_sigev(ksiginfo_t dst, struct sigevent sigev)
	{
	dst->ksi_signo = sigev->sigev_signo;
	dst->ksi_value = sigev->sigev_value;
	}

	struct pgrp;
	struct proc;
	struct sigio;
	struct thread;

	/*
	* Lock the pointers for a sigio object in the underlying objects of
	* a file descriptor.
	*/
	#define SIGIO_LOCK() mtx_lock(&sigio_lock)
	#define SIGIO_TRYLOCK() mtx_trylock(&sigio_lock)
	#define SIGIO_UNLOCK() mtx_unlock(&sigio_lock)
	#define SIGIO_LOCKED() mtx_owned(&sigio_lock)
	#define SIGIO_ASSERT(type) mtx_assert(&sigio_lock, type)

	extern struct mtx sigio_lock;

	/* Values for stop_allowed parameter for cursig(). */
	#define SIG_STOP_ALLOWED 100
	#define SIG_STOP_NOT_ALLOWED 101

	/* Flags for kern_sigprocmask(). */
	#define SIGPROCMASK_OLD 0x0001
	#define SIGPROCMASK_PROC_LOCKED 0x0002
	#define SIGPROCMASK_PS_LOCKED 0x0004

	int cursig(struct thread *td, int stop_allowed);
	void execsigs(struct proc *p);
	void gsignal(int pgid, int sig, ksiginfo_t *ksi);
	void killproc(struct proc p, char why);
	ksiginfo_t * ksiginfo_alloc(int wait);
	void ksiginfo_free(ksiginfo_t *ksi);
	int pksignal(struct proc p, int sig, ksiginfo_t ksi);
	void pgsigio(struct sigio **sigiop, int sig, int checkctty);
	void pgsignal(struct pgrp pgrp, int sig, int checkctty, ksiginfo_t ksi);
	int postsig(int sig);
	-void psignal(struct proc *p, int sig);
	+void kern_psignal(struct proc *p, int sig);
	int ptracestop(struct thread *td, int sig);
	void sendsig(sig_t catcher, ksiginfo_t ksi, sigset_t retmask);
	struct sigacts *sigacts_alloc(void);
	void sigacts_copy(struct sigacts dest, struct sigacts src);
	void sigacts_free(struct sigacts *ps);
	struct sigacts sigacts_hold(struct sigacts ps);
	int sigacts_shared(struct sigacts *ps);
	void sigexit(struct thread *td, int sig) __dead2;
	int sigev_findtd(struct proc p, struct sigevent sigev, struct thread **);
	int sig_ffs(sigset_t *set);
	void siginit(struct proc *p);
	void signotify(struct thread *td);
	void sigqueue_delete(struct sigqueue *queue, int sig);
	void sigqueue_delete_proc(struct proc *p, int sig);
	void sigqueue_flush(struct sigqueue *queue);
	void sigqueue_init(struct sigqueue queue, struct proc p);
	void sigqueue_take(ksiginfo_t *ksi);
	void tdksignal(struct thread td, int sig, ksiginfo_t ksi);
	int tdsendsignal(struct proc p, struct thread td, int sig,
	ksiginfo_t *ksi);
	void tdsigcleanup(struct thread *td);
	void tdsignal(struct thread *td, int sig);
	void trapsignal(struct thread td, ksiginfo_t ksi);

	#endif /* _KERNEL */

	#endif /* !_SYS_SIGNALVAR_H_ */
	Index: head/sys/sys/sysent.h
	===================================================================
	--- head/sys/sys/sysent.h (revision 225616)
	+++ head/sys/sys/sysent.h (revision 225617)
	@@ -1,247 +1,264 @@
	/*-
	* Copyright (c) 1982, 1988, 1991 The Regents of the University of California.
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* $FreeBSD$
	*/

	#ifndef _SYS_SYSENT_H_
	#define _SYS_SYSENT_H_

	#include <bsm/audit.h>

	struct rlimit;
	struct sysent;
	struct thread;
	struct ksiginfo;

	typedef int sy_call_t(struct thread , void );

	/* Used by the machine dependent syscall() code. */
	typedef void (systrace_probe_func_t)(u_int32_t, int, struct sysent , void *,
	int);

	/*
	* Used by loaded syscalls to convert arguments to a DTrace array
	* of 64-bit arguments.
	*/
	typedef void (systrace_args_func_t)(int, void , u_int64_t , int );

	extern systrace_probe_func_t systrace_probe_func;

	struct sysent { /* system call table */
	int sy_narg; /* number of arguments */
	sy_call_t sy_call; / implementing function */
	au_event_t sy_auevent; /* audit event associated with syscall */
	systrace_args_func_t sy_systrace_args_func;
	/* optional argument conversion function. */
	u_int32_t sy_entry; /* DTrace entry ID for systrace. */
	u_int32_t sy_return; /* DTrace return ID for systrace. */
	u_int32_t sy_flags; /* General flags for system calls. */
	u_int32_t sy_thrcnt;
	};

	/*
	* A system call is permitted in capability mode.
	*/
	#define SYF_CAPENABLED 0x00000001

	#define SY_THR_FLAGMASK 0x7
	#define SY_THR_STATIC 0x1
	#define SY_THR_DRAINING 0x2
	#define SY_THR_ABSENT 0x4
	#define SY_THR_INCR 0x8

	struct image_params;
	struct __sigset;
	struct syscall_args;
	struct trapframe;
	struct vnode;

	struct sysentvec {
	int sv_size; /* number of entries */
	struct sysent sv_table; / pointer to sysent */
	u_int sv_mask; /* optional mask to index */
	int sv_sigsize; /* size of signal translation table */
	int sv_sigtbl; / signal translation table */
	int sv_errsize; /* size of errno translation table */
	int sv_errtbl; / errno translation table */
	int (*sv_transtrap)(int, int);
	/* translate trap-to-signal mapping */
	int (sv_fixup)(register_t , struct image_params );
	/* stack fixup function */
	void (sv_sendsig)(void ()(int), struct ksiginfo , struct __sigset );
	/* send signal */
	char sv_sigcode; / start of sigtramp code */
	int sv_szsigcode; / size of sigtramp code */
	void (sv_prepsyscall)(struct trapframe , int , u_int ,
	caddr_t *);
	char sv_name; / name of binary type */
	int (sv_coredump)(struct thread , struct vnode *, off_t, int);
	/* function to dump core, or NULL */
	int (sv_imgact_try)(struct image_params );
	int sv_minsigstksz; /* minimum signal stack size */
	int sv_pagesize; /* pagesize */
	vm_offset_t sv_minuser; /* VM_MIN_ADDRESS */
	vm_offset_t sv_maxuser; /* VM_MAXUSER_ADDRESS */
	vm_offset_t sv_usrstack; /* USRSTACK */
	vm_offset_t sv_psstrings; /* PS_STRINGS */
	int sv_stackprot; /* vm protection for stack */
	register_t (sv_copyout_strings)(struct image_params *);
	void (sv_setregs)(struct thread , struct image_params *,
	u_long);
	void (sv_fixlimit)(struct rlimit , int);
	u_long *sv_maxssiz;
	u_int sv_flags;
	void (sv_set_syscall_retval)(struct thread , int);
	int (sv_fetch_syscall_args)(struct thread , struct
	syscall_args *);
	const char **sv_syscallnames;
	vm_offset_t sv_shared_page_base;
	vm_offset_t sv_shared_page_len;
	vm_offset_t sv_sigcode_base;
	void *sv_shared_page_obj;
	void (sv_schedtail)(struct thread );
	};

	#define SV_ILP32 0x000100
	#define SV_LP64 0x000200
	#define SV_IA32 0x004000
	#define SV_AOUT 0x008000
	#define SV_SHP 0x010000

	#define SV_ABI_MASK 0xff
	#define SV_PROC_FLAG(p, x) ((p)->p_sysent->sv_flags & (x))
	#define SV_PROC_ABI(p) ((p)->p_sysent->sv_flags & SV_ABI_MASK)
	#define SV_CURPROC_FLAG(x) SV_PROC_FLAG(curproc, x)
	#define SV_CURPROC_ABI() SV_PROC_ABI(curproc)
	/* same as ELFOSABI_XXX, to prevent header pollution */
	#define SV_ABI_LINUX 3
	#define SV_ABI_FREEBSD 9
	#define SV_ABI_UNDEF 255

	#ifdef _KERNEL
	extern struct sysentvec aout_sysvec;
	extern struct sysentvec elf_freebsd_sysvec;
	extern struct sysentvec null_sysvec;
	extern struct sysent sysent[];
	extern const char *syscallnames[];

	#define NO_SYSCALL (-1)

	struct module;

	struct syscall_module_data {
	int (chainevh)(struct module , int, void ); / next handler */
	void chainarg; / arg for next event handler */
	int offset; / offset into sysent */
	struct sysent new_sysent; / new sysent */
	struct sysent old_sysent; /* old sysent */
	};

	#define MAKE_SYSENT(syscallname) \
	static struct sysent syscallname##_sysent = { \
	(sizeof(struct syscallname ## _args ) \
	/ sizeof(register_t)), \
	+ (sy_call_t *)& sys_##syscallname, \
	+ SYS_AUE_##syscallname \
	+}
	+
	+#define MAKE_SYSENT_COMPAT(syscallname) \
	+static struct sysent syscallname##_sysent = { \
	+ (sizeof(struct syscallname ## _args ) \
	+ / sizeof(register_t)), \
	(sy_call_t *)& syscallname, \
	SYS_AUE_##syscallname \
	}

	#define SYSCALL_MODULE(name, offset, new_sysent, evh, arg) \
	static struct syscall_module_data name##_syscall_mod = { \
	evh, arg, offset, new_sysent, { 0, NULL, AUE_NULL } \
	}; \
	\
	static moduledata_t name##_mod = { \
	"sys/" #name, \
	syscall_module_handler, \
	&name##_syscall_mod \
	}; \
	DECLARE_MODULE(name, name##_mod, SI_SUB_SYSCALLS, SI_ORDER_MIDDLE)

	#define SYSCALL_MODULE_HELPER(syscallname) \
	static int syscallname##_syscall = SYS_##syscallname; \
	MAKE_SYSENT(syscallname); \
	SYSCALL_MODULE(syscallname, \
	& syscallname##_syscall, & syscallname##_sysent, \
	NULL, NULL)

	#define SYSCALL_MODULE_PRESENT(syscallname) \
	(sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmnosys && \
	sysent[SYS_##syscallname].sy_call != (sy_call_t *)lkmressys)

	/*
	* Syscall registration helpers with resource allocation handling.
	*/
	struct syscall_helper_data {
	struct sysent new_sysent;
	struct sysent old_sysent;
	int syscall_no;
	int registered;
	};
	#define SYSCALL_INIT_HELPER(syscallname) { \
	+ .new_sysent = { \
	+ .sy_narg = (sizeof(struct syscallname ## _args ) \
	+ / sizeof(register_t)), \
	+ .sy_call = (sy_call_t *)& sys_ ## syscallname, \
	+ .sy_auevent = SYS_AUE_##syscallname \
	+ }, \
	+ .syscall_no = SYS_##syscallname \
	+}
	+#define SYSCALL_INIT_HELPER_COMPAT(syscallname) { \
	.new_sysent = { \
	.sy_narg = (sizeof(struct syscallname ## _args ) \
	/ sizeof(register_t)), \
	.sy_call = (sy_call_t *)& syscallname, \
	.sy_auevent = SYS_AUE_##syscallname \
	}, \
	.syscall_no = SYS_##syscallname \
	}
	#define SYSCALL_INIT_LAST { \
	.syscall_no = NO_SYSCALL \
	}

	int syscall_register(int offset, struct sysent new_sysent,
	struct sysent *old_sysent);
	int syscall_deregister(int offset, struct sysent old_sysent);
	int syscall_module_handler(struct module mod, int what, void arg);
	int syscall_helper_register(struct syscall_helper_data *sd);
	int syscall_helper_unregister(struct syscall_helper_data *sd);

	struct proc;
	const char syscallname(struct proc p, u_int code);

	/* Special purpose system call functions. */
	struct nosys_args;

	int lkmnosys(struct thread , struct nosys_args );
	int lkmressys(struct thread , struct nosys_args );

	int syscall_thread_enter(struct thread td, struct sysent se);
	void syscall_thread_exit(struct thread td, struct sysent se);

	int shared_page_fill(int size, int align, const char *data);
	void exec_sysvec_init(void *param);

	#define INIT_SYSENTVEC(name, sv) \
	SYSINIT(name, SI_SUB_EXEC, SI_ORDER_ANY, \
	(sysinit_cfunc_t)exec_sysvec_init, sv);

	#endif /* _KERNEL */

	#endif /* !_SYS_SYSENT_H_ */
	Index: head/sys/vm/swap_pager.c
	===================================================================
	--- head/sys/vm/swap_pager.c (revision 225616)
	+++ head/sys/vm/swap_pager.c (revision 225617)
	@@ -1,2694 +1,2694 @@
	/*-
	* Copyright (c) 1998 Matthew Dillon,
	* Copyright (c) 1994 John S. Dyson
	* Copyright (c) 1990 University of Utah.
	* Copyright (c) 1982, 1986, 1989, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* the Systems Programming Group of the University of Utah Computer
	* Science Department.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 3. All advertising materials mentioning features or use of this software
	* must display the following acknowledgement:
	* This product includes software developed by the University of
	* California, Berkeley and its contributors.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* New Swap System
	* Matthew Dillon
	*
	* Radix Bitmap 'blists'.
	*
	* - The new swapper uses the new radix bitmap code. This should scale
	* to arbitrarily small or arbitrarily large swap spaces and an almost
	* arbitrary degree of fragmentation.
	*
	* Features:
	*
	* - on the fly reallocation of swap during putpages. The new system
	* does not try to keep previously allocated swap blocks for dirty
	* pages.
	*
	* - on the fly deallocation of swap
	*
	* - No more garbage collection required. Unnecessarily allocated swap
	* blocks only exist for dirty vm_page_t's now and these are already
	* cycled (in a high-load system) by the pager. We also do on-the-fly
	* removal of invalidated swap blocks when a page is destroyed
	* or renamed.
	*
	* from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
	*
	* @(#)swap_pager.c 8.9 (Berkeley) 3/21/94
	* @(#)vm_swap.c 8.5 (Berkeley) 2/17/94
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_swap.h"
	#include "opt_vm.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/conf.h>
	#include <sys/kernel.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/bio.h>
	#include <sys/buf.h>
	#include <sys/disk.h>
	#include <sys/fcntl.h>
	#include <sys/mount.h>
	#include <sys/namei.h>
	#include <sys/vnode.h>
	#include <sys/malloc.h>
	#include <sys/racct.h>
	#include <sys/resource.h>
	#include <sys/resourcevar.h>
	#include <sys/sysctl.h>
	#include <sys/sysproto.h>
	#include <sys/blist.h>
	#include <sys/lock.h>
	#include <sys/sx.h>
	#include <sys/vmmeter.h>

	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_kern.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_pageout.h>
	#include <vm/vm_param.h>
	#include <vm/swap_pager.h>
	#include <vm/vm_extern.h>
	#include <vm/uma.h>

	#include <geom/geom.h>

	/*
	* SWB_NPAGES must be a power of 2. It may be set to 1, 2, 4, 8, 16
	* or 32 pages per allocation.
	* The 32-page limit is due to the radix code (kern/subr_blist.c).
	*/
	#ifndef MAX_PAGEOUT_CLUSTER
	#define MAX_PAGEOUT_CLUSTER 16
	#endif

	#if !defined(SWB_NPAGES)
	#define SWB_NPAGES MAX_PAGEOUT_CLUSTER
	#endif

	/*
	* The swblock structure maps an object and a small, fixed-size range
	* of page indices to disk addresses within a swap area.
	* The collection of these mappings is implemented as a hash table.
	* Unused disk addresses within a swap area are allocated and managed
	* using a blist.
	*/
	#define SWCORRECT(n) (sizeof(void ) (n) / sizeof(daddr_t))
	#define SWAP_META_PAGES (SWB_NPAGES * 2)
	#define SWAP_META_MASK (SWAP_META_PAGES - 1)

	struct swblock {
	struct swblock *swb_hnext;
	vm_object_t swb_object;
	vm_pindex_t swb_index;
	int swb_count;
	daddr_t swb_pages[SWAP_META_PAGES];
	};

	static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data");
	static struct mtx sw_dev_mtx;
	static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
	static struct swdevt swdevhd; / Allocate from here next */
	static int nswapdev; /* Number of swap devices */
	int swap_pager_avail;
	static int swdev_syscall_active = 0; /* serialize swap(on\|off) */

	static vm_ooffset_t swap_total;
	SYSCTL_QUAD(_vm, OID_AUTO, swap_total, CTLFLAG_RD, &swap_total, 0,
	"Total amount of available swap storage.");
	static vm_ooffset_t swap_reserved;
	SYSCTL_QUAD(_vm, OID_AUTO, swap_reserved, CTLFLAG_RD, &swap_reserved, 0,
	"Amount of swap storage needed to back all allocated anonymous memory.");
	static int overcommit = 0;
	SYSCTL_INT(_vm, OID_AUTO, overcommit, CTLFLAG_RW, &overcommit, 0,
	"Configure virtual memory overcommit behavior. See tuning(7) "
	"for details.");

	/* bits from overcommit */
	#define SWAP_RESERVE_FORCE_ON (1 << 0)
	#define SWAP_RESERVE_RLIMIT_ON (1 << 1)
	#define SWAP_RESERVE_ALLOW_NONWIRED (1 << 2)

	int
	swap_reserve(vm_ooffset_t incr)
	{

	return (swap_reserve_by_cred(incr, curthread->td_ucred));
	}

	int
	swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred)
	{
	vm_ooffset_t r, s;
	int res, error;
	static int curfail;
	static struct timeval lastfail;
	struct uidinfo *uip;

	uip = cred->cr_ruidinfo;

	if (incr & PAGE_MASK)
	panic("swap_reserve: & PAGE_MASK");

	#ifdef RACCT
	PROC_LOCK(curproc);
	error = racct_add(curproc, RACCT_SWAP, incr);
	PROC_UNLOCK(curproc);
	if (error != 0)
	return (0);
	#endif

	res = 0;
	mtx_lock(&sw_dev_mtx);
	r = swap_reserved + incr;
	if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
	s = cnt.v_page_count - cnt.v_free_reserved - cnt.v_wire_count;
	s *= PAGE_SIZE;
	} else
	s = 0;
	s += swap_total;
	if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 \|\| r <= s \|\|
	(error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) {
	res = 1;
	swap_reserved = r;
	}
	mtx_unlock(&sw_dev_mtx);

	if (res) {
	PROC_LOCK(curproc);
	UIDINFO_VMSIZE_LOCK(uip);
	if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
	uip->ui_vmsize + incr > lim_cur(curproc, RLIMIT_SWAP) &&
	priv_check(curthread, PRIV_VM_SWAP_NORLIMIT))
	res = 0;
	else
	uip->ui_vmsize += incr;
	UIDINFO_VMSIZE_UNLOCK(uip);
	PROC_UNLOCK(curproc);
	if (!res) {
	mtx_lock(&sw_dev_mtx);
	swap_reserved -= incr;
	mtx_unlock(&sw_dev_mtx);
	}
	}
	if (!res && ppsratecheck(&lastfail, &curfail, 1)) {
	printf("uid %d, pid %d: swap reservation for %jd bytes failed\n",
	curproc->p_pid, uip->ui_uid, incr);
	}

	#ifdef RACCT
	if (!res) {
	PROC_LOCK(curproc);
	racct_sub(curproc, RACCT_SWAP, incr);
	PROC_UNLOCK(curproc);
	}
	#endif

	return (res);
	}

	void
	swap_reserve_force(vm_ooffset_t incr)
	{
	struct uidinfo *uip;

	mtx_lock(&sw_dev_mtx);
	swap_reserved += incr;
	mtx_unlock(&sw_dev_mtx);

	#ifdef RACCT
	PROC_LOCK(curproc);
	racct_add_force(curproc, RACCT_SWAP, incr);
	PROC_UNLOCK(curproc);
	#endif

	uip = curthread->td_ucred->cr_ruidinfo;
	PROC_LOCK(curproc);
	UIDINFO_VMSIZE_LOCK(uip);
	uip->ui_vmsize += incr;
	UIDINFO_VMSIZE_UNLOCK(uip);
	PROC_UNLOCK(curproc);
	}

	void
	swap_release(vm_ooffset_t decr)
	{
	struct ucred *cred;

	PROC_LOCK(curproc);
	cred = curthread->td_ucred;
	swap_release_by_cred(decr, cred);
	PROC_UNLOCK(curproc);
	}

	void
	swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
	{
	struct uidinfo *uip;

	uip = cred->cr_ruidinfo;

	if (decr & PAGE_MASK)
	panic("swap_release: & PAGE_MASK");

	mtx_lock(&sw_dev_mtx);
	if (swap_reserved < decr)
	panic("swap_reserved < decr");
	swap_reserved -= decr;
	mtx_unlock(&sw_dev_mtx);

	UIDINFO_VMSIZE_LOCK(uip);
	if (uip->ui_vmsize < decr)
	printf("negative vmsize for uid = %d\n", uip->ui_uid);
	uip->ui_vmsize -= decr;
	UIDINFO_VMSIZE_UNLOCK(uip);

	racct_sub_cred(cred, RACCT_SWAP, decr);
	}

	static void swapdev_strategy(struct buf , struct swdevt sw);

	#define SWM_FREE 0x02 /* free, period */
	#define SWM_POP 0x04 /* pop out */

	int swap_pager_full = 2; /* swap space exhaustion (task killing) */
	static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
	static int nsw_rcount; /* free read buffers */
	static int nsw_wcount_sync; /* limit write buffers / synchronous */
	static int nsw_wcount_async; /* limit write buffers / asynchronous */
	static int nsw_wcount_async_max;/* assigned maximum */
	static int nsw_cluster_max; /* maximum VOP I/O allowed */

	static struct swblock **swhash;
	static int swhash_mask;
	static struct mtx swhash_mtx;

	static int swap_async_max = 4; /* maximum in-progress async I/O's */
	static struct sx sw_alloc_sx;


	SYSCTL_INT(_vm, OID_AUTO, swap_async_max,
	CTLFLAG_RW, &swap_async_max, 0, "Maximum running async swap ops");

	/*
	* "named" and "unnamed" anon region objects. Try to reduce the overhead
	* of searching a named list by hashing it just a little.
	*/

	#define NOBJLISTS 8

	#define NOBJLIST(handle) \
	(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])

	static struct mtx sw_alloc_mtx; /* protect list manipulation */
	static struct pagerlst swap_pager_object_list[NOBJLISTS];
	static uma_zone_t swap_zone;
	static struct vm_object swap_zone_obj;

	/*
	* pagerops for OBJT_SWAP - "swap pager". Some ops are also global procedure
	* calls hooked from other parts of the VM system and do not appear here.
	* (see vm/swap_pager.h).
	*/
	static vm_object_t
	swap_pager_alloc(void *handle, vm_ooffset_t size,
	vm_prot_t prot, vm_ooffset_t offset, struct ucred *);
	static void swap_pager_dealloc(vm_object_t object);
	static int swap_pager_getpages(vm_object_t, vm_page_t *, int, int);
	static void swap_pager_putpages(vm_object_t, vm_page_t , int, boolean_t, int );
	static boolean_t
	swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int before, int after);
	static void swap_pager_init(void);
	static void swap_pager_unswapped(vm_page_t);
	static void swap_pager_swapoff(struct swdevt *sp);

	struct pagerops swappagerops = {
	.pgo_init = swap_pager_init, /* early system initialization of pager */
	.pgo_alloc = swap_pager_alloc, /* allocate an OBJT_SWAP object */
	.pgo_dealloc = swap_pager_dealloc, /* deallocate an OBJT_SWAP object */
	.pgo_getpages = swap_pager_getpages, /* pagein */
	.pgo_putpages = swap_pager_putpages, /* pageout */
	.pgo_haspage = swap_pager_haspage, /* get backing store status for page */
	.pgo_pageunswapped = swap_pager_unswapped, /* remove swap related to page */
	};

	/*
	* dmmax is in page-sized chunks with the new swap system. It was
	* dev-bsized chunks in the old. dmmax is always a power of 2.
	*
	* swap_() routines are externally accessible. swp_() routines are
	* internal.
	*/
	static int dmmax;
	static int nswap_lowat = 128; /* in pages, swap_pager_almost_full warn */
	static int nswap_hiwat = 512; /* in pages, swap_pager_almost_full warn */

	SYSCTL_INT(_vm, OID_AUTO, dmmax,
	CTLFLAG_RD, &dmmax, 0, "Maximum size of a swap block");

	static void swp_sizecheck(void);
	static void swp_pager_async_iodone(struct buf *bp);
	static int swapongeom(struct thread , struct vnode );
	static int swaponvp(struct thread , struct vnode , u_long);
	static int swapoff_one(struct swdevt sp, struct ucred cred);

	/*
	* Swap bitmap functions
	*/
	static void swp_pager_freeswapspace(daddr_t blk, int npages);
	static daddr_t swp_pager_getswapspace(int npages);

	/*
	* Metadata functions
	*/
	static struct swblock **swp_pager_hash(vm_object_t object, vm_pindex_t index);
	static void swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
	static void swp_pager_meta_free(vm_object_t, vm_pindex_t, daddr_t);
	static void swp_pager_meta_free_all(vm_object_t);
	static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);

	static void
	swp_pager_free_nrpage(vm_page_t m)
	{

	vm_page_lock(m);
	if (m->wire_count == 0)
	vm_page_free(m);
	vm_page_unlock(m);
	}

	/*
	* SWP_SIZECHECK() - update swap_pager_full indication
	*
	* update the swap_pager_almost_full indication and warn when we are
	* about to run out of swap space, using lowat/hiwat hysteresis.
	*
	* Clear swap_pager_full ( task killing ) indication when lowat is met.
	*
	* No restrictions on call
	* This routine may not block.
	*/
	static void
	swp_sizecheck(void)
	{

	if (swap_pager_avail < nswap_lowat) {
	if (swap_pager_almost_full == 0) {
	printf("swap_pager: out of swap space\n");
	swap_pager_almost_full = 1;
	}
	} else {
	swap_pager_full = 0;
	if (swap_pager_avail > nswap_hiwat)
	swap_pager_almost_full = 0;
	}
	}

	/*
	* SWP_PAGER_HASH() - hash swap meta data
	*
	* This is an helper function which hashes the swapblk given
	* the object and page index. It returns a pointer to a pointer
	* to the object, or a pointer to a NULL pointer if it could not
	* find a swapblk.
	*/
	static struct swblock **
	swp_pager_hash(vm_object_t object, vm_pindex_t index)
	{
	struct swblock **pswap;
	struct swblock *swap;

	index &= ~(vm_pindex_t)SWAP_META_MASK;
	pswap = &swhash[(index ^ (int)(intptr_t)object) & swhash_mask];
	while ((swap = *pswap) != NULL) {
	if (swap->swb_object == object &&
	swap->swb_index == index
	) {
	break;
	}
	pswap = &swap->swb_hnext;
	}
	return (pswap);
	}

	/*
	* SWAP_PAGER_INIT() - initialize the swap pager!
	*
	* Expected to be started from system init. NOTE: This code is run
	* before much else so be careful what you depend on. Most of the VM
	* system has yet to be initialized at this point.
	*/
	static void
	swap_pager_init(void)
	{
	/*
	* Initialize object lists
	*/
	int i;

	for (i = 0; i < NOBJLISTS; ++i)
	TAILQ_INIT(&swap_pager_object_list[i]);
	mtx_init(&sw_alloc_mtx, "swap_pager list", NULL, MTX_DEF);
	mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);

	/*
	* Device Stripe, in PAGE_SIZE'd blocks
	*/
	dmmax = SWB_NPAGES * 2;
	}

	/*
	* SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
	*
	* Expected to be started from pageout process once, prior to entering
	* its main loop.
	*/
	void
	swap_pager_swap_init(void)
	{
	int n, n2;

	/*
	* Number of in-transit swap bp operations. Don't
	* exhaust the pbufs completely. Make sure we
	* initialize workable values (0 will work for hysteresis
	* but it isn't very efficient).
	*
	* The nsw_cluster_max is constrained by the bp->b_pages[]
	* array (MAXPHYS/PAGE_SIZE) and our locally defined
	* MAX_PAGEOUT_CLUSTER. Also be aware that swap ops are
	* constrained by the swap device interleave stripe size.
	*
	* Currently we hardwire nsw_wcount_async to 4. This limit is
	* designed to prevent other I/O from having high latencies due to
	* our pageout I/O. The value 4 works well for one or two active swap
	* devices but is probably a little low if you have more. Even so,
	* a higher value would probably generate only a limited improvement
	* with three or four active swap devices since the system does not
	* typically have to pageout at extreme bandwidths. We will want
	* at least 2 per swap devices, and 4 is a pretty good value if you
	* have one NFS swap device due to the command/ack latency over NFS.
	* So it all works out pretty well.
	*/
	nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER);

	mtx_lock(&pbuf_mtx);
	nsw_rcount = (nswbuf + 1) / 2;
	nsw_wcount_sync = (nswbuf + 3) / 4;
	nsw_wcount_async = 4;
	nsw_wcount_async_max = nsw_wcount_async;
	mtx_unlock(&pbuf_mtx);

	/*
	* Initialize our zone. Right now I'm just guessing on the number
	* we need based on the number of pages in the system. Each swblock
	* can hold 16 pages, so this is probably overkill. This reservation
	* is typically limited to around 32MB by default.
	*/
	n = cnt.v_page_count / 2;
	if (maxswzone && n > maxswzone / sizeof(struct swblock))
	n = maxswzone / sizeof(struct swblock);
	n2 = n;
	swap_zone = uma_zcreate("SWAPMETA", sizeof(struct swblock), NULL, NULL,
	NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE \| UMA_ZONE_VM);
	if (swap_zone == NULL)
	panic("failed to create swap_zone.");
	do {
	if (uma_zone_set_obj(swap_zone, &swap_zone_obj, n))
	break;
	/*
	* if the allocation failed, try a zone two thirds the
	* size of the previous attempt.
	*/
	n -= ((n + 2) / 3);
	} while (n > 0);
	if (n2 != n)
	printf("Swap zone entries reduced from %d to %d.\n", n2, n);
	n2 = n;

	/*
	* Initialize our meta-data hash table. The swapper does not need to
	* be quite as efficient as the VM system, so we do not use an
	* oversized hash table.
	*
	* n: size of hash table, must be power of 2
	* swhash_mask: hash table index mask
	*/
	for (n = 1; n < n2 / 8; n *= 2)
	;
	swhash = malloc(sizeof(struct swblock ) n, M_VMPGDATA, M_WAITOK \| M_ZERO);
	swhash_mask = n - 1;
	mtx_init(&swhash_mtx, "swap_pager swhash", NULL, MTX_DEF);
	}

	/*
	* SWAP_PAGER_ALLOC() - allocate a new OBJT_SWAP VM object and instantiate
	* its metadata structures.
	*
	* This routine is called from the mmap and fork code to create a new
	* OBJT_SWAP object. We do this by creating an OBJT_DEFAULT object
	* and then converting it with swp_pager_meta_build().
	*
	* This routine may block in vm_object_allocate() and create a named
	* object lookup race, so we must interlock.
	*
	* MPSAFE
	*/
	static vm_object_t
	swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
	vm_ooffset_t offset, struct ucred *cred)
	{
	vm_object_t object;
	vm_pindex_t pindex;

	pindex = OFF_TO_IDX(offset + PAGE_MASK + size);
	if (handle) {
	mtx_lock(&Giant);
	/*
	* Reference existing named region or allocate new one. There
	* should not be a race here against swp_pager_meta_build()
	* as called from vm_page_remove() in regards to the lookup
	* of the handle.
	*/
	sx_xlock(&sw_alloc_sx);
	object = vm_pager_object_lookup(NOBJLIST(handle), handle);
	if (object == NULL) {
	if (cred != NULL) {
	if (!swap_reserve_by_cred(size, cred)) {
	sx_xunlock(&sw_alloc_sx);
	mtx_unlock(&Giant);
	return (NULL);
	}
	crhold(cred);
	}
	object = vm_object_allocate(OBJT_DEFAULT, pindex);
	VM_OBJECT_LOCK(object);
	object->handle = handle;
	if (cred != NULL) {
	object->cred = cred;
	object->charge = size;
	}
	swp_pager_meta_build(object, 0, SWAPBLK_NONE);
	VM_OBJECT_UNLOCK(object);
	}
	sx_xunlock(&sw_alloc_sx);
	mtx_unlock(&Giant);
	} else {
	if (cred != NULL) {
	if (!swap_reserve_by_cred(size, cred))
	return (NULL);
	crhold(cred);
	}
	object = vm_object_allocate(OBJT_DEFAULT, pindex);
	VM_OBJECT_LOCK(object);
	if (cred != NULL) {
	object->cred = cred;
	object->charge = size;
	}
	swp_pager_meta_build(object, 0, SWAPBLK_NONE);
	VM_OBJECT_UNLOCK(object);
	}
	return (object);
	}

	/*
	* SWAP_PAGER_DEALLOC() - remove swap metadata from object
	*
	* The swap backing for the object is destroyed. The code is
	* designed such that we can reinstantiate it later, but this
	* routine is typically called only when the entire object is
	* about to be destroyed.
	*
	* The object must be locked.
	*/
	static void
	swap_pager_dealloc(vm_object_t object)
	{

	/*
	* Remove from list right away so lookups will fail if we block for
	* pageout completion.
	*/
	if (object->handle != NULL) {
	mtx_lock(&sw_alloc_mtx);
	TAILQ_REMOVE(NOBJLIST(object->handle), object, pager_object_list);
	mtx_unlock(&sw_alloc_mtx);
	}

	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
	vm_object_pip_wait(object, "swpdea");

	/*
	* Free all remaining metadata. We only bother to free it from
	* the swap meta data. We do not attempt to free swapblk's still
	* associated with vm_page_t's for this object. We do not care
	* if paging is still in progress on some objects.
	*/
	swp_pager_meta_free_all(object);
	}

	/************************************************************************
	* SWAP PAGER BITMAP ROUTINES *
	************************************************************************/

	/*
	* SWP_PAGER_GETSWAPSPACE() - allocate raw swap space
	*
	* Allocate swap for the requested number of pages. The starting
	* swap block number (a page index) is returned or SWAPBLK_NONE
	* if the allocation failed.
	*
	* Also has the side effect of advising that somebody made a mistake
	* when they configured swap and didn't configure enough.
	*
	* This routine may not sleep.
	*
	* We allocate in round-robin fashion from the configured devices.
	*/
	static daddr_t
	swp_pager_getswapspace(int npages)
	{
	daddr_t blk;
	struct swdevt *sp;
	int i;

	blk = SWAPBLK_NONE;
	mtx_lock(&sw_dev_mtx);
	sp = swdevhd;
	for (i = 0; i < nswapdev; i++) {
	if (sp == NULL)
	sp = TAILQ_FIRST(&swtailq);
	if (!(sp->sw_flags & SW_CLOSING)) {
	blk = blist_alloc(sp->sw_blist, npages);
	if (blk != SWAPBLK_NONE) {
	blk += sp->sw_first;
	sp->sw_used += npages;
	swap_pager_avail -= npages;
	swp_sizecheck();
	swdevhd = TAILQ_NEXT(sp, sw_list);
	goto done;
	}
	}
	sp = TAILQ_NEXT(sp, sw_list);
	}
	if (swap_pager_full != 2) {
	printf("swap_pager_getswapspace(%d): failed\n", npages);
	swap_pager_full = 2;
	swap_pager_almost_full = 1;
	}
	swdevhd = NULL;
	done:
	mtx_unlock(&sw_dev_mtx);
	return (blk);
	}

	static int
	swp_pager_isondev(daddr_t blk, struct swdevt *sp)
	{

	return (blk >= sp->sw_first && blk < sp->sw_end);
	}

	static void
	swp_pager_strategy(struct buf *bp)
	{
	struct swdevt *sp;

	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	if (bp->b_blkno >= sp->sw_first && bp->b_blkno < sp->sw_end) {
	mtx_unlock(&sw_dev_mtx);
	sp->sw_strategy(bp, sp);
	return;
	}
	}
	panic("Swapdev not found");
	}


	/*
	* SWP_PAGER_FREESWAPSPACE() - free raw swap space
	*
	* This routine returns the specified swap blocks back to the bitmap.
	*
	* This routine may not sleep.
	*/
	static void
	swp_pager_freeswapspace(daddr_t blk, int npages)
	{
	struct swdevt *sp;

	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	if (blk >= sp->sw_first && blk < sp->sw_end) {
	sp->sw_used -= npages;
	/*
	* If we are attempting to stop swapping on
	* this device, we don't want to mark any
	* blocks free lest they be reused.
	*/
	if ((sp->sw_flags & SW_CLOSING) == 0) {
	blist_free(sp->sw_blist, blk - sp->sw_first,
	npages);
	swap_pager_avail += npages;
	swp_sizecheck();
	}
	mtx_unlock(&sw_dev_mtx);
	return;
	}
	}
	panic("Swapdev not found");
	}

	/*
	* SWAP_PAGER_FREESPACE() - frees swap blocks associated with a page
	* range within an object.
	*
	* This is a globally accessible routine.
	*
	* This routine removes swapblk assignments from swap metadata.
	*
	* The external callers of this routine typically have already destroyed
	* or renamed vm_page_t's associated with this range in the object so
	* we should be ok.
	*/
	void
	swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
	{

	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
	swp_pager_meta_free(object, start, size);
	}

	/*
	* SWAP_PAGER_RESERVE() - reserve swap blocks in object
	*
	* Assigns swap blocks to the specified range within the object. The
	* swap blocks are not zerod. Any previous swap assignment is destroyed.
	*
	* Returns 0 on success, -1 on failure.
	*/
	int
	swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
	{
	int n = 0;
	daddr_t blk = SWAPBLK_NONE;
	vm_pindex_t beg = start; /* save start index */

	VM_OBJECT_LOCK(object);
	while (size) {
	if (n == 0) {
	n = BLIST_MAX_ALLOC;
	while ((blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE) {
	n >>= 1;
	if (n == 0) {
	swp_pager_meta_free(object, beg, start - beg);
	VM_OBJECT_UNLOCK(object);
	return (-1);
	}
	}
	}
	swp_pager_meta_build(object, start, blk);
	--size;
	++start;
	++blk;
	--n;
	}
	swp_pager_meta_free(object, start, n);
	VM_OBJECT_UNLOCK(object);
	return (0);
	}

	/*
	* SWAP_PAGER_COPY() - copy blocks from source pager to destination pager
	* and destroy the source.
	*
	* Copy any valid swapblks from the source to the destination. In
	* cases where both the source and destination have a valid swapblk,
	* we keep the destination's.
	*
	* This routine is allowed to sleep. It may sleep allocating metadata
	* indirectly through swp_pager_meta_build() or if paging is still in
	* progress on the source.
	*
	* The source object contains no vm_page_t's (which is just as well)
	*
	* The source object is of type OBJT_SWAP.
	*
	* The source and destination objects must be locked.
	* Both object locks may temporarily be released.
	*/
	void
	swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
	vm_pindex_t offset, int destroysource)
	{
	vm_pindex_t i;

	VM_OBJECT_LOCK_ASSERT(srcobject, MA_OWNED);
	VM_OBJECT_LOCK_ASSERT(dstobject, MA_OWNED);

	/*
	* If destroysource is set, we remove the source object from the
	* swap_pager internal queue now.
	*/
	if (destroysource) {
	if (srcobject->handle != NULL) {
	mtx_lock(&sw_alloc_mtx);
	TAILQ_REMOVE(
	NOBJLIST(srcobject->handle),
	srcobject,
	pager_object_list
	);
	mtx_unlock(&sw_alloc_mtx);
	}
	}

	/*
	* transfer source to destination.
	*/
	for (i = 0; i < dstobject->size; ++i) {
	daddr_t dstaddr;

	/*
	* Locate (without changing) the swapblk on the destination,
	* unless it is invalid in which case free it silently, or
	* if the destination is a resident page, in which case the
	* source is thrown away.
	*/
	dstaddr = swp_pager_meta_ctl(dstobject, i, 0);

	if (dstaddr == SWAPBLK_NONE) {
	/*
	* Destination has no swapblk and is not resident,
	* copy source.
	*/
	daddr_t srcaddr;

	srcaddr = swp_pager_meta_ctl(
	srcobject,
	i + offset,
	SWM_POP
	);

	if (srcaddr != SWAPBLK_NONE) {
	/*
	* swp_pager_meta_build() can sleep.
	*/
	vm_object_pip_add(srcobject, 1);
	VM_OBJECT_UNLOCK(srcobject);
	vm_object_pip_add(dstobject, 1);
	swp_pager_meta_build(dstobject, i, srcaddr);
	vm_object_pip_wakeup(dstobject);
	VM_OBJECT_LOCK(srcobject);
	vm_object_pip_wakeup(srcobject);
	}
	} else {
	/*
	* Destination has valid swapblk or it is represented
	* by a resident page. We destroy the sourceblock.
	*/

	swp_pager_meta_ctl(srcobject, i + offset, SWM_FREE);
	}
	}

	/*
	* Free left over swap blocks in source.
	*
	* We have to revert the type to OBJT_DEFAULT so we do not accidently
	* double-remove the object from the swap queues.
	*/
	if (destroysource) {
	swp_pager_meta_free_all(srcobject);
	/*
	* Reverting the type is not necessary, the caller is going
	* to destroy srcobject directly, but I'm doing it here
	* for consistency since we've removed the object from its
	* queues.
	*/
	srcobject->type = OBJT_DEFAULT;
	}
	}

	/*
	* SWAP_PAGER_HASPAGE() - determine if we have good backing store for
	* the requested page.
	*
	* We determine whether good backing store exists for the requested
	* page and return TRUE if it does, FALSE if it doesn't.
	*
	* If TRUE, we also try to determine how much valid, contiguous backing
	* store exists before and after the requested page within a reasonable
	* distance. We do not try to restrict it to the swap device stripe
	* (that is handled in getpages/putpages). It probably isn't worth
	* doing here.
	*/
	static boolean_t
	swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int before, int after)
	{
	daddr_t blk0;

	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
	/*
	* do we have good backing store at the requested index ?
	*/
	blk0 = swp_pager_meta_ctl(object, pindex, 0);

	if (blk0 == SWAPBLK_NONE) {
	if (before)
	*before = 0;
	if (after)
	*after = 0;
	return (FALSE);
	}

	/*
	* find backwards-looking contiguous good backing store
	*/
	if (before != NULL) {
	int i;

	for (i = 1; i < (SWB_NPAGES/2); ++i) {
	daddr_t blk;

	if (i > pindex)
	break;
	blk = swp_pager_meta_ctl(object, pindex - i, 0);
	if (blk != blk0 - i)
	break;
	}
	*before = (i - 1);
	}

	/*
	* find forward-looking contiguous good backing store
	*/
	if (after != NULL) {
	int i;

	for (i = 1; i < (SWB_NPAGES/2); ++i) {
	daddr_t blk;

	blk = swp_pager_meta_ctl(object, pindex + i, 0);
	if (blk != blk0 + i)
	break;
	}
	*after = (i - 1);
	}
	return (TRUE);
	}

	/*
	* SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
	*
	* This removes any associated swap backing store, whether valid or
	* not, from the page.
	*
	* This routine is typically called when a page is made dirty, at
	* which point any associated swap can be freed. MADV_FREE also
	* calls us in a special-case situation
	*
	* NOTE!!! If the page is clean and the swap was valid, the caller
	* should make the page dirty before calling this routine. This routine
	* does NOT change the m->dirty status of the page. Also: MADV_FREE
	* depends on it.
	*
	* This routine may not sleep.
	*/
	static void
	swap_pager_unswapped(vm_page_t m)
	{

	VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
	swp_pager_meta_ctl(m->object, m->pindex, SWM_FREE);
	}

	/*
	* SWAP_PAGER_GETPAGES() - bring pages in from swap
	*
	* Attempt to retrieve (m, count) pages from backing store, but make
	* sure we retrieve at least m[reqpage]. We try to load in as large
	* a chunk surrounding m[reqpage] as is contiguous in swap and which
	* belongs to the same object.
	*
	* The code is designed for asynchronous operation and
	* immediate-notification of 'reqpage' but tends not to be
	* used that way. Please do not optimize-out this algorithmic
	* feature, I intend to improve on it in the future.
	*
	* The parent has a single vm_object_pip_add() reference prior to
	* calling us and we should return with the same.
	*
	* The parent has BUSY'd the pages. We should return with 'm'
	* left busy, but the others adjusted.
	*/
	static int
	swap_pager_getpages(vm_object_t object, vm_page_t *m, int count, int reqpage)
	{
	struct buf *bp;
	vm_page_t mreq;
	int i;
	int j;
	daddr_t blk;

	mreq = m[reqpage];

	KASSERT(mreq->object == object,
	("swap_pager_getpages: object mismatch %p/%p",
	object, mreq->object));

	/*
	* Calculate range to retrieve. The pages have already been assigned
	* their swapblks. We require a contiguous range but we know it to
	* not span devices. If we do not supply it, bad things
	* happen. Note that blk, iblk & jblk can be SWAPBLK_NONE, but the
	* loops are set up such that the case(s) are handled implicitly.
	*
	* The swp_*() calls must be made with the object locked.
	*/
	blk = swp_pager_meta_ctl(mreq->object, mreq->pindex, 0);

	for (i = reqpage - 1; i >= 0; --i) {
	daddr_t iblk;

	iblk = swp_pager_meta_ctl(m[i]->object, m[i]->pindex, 0);
	if (blk != iblk + (reqpage - i))
	break;
	}
	++i;

	for (j = reqpage + 1; j < count; ++j) {
	daddr_t jblk;

	jblk = swp_pager_meta_ctl(m[j]->object, m[j]->pindex, 0);
	if (blk != jblk - (j - reqpage))
	break;
	}

	/*
	* free pages outside our collection range. Note: we never free
	* mreq, it must remain busy throughout.
	*/
	if (0 < i \|\| j < count) {
	int k;

	for (k = 0; k < i; ++k)
	swp_pager_free_nrpage(m[k]);
	for (k = j; k < count; ++k)
	swp_pager_free_nrpage(m[k]);
	}

	/*
	* Return VM_PAGER_FAIL if we have nothing to do. Return mreq
	* still busy, but the others unbusied.
	*/
	if (blk == SWAPBLK_NONE)
	return (VM_PAGER_FAIL);

	/*
	* Getpbuf() can sleep.
	*/
	VM_OBJECT_UNLOCK(object);
	/*
	* Get a swap buffer header to perform the IO
	*/
	bp = getpbuf(&nsw_rcount);
	bp->b_flags \|= B_PAGING;

	/*
	* map our page(s) into kva for input
	*/
	pmap_qenter((vm_offset_t)bp->b_data, m + i, j - i);

	bp->b_iocmd = BIO_READ;
	bp->b_iodone = swp_pager_async_iodone;
	bp->b_rcred = crhold(thread0.td_ucred);
	bp->b_wcred = crhold(thread0.td_ucred);
	bp->b_blkno = blk - (reqpage - i);
	bp->b_bcount = PAGE_SIZE * (j - i);
	bp->b_bufsize = PAGE_SIZE * (j - i);
	bp->b_pager.pg_reqpage = reqpage - i;

	VM_OBJECT_LOCK(object);
	{
	int k;

	for (k = i; k < j; ++k) {
	bp->b_pages[k - i] = m[k];
	m[k]->oflags \|= VPO_SWAPINPROG;
	}
	}
	bp->b_npages = j - i;

	PCPU_INC(cnt.v_swapin);
	PCPU_ADD(cnt.v_swappgsin, bp->b_npages);

	/*
	* We still hold the lock on mreq, and our automatic completion routine
	* does not remove it.
	*/
	vm_object_pip_add(object, bp->b_npages);
	VM_OBJECT_UNLOCK(object);

	/*
	* perform the I/O. NOTE!!! bp cannot be considered valid after
	* this point because we automatically release it on completion.
	* Instead, we look at the one page we are interested in which we
	* still hold a lock on even through the I/O completion.
	*
	* The other pages in our m[] array are also released on completion,
	* so we cannot assume they are valid anymore either.
	*
	* NOTE: b_blkno is destroyed by the call to swapdev_strategy
	*/
	BUF_KERNPROC(bp);
	swp_pager_strategy(bp);

	/*
	* wait for the page we want to complete. VPO_SWAPINPROG is always
	* cleared on completion. If an I/O error occurs, SWAPBLK_NONE
	* is set in the meta-data.
	*/
	VM_OBJECT_LOCK(object);
	while ((mreq->oflags & VPO_SWAPINPROG) != 0) {
	mreq->oflags \|= VPO_WANTED;
	PCPU_INC(cnt.v_intrans);
	if (msleep(mreq, VM_OBJECT_MTX(object), PSWP, "swread", hz*20)) {
	printf(
	"swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
	bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
	}
	}

	/*
	* mreq is left busied after completion, but all the other pages
	* are freed. If we had an unrecoverable read error the page will
	* not be valid.
	*/
	if (mreq->valid != VM_PAGE_BITS_ALL) {
	return (VM_PAGER_ERROR);
	} else {
	return (VM_PAGER_OK);
	}

	/*
	* A final note: in a low swap situation, we cannot deallocate swap
	* and mark a page dirty here because the caller is likely to mark
	* the page clean when we return, causing the page to possibly revert
	* to all-zero's later.
	*/
	}

	/*
	* swap_pager_putpages:
	*
	* Assign swap (if necessary) and initiate I/O on the specified pages.
	*
	* We support both OBJT_DEFAULT and OBJT_SWAP objects. DEFAULT objects
	* are automatically converted to SWAP objects.
	*
	* In a low memory situation we may block in VOP_STRATEGY(), but the new
	* vm_page reservation system coupled with properly written VFS devices
	* should ensure that no low-memory deadlock occurs. This is an area
	* which needs work.
	*
	* The parent has N vm_object_pip_add() references prior to
	* calling us and will remove references for rtvals[] that are
	* not set to VM_PAGER_PEND. We need to remove the rest on I/O
	* completion.
	*
	* The parent has soft-busy'd the pages it passes us and will unbusy
	* those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
	* We need to unbusy the rest on I/O completion.
	*/
	void
	swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
	boolean_t sync, int *rtvals)
	{
	int i;
	int n = 0;

	if (count && m[0]->object != object) {
	panic("swap_pager_putpages: object mismatch %p/%p",
	object,
	m[0]->object
	);
	}

	/*
	* Step 1
	*
	* Turn object into OBJT_SWAP
	* check for bogus sysops
	* force sync if not pageout process
	*/
	if (object->type != OBJT_SWAP)
	swp_pager_meta_build(object, 0, SWAPBLK_NONE);
	VM_OBJECT_UNLOCK(object);

	if (curproc != pageproc)
	sync = TRUE;

	/*
	* Step 2
	*
	* Update nsw parameters from swap_async_max sysctl values.
	* Do not let the sysop crash the machine with bogus numbers.
	*/
	mtx_lock(&pbuf_mtx);
	if (swap_async_max != nsw_wcount_async_max) {
	int n;

	/*
	* limit range
	*/
	if ((n = swap_async_max) > nswbuf / 2)
	n = nswbuf / 2;
	if (n < 1)
	n = 1;
	swap_async_max = n;

	/*
	* Adjust difference ( if possible ). If the current async
	* count is too low, we may not be able to make the adjustment
	* at this time.
	*/
	n -= nsw_wcount_async_max;
	if (nsw_wcount_async + n >= 0) {
	nsw_wcount_async += n;
	nsw_wcount_async_max += n;
	wakeup(&nsw_wcount_async);
	}
	}
	mtx_unlock(&pbuf_mtx);

	/*
	* Step 3
	*
	* Assign swap blocks and issue I/O. We reallocate swap on the fly.
	* The page is left dirty until the pageout operation completes
	* successfully.
	*/
	for (i = 0; i < count; i += n) {
	int j;
	struct buf *bp;
	daddr_t blk;

	/*
	* Maximum I/O size is limited by a number of factors.
	*/
	n = min(BLIST_MAX_ALLOC, count - i);
	n = min(n, nsw_cluster_max);

	/*
	* Get biggest block of swap we can. If we fail, fall
	* back and try to allocate a smaller block. Don't go
	* overboard trying to allocate space if it would overly
	* fragment swap.
	*/
	while (
	(blk = swp_pager_getswapspace(n)) == SWAPBLK_NONE &&
	n > 4
	) {
	n >>= 1;
	}
	if (blk == SWAPBLK_NONE) {
	for (j = 0; j < n; ++j)
	rtvals[i+j] = VM_PAGER_FAIL;
	continue;
	}

	/*
	* All I/O parameters have been satisfied, build the I/O
	* request and assign the swap space.
	*/
	if (sync == TRUE) {
	bp = getpbuf(&nsw_wcount_sync);
	} else {
	bp = getpbuf(&nsw_wcount_async);
	bp->b_flags = B_ASYNC;
	}
	bp->b_flags \|= B_PAGING;
	bp->b_iocmd = BIO_WRITE;

	pmap_qenter((vm_offset_t)bp->b_data, &m[i], n);

	bp->b_rcred = crhold(thread0.td_ucred);
	bp->b_wcred = crhold(thread0.td_ucred);
	bp->b_bcount = PAGE_SIZE * n;
	bp->b_bufsize = PAGE_SIZE * n;
	bp->b_blkno = blk;

	VM_OBJECT_LOCK(object);
	for (j = 0; j < n; ++j) {
	vm_page_t mreq = m[i+j];

	swp_pager_meta_build(
	mreq->object,
	mreq->pindex,
	blk + j
	);
	vm_page_dirty(mreq);
	rtvals[i+j] = VM_PAGER_OK;

	mreq->oflags \|= VPO_SWAPINPROG;
	bp->b_pages[j] = mreq;
	}
	VM_OBJECT_UNLOCK(object);
	bp->b_npages = n;
	/*
	* Must set dirty range for NFS to work.
	*/
	bp->b_dirtyoff = 0;
	bp->b_dirtyend = bp->b_bcount;

	PCPU_INC(cnt.v_swapout);
	PCPU_ADD(cnt.v_swappgsout, bp->b_npages);

	/*
	* asynchronous
	*
	* NOTE: b_blkno is destroyed by the call to swapdev_strategy
	*/
	if (sync == FALSE) {
	bp->b_iodone = swp_pager_async_iodone;
	BUF_KERNPROC(bp);
	swp_pager_strategy(bp);

	for (j = 0; j < n; ++j)
	rtvals[i+j] = VM_PAGER_PEND;
	/* restart outter loop */
	continue;
	}

	/*
	* synchronous
	*
	* NOTE: b_blkno is destroyed by the call to swapdev_strategy
	*/
	bp->b_iodone = bdone;
	swp_pager_strategy(bp);

	/*
	* Wait for the sync I/O to complete, then update rtvals.
	* We just set the rtvals[] to VM_PAGER_PEND so we can call
	* our async completion routine at the end, thus avoiding a
	* double-free.
	*/
	bwait(bp, PVM, "swwrt");
	for (j = 0; j < n; ++j)
	rtvals[i+j] = VM_PAGER_PEND;
	/*
	* Now that we are through with the bp, we can call the
	* normal async completion, which frees everything up.
	*/
	swp_pager_async_iodone(bp);
	}
	VM_OBJECT_LOCK(object);
	}

	/*
	* swp_pager_async_iodone:
	*
	* Completion routine for asynchronous reads and writes from/to swap.
	* Also called manually by synchronous code to finish up a bp.
	*
	* For READ operations, the pages are VPO_BUSY'd. For WRITE operations,
	* the pages are vm_page_t->busy'd. For READ operations, we VPO_BUSY
	* unbusy all pages except the 'main' request page. For WRITE
	* operations, we vm_page_t->busy'd unbusy all pages ( we can do this
	* because we marked them all VM_PAGER_PEND on return from putpages ).
	*
	* This routine may not sleep.
	*/
	static void
	swp_pager_async_iodone(struct buf *bp)
	{
	int i;
	vm_object_t object = NULL;

	/*
	* report error
	*/
	if (bp->b_ioflags & BIO_ERROR) {
	printf(
	"swap_pager: I/O error - %s failed; blkno %ld,"
	"size %ld, error %d\n",
	((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
	(long)bp->b_blkno,
	(long)bp->b_bcount,
	bp->b_error
	);
	}

	/*
	* remove the mapping for kernel virtual
	*/
	pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);

	if (bp->b_npages) {
	object = bp->b_pages[0]->object;
	VM_OBJECT_LOCK(object);
	}

	/*
	* cleanup pages. If an error occurs writing to swap, we are in
	* very serious trouble. If it happens to be a disk error, though,
	* we may be able to recover by reassigning the swap later on. So
	* in this case we remove the m->swapblk assignment for the page
	* but do not free it in the rlist. The errornous block(s) are thus
	* never reallocated as swap. Redirty the page and continue.
	*/
	for (i = 0; i < bp->b_npages; ++i) {
	vm_page_t m = bp->b_pages[i];

	m->oflags &= ~VPO_SWAPINPROG;

	if (bp->b_ioflags & BIO_ERROR) {
	/*
	* If an error occurs I'd love to throw the swapblk
	* away without freeing it back to swapspace, so it
	* can never be used again. But I can't from an
	* interrupt.
	*/
	if (bp->b_iocmd == BIO_READ) {
	/*
	* When reading, reqpage needs to stay
	* locked for the parent, but all other
	* pages can be freed. We still want to
	* wakeup the parent waiting on the page,
	* though. ( also: pg_reqpage can be -1 and
	* not match anything ).
	*
	* We have to wake specifically requested pages
	* up too because we cleared VPO_SWAPINPROG and
	* someone may be waiting for that.
	*
	* NOTE: for reads, m->dirty will probably
	* be overridden by the original caller of
	* getpages so don't play cute tricks here.
	*/
	m->valid = 0;
	if (i != bp->b_pager.pg_reqpage)
	swp_pager_free_nrpage(m);
	else
	vm_page_flash(m);
	/*
	* If i == bp->b_pager.pg_reqpage, do not wake
	* the page up. The caller needs to.
	*/
	} else {
	/*
	* If a write error occurs, reactivate page
	* so it doesn't clog the inactive list,
	* then finish the I/O.
	*/
	vm_page_dirty(m);
	vm_page_lock(m);
	vm_page_activate(m);
	vm_page_unlock(m);
	vm_page_io_finish(m);
	}
	} else if (bp->b_iocmd == BIO_READ) {
	/*
	* NOTE: for reads, m->dirty will probably be
	* overridden by the original caller of getpages so
	* we cannot set them in order to free the underlying
	* swap in a low-swap situation. I don't think we'd
	* want to do that anyway, but it was an optimization
	* that existed in the old swapper for a time before
	* it got ripped out due to precisely this problem.
	*
	* If not the requested page then deactivate it.
	*
	* Note that the requested page, reqpage, is left
	* busied, but we still have to wake it up. The
	* other pages are released (unbusied) by
	* vm_page_wakeup().
	*/
	KASSERT(!pmap_page_is_mapped(m),
	("swp_pager_async_iodone: page %p is mapped", m));
	m->valid = VM_PAGE_BITS_ALL;
	KASSERT(m->dirty == 0,
	("swp_pager_async_iodone: page %p is dirty", m));

	/*
	* We have to wake specifically requested pages
	* up too because we cleared VPO_SWAPINPROG and
	* could be waiting for it in getpages. However,
	* be sure to not unbusy getpages specifically
	* requested page - getpages expects it to be
	* left busy.
	*/
	if (i != bp->b_pager.pg_reqpage) {
	vm_page_lock(m);
	vm_page_deactivate(m);
	vm_page_unlock(m);
	vm_page_wakeup(m);
	} else
	vm_page_flash(m);
	} else {
	/*
	* For write success, clear the dirty
	* status, then finish the I/O ( which decrements the
	* busy count and possibly wakes waiter's up ).
	*/
	KASSERT((m->aflags & PGA_WRITEABLE) == 0,
	("swp_pager_async_iodone: page %p is not write"
	" protected", m));
	vm_page_undirty(m);
	vm_page_io_finish(m);
	if (vm_page_count_severe()) {
	vm_page_lock(m);
	vm_page_try_to_cache(m);
	vm_page_unlock(m);
	}
	}
	}

	/*
	* adjust pip. NOTE: the original parent may still have its own
	* pip refs on the object.
	*/
	if (object != NULL) {
	vm_object_pip_wakeupn(object, bp->b_npages);
	VM_OBJECT_UNLOCK(object);
	}

	/*
	* swapdev_strategy() manually sets b_vp and b_bufobj before calling
	* bstrategy(). Set them back to NULL now we're done with it, or we'll
	* trigger a KASSERT in relpbuf().
	*/
	if (bp->b_vp) {
	bp->b_vp = NULL;
	bp->b_bufobj = NULL;
	}
	/*
	* release the physical I/O buffer
	*/
	relpbuf(
	bp,
	((bp->b_iocmd == BIO_READ) ? &nsw_rcount :
	((bp->b_flags & B_ASYNC) ?
	&nsw_wcount_async :
	&nsw_wcount_sync
	)
	)
	);
	}

	/*
	* swap_pager_isswapped:
	*
	* Return 1 if at least one page in the given object is paged
	* out to the given swap device.
	*
	* This routine may not sleep.
	*/
	int
	swap_pager_isswapped(vm_object_t object, struct swdevt *sp)
	{
	daddr_t index = 0;
	int bcount;
	int i;

	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
	if (object->type != OBJT_SWAP)
	return (0);

	mtx_lock(&swhash_mtx);
	for (bcount = 0; bcount < object->un_pager.swp.swp_bcount; bcount++) {
	struct swblock *swap;

	if ((swap = *swp_pager_hash(object, index)) != NULL) {
	for (i = 0; i < SWAP_META_PAGES; ++i) {
	if (swp_pager_isondev(swap->swb_pages[i], sp)) {
	mtx_unlock(&swhash_mtx);
	return (1);
	}
	}
	}
	index += SWAP_META_PAGES;
	}
	mtx_unlock(&swhash_mtx);
	return (0);
	}

	/*
	* SWP_PAGER_FORCE_PAGEIN() - force a swap block to be paged in
	*
	* This routine dissociates the page at the given index within a
	* swap block from its backing store, paging it in if necessary.
	* If the page is paged in, it is placed in the inactive queue,
	* since it had its backing store ripped out from under it.
	* We also attempt to swap in all other pages in the swap block,
	* we only guarantee that the one at the specified index is
	* paged in.
	*
	* XXX - The code to page the whole block in doesn't work, so we
	* revert to the one-by-one behavior for now. Sigh.
	*/
	static inline void
	swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex)
	{
	vm_page_t m;

	vm_object_pip_add(object, 1);
	m = vm_page_grab(object, pindex, VM_ALLOC_NORMAL\|VM_ALLOC_RETRY);
	if (m->valid == VM_PAGE_BITS_ALL) {
	vm_object_pip_subtract(object, 1);
	vm_page_dirty(m);
	vm_page_lock(m);
	vm_page_activate(m);
	vm_page_unlock(m);
	vm_page_wakeup(m);
	vm_pager_page_unswapped(m);
	return;
	}

	if (swap_pager_getpages(object, &m, 1, 0) != VM_PAGER_OK)
	panic("swap_pager_force_pagein: read from swap failed");/XXX/
	vm_object_pip_subtract(object, 1);
	vm_page_dirty(m);
	vm_page_lock(m);
	vm_page_deactivate(m);
	vm_page_unlock(m);
	vm_page_wakeup(m);
	vm_pager_page_unswapped(m);
	}

	/*
	* swap_pager_swapoff:
	*
	* Page in all of the pages that have been paged out to the
	* given device. The corresponding blocks in the bitmap must be
	* marked as allocated and the device must be flagged SW_CLOSING.
	* There may be no processes swapped out to the device.
	*
	* This routine may block.
	*/
	static void
	swap_pager_swapoff(struct swdevt *sp)
	{
	struct swblock *swap;
	int i, j, retries;

	GIANT_REQUIRED;

	retries = 0;
	full_rescan:
	mtx_lock(&swhash_mtx);
	for (i = 0; i <= swhash_mask; i++) { /* '<=' is correct here */
	restart:
	for (swap = swhash[i]; swap != NULL; swap = swap->swb_hnext) {
	vm_object_t object = swap->swb_object;
	vm_pindex_t pindex = swap->swb_index;
	for (j = 0; j < SWAP_META_PAGES; ++j) {
	if (swp_pager_isondev(swap->swb_pages[j], sp)) {
	/* avoid deadlock */
	if (!VM_OBJECT_TRYLOCK(object)) {
	break;
	} else {
	mtx_unlock(&swhash_mtx);
	swp_pager_force_pagein(object,
	pindex + j);
	VM_OBJECT_UNLOCK(object);
	mtx_lock(&swhash_mtx);
	goto restart;
	}
	}
	}
	}
	}
	mtx_unlock(&swhash_mtx);
	if (sp->sw_used) {
	/*
	* Objects may be locked or paging to the device being
	* removed, so we will miss their pages and need to
	* make another pass. We have marked this device as
	* SW_CLOSING, so the activity should finish soon.
	*/
	retries++;
	if (retries > 100) {
	panic("swapoff: failed to locate %d swap blocks",
	sp->sw_used);
	}
	pause("swpoff", hz / 20);
	goto full_rescan;
	}
	}

	/************************************************************************
	* SWAP META DATA *
	************************************************************************
	*
	* These routines manipulate the swap metadata stored in the
	* OBJT_SWAP object.
	*
	* Swap metadata is implemented with a global hash and not directly
	* linked into the object. Instead the object simply contains
	* appropriate tracking counters.
	*/

	/*
	* SWP_PAGER_META_BUILD() - add swap block to swap meta data for object
	*
	* We first convert the object to a swap object if it is a default
	* object.
	*
	* The specified swapblk is added to the object's swap metadata. If
	* the swapblk is not valid, it is freed instead. Any previously
	* assigned swapblk is freed.
	*/
	static void
	swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
	{
	struct swblock *swap;
	struct swblock **pswap;
	int idx;

	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
	/*
	* Convert default object to swap object if necessary
	*/
	if (object->type != OBJT_SWAP) {
	object->type = OBJT_SWAP;
	object->un_pager.swp.swp_bcount = 0;

	if (object->handle != NULL) {
	mtx_lock(&sw_alloc_mtx);
	TAILQ_INSERT_TAIL(
	NOBJLIST(object->handle),
	object,
	pager_object_list
	);
	mtx_unlock(&sw_alloc_mtx);
	}
	}

	/*
	* Locate hash entry. If not found create, but if we aren't adding
	* anything just return. If we run out of space in the map we wait
	* and, since the hash table may have changed, retry.
	*/
	retry:
	mtx_lock(&swhash_mtx);
	pswap = swp_pager_hash(object, pindex);

	if ((swap = *pswap) == NULL) {
	int i;

	if (swapblk == SWAPBLK_NONE)
	goto done;

	swap = *pswap = uma_zalloc(swap_zone, M_NOWAIT);
	if (swap == NULL) {
	mtx_unlock(&swhash_mtx);
	VM_OBJECT_UNLOCK(object);
	if (uma_zone_exhausted(swap_zone)) {
	printf("swap zone exhausted, increase kern.maxswzone\n");
	vm_pageout_oom(VM_OOM_SWAPZ);
	pause("swzonex", 10);
	} else
	VM_WAIT;
	VM_OBJECT_LOCK(object);
	goto retry;
	}

	swap->swb_hnext = NULL;
	swap->swb_object = object;
	swap->swb_index = pindex & ~(vm_pindex_t)SWAP_META_MASK;
	swap->swb_count = 0;

	++object->un_pager.swp.swp_bcount;

	for (i = 0; i < SWAP_META_PAGES; ++i)
	swap->swb_pages[i] = SWAPBLK_NONE;
	}

	/*
	* Delete prior contents of metadata
	*/
	idx = pindex & SWAP_META_MASK;

	if (swap->swb_pages[idx] != SWAPBLK_NONE) {
	swp_pager_freeswapspace(swap->swb_pages[idx], 1);
	--swap->swb_count;
	}

	/*
	* Enter block into metadata
	*/
	swap->swb_pages[idx] = swapblk;
	if (swapblk != SWAPBLK_NONE)
	++swap->swb_count;
	done:
	mtx_unlock(&swhash_mtx);
	}

	/*
	* SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
	*
	* The requested range of blocks is freed, with any associated swap
	* returned to the swap bitmap.
	*
	* This routine will free swap metadata structures as they are cleaned
	* out. This routine does NOT operate on swap metadata associated
	* with resident pages.
	*/
	static void
	swp_pager_meta_free(vm_object_t object, vm_pindex_t index, daddr_t count)
	{

	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
	if (object->type != OBJT_SWAP)
	return;

	while (count > 0) {
	struct swblock **pswap;
	struct swblock *swap;

	mtx_lock(&swhash_mtx);
	pswap = swp_pager_hash(object, index);

	if ((swap = *pswap) != NULL) {
	daddr_t v = swap->swb_pages[index & SWAP_META_MASK];

	if (v != SWAPBLK_NONE) {
	swp_pager_freeswapspace(v, 1);
	swap->swb_pages[index & SWAP_META_MASK] =
	SWAPBLK_NONE;
	if (--swap->swb_count == 0) {
	*pswap = swap->swb_hnext;
	uma_zfree(swap_zone, swap);
	--object->un_pager.swp.swp_bcount;
	}
	}
	--count;
	++index;
	} else {
	int n = SWAP_META_PAGES - (index & SWAP_META_MASK);
	count -= n;
	index += n;
	}
	mtx_unlock(&swhash_mtx);
	}
	}

	/*
	* SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
	*
	* This routine locates and destroys all swap metadata associated with
	* an object.
	*/
	static void
	swp_pager_meta_free_all(vm_object_t object)
	{
	daddr_t index = 0;

	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
	if (object->type != OBJT_SWAP)
	return;

	while (object->un_pager.swp.swp_bcount) {
	struct swblock **pswap;
	struct swblock *swap;

	mtx_lock(&swhash_mtx);
	pswap = swp_pager_hash(object, index);
	if ((swap = *pswap) != NULL) {
	int i;

	for (i = 0; i < SWAP_META_PAGES; ++i) {
	daddr_t v = swap->swb_pages[i];
	if (v != SWAPBLK_NONE) {
	--swap->swb_count;
	swp_pager_freeswapspace(v, 1);
	}
	}
	if (swap->swb_count != 0)
	panic("swap_pager_meta_free_all: swb_count != 0");
	*pswap = swap->swb_hnext;
	uma_zfree(swap_zone, swap);
	--object->un_pager.swp.swp_bcount;
	}
	mtx_unlock(&swhash_mtx);
	index += SWAP_META_PAGES;
	}
	}

	/*
	* SWP_PAGER_METACTL() - misc control of swap and vm_page_t meta data.
	*
	* This routine is capable of looking up, popping, or freeing
	* swapblk assignments in the swap meta data or in the vm_page_t.
	* The routine typically returns the swapblk being looked-up, or popped,
	* or SWAPBLK_NONE if the block was freed, or SWAPBLK_NONE if the block
	* was invalid. This routine will automatically free any invalid
	* meta-data swapblks.
	*
	* It is not possible to store invalid swapblks in the swap meta data
	* (other then a literal 'SWAPBLK_NONE'), so we don't bother checking.
	*
	* When acting on a busy resident page and paging is in progress, we
	* have to wait until paging is complete but otherwise can act on the
	* busy page.
	*
	* SWM_FREE remove and free swap block from metadata
	* SWM_POP remove from meta data but do not free.. pop it out
	*/
	static daddr_t
	swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
	{
	struct swblock **pswap;
	struct swblock *swap;
	daddr_t r1;
	int idx;

	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
	/*
	* The meta data only exists of the object is OBJT_SWAP
	* and even then might not be allocated yet.
	*/
	if (object->type != OBJT_SWAP)
	return (SWAPBLK_NONE);

	r1 = SWAPBLK_NONE;
	mtx_lock(&swhash_mtx);
	pswap = swp_pager_hash(object, pindex);

	if ((swap = *pswap) != NULL) {
	idx = pindex & SWAP_META_MASK;
	r1 = swap->swb_pages[idx];

	if (r1 != SWAPBLK_NONE) {
	if (flags & SWM_FREE) {
	swp_pager_freeswapspace(r1, 1);
	r1 = SWAPBLK_NONE;
	}
	if (flags & (SWM_FREE\|SWM_POP)) {
	swap->swb_pages[idx] = SWAPBLK_NONE;
	if (--swap->swb_count == 0) {
	*pswap = swap->swb_hnext;
	uma_zfree(swap_zone, swap);
	--object->un_pager.swp.swp_bcount;
	}
	}
	}
	}
	mtx_unlock(&swhash_mtx);
	return (r1);
	}

	/*
	* System call swapon(name) enables swapping on device name,
	* which must be in the swdevsw. Return EBUSY
	* if already swapping on this device.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct swapon_args {
	char *name;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	-swapon(struct thread td, struct swapon_args uap)
	+sys_swapon(struct thread td, struct swapon_args uap)
	{
	struct vattr attr;
	struct vnode *vp;
	struct nameidata nd;
	int error;

	error = priv_check(td, PRIV_SWAPON);
	if (error)
	return (error);

	mtx_lock(&Giant);
	while (swdev_syscall_active)
	tsleep(&swdev_syscall_active, PUSER - 1, "swpon", 0);
	swdev_syscall_active = 1;

	/*
	* Swap metadata may not fit in the KVM if we have physical
	* memory of >1GB.
	*/
	if (swap_zone == NULL) {
	error = ENOMEM;
	goto done;
	}

	NDINIT(&nd, LOOKUP, ISOPEN \| FOLLOW \| AUDITVNODE1, UIO_USERSPACE,
	uap->name, td);
	error = namei(&nd);
	if (error)
	goto done;

	NDFREE(&nd, NDF_ONLY_PNBUF);
	vp = nd.ni_vp;

	if (vn_isdisk(vp, &error)) {
	error = swapongeom(td, vp);
	} else if (vp->v_type == VREG &&
	(vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
	(error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) {
	/*
	* Allow direct swapping to NFS regular files in the same
	* way that nfs_mountroot() sets up diskless swapping.
	*/
	error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
	}

	if (error)
	vrele(vp);
	done:
	swdev_syscall_active = 0;
	wakeup_one(&swdev_syscall_active);
	mtx_unlock(&Giant);
	return (error);
	}

	static void
	swaponsomething(struct vnode vp, void id, u_long nblks, sw_strategy_t strategy, sw_close_t close, dev_t dev)
	{
	struct swdevt sp, tsp;
	swblk_t dvbase;
	u_long mblocks;

	/*
	* nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
	* First chop nblks off to page-align it, then convert.
	*
	* sw->sw_nblks is in page-sized chunks now too.
	*/
	nblks &= ~(ctodb(1) - 1);
	nblks = dbtoc(nblks);

	/*
	* If we go beyond this, we get overflows in the radix
	* tree bitmap code.
	*/
	mblocks = 0x40000000 / BLIST_META_RADIX;
	if (nblks > mblocks) {
	printf(
	"WARNING: reducing swap size to maximum of %luMB per unit\n",
	mblocks / 1024 / 1024 * PAGE_SIZE);
	nblks = mblocks;
	}

	sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK \| M_ZERO);
	sp->sw_vp = vp;
	sp->sw_id = id;
	sp->sw_dev = dev;
	sp->sw_flags = 0;
	sp->sw_nblks = nblks;
	sp->sw_used = 0;
	sp->sw_strategy = strategy;
	sp->sw_close = close;

	sp->sw_blist = blist_create(nblks, M_WAITOK);
	/*
	* Do not free the first two block in order to avoid overwriting
	* any bsd label at the front of the partition
	*/
	blist_free(sp->sw_blist, 2, nblks - 2);

	dvbase = 0;
	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(tsp, &swtailq, sw_list) {
	if (tsp->sw_end >= dvbase) {
	/*
	* We put one uncovered page between the devices
	* in order to definitively prevent any cross-device
	* I/O requests
	*/
	dvbase = tsp->sw_end + 1;
	}
	}
	sp->sw_first = dvbase;
	sp->sw_end = dvbase + nblks;
	TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
	nswapdev++;
	swap_pager_avail += nblks;
	swap_total += (vm_ooffset_t)nblks * PAGE_SIZE;
	swp_sizecheck();
	mtx_unlock(&sw_dev_mtx);
	}

	/*
	* SYSCALL: swapoff(devname)
	*
	* Disable swapping on the given device.
	*
	* XXX: Badly designed system call: it should use a device index
	* rather than filename as specification. We keep sw_vp around
	* only to make this work.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct swapoff_args {
	char *name;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	-swapoff(struct thread td, struct swapoff_args uap)
	+sys_swapoff(struct thread td, struct swapoff_args uap)
	{
	struct vnode *vp;
	struct nameidata nd;
	struct swdevt *sp;
	int error;

	error = priv_check(td, PRIV_SWAPOFF);
	if (error)
	return (error);

	mtx_lock(&Giant);
	while (swdev_syscall_active)
	tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
	swdev_syscall_active = 1;

	NDINIT(&nd, LOOKUP, FOLLOW \| AUDITVNODE1, UIO_USERSPACE, uap->name,
	td);
	error = namei(&nd);
	if (error)
	goto done;
	NDFREE(&nd, NDF_ONLY_PNBUF);
	vp = nd.ni_vp;

	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	if (sp->sw_vp == vp)
	break;
	}
	mtx_unlock(&sw_dev_mtx);
	if (sp == NULL) {
	error = EINVAL;
	goto done;
	}
	error = swapoff_one(sp, td->td_ucred);
	done:
	swdev_syscall_active = 0;
	wakeup_one(&swdev_syscall_active);
	mtx_unlock(&Giant);
	return (error);
	}

	static int
	swapoff_one(struct swdevt sp, struct ucred cred)
	{
	u_long nblks, dvbase;
	#ifdef MAC
	int error;
	#endif

	mtx_assert(&Giant, MA_OWNED);
	#ifdef MAC
	(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE \| LK_RETRY);
	error = mac_system_check_swapoff(cred, sp->sw_vp);
	(void) VOP_UNLOCK(sp->sw_vp, 0);
	if (error != 0)
	return (error);
	#endif
	nblks = sp->sw_nblks;

	/*
	* We can turn off this swap device safely only if the
	* available virtual memory in the system will fit the amount
	* of data we will have to page back in, plus an epsilon so
	* the system doesn't become critically low on swap space.
	*/
	if (cnt.v_free_count + cnt.v_cache_count + swap_pager_avail <
	nblks + nswap_lowat) {
	return (ENOMEM);
	}

	/*
	* Prevent further allocations on this device.
	*/
	mtx_lock(&sw_dev_mtx);
	sp->sw_flags \|= SW_CLOSING;
	for (dvbase = 0; dvbase < sp->sw_end; dvbase += dmmax) {
	swap_pager_avail -= blist_fill(sp->sw_blist,
	dvbase, dmmax);
	}
	swap_total -= (vm_ooffset_t)nblks * PAGE_SIZE;
	mtx_unlock(&sw_dev_mtx);

	/*
	* Page in the contents of the device and close it.
	*/
	swap_pager_swapoff(sp);

	sp->sw_close(curthread, sp);
	sp->sw_id = NULL;
	mtx_lock(&sw_dev_mtx);
	TAILQ_REMOVE(&swtailq, sp, sw_list);
	nswapdev--;
	if (nswapdev == 0) {
	swap_pager_full = 2;
	swap_pager_almost_full = 1;
	}
	if (swdevhd == sp)
	swdevhd = NULL;
	mtx_unlock(&sw_dev_mtx);
	blist_destroy(sp->sw_blist);
	free(sp, M_VMPGDATA);
	return (0);
	}

	void
	swapoff_all(void)
	{
	struct swdevt sp, spt;
	const char *devname;
	int error;

	mtx_lock(&Giant);
	while (swdev_syscall_active)
	tsleep(&swdev_syscall_active, PUSER - 1, "swpoff", 0);
	swdev_syscall_active = 1;

	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
	mtx_unlock(&sw_dev_mtx);
	if (vn_isdisk(sp->sw_vp, NULL))
	devname = sp->sw_vp->v_rdev->si_name;
	else
	devname = "[file]";
	error = swapoff_one(sp, thread0.td_ucred);
	if (error != 0) {
	printf("Cannot remove swap device %s (error=%d), "
	"skipping.\n", devname, error);
	} else if (bootverbose) {
	printf("Swap device %s removed.\n", devname);
	}
	mtx_lock(&sw_dev_mtx);
	}
	mtx_unlock(&sw_dev_mtx);

	swdev_syscall_active = 0;
	wakeup_one(&swdev_syscall_active);
	mtx_unlock(&Giant);
	}

	void
	swap_pager_status(int total, int used)
	{
	struct swdevt *sp;

	*total = 0;
	*used = 0;
	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	*total += sp->sw_nblks;
	*used += sp->sw_used;
	}
	mtx_unlock(&sw_dev_mtx);
	}

	int
	swap_dev_info(int name, struct xswdev xs, char devname, size_t len)
	{
	struct swdevt *sp;
	char *tmp_devname;
	int error, n;

	n = 0;
	error = ENOENT;
	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	if (n != name) {
	n++;
	continue;
	}
	xs->xsw_version = XSWDEV_VERSION;
	xs->xsw_dev = sp->sw_dev;
	xs->xsw_flags = sp->sw_flags;
	xs->xsw_nblks = sp->sw_nblks;
	xs->xsw_used = sp->sw_used;
	if (devname != NULL) {
	if (vn_isdisk(sp->sw_vp, NULL))
	tmp_devname = sp->sw_vp->v_rdev->si_name;
	else
	tmp_devname = "[file]";
	strncpy(devname, tmp_devname, len);
	}
	error = 0;
	break;
	}
	mtx_unlock(&sw_dev_mtx);
	return (error);
	}

	static int
	sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
	{
	struct xswdev xs;
	int error;

	if (arg2 != 1) /* name length */
	return (EINVAL);
	error = swap_dev_info((int )arg1, &xs, NULL, 0);
	if (error != 0)
	return (error);
	error = SYSCTL_OUT(req, &xs, sizeof(xs));
	return (error);
	}

	SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
	"Number of swap devices");
	SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD, sysctl_vm_swap_info,
	"Swap statistics by device");

	/*
	* vmspace_swap_count() - count the approximate swap usage in pages for a
	* vmspace.
	*
	* The map must be locked.
	*
	* Swap usage is determined by taking the proportional swap used by
	* VM objects backing the VM map. To make up for fractional losses,
	* if the VM object has any swap use at all the associated map entries
	* count for at least 1 swap page.
	*/
	long
	vmspace_swap_count(struct vmspace *vmspace)
	{
	vm_map_t map;
	vm_map_entry_t cur;
	vm_object_t object;
	long count, n;

	map = &vmspace->vm_map;
	count = 0;

	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
	if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) == 0 &&
	(object = cur->object.vm_object) != NULL) {
	VM_OBJECT_LOCK(object);
	if (object->type == OBJT_SWAP &&
	object->un_pager.swp.swp_bcount != 0) {
	n = (cur->end - cur->start) / PAGE_SIZE;
	count += object->un_pager.swp.swp_bcount *
	SWAP_META_PAGES * n / object->size + 1;
	}
	VM_OBJECT_UNLOCK(object);
	}
	}
	return (count);
	}

	/*
	* GEOM backend
	*
	* Swapping onto disk devices.
	*
	*/

	static g_orphan_t swapgeom_orphan;

	static struct g_class g_swap_class = {
	.name = "SWAP",
	.version = G_VERSION,
	.orphan = swapgeom_orphan,
	};

	DECLARE_GEOM_CLASS(g_swap_class, g_class);


	static void
	swapgeom_done(struct bio *bp2)
	{
	struct buf *bp;

	bp = bp2->bio_caller2;
	bp->b_ioflags = bp2->bio_flags;
	if (bp2->bio_error)
	bp->b_ioflags \|= BIO_ERROR;
	bp->b_resid = bp->b_bcount - bp2->bio_completed;
	bp->b_error = bp2->bio_error;
	bufdone(bp);
	g_destroy_bio(bp2);
	}

	static void
	swapgeom_strategy(struct buf bp, struct swdevt sp)
	{
	struct bio *bio;
	struct g_consumer *cp;

	cp = sp->sw_id;
	if (cp == NULL) {
	bp->b_error = ENXIO;
	bp->b_ioflags \|= BIO_ERROR;
	bufdone(bp);
	return;
	}
	if (bp->b_iocmd == BIO_WRITE)
	bio = g_new_bio();
	else
	bio = g_alloc_bio();
	if (bio == NULL) {
	bp->b_error = ENOMEM;
	bp->b_ioflags \|= BIO_ERROR;
	bufdone(bp);
	return;
	}

	bio->bio_caller2 = bp;
	bio->bio_cmd = bp->b_iocmd;
	bio->bio_data = bp->b_data;
	bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
	bio->bio_length = bp->b_bcount;
	bio->bio_done = swapgeom_done;
	g_io_request(bio, cp);
	return;
	}

	static void
	swapgeom_orphan(struct g_consumer *cp)
	{
	struct swdevt *sp;

	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list)
	if (sp->sw_id == cp)
	sp->sw_id = NULL;
	mtx_unlock(&sw_dev_mtx);
	}

	static void
	swapgeom_close_ev(void *arg, int flags)
	{
	struct g_consumer *cp;

	cp = arg;
	g_access(cp, -1, -1, 0);
	g_detach(cp);
	g_destroy_consumer(cp);
	}

	static void
	swapgeom_close(struct thread td, struct swdevt sw)
	{

	/* XXX: direct call when Giant untangled */
	g_waitfor_event(swapgeom_close_ev, sw->sw_id, M_WAITOK, NULL);
	}


	struct swh0h0 {
	struct cdev *dev;
	struct vnode *vp;
	int error;
	};

	static void
	swapongeom_ev(void *arg, int flags)
	{
	struct swh0h0 *swh;
	struct g_provider *pp;
	struct g_consumer *cp;
	static struct g_geom *gp;
	struct swdevt *sp;
	u_long nblks;
	int error;

	swh = arg;
	swh->error = 0;
	pp = g_dev_getprovider(swh->dev);
	if (pp == NULL) {
	swh->error = ENODEV;
	return;
	}
	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	cp = sp->sw_id;
	if (cp != NULL && cp->provider == pp) {
	mtx_unlock(&sw_dev_mtx);
	swh->error = EBUSY;
	return;
	}
	}
	mtx_unlock(&sw_dev_mtx);
	if (gp == NULL)
	gp = g_new_geomf(&g_swap_class, "swap", NULL);
	cp = g_new_consumer(gp);
	g_attach(cp, pp);
	/*
	* XXX: Everytime you think you can improve the margin for
	* footshooting, somebody depends on the ability to do so:
	* savecore(8) wants to write to our swapdev so we cannot
	* set an exclusive count :-(
	*/
	error = g_access(cp, 1, 1, 0);
	if (error) {
	g_detach(cp);
	g_destroy_consumer(cp);
	swh->error = error;
	return;
	}
	nblks = pp->mediasize / DEV_BSIZE;
	swaponsomething(swh->vp, cp, nblks, swapgeom_strategy,
	swapgeom_close, dev2udev(swh->dev));
	swh->error = 0;
	return;
	}

	static int
	swapongeom(struct thread td, struct vnode vp)
	{
	int error;
	struct swh0h0 swh;

	vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);

	swh.dev = vp->v_rdev;
	swh.vp = vp;
	swh.error = 0;
	/* XXX: direct call when Giant untangled */
	error = g_waitfor_event(swapongeom_ev, &swh, M_WAITOK, NULL);
	if (!error)
	error = swh.error;
	VOP_UNLOCK(vp, 0);
	return (error);
	}

	/*
	* VNODE backend
	*
	* This is used mainly for network filesystem (read: probably only tested
	* with NFS) swapfiles.
	*
	*/

	static void
	swapdev_strategy(struct buf bp, struct swdevt sp)
	{
	struct vnode *vp2;

	bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);

	vp2 = sp->sw_id;
	vhold(vp2);
	if (bp->b_iocmd == BIO_WRITE) {
	if (bp->b_bufobj)
	bufobj_wdrop(bp->b_bufobj);
	bufobj_wref(&vp2->v_bufobj);
	}
	if (bp->b_bufobj != &vp2->v_bufobj)
	bp->b_bufobj = &vp2->v_bufobj;
	bp->b_vp = vp2;
	bp->b_iooffset = dbtob(bp->b_blkno);
	bstrategy(bp);
	return;
	}

	static void
	swapdev_close(struct thread td, struct swdevt sp)
	{

	VOP_CLOSE(sp->sw_vp, FREAD \| FWRITE, td->td_ucred, td);
	vrele(sp->sw_vp);
	}


	static int
	swaponvp(struct thread td, struct vnode vp, u_long nblks)
	{
	struct swdevt *sp;
	int error;

	if (nblks == 0)
	return (ENXIO);
	mtx_lock(&sw_dev_mtx);
	TAILQ_FOREACH(sp, &swtailq, sw_list) {
	if (sp->sw_id == vp) {
	mtx_unlock(&sw_dev_mtx);
	return (EBUSY);
	}
	}
	mtx_unlock(&sw_dev_mtx);

	(void) vn_lock(vp, LK_EXCLUSIVE \| LK_RETRY);
	#ifdef MAC
	error = mac_system_check_swapon(td->td_ucred, vp);
	if (error == 0)
	#endif
	error = VOP_OPEN(vp, FREAD \| FWRITE, td->td_ucred, td, NULL);
	(void) VOP_UNLOCK(vp, 0);
	if (error)
	return (error);

	swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
	NODEV);
	return (0);
	}
	Index: head/sys/vm/vm_mmap.c
	===================================================================
	--- head/sys/vm/vm_mmap.c (revision 225616)
	+++ head/sys/vm/vm_mmap.c (revision 225617)
	@@ -1,1574 +1,1574 @@
	/*-
	* Copyright (c) 1988 University of Utah.
	* Copyright (c) 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* the Systems Programming Group of the University of Utah Computer
	* Science Department.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
	*
	* @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94
	*/

	/*
	* Mapped file (mmap) interface to VM
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include "opt_compat.h"
	#include "opt_hwpmc_hooks.h"

	#include <sys/param.h>
	#include <sys/systm.h>
	#include <sys/capability.h>
	#include <sys/kernel.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/sysproto.h>
	#include <sys/filedesc.h>
	#include <sys/priv.h>
	#include <sys/proc.h>
	#include <sys/racct.h>
	#include <sys/resource.h>
	#include <sys/resourcevar.h>
	#include <sys/vnode.h>
	#include <sys/fcntl.h>
	#include <sys/file.h>
	#include <sys/mman.h>
	#include <sys/mount.h>
	#include <sys/conf.h>
	#include <sys/stat.h>
	#include <sys/sysent.h>
	#include <sys/vmmeter.h>

	#include <security/mac/mac_framework.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>
	#include <vm/vm_object.h>
	#include <vm/vm_page.h>
	#include <vm/vm_pager.h>
	#include <vm/vm_pageout.h>
	#include <vm/vm_extern.h>
	#include <vm/vm_page.h>

	#ifdef HWPMC_HOOKS
	#include <sys/pmckern.h>
	#endif

	#ifndef _SYS_SYSPROTO_H_
	struct sbrk_args {
	int incr;
	};
	#endif

	static int vm_mmap_vnode(struct thread , vm_size_t, vm_prot_t, vm_prot_t ,
	int , struct vnode , vm_ooffset_t , vm_object_t );
	static int vm_mmap_cdev(struct thread , vm_size_t, vm_prot_t, vm_prot_t ,
	int , struct cdev , vm_ooffset_t , vm_object_t );
	static int vm_mmap_shm(struct thread , vm_size_t, vm_prot_t, vm_prot_t ,
	int , struct shmfd , vm_ooffset_t, vm_object_t *);

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	-sbrk(td, uap)
	+sys_sbrk(td, uap)
	struct thread *td;
	struct sbrk_args *uap;
	{
	/* Not yet implemented */
	return (EOPNOTSUPP);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct sstk_args {
	int incr;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	-sstk(td, uap)
	+sys_sstk(td, uap)
	struct thread *td;
	struct sstk_args *uap;
	{
	/* Not yet implemented */
	return (EOPNOTSUPP);
	}

	#if defined(COMPAT_43)
	#ifndef _SYS_SYSPROTO_H_
	struct getpagesize_args {
	int dummy;
	};
	#endif

	/* ARGSUSED */
	int
	ogetpagesize(td, uap)
	struct thread *td;
	struct getpagesize_args *uap;
	{
	/* MP SAFE */
	td->td_retval[0] = PAGE_SIZE;
	return (0);
	}
	#endif /* COMPAT_43 */


	/*
	* Memory Map (mmap) system call. Note that the file offset
	* and address are allowed to be NOT page aligned, though if
	* the MAP_FIXED flag it set, both must have the same remainder
	* modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not
	* page-aligned, the actual mapping starts at trunc_page(addr)
	* and the return value is adjusted up by the page offset.
	*
	* Generally speaking, only character devices which are themselves
	* memory-based, such as a video framebuffer, can be mmap'd. Otherwise
	* there would be no cache coherency between a descriptor and a VM mapping
	* both to the same character device.
	*/
	#ifndef _SYS_SYSPROTO_H_
	struct mmap_args {
	void *addr;
	size_t len;
	int prot;
	int flags;
	int fd;
	long pad;
	off_t pos;
	};
	#endif

	/*
	* MPSAFE
	*/
	int
	-mmap(td, uap)
	+sys_mmap(td, uap)
	struct thread *td;
	struct mmap_args *uap;
	{
	#ifdef HWPMC_HOOKS
	struct pmckern_map_in pkm;
	#endif
	struct file *fp;
	struct vnode *vp;
	vm_offset_t addr;
	vm_size_t size, pageoff;
	vm_prot_t cap_maxprot, prot, maxprot;
	void *handle;
	objtype_t handle_type;
	int flags, error;
	off_t pos;
	struct vmspace *vms = td->td_proc->p_vmspace;
	cap_rights_t rights;

	addr = (vm_offset_t) uap->addr;
	size = uap->len;
	prot = uap->prot & VM_PROT_ALL;
	flags = uap->flags;
	pos = uap->pos;

	fp = NULL;

	/* Make sure mapping fits into numeric range, etc. */
	if ((uap->len == 0 && !SV_CURPROC_FLAG(SV_AOUT) &&
	curproc->p_osrel >= P_OSREL_MAP_ANON) \|\|
	((flags & MAP_ANON) && (uap->fd != -1 \|\| pos != 0)))
	return (EINVAL);

	if (flags & MAP_STACK) {
	if ((uap->fd != -1) \|\|
	((prot & (PROT_READ \| PROT_WRITE)) != (PROT_READ \| PROT_WRITE)))
	return (EINVAL);
	flags \|= MAP_ANON;
	pos = 0;
	}

	/*
	* Align the file position to a page boundary,
	* and save its page offset component.
	*/
	pageoff = (pos & PAGE_MASK);
	pos -= pageoff;

	/* Adjust size for rounding (on both ends). */
	size += pageoff; /* low end... */
	size = (vm_size_t) round_page(size); /* hi end */

	/*
	* Check for illegal addresses. Watch out for address wrap... Note
	* that VM_*_ADDRESS are not constants due to casts (argh).
	*/
	if (flags & MAP_FIXED) {
	/*
	* The specified address must have the same remainder
	* as the file offset taken modulo PAGE_SIZE, so it
	* should be aligned after adjustment by pageoff.
	*/
	addr -= pageoff;
	if (addr & PAGE_MASK)
	return (EINVAL);

	/* Address range must be all in user VM space. */
	if (addr < vm_map_min(&vms->vm_map) \|\|
	addr + size > vm_map_max(&vms->vm_map))
	return (EINVAL);
	if (addr + size < addr)
	return (EINVAL);
	} else {
	/*
	* XXX for non-fixed mappings where no hint is provided or
	* the hint would fall in the potential heap space,
	* place it after the end of the largest possible heap.
	*
	* There should really be a pmap call to determine a reasonable
	* location.
	*/
	PROC_LOCK(td->td_proc);
	if (addr == 0 \|\|
	(addr >= round_page((vm_offset_t)vms->vm_taddr) &&
	addr < round_page((vm_offset_t)vms->vm_daddr +
	lim_max(td->td_proc, RLIMIT_DATA))))
	addr = round_page((vm_offset_t)vms->vm_daddr +
	lim_max(td->td_proc, RLIMIT_DATA));
	PROC_UNLOCK(td->td_proc);
	}
	if (flags & MAP_ANON) {
	/*
	* Mapping blank space is trivial.
	*/
	handle = NULL;
	handle_type = OBJT_DEFAULT;
	maxprot = VM_PROT_ALL;
	cap_maxprot = VM_PROT_ALL;
	} else {
	/*
	* Mapping file, get fp for validation and don't let the
	* descriptor disappear on us if we block. Check capability
	* rights, but also return the maximum rights to be combined
	* with maxprot later.
	*/
	rights = CAP_MMAP;
	if (prot & PROT_READ)
	rights \|= CAP_READ;
	if ((flags & MAP_SHARED) != 0) {
	if (prot & PROT_WRITE)
	rights \|= CAP_WRITE;
	}
	if (prot & PROT_EXEC)
	rights \|= CAP_MAPEXEC;
	if ((error = fget_mmap(td, uap->fd, rights, &cap_maxprot,
	&fp)) != 0)
	goto done;
	if (fp->f_type == DTYPE_SHM) {
	handle = fp->f_data;
	handle_type = OBJT_SWAP;
	maxprot = VM_PROT_NONE;

	/* FREAD should always be set. */
	if (fp->f_flag & FREAD)
	maxprot \|= VM_PROT_EXECUTE \| VM_PROT_READ;
	if (fp->f_flag & FWRITE)
	maxprot \|= VM_PROT_WRITE;
	goto map;
	}
	if (fp->f_type != DTYPE_VNODE) {
	error = ENODEV;
	goto done;
	}
	#if defined(COMPAT_FREEBSD7) \|\| defined(COMPAT_FREEBSD6) \|\| \
	defined(COMPAT_FREEBSD5) \|\| defined(COMPAT_FREEBSD4)
	/*
	* POSIX shared-memory objects are defined to have
	* kernel persistence, and are not defined to support
	* read(2)/write(2) -- or even open(2). Thus, we can
	* use MAP_ASYNC to trade on-disk coherence for speed.
	* The shm_open(3) library routine turns on the FPOSIXSHM
	* flag to request this behavior.
	*/
	if (fp->f_flag & FPOSIXSHM)
	flags \|= MAP_NOSYNC;
	#endif
	vp = fp->f_vnode;
	/*
	* Ensure that file and memory protections are
	* compatible. Note that we only worry about
	* writability if mapping is shared; in this case,
	* current and max prot are dictated by the open file.
	* XXX use the vnode instead? Problem is: what
	* credentials do we use for determination? What if
	* proc does a setuid?
	*/
	if (vp->v_mount != NULL && vp->v_mount->mnt_flag & MNT_NOEXEC)
	maxprot = VM_PROT_NONE;
	else
	maxprot = VM_PROT_EXECUTE;
	if (fp->f_flag & FREAD) {
	maxprot \|= VM_PROT_READ;
	} else if (prot & PROT_READ) {
	error = EACCES;
	goto done;
	}
	/*
	* If we are sharing potential changes (either via
	* MAP_SHARED or via the implicit sharing of character
	* device mappings), and we are trying to get write
	* permission although we opened it without asking
	* for it, bail out.
	*/
	if ((flags & MAP_SHARED) != 0) {
	if ((fp->f_flag & FWRITE) != 0) {
	maxprot \|= VM_PROT_WRITE;
	} else if ((prot & PROT_WRITE) != 0) {
	error = EACCES;
	goto done;
	}
	} else if (vp->v_type != VCHR \|\| (fp->f_flag & FWRITE) != 0) {
	maxprot \|= VM_PROT_WRITE;
	cap_maxprot \|= VM_PROT_WRITE;
	}
	handle = (void *)vp;
	handle_type = OBJT_VNODE;
	}
	map:
	td->td_fpop = fp;
	maxprot &= cap_maxprot;
	error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot,
	flags, handle_type, handle, pos);
	td->td_fpop = NULL;
	#ifdef HWPMC_HOOKS
	/* inform hwpmc(4) if an executable is being mapped */
	if (error == 0 && handle_type == OBJT_VNODE &&
	(prot & PROT_EXEC)) {
	pkm.pm_file = handle;
	pkm.pm_address = (uintptr_t) addr;
	PMC_CALL_HOOK(td, PMC_FN_MMAP, (void *) &pkm);
	}
	#endif
	if (error == 0)
	td->td_retval[0] = (register_t) (addr + pageoff);
	done:
	if (fp)
	fdrop(fp, td);

	return (error);
	}

	int
	freebsd6_mmap(struct thread td, struct freebsd6_mmap_args uap)
	{
	struct mmap_args oargs;

	oargs.addr = uap->addr;
	oargs.len = uap->len;
	oargs.prot = uap->prot;
	oargs.flags = uap->flags;
	oargs.fd = uap->fd;
	oargs.pos = uap->pos;
	- return (mmap(td, &oargs));
	+ return (sys_mmap(td, &oargs));
	}

	#ifdef COMPAT_43
	#ifndef _SYS_SYSPROTO_H_
	struct ommap_args {
	caddr_t addr;
	int len;
	int prot;
	int flags;
	int fd;
	long pos;
	};
	#endif
	int
	ommap(td, uap)
	struct thread *td;
	struct ommap_args *uap;
	{
	struct mmap_args nargs;
	static const char cvtbsdprot[8] = {
	0,
	PROT_EXEC,
	PROT_WRITE,
	PROT_EXEC \| PROT_WRITE,
	PROT_READ,
	PROT_EXEC \| PROT_READ,
	PROT_WRITE \| PROT_READ,
	PROT_EXEC \| PROT_WRITE \| PROT_READ,
	};

	#define OMAP_ANON 0x0002
	#define OMAP_COPY 0x0020
	#define OMAP_SHARED 0x0010
	#define OMAP_FIXED 0x0100

	nargs.addr = uap->addr;
	nargs.len = uap->len;
	nargs.prot = cvtbsdprot[uap->prot & 0x7];
	nargs.flags = 0;
	if (uap->flags & OMAP_ANON)
	nargs.flags \|= MAP_ANON;
	if (uap->flags & OMAP_COPY)
	nargs.flags \|= MAP_COPY;
	if (uap->flags & OMAP_SHARED)
	nargs.flags \|= MAP_SHARED;
	else
	nargs.flags \|= MAP_PRIVATE;
	if (uap->flags & OMAP_FIXED)
	nargs.flags \|= MAP_FIXED;
	nargs.fd = uap->fd;
	nargs.pos = uap->pos;
	- return (mmap(td, &nargs));
	+ return (sys_mmap(td, &nargs));
	}
	#endif /* COMPAT_43 */


	#ifndef _SYS_SYSPROTO_H_
	struct msync_args {
	void *addr;
	size_t len;
	int flags;
	};
	#endif
	/*
	* MPSAFE
	*/
	int
	-msync(td, uap)
	+sys_msync(td, uap)
	struct thread *td;
	struct msync_args *uap;
	{
	vm_offset_t addr;
	vm_size_t size, pageoff;
	int flags;
	vm_map_t map;
	int rv;

	addr = (vm_offset_t) uap->addr;
	size = uap->len;
	flags = uap->flags;

	pageoff = (addr & PAGE_MASK);
	addr -= pageoff;
	size += pageoff;
	size = (vm_size_t) round_page(size);
	if (addr + size < addr)
	return (EINVAL);

	if ((flags & (MS_ASYNC\|MS_INVALIDATE)) == (MS_ASYNC\|MS_INVALIDATE))
	return (EINVAL);

	map = &td->td_proc->p_vmspace->vm_map;

	/*
	* Clean the pages and interpret the return value.
	*/
	rv = vm_map_sync(map, addr, addr + size, (flags & MS_ASYNC) == 0,
	(flags & MS_INVALIDATE) != 0);
	switch (rv) {
	case KERN_SUCCESS:
	return (0);
	case KERN_INVALID_ADDRESS:
	return (EINVAL); /* Sun returns ENOMEM? */
	case KERN_INVALID_ARGUMENT:
	return (EBUSY);
	default:
	return (EINVAL);
	}
	}

	#ifndef _SYS_SYSPROTO_H_
	struct munmap_args {
	void *addr;
	size_t len;
	};
	#endif
	/*
	* MPSAFE
	*/
	int
	-munmap(td, uap)
	+sys_munmap(td, uap)
	struct thread *td;
	struct munmap_args *uap;
	{
	#ifdef HWPMC_HOOKS
	struct pmckern_map_out pkm;
	vm_map_entry_t entry;
	#endif
	vm_offset_t addr;
	vm_size_t size, pageoff;
	vm_map_t map;

	addr = (vm_offset_t) uap->addr;
	size = uap->len;
	if (size == 0)
	return (EINVAL);

	pageoff = (addr & PAGE_MASK);
	addr -= pageoff;
	size += pageoff;
	size = (vm_size_t) round_page(size);
	if (addr + size < addr)
	return (EINVAL);

	/*
	* Check for illegal addresses. Watch out for address wrap...
	*/
	map = &td->td_proc->p_vmspace->vm_map;
	if (addr < vm_map_min(map) \|\| addr + size > vm_map_max(map))
	return (EINVAL);
	vm_map_lock(map);
	#ifdef HWPMC_HOOKS
	/*
	* Inform hwpmc if the address range being unmapped contains
	* an executable region.
	*/
	pkm.pm_address = (uintptr_t) NULL;
	if (vm_map_lookup_entry(map, addr, &entry)) {
	for (;
	entry != &map->header && entry->start < addr + size;
	entry = entry->next) {
	if (vm_map_check_protection(map, entry->start,
	entry->end, VM_PROT_EXECUTE) == TRUE) {
	pkm.pm_address = (uintptr_t) addr;
	pkm.pm_size = (size_t) size;
	break;
	}
	}
	}
	#endif
	vm_map_delete(map, addr, addr + size);

	#ifdef HWPMC_HOOKS
	/* downgrade the lock to prevent a LOR with the pmc-sx lock */
	vm_map_lock_downgrade(map);
	if (pkm.pm_address != (uintptr_t) NULL)
	PMC_CALL_HOOK(td, PMC_FN_MUNMAP, (void *) &pkm);
	vm_map_unlock_read(map);
	#else
	vm_map_unlock(map);
	#endif
	/* vm_map_delete returns nothing but KERN_SUCCESS anyway */
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct mprotect_args {
	const void *addr;
	size_t len;
	int prot;
	};
	#endif
	/*
	* MPSAFE
	*/
	int
	-mprotect(td, uap)
	+sys_mprotect(td, uap)
	struct thread *td;
	struct mprotect_args *uap;
	{
	vm_offset_t addr;
	vm_size_t size, pageoff;
	vm_prot_t prot;

	addr = (vm_offset_t) uap->addr;
	size = uap->len;
	prot = uap->prot & VM_PROT_ALL;

	pageoff = (addr & PAGE_MASK);
	addr -= pageoff;
	size += pageoff;
	size = (vm_size_t) round_page(size);
	if (addr + size < addr)
	return (EINVAL);

	switch (vm_map_protect(&td->td_proc->p_vmspace->vm_map, addr,
	addr + size, prot, FALSE)) {
	case KERN_SUCCESS:
	return (0);
	case KERN_PROTECTION_FAILURE:
	return (EACCES);
	case KERN_RESOURCE_SHORTAGE:
	return (ENOMEM);
	}
	return (EINVAL);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct minherit_args {
	void *addr;
	size_t len;
	int inherit;
	};
	#endif
	/*
	* MPSAFE
	*/
	int
	-minherit(td, uap)
	+sys_minherit(td, uap)
	struct thread *td;
	struct minherit_args *uap;
	{
	vm_offset_t addr;
	vm_size_t size, pageoff;
	vm_inherit_t inherit;

	addr = (vm_offset_t)uap->addr;
	size = uap->len;
	inherit = uap->inherit;

	pageoff = (addr & PAGE_MASK);
	addr -= pageoff;
	size += pageoff;
	size = (vm_size_t) round_page(size);
	if (addr + size < addr)
	return (EINVAL);

	switch (vm_map_inherit(&td->td_proc->p_vmspace->vm_map, addr,
	addr + size, inherit)) {
	case KERN_SUCCESS:
	return (0);
	case KERN_PROTECTION_FAILURE:
	return (EACCES);
	}
	return (EINVAL);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct madvise_args {
	void *addr;
	size_t len;
	int behav;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	-madvise(td, uap)
	+sys_madvise(td, uap)
	struct thread *td;
	struct madvise_args *uap;
	{
	vm_offset_t start, end;
	vm_map_t map;
	struct proc *p;
	int error;

	/*
	* Check for our special case, advising the swap pager we are
	* "immortal."
	*/
	if (uap->behav == MADV_PROTECT) {
	error = priv_check(td, PRIV_VM_MADV_PROTECT);
	if (error == 0) {
	p = td->td_proc;
	PROC_LOCK(p);
	p->p_flag \|= P_PROTECTED;
	PROC_UNLOCK(p);
	}
	return (error);
	}
	/*
	* Check for illegal behavior
	*/
	if (uap->behav < 0 \|\| uap->behav > MADV_CORE)
	return (EINVAL);
	/*
	* Check for illegal addresses. Watch out for address wrap... Note
	* that VM_*_ADDRESS are not constants due to casts (argh).
	*/
	map = &td->td_proc->p_vmspace->vm_map;
	if ((vm_offset_t)uap->addr < vm_map_min(map) \|\|
	(vm_offset_t)uap->addr + uap->len > vm_map_max(map))
	return (EINVAL);
	if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr)
	return (EINVAL);

	/*
	* Since this routine is only advisory, we default to conservative
	* behavior.
	*/
	start = trunc_page((vm_offset_t) uap->addr);
	end = round_page((vm_offset_t) uap->addr + uap->len);

	if (vm_map_madvise(map, start, end, uap->behav))
	return (EINVAL);
	return (0);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct mincore_args {
	const void *addr;
	size_t len;
	char *vec;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	-mincore(td, uap)
	+sys_mincore(td, uap)
	struct thread *td;
	struct mincore_args *uap;
	{
	vm_offset_t addr, first_addr;
	vm_offset_t end, cend;
	pmap_t pmap;
	vm_map_t map;
	char *vec;
	int error = 0;
	int vecindex, lastvecindex;
	vm_map_entry_t current;
	vm_map_entry_t entry;
	vm_object_t object;
	vm_paddr_t locked_pa;
	vm_page_t m;
	vm_pindex_t pindex;
	int mincoreinfo;
	unsigned int timestamp;
	boolean_t locked;

	/*
	* Make sure that the addresses presented are valid for user
	* mode.
	*/
	first_addr = addr = trunc_page((vm_offset_t) uap->addr);
	end = addr + (vm_size_t)round_page(uap->len);
	map = &td->td_proc->p_vmspace->vm_map;
	if (end > vm_map_max(map) \|\| end < addr)
	return (ENOMEM);

	/*
	* Address of byte vector
	*/
	vec = uap->vec;

	pmap = vmspace_pmap(td->td_proc->p_vmspace);

	vm_map_lock_read(map);
	RestartScan:
	timestamp = map->timestamp;

	if (!vm_map_lookup_entry(map, addr, &entry)) {
	vm_map_unlock_read(map);
	return (ENOMEM);
	}

	/*
	* Do this on a map entry basis so that if the pages are not
	* in the current processes address space, we can easily look
	* up the pages elsewhere.
	*/
	lastvecindex = -1;
	for (current = entry;
	(current != &map->header) && (current->start < end);
	current = current->next) {

	/*
	* check for contiguity
	*/
	if (current->end < end &&
	(entry->next == &map->header \|\|
	current->next->start > current->end)) {
	vm_map_unlock_read(map);
	return (ENOMEM);
	}

	/*
	* ignore submaps (for now) or null objects
	*/
	if ((current->eflags & MAP_ENTRY_IS_SUB_MAP) \|\|
	current->object.vm_object == NULL)
	continue;

	/*
	* limit this scan to the current map entry and the
	* limits for the mincore call
	*/
	if (addr < current->start)
	addr = current->start;
	cend = current->end;
	if (cend > end)
	cend = end;

	/*
	* scan this entry one page at a time
	*/
	while (addr < cend) {
	/*
	* Check pmap first, it is likely faster, also
	* it can provide info as to whether we are the
	* one referencing or modifying the page.
	*/
	object = NULL;
	locked_pa = 0;
	retry:
	m = NULL;
	mincoreinfo = pmap_mincore(pmap, addr, &locked_pa);
	if (locked_pa != 0) {
	/*
	* The page is mapped by this process but not
	* both accessed and modified. It is also
	* managed. Acquire the object lock so that
	* other mappings might be examined.
	*/
	m = PHYS_TO_VM_PAGE(locked_pa);
	if (m->object != object) {
	if (object != NULL)
	VM_OBJECT_UNLOCK(object);
	object = m->object;
	locked = VM_OBJECT_TRYLOCK(object);
	vm_page_unlock(m);
	if (!locked) {
	VM_OBJECT_LOCK(object);
	vm_page_lock(m);
	goto retry;
	}
	} else
	vm_page_unlock(m);
	KASSERT(m->valid == VM_PAGE_BITS_ALL,
	("mincore: page %p is mapped but invalid",
	m));
	} else if (mincoreinfo == 0) {
	/*
	* The page is not mapped by this process. If
	* the object implements managed pages, then
	* determine if the page is resident so that
	* the mappings might be examined.
	*/
	if (current->object.vm_object != object) {
	if (object != NULL)
	VM_OBJECT_UNLOCK(object);
	object = current->object.vm_object;
	VM_OBJECT_LOCK(object);
	}
	if (object->type == OBJT_DEFAULT \|\|
	object->type == OBJT_SWAP \|\|
	object->type == OBJT_VNODE) {
	pindex = OFF_TO_IDX(current->offset +
	(addr - current->start));
	m = vm_page_lookup(object, pindex);
	if (m != NULL && m->valid == 0)
	m = NULL;
	if (m != NULL)
	mincoreinfo = MINCORE_INCORE;
	}
	}
	if (m != NULL) {
	/* Examine other mappings to the page. */
	if (m->dirty == 0 && pmap_is_modified(m))
	vm_page_dirty(m);
	if (m->dirty != 0)
	mincoreinfo \|= MINCORE_MODIFIED_OTHER;
	/*
	* The first test for PGA_REFERENCED is an
	* optimization. The second test is
	* required because a concurrent pmap
	* operation could clear the last reference
	* and set PGA_REFERENCED before the call to
	* pmap_is_referenced().
	*/
	if ((m->aflags & PGA_REFERENCED) != 0 \|\|
	pmap_is_referenced(m) \|\|
	(m->aflags & PGA_REFERENCED) != 0)
	mincoreinfo \|= MINCORE_REFERENCED_OTHER;
	}
	if (object != NULL)
	VM_OBJECT_UNLOCK(object);

	/*
	* subyte may page fault. In case it needs to modify
	* the map, we release the lock.
	*/
	vm_map_unlock_read(map);

	/*
	* calculate index into user supplied byte vector
	*/
	vecindex = OFF_TO_IDX(addr - first_addr);

	/*
	* If we have skipped map entries, we need to make sure that
	* the byte vector is zeroed for those skipped entries.
	*/
	while ((lastvecindex + 1) < vecindex) {
	error = subyte(vec + lastvecindex, 0);
	if (error) {
	error = EFAULT;
	goto done2;
	}
	++lastvecindex;
	}

	/*
	* Pass the page information to the user
	*/
	error = subyte(vec + vecindex, mincoreinfo);
	if (error) {
	error = EFAULT;
	goto done2;
	}

	/*
	* If the map has changed, due to the subyte, the previous
	* output may be invalid.
	*/
	vm_map_lock_read(map);
	if (timestamp != map->timestamp)
	goto RestartScan;

	lastvecindex = vecindex;
	addr += PAGE_SIZE;
	}
	}

	/*
	* subyte may page fault. In case it needs to modify
	* the map, we release the lock.
	*/
	vm_map_unlock_read(map);

	/*
	* Zero the last entries in the byte vector.
	*/
	vecindex = OFF_TO_IDX(end - first_addr);
	while ((lastvecindex + 1) < vecindex) {
	error = subyte(vec + lastvecindex, 0);
	if (error) {
	error = EFAULT;
	goto done2;
	}
	++lastvecindex;
	}

	/*
	* If the map has changed, due to the subyte, the previous
	* output may be invalid.
	*/
	vm_map_lock_read(map);
	if (timestamp != map->timestamp)
	goto RestartScan;
	vm_map_unlock_read(map);
	done2:
	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct mlock_args {
	const void *addr;
	size_t len;
	};
	#endif
	/*
	* MPSAFE
	*/
	int
	-mlock(td, uap)
	+sys_mlock(td, uap)
	struct thread *td;
	struct mlock_args *uap;
	{
	struct proc *proc;
	vm_offset_t addr, end, last, start;
	vm_size_t npages, size;
	unsigned long nsize;
	int error;

	error = priv_check(td, PRIV_VM_MLOCK);
	if (error)
	return (error);
	addr = (vm_offset_t)uap->addr;
	size = uap->len;
	last = addr + size;
	start = trunc_page(addr);
	end = round_page(last);
	if (last < addr \|\| end < addr)
	return (EINVAL);
	npages = atop(end - start);
	if (npages > vm_page_max_wired)
	return (ENOMEM);
	proc = td->td_proc;
	PROC_LOCK(proc);
	nsize = ptoa(npages +
	pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map)));
	if (nsize > lim_cur(proc, RLIMIT_MEMLOCK)) {
	PROC_UNLOCK(proc);
	return (ENOMEM);
	}
	PROC_UNLOCK(proc);
	if (npages + cnt.v_wire_count > vm_page_max_wired)
	return (EAGAIN);
	#ifdef RACCT
	PROC_LOCK(proc);
	error = racct_set(proc, RACCT_MEMLOCK, nsize);
	PROC_UNLOCK(proc);
	if (error != 0)
	return (ENOMEM);
	#endif
	error = vm_map_wire(&proc->p_vmspace->vm_map, start, end,
	VM_MAP_WIRE_USER \| VM_MAP_WIRE_NOHOLES);
	#ifdef RACCT
	if (error != KERN_SUCCESS) {
	PROC_LOCK(proc);
	racct_set(proc, RACCT_MEMLOCK,
	ptoa(pmap_wired_count(vm_map_pmap(&proc->p_vmspace->vm_map))));
	PROC_UNLOCK(proc);
	}
	#endif
	return (error == KERN_SUCCESS ? 0 : ENOMEM);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct mlockall_args {
	int how;
	};
	#endif

	/*
	* MPSAFE
	*/
	int
	-mlockall(td, uap)
	+sys_mlockall(td, uap)
	struct thread *td;
	struct mlockall_args *uap;
	{
	vm_map_t map;
	int error;

	map = &td->td_proc->p_vmspace->vm_map;
	error = 0;

	if ((uap->how == 0) \|\| ((uap->how & ~(MCL_CURRENT\|MCL_FUTURE)) != 0))
	return (EINVAL);

	#if 0
	/*
	* If wiring all pages in the process would cause it to exceed
	* a hard resource limit, return ENOMEM.
	*/
	PROC_LOCK(td->td_proc);
	if (map->size > lim_cur(td->td_proc, RLIMIT_MEMLOCK)) {
	PROC_UNLOCK(td->td_proc);
	return (ENOMEM);
	}
	PROC_UNLOCK(td->td_proc);
	#else
	error = priv_check(td, PRIV_VM_MLOCK);
	if (error)
	return (error);
	#endif
	#ifdef RACCT
	PROC_LOCK(td->td_proc);
	error = racct_set(td->td_proc, RACCT_MEMLOCK, map->size);
	PROC_UNLOCK(td->td_proc);
	if (error != 0)
	return (ENOMEM);
	#endif

	if (uap->how & MCL_FUTURE) {
	vm_map_lock(map);
	vm_map_modflags(map, MAP_WIREFUTURE, 0);
	vm_map_unlock(map);
	error = 0;
	}

	if (uap->how & MCL_CURRENT) {
	/*
	* P1003.1-2001 mandates that all currently mapped pages
	* will be memory resident and locked (wired) upon return
	* from mlockall(). vm_map_wire() will wire pages, by
	* calling vm_fault_wire() for each page in the region.
	*/
	error = vm_map_wire(map, vm_map_min(map), vm_map_max(map),
	VM_MAP_WIRE_USER\|VM_MAP_WIRE_HOLESOK);
	error = (error == KERN_SUCCESS ? 0 : EAGAIN);
	}
	#ifdef RACCT
	if (error != KERN_SUCCESS) {
	PROC_LOCK(td->td_proc);
	racct_set(td->td_proc, RACCT_MEMLOCK,
	ptoa(pmap_wired_count(vm_map_pmap(&td->td_proc->p_vmspace->vm_map))));
	PROC_UNLOCK(td->td_proc);
	}
	#endif

	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct munlockall_args {
	register_t dummy;
	};
	#endif

	/*
	* MPSAFE
	*/
	int
	-munlockall(td, uap)
	+sys_munlockall(td, uap)
	struct thread *td;
	struct munlockall_args *uap;
	{
	vm_map_t map;
	int error;

	map = &td->td_proc->p_vmspace->vm_map;
	error = priv_check(td, PRIV_VM_MUNLOCK);
	if (error)
	return (error);

	/* Clear the MAP_WIREFUTURE flag from this vm_map. */
	vm_map_lock(map);
	vm_map_modflags(map, 0, MAP_WIREFUTURE);
	vm_map_unlock(map);

	/* Forcibly unwire all pages. */
	error = vm_map_unwire(map, vm_map_min(map), vm_map_max(map),
	VM_MAP_WIRE_USER\|VM_MAP_WIRE_HOLESOK);
	#ifdef RACCT
	if (error == KERN_SUCCESS) {
	PROC_LOCK(td->td_proc);
	racct_set(td->td_proc, RACCT_MEMLOCK, 0);
	PROC_UNLOCK(td->td_proc);
	}
	#endif

	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct munlock_args {
	const void *addr;
	size_t len;
	};
	#endif
	/*
	* MPSAFE
	*/
	int
	-munlock(td, uap)
	+sys_munlock(td, uap)
	struct thread *td;
	struct munlock_args *uap;
	{
	vm_offset_t addr, end, last, start;
	vm_size_t size;
	int error;

	error = priv_check(td, PRIV_VM_MUNLOCK);
	if (error)
	return (error);
	addr = (vm_offset_t)uap->addr;
	size = uap->len;
	last = addr + size;
	start = trunc_page(addr);
	end = round_page(last);
	if (last < addr \|\| end < addr)
	return (EINVAL);
	error = vm_map_unwire(&td->td_proc->p_vmspace->vm_map, start, end,
	VM_MAP_WIRE_USER \| VM_MAP_WIRE_NOHOLES);
	#ifdef RACCT
	if (error == KERN_SUCCESS) {
	PROC_LOCK(td->td_proc);
	racct_sub(td->td_proc, RACCT_MEMLOCK, ptoa(end - start));
	PROC_UNLOCK(td->td_proc);
	}
	#endif
	return (error == KERN_SUCCESS ? 0 : ENOMEM);
	}

	/*
	* vm_mmap_vnode()
	*
	* MPSAFE
	*
	* Helper function for vm_mmap. Perform sanity check specific for mmap
	* operations on vnodes.
	*/
	int
	vm_mmap_vnode(struct thread *td, vm_size_t objsize,
	vm_prot_t prot, vm_prot_t maxprotp, int flagsp,
	struct vnode vp, vm_ooffset_t foffp, vm_object_t *objp)
	{
	struct vattr va;
	vm_object_t obj;
	vm_offset_t foff;
	struct mount *mp;
	struct ucred *cred;
	int error, flags;
	int vfslocked;

	mp = vp->v_mount;
	cred = td->td_ucred;
	vfslocked = VFS_LOCK_GIANT(mp);
	if ((error = vget(vp, LK_SHARED, td)) != 0) {
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}
	foff = *foffp;
	flags = *flagsp;
	obj = vp->v_object;
	if (vp->v_type == VREG) {
	/*
	* Get the proper underlying object
	*/
	if (obj == NULL) {
	error = EINVAL;
	goto done;
	}
	if (obj->handle != vp) {
	vput(vp);
	vp = (struct vnode*)obj->handle;
	vget(vp, LK_SHARED, td);
	}
	} else if (vp->v_type == VCHR) {
	error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp,
	vp->v_rdev, foffp, objp);
	if (error == 0)
	goto mark_atime;
	goto done;
	} else {
	error = EINVAL;
	goto done;
	}
	if ((error = VOP_GETATTR(vp, &va, cred)))
	goto done;
	#ifdef MAC
	error = mac_vnode_check_mmap(cred, vp, prot, flags);
	if (error != 0)
	goto done;
	#endif
	if ((flags & MAP_SHARED) != 0) {
	if ((va.va_flags & (SF_SNAPSHOT\|IMMUTABLE\|APPEND)) != 0) {
	if (prot & PROT_WRITE) {
	error = EPERM;
	goto done;
	}
	*maxprotp &= ~VM_PROT_WRITE;
	}
	}
	/*
	* If it is a regular file without any references
	* we do not need to sync it.
	* Adjust object size to be the size of actual file.
	*/
	objsize = round_page(va.va_size);
	if (va.va_nlink == 0)
	flags \|= MAP_NOSYNC;
	obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, td->td_ucred);
	if (obj == NULL) {
	error = ENOMEM;
	goto done;
	}
	*objp = obj;
	*flagsp = flags;

	mark_atime:
	vfs_mark_atime(vp, cred);

	done:
	vput(vp);
	VFS_UNLOCK_GIANT(vfslocked);
	return (error);
	}

	/*
	* vm_mmap_cdev()
	*
	* MPSAFE
	*
	* Helper function for vm_mmap. Perform sanity check specific for mmap
	* operations on cdevs.
	*/
	int
	vm_mmap_cdev(struct thread *td, vm_size_t objsize,
	vm_prot_t prot, vm_prot_t maxprotp, int flagsp,
	struct cdev cdev, vm_ooffset_t foff, vm_object_t *objp)
	{
	vm_object_t obj;
	struct cdevsw *dsw;
	int error, flags, ref;

	flags = *flagsp;

	dsw = dev_refthread(cdev, &ref);
	if (dsw == NULL)
	return (ENXIO);
	if (dsw->d_flags & D_MMAP_ANON) {
	dev_relthread(cdev, ref);
	*maxprotp = VM_PROT_ALL;
	*flagsp \|= MAP_ANON;
	return (0);
	}
	/*
	* cdevs do not provide private mappings of any kind.
	*/
	if ((*maxprotp & VM_PROT_WRITE) == 0 &&
	(prot & PROT_WRITE) != 0) {
	dev_relthread(cdev, ref);
	return (EACCES);
	}
	if (flags & (MAP_PRIVATE\|MAP_COPY)) {
	dev_relthread(cdev, ref);
	return (EINVAL);
	}
	/*
	* Force device mappings to be shared.
	*/
	flags \|= MAP_SHARED;
	#ifdef MAC_XXX
	error = mac_cdev_check_mmap(td->td_ucred, cdev, prot);
	if (error != 0) {
	dev_relthread(cdev, ref);
	return (error);
	}
	#endif
	/*
	* First, try d_mmap_single(). If that is not implemented
	* (returns ENODEV), fall back to using the device pager.
	* Note that d_mmap_single() must return a reference to the
	* object (it needs to bump the reference count of the object
	* it returns somehow).
	*
	* XXX assumes VM_PROT_* == PROT_*
	*/
	error = dsw->d_mmap_single(cdev, foff, objsize, objp, (int)prot);
	dev_relthread(cdev, ref);
	if (error != ENODEV)
	return (error);
	obj = vm_pager_allocate(OBJT_DEVICE, cdev, objsize, prot, *foff,
	td->td_ucred);
	if (obj == NULL)
	return (EINVAL);
	*objp = obj;
	*flagsp = flags;
	return (0);
	}

	/*
	* vm_mmap_shm()
	*
	* MPSAFE
	*
	* Helper function for vm_mmap. Perform sanity check specific for mmap
	* operations on shm file descriptors.
	*/
	int
	vm_mmap_shm(struct thread *td, vm_size_t objsize,
	vm_prot_t prot, vm_prot_t maxprotp, int flagsp,
	struct shmfd shmfd, vm_ooffset_t foff, vm_object_t objp)
	{
	int error;

	if ((*flagsp & MAP_SHARED) != 0 &&
	(*maxprotp & VM_PROT_WRITE) == 0 &&
	(prot & PROT_WRITE) != 0)
	return (EACCES);
	#ifdef MAC
	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, *flagsp);
	if (error != 0)
	return (error);
	#endif
	error = shm_mmap(shmfd, objsize, foff, objp);
	if (error)
	return (error);
	return (0);
	}

	/*
	* vm_mmap()
	*
	* MPSAFE
	*
	* Internal version of mmap. Currently used by mmap, exec, and sys5
	* shared memory. Handle is either a vnode pointer or NULL for MAP_ANON.
	*/
	int
	vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
	vm_prot_t maxprot, int flags,
	objtype_t handle_type, void *handle,
	vm_ooffset_t foff)
	{
	boolean_t fitit;
	vm_object_t object = NULL;
	int rv = KERN_SUCCESS;
	int docow, error;
	struct thread *td = curthread;

	if (size == 0)
	return (0);

	size = round_page(size);

	PROC_LOCK(td->td_proc);
	if (td->td_proc->p_vmspace->vm_map.size + size >
	lim_cur(td->td_proc, RLIMIT_VMEM)) {
	PROC_UNLOCK(td->td_proc);
	return (ENOMEM);
	}
	if (racct_set(td->td_proc, RACCT_VMEM,
	td->td_proc->p_vmspace->vm_map.size + size)) {
	PROC_UNLOCK(td->td_proc);
	return (ENOMEM);
	}
	PROC_UNLOCK(td->td_proc);

	/*
	* We currently can only deal with page aligned file offsets.
	* The check is here rather than in the syscall because the
	* kernel calls this function internally for other mmaping
	* operations (such as in exec) and non-aligned offsets will
	* cause pmap inconsistencies...so we want to be sure to
	* disallow this in all cases.
	*/
	if (foff & PAGE_MASK)
	return (EINVAL);

	if ((flags & MAP_FIXED) == 0) {
	fitit = TRUE;
	addr = round_page(addr);
	} else {
	if (addr != trunc_page(addr))
	return (EINVAL);
	fitit = FALSE;
	}
	/*
	* Lookup/allocate object.
	*/
	switch (handle_type) {
	case OBJT_DEVICE:
	error = vm_mmap_cdev(td, size, prot, &maxprot, &flags,
	handle, &foff, &object);
	break;
	case OBJT_VNODE:
	error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
	handle, &foff, &object);
	break;
	case OBJT_SWAP:
	error = vm_mmap_shm(td, size, prot, &maxprot, &flags,
	handle, foff, &object);
	break;
	case OBJT_DEFAULT:
	if (handle == NULL) {
	error = 0;
	break;
	}
	/* FALLTHROUGH */
	default:
	error = EINVAL;
	break;
	}
	if (error)
	return (error);
	if (flags & MAP_ANON) {
	object = NULL;
	docow = 0;
	/*
	* Unnamed anonymous regions always start at 0.
	*/
	if (handle == 0)
	foff = 0;
	} else if (flags & MAP_PREFAULT_READ)
	docow = MAP_PREFAULT;
	else
	docow = MAP_PREFAULT_PARTIAL;

	if ((flags & (MAP_ANON\|MAP_SHARED)) == 0)
	docow \|= MAP_COPY_ON_WRITE;
	if (flags & MAP_NOSYNC)
	docow \|= MAP_DISABLE_SYNCER;
	if (flags & MAP_NOCORE)
	docow \|= MAP_DISABLE_COREDUMP;

	if (flags & MAP_STACK)
	rv = vm_map_stack(map, *addr, size, prot, maxprot,
	docow \| MAP_STACK_GROWS_DOWN);
	else if (fitit)
	rv = vm_map_find(map, object, foff, addr, size,
	object != NULL && object->type == OBJT_DEVICE ?
	VMFS_ALIGNED_SPACE : VMFS_ANY_SPACE, prot, maxprot, docow);
	else
	rv = vm_map_fixed(map, object, foff, *addr, size,
	prot, maxprot, docow);

	if (rv != KERN_SUCCESS) {
	/*
	* Lose the object reference. Will destroy the
	* object if it's an unnamed anonymous mapping
	* or named anonymous without other references.
	*/
	vm_object_deallocate(object);
	} else if (flags & MAP_SHARED) {
	/*
	* Shared memory is also shared with children.
	*/
	rv = vm_map_inherit(map, addr, addr + size, VM_INHERIT_SHARE);
	if (rv != KERN_SUCCESS)
	(void) vm_map_remove(map, addr, addr + size);
	}

	/*
	* If the process has requested that all future mappings
	* be wired, then heed this.
	*/
	if ((rv == KERN_SUCCESS) && (map->flags & MAP_WIREFUTURE))
	vm_map_wire(map, addr, addr + size,
	VM_MAP_WIRE_USER\|VM_MAP_WIRE_NOHOLES);

	return (vm_mmap_to_errno(rv));
	}

	int
	vm_mmap_to_errno(int rv)
	{

	switch (rv) {
	case KERN_SUCCESS:
	return (0);
	case KERN_INVALID_ADDRESS:
	case KERN_NO_SPACE:
	return (ENOMEM);
	case KERN_PROTECTION_FAILURE:
	return (EACCES);
	default:
	return (EINVAL);
	}
	}
	Index: head/sys/vm/vm_unix.c
	===================================================================
	--- head/sys/vm/vm_unix.c (revision 225616)
	+++ head/sys/vm/vm_unix.c (revision 225617)
	@@ -1,207 +1,207 @@
	/*-
	* Copyright (c) 1988 University of Utah.
	* Copyright (c) 1991, 1993
	* The Regents of the University of California. All rights reserved.
	*
	* This code is derived from software contributed to Berkeley by
	* the Systems Programming Group of the University of Utah Computer
	* Science Department.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* 4. Neither the name of the University nor the names of its contributors
	* may be used to endorse or promote products derived from this software
	* without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	* SUCH DAMAGE.
	*
	* from: Utah $Hdr: vm_unix.c 1.1 89/11/07$
	*
	* @(#)vm_unix.c 8.1 (Berkeley) 6/11/93
	*/

	/*
	* Traditional sbrk/grow interface to VM
	*/

	#include <sys/cdefs.h>
	__FBSDID("$FreeBSD$");

	#include <sys/param.h>
	#include <sys/lock.h>
	#include <sys/mutex.h>
	#include <sys/proc.h>
	#include <sys/racct.h>
	#include <sys/resourcevar.h>
	#include <sys/sysproto.h>
	#include <sys/systm.h>

	#include <vm/vm.h>
	#include <vm/vm_param.h>
	#include <vm/pmap.h>
	#include <vm/vm_map.h>

	#ifndef _SYS_SYSPROTO_H_
	struct obreak_args {
	char *nsize;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	-obreak(td, uap)
	+sys_obreak(td, uap)
	struct thread *td;
	struct obreak_args *uap;
	{
	struct vmspace *vm = td->td_proc->p_vmspace;
	vm_offset_t new, old, base;
	rlim_t datalim, vmemlim;
	int rv;
	int error = 0;
	boolean_t do_map_wirefuture;

	PROC_LOCK(td->td_proc);
	datalim = lim_cur(td->td_proc, RLIMIT_DATA);
	vmemlim = lim_cur(td->td_proc, RLIMIT_VMEM);
	PROC_UNLOCK(td->td_proc);

	do_map_wirefuture = FALSE;
	new = round_page((vm_offset_t)uap->nsize);
	vm_map_lock(&vm->vm_map);

	base = round_page((vm_offset_t) vm->vm_daddr);
	old = base + ctob(vm->vm_dsize);
	if (new > base) {
	/*
	* Check the resource limit, but allow a process to reduce
	* its usage, even if it remains over the limit.
	*/
	if (new - base > datalim && new > old) {
	error = ENOMEM;
	goto done;
	}
	if (new > vm_map_max(&vm->vm_map)) {
	error = ENOMEM;
	goto done;
	}
	} else if (new < base) {
	/*
	* This is simply an invalid value. If someone wants to
	* do fancy address space manipulations, mmap and munmap
	* can do most of what the user would want.
	*/
	error = EINVAL;
	goto done;
	}
	if (new > old) {
	if (vm->vm_map.size + (new - old) > vmemlim) {
	error = ENOMEM;
	goto done;
	}
	#ifdef RACCT
	PROC_LOCK(td->td_proc);
	error = racct_set(td->td_proc, RACCT_DATA, new - base);
	if (error != 0) {
	PROC_UNLOCK(td->td_proc);
	error = ENOMEM;
	goto done;
	}
	error = racct_set(td->td_proc, RACCT_VMEM,
	vm->vm_map.size + (new - old));
	if (error != 0) {
	racct_set_force(td->td_proc, RACCT_DATA, old - base);
	PROC_UNLOCK(td->td_proc);
	error = ENOMEM;
	goto done;
	}
	PROC_UNLOCK(td->td_proc);
	#endif
	rv = vm_map_insert(&vm->vm_map, NULL, 0, old, new,
	VM_PROT_RW, VM_PROT_ALL, 0);
	if (rv != KERN_SUCCESS) {
	#ifdef RACCT
	PROC_LOCK(td->td_proc);
	racct_set_force(td->td_proc, RACCT_DATA, old - base);
	racct_set_force(td->td_proc, RACCT_VMEM, vm->vm_map.size);
	PROC_UNLOCK(td->td_proc);
	#endif
	error = ENOMEM;
	goto done;
	}
	vm->vm_dsize += btoc(new - old);
	/*
	* Handle the MAP_WIREFUTURE case for legacy applications,
	* by marking the newly mapped range of pages as wired.
	* We are not required to perform a corresponding
	* vm_map_unwire() before vm_map_delete() below, as
	* it will forcibly unwire the pages in the range.
	*
	* XXX If the pages cannot be wired, no error is returned.
	*/
	if ((vm->vm_map.flags & MAP_WIREFUTURE) == MAP_WIREFUTURE) {
	if (bootverbose)
	printf("obreak: MAP_WIREFUTURE set\n");
	do_map_wirefuture = TRUE;
	}
	} else if (new < old) {
	rv = vm_map_delete(&vm->vm_map, new, old);
	if (rv != KERN_SUCCESS) {
	error = ENOMEM;
	goto done;
	}
	vm->vm_dsize -= btoc(old - new);
	#ifdef RACCT
	PROC_LOCK(td->td_proc);
	racct_set_force(td->td_proc, RACCT_DATA, new - base);
	racct_set_force(td->td_proc, RACCT_VMEM, vm->vm_map.size);
	PROC_UNLOCK(td->td_proc);
	#endif
	}
	done:
	vm_map_unlock(&vm->vm_map);

	if (do_map_wirefuture)
	(void) vm_map_wire(&vm->vm_map, old, new,
	VM_MAP_WIRE_USER\|VM_MAP_WIRE_NOHOLES);

	return (error);
	}

	#ifndef _SYS_SYSPROTO_H_
	struct ovadvise_args {
	int anom;
	};
	#endif

	/*
	* MPSAFE
	*/
	/* ARGSUSED */
	int
	-ovadvise(td, uap)
	+sys_ovadvise(td, uap)
	struct thread *td;
	struct ovadvise_args *uap;
	{
	/* START_GIANT_OPTIONAL */
	/* END_GIANT_OPTIONAL */
	return (EINVAL);
	}

File Metadata

Mime Type: application/octet-stream
Expires: Tue, Jan 21, 2:42 AM (1 d, 23 h)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: nYxhORJIsPcH
Default Alt Text: (4 MB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions